]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
userspace: Define and use struct eth_addr.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
c7952afb 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
8b61709d
BP
55#include "dynamic-string.h"
56#include "fatal-signal.h"
93b13be8
BP
57#include "hash.h"
58#include "hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
e9e28be3 64#include "ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
8b61709d 70#include "shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
5136ce49 76
d98e6007 77VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 78
d76f09ea
BP
79COVERAGE_DEFINE(netdev_set_policing);
80COVERAGE_DEFINE(netdev_arp_lookup);
81COVERAGE_DEFINE(netdev_get_ifindex);
82COVERAGE_DEFINE(netdev_get_hwaddr);
83COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
84COVERAGE_DEFINE(netdev_get_ethtool);
85COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 86
8b61709d
BP
87\f
88/* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90#ifndef ADVERTISED_Pause
91#define ADVERTISED_Pause (1 << 13)
92#endif
93#ifndef ADVERTISED_Asym_Pause
94#define ADVERTISED_Asym_Pause (1 << 14)
95#endif
96
e47bd51a
JP
97/* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99#ifndef ETHTOOL_GFLAGS
100#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101#endif
102#ifndef ETHTOOL_SFLAGS
103#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104#endif
105
c1c9c9c4
BP
106/* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108#ifndef TC_RTAB_SIZE
109#define TC_RTAB_SIZE 1024
110#endif
111
b73c8518
SH
112/* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
55bc98d6
BP
121#ifndef PACKET_AUXDATA
122#define PACKET_AUXDATA 8
123#endif
b73c8518
SH
124#ifndef TP_STATUS_VLAN_VALID
125#define TP_STATUS_VLAN_VALID (1 << 4)
126#endif
127#ifndef TP_STATUS_VLAN_TPID_VALID
128#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129#endif
130#undef tpacket_auxdata
131#define tpacket_auxdata rpl_tpacket_auxdata
132struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140};
141
fa373af4
BP
142/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 *
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
149#ifndef IFLA_STATS64
337c9b99 150#define IFLA_STATS64 23
fa373af4
BP
151#endif
152#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
153struct rtnl_link_stats64 {
154 uint64_t rx_packets;
155 uint64_t tx_packets;
156 uint64_t rx_bytes;
157 uint64_t tx_bytes;
158 uint64_t rx_errors;
159 uint64_t tx_errors;
160 uint64_t rx_dropped;
161 uint64_t tx_dropped;
162 uint64_t multicast;
163 uint64_t collisions;
164
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
171
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
177
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
180};
337c9b99 181
8b61709d 182enum {
7fbef77a
JG
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
185 VALID_IN4 = 1 << 2,
186 VALID_IN6 = 1 << 3,
187 VALID_MTU = 1 << 4,
3a183124 188 VALID_POLICING = 1 << 5,
4f925bd3
PS
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
51f87458 191 VALID_FEATURES = 1 << 8,
8b61709d 192};
c1c9c9c4
BP
193\f
194/* Traffic control. */
195
196/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
197 * network device.
198 *
199 * Each TC implementation subclasses this with whatever additional data it
200 * needs. */
c1c9c9c4
BP
201struct tc {
202 const struct tc_ops *ops;
93b13be8
BP
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
206};
c1c9c9c4 207
559eb230
BP
208#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209
93b13be8
BP
210/* One traffic control queue.
211 *
212 * Each TC implementation subclasses this with whatever additional data it
213 * needs. */
214struct tc_queue {
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 217 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
218};
219
220/* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
222 *
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
226struct tc_ops {
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
231
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
234
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
238
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
244 *
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
248 *
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
251 *
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
79f1cbe9 254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
255
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
259 *
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
265 * 'netdev'.
266 *
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
270
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
273 * tc_destroy(tc).
274 *
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
278 *
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
281
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 *
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
287 *
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
291 *
292 * This function may be null if 'tc' is not configurable.
293 */
79f1cbe9 294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
295
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
298 *
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
302 *
303 * This function may be null if 'tc' is not configurable.
304 */
79f1cbe9 305 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 306
93b13be8
BP
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
309 *
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 *
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
317 *
318 * This function may be null if 'tc' does not have queues ('n_queues' is
319 * 0). */
93b13be8 320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 321 struct smap *details);
c1c9c9c4
BP
322
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
326 * 'n_queues'.
327 *
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 *
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 335 const struct smap *details);
c1c9c9c4 336
93b13be8
BP
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
339 *
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
93b13be8 342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 343
93b13be8
BP
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
346 *
347 * On success, initializes '*stats'.
348 *
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
93b13be8
BP
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
c1c9c9c4
BP
353 struct netdev_queue_stats *stats);
354
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 *
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
363};
364
365static void
366tc_init(struct tc *tc, const struct tc_ops *ops)
367{
368 tc->ops = ops;
93b13be8 369 hmap_init(&tc->queues);
c1c9c9c4
BP
370}
371
372static void
373tc_destroy(struct tc *tc)
374{
93b13be8 375 hmap_destroy(&tc->queues);
c1c9c9c4
BP
376}
377
378static const struct tc_ops tc_ops_htb;
a339aa81 379static const struct tc_ops tc_ops_hfsc;
677d9158
JV
380static const struct tc_ops tc_ops_codel;
381static const struct tc_ops tc_ops_fqcodel;
382static const struct tc_ops tc_ops_sfq;
c1c9c9c4
BP
383static const struct tc_ops tc_ops_default;
384static const struct tc_ops tc_ops_other;
385
559eb230 386static const struct tc_ops *const tcs[] = {
c1c9c9c4 387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
c1c9c9c4
BP
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
394 NULL
395};
149f577a 396
c1c9c9c4
BP
397static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398static unsigned int tc_get_major(unsigned int handle);
399static unsigned int tc_get_minor(unsigned int handle);
400
401static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403static unsigned int tc_buffer_per_jiffy(unsigned int rate);
404
405static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 408static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
409static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
411
412static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420static int tc_delete_class(const struct netdev *, unsigned int handle);
421
422static int tc_del_qdisc(struct netdev *netdev);
423static int tc_query_qdisc(const struct netdev *netdev);
424
425static int tc_calc_cell_log(unsigned int mtu);
426static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
430\f
b5d57fc8
BP
431struct netdev_linux {
432 struct netdev up;
149f577a 433
86383816
BP
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
436
149f577a 437 unsigned int cache_valid;
8b61709d 438
1670c579
EJ
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
442
8722022c
BP
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 445 int ifindex;
74ff3298 446 struct eth_addr etheraddr;
f1acd62b 447 struct in_addr address, netmask;
8b61709d
BP
448 struct in6_addr in6;
449 int mtu;
059e5f4f 450 unsigned int ifi_flags;
65c3058c 451 long long int carrier_resets;
80a86fbe
BP
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
bba1e6f3
PS
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
90a6637d 456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 458 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
49af9a3d 461 int in4_error; /* Cached error code from reading in4 addr. */
7df6932e 462 int in6_error; /* Cached error code from reading in6 addr. */
51f87458 463
a00ca915
EJ
464 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
465 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
466 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 467
4f925bd3 468 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 469 struct tc *tc;
149f577a 470
d0d08f8a
BP
471 /* For devices of class netdev_tap_class only. */
472 int tap_fd;
8b61709d
BP
473};
474
f7791740
PS
475struct netdev_rxq_linux {
476 struct netdev_rxq up;
796223f5 477 bool is_tap;
5b7448ed 478 int fd;
149f577a 479};
8b61709d 480
8b61709d
BP
481/* This is set pretty low because we probably won't learn anything from the
482 * additional log messages. */
483static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
484
19c8e9c1
JS
485/* Polling miimon status for all ports causes performance degradation when
486 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
487 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
488 *
489 * Readers do not depend on this variable synchronizing with the related
490 * changes in the device miimon status, so we can use atomic_count. */
491static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 492
259e0b1a 493static void netdev_linux_run(void);
6f643e49 494
0b0544d7 495static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 496 int cmd, const char *cmd_name);
f1acd62b
BP
497static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
498 int cmd, const char *cmd_name);
b5d57fc8 499static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 500static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
501static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
502 enum netdev_flags on, enum netdev_flags *old_flagsp)
503 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
504static int do_get_ifindex(const char *netdev_name);
505static int get_ifindex(const struct netdev *, int *ifindexp);
506static int do_set_addr(struct netdev *netdev,
507 int ioctl_nr, const char *ioctl_name,
508 struct in_addr addr);
74ff3298
JR
509static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
510static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 511static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 512static int af_packet_sock(void);
19c8e9c1 513static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
514static void netdev_linux_miimon_run(void);
515static void netdev_linux_miimon_wait(void);
df1e5a3b 516static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 517
15b3596a
JG
518static bool
519is_netdev_linux_class(const struct netdev_class *netdev_class)
520{
259e0b1a 521 return netdev_class->run == netdev_linux_run;
15b3596a
JG
522}
523
796223f5
BP
524static bool
525is_tap_netdev(const struct netdev *netdev)
526{
b5d57fc8 527 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
528}
529
8b61709d
BP
530static struct netdev_linux *
531netdev_linux_cast(const struct netdev *netdev)
532{
b5d57fc8 533 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 534
180c6d0b 535 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 536}
796223f5 537
f7791740
PS
538static struct netdev_rxq_linux *
539netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 540{
9dc63482 541 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 542 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 543}
ff4ed3c9 544\f
cee87338 545static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 546 const struct rtnetlink_change *)
86383816 547 OVS_REQUIRES(netdev->mutex);
cee87338 548static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
549 unsigned int ifi_flags, unsigned int mask)
550 OVS_REQUIRES(netdev->mutex);
cee87338 551
d6384a3a
AW
552/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
553 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
554 * if no such socket could be created. */
555static struct nl_sock *
556netdev_linux_notify_sock(void)
557{
558 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
559 static struct nl_sock *sock;
d6384a3a
AW
560 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
561 RTNLGRP_IPV6_IFADDR};
cee87338
BP
562
563 if (ovsthread_once_start(&once)) {
564 int error;
565
566 error = nl_sock_create(NETLINK_ROUTE, &sock);
567 if (!error) {
d6384a3a
AW
568 size_t i;
569
570 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
571 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
572 if (error) {
573 nl_sock_destroy(sock);
574 sock = NULL;
575 break;
576 }
cee87338
BP
577 }
578 }
579 ovsthread_once_done(&once);
580 }
581
582 return sock;
583}
584
19c8e9c1
JS
585static bool
586netdev_linux_miimon_enabled(void)
587{
812c272c 588 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
589}
590
8b61709d
BP
591static void
592netdev_linux_run(void)
593{
cee87338
BP
594 struct nl_sock *sock;
595 int error;
596
19c8e9c1
JS
597 if (netdev_linux_miimon_enabled()) {
598 netdev_linux_miimon_run();
599 }
cee87338
BP
600
601 sock = netdev_linux_notify_sock();
602 if (!sock) {
603 return;
604 }
605
606 do {
607 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
608 uint64_t buf_stub[4096 / 8];
609 struct ofpbuf buf;
610
611 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
612 error = nl_sock_recv(sock, &buf, false);
613 if (!error) {
7e9dcc0f 614 struct rtnetlink_change change;
cee87338 615
7e9dcc0f 616 if (rtnetlink_parse(&buf, &change)) {
cee87338
BP
617 struct netdev *netdev_ = netdev_from_name(change.ifname);
618 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
620
621 ovs_mutex_lock(&netdev->mutex);
cee87338 622 netdev_linux_update(netdev, &change);
86383816 623 ovs_mutex_unlock(&netdev->mutex);
cee87338 624 }
38e0065b 625 netdev_close(netdev_);
cee87338
BP
626 }
627 } else if (error == ENOBUFS) {
628 struct shash device_shash;
629 struct shash_node *node;
630
631 nl_sock_drain(sock);
632
633 shash_init(&device_shash);
634 netdev_get_devices(&netdev_linux_class, &device_shash);
635 SHASH_FOR_EACH (node, &device_shash) {
636 struct netdev *netdev_ = node->data;
637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
638 unsigned int flags;
639
86383816 640 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
641 get_flags(netdev_, &flags);
642 netdev_linux_changed(netdev, flags, 0);
86383816
BP
643 ovs_mutex_unlock(&netdev->mutex);
644
cee87338
BP
645 netdev_close(netdev_);
646 }
647 shash_destroy(&device_shash);
648 } else if (error != EAGAIN) {
649 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
650 ovs_strerror(error));
651 }
652 ofpbuf_uninit(&buf);
653 } while (!error);
8b61709d
BP
654}
655
656static void
657netdev_linux_wait(void)
658{
cee87338
BP
659 struct nl_sock *sock;
660
19c8e9c1
JS
661 if (netdev_linux_miimon_enabled()) {
662 netdev_linux_miimon_wait();
663 }
cee87338
BP
664 sock = netdev_linux_notify_sock();
665 if (sock) {
666 nl_sock_wait(sock, POLLIN);
667 }
8b61709d
BP
668}
669
ac4d3bcb 670static void
b5d57fc8
BP
671netdev_linux_changed(struct netdev_linux *dev,
672 unsigned int ifi_flags, unsigned int mask)
86383816 673 OVS_REQUIRES(dev->mutex)
ac4d3bcb 674{
3e912ffc 675 netdev_change_seq_changed(&dev->up);
8aa77183
BP
676
677 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
678 dev->carrier_resets++;
679 }
680 dev->ifi_flags = ifi_flags;
681
4f925bd3
PS
682 dev->cache_valid &= mask;
683}
684
685static void
b5d57fc8 686netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 687 const struct rtnetlink_change *change)
86383816 688 OVS_REQUIRES(dev->mutex)
4f925bd3 689{
d6384a3a
AW
690 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
691 if (change->nlmsg_type == RTM_NEWLINK) {
692 /* Keep drv-info, in4, in6. */
693 netdev_linux_changed(dev, change->ifi_flags,
694 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
695
696 /* Update netdev from rtnl-change msg. */
697 if (change->mtu) {
698 dev->mtu = change->mtu;
699 dev->cache_valid |= VALID_MTU;
700 dev->netdev_mtu_error = 0;
701 }
90a6637d 702
74ff3298
JR
703 if (!eth_addr_is_zero(change->mac)) {
704 dev->etheraddr = change->mac;
d6384a3a
AW
705 dev->cache_valid |= VALID_ETHERADDR;
706 dev->ether_addr_error = 0;
707 }
44445cac 708
d6384a3a
AW
709 dev->ifindex = change->if_index;
710 dev->cache_valid |= VALID_IFINDEX;
711 dev->get_ifindex_error = 0;
712 } else {
713 netdev_linux_changed(dev, change->ifi_flags, 0);
714 }
715 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
716 /* Invalidates in4, in6. */
717 netdev_linux_changed(dev, dev->ifi_flags,
718 ~(VALID_IN4 | VALID_IN6));
4f925bd3 719 } else {
d6384a3a 720 OVS_NOT_REACHED();
4f925bd3 721 }
ac4d3bcb
EJ
722}
723
9dc63482
BP
724static struct netdev *
725netdev_linux_alloc(void)
726{
727 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
728 return &netdev->up;
729}
730
cee87338 731static void
9dc63482
BP
732netdev_linux_common_construct(struct netdev_linux *netdev)
733{
834d6caf 734 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
735}
736
1f6e0fbd
BP
737/* Creates system and internal devices. */
738static int
9dc63482 739netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 740{
9dc63482 741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
742 int error;
743
cee87338 744 netdev_linux_common_construct(netdev);
1f6e0fbd 745
b5d57fc8
BP
746 error = get_flags(&netdev->up, &netdev->ifi_flags);
747 if (error == ENODEV) {
9dc63482 748 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 749 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
750 return ENODEV;
751 } else {
752 /* "Internal" netdevs have to be created as netdev objects before
753 * they exist in the kernel, because creating them in the kernel
754 * happens by passing a netdev object to dpif_port_add().
755 * Therefore, ignore the error. */
756 }
757 }
46415c90 758
a740f0de
JG
759 return 0;
760}
761
5b7448ed
JG
762/* For most types of netdevs we open the device for each call of
763 * netdev_open(). However, this is not the case with tap devices,
764 * since it is only possible to open the device once. In this
765 * situation we share a single file descriptor, and consequently
766 * buffers, across all readers. Therefore once data is read it will
767 * be unavailable to other reads for tap devices. */
a740f0de 768static int
9dc63482 769netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 770{
9dc63482 771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 772 static const char tap_dev[] = "/dev/net/tun";
9dc63482 773 const char *name = netdev_->name;
a740f0de
JG
774 struct ifreq ifr;
775 int error;
776
cee87338 777 netdev_linux_common_construct(netdev);
1f6e0fbd 778
6c88d577 779 /* Open tap device. */
d0d08f8a
BP
780 netdev->tap_fd = open(tap_dev, O_RDWR);
781 if (netdev->tap_fd < 0) {
6c88d577 782 error = errno;
10a89ef0 783 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 784 return error;
6c88d577
JP
785 }
786
787 /* Create tap device. */
788 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 789 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 790 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 791 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 792 ovs_strerror(errno));
6c88d577 793 error = errno;
f61d8d29 794 goto error_close;
6c88d577
JP
795 }
796
797 /* Make non-blocking. */
d0d08f8a 798 error = set_nonblocking(netdev->tap_fd);
a740f0de 799 if (error) {
f61d8d29 800 goto error_close;
a740f0de
JG
801 }
802
803 return 0;
804
f61d8d29 805error_close:
d0d08f8a 806 close(netdev->tap_fd);
a740f0de
JG
807 return error;
808}
809
6c88d577 810static void
9dc63482 811netdev_linux_destruct(struct netdev *netdev_)
6c88d577 812{
b5d57fc8 813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 814
b5d57fc8
BP
815 if (netdev->tc && netdev->tc->ops->tc_destroy) {
816 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
817 }
818
d0d08f8a
BP
819 if (netdev_get_class(netdev_) == &netdev_tap_class
820 && netdev->tap_fd >= 0)
821 {
822 close(netdev->tap_fd);
6c88d577 823 }
86383816 824
19c8e9c1 825 if (netdev->miimon_interval > 0) {
812c272c 826 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
827 }
828
86383816 829 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
830}
831
9dc63482
BP
832static void
833netdev_linux_dealloc(struct netdev *netdev_)
834{
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
836 free(netdev);
837}
838
f7791740
PS
839static struct netdev_rxq *
840netdev_linux_rxq_alloc(void)
9dc63482 841{
f7791740 842 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
843 return &rx->up;
844}
845
7b6b0ef4 846static int
f7791740 847netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 848{
f7791740 849 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 850 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 852 int error;
7b6b0ef4 853
86383816 854 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
855 rx->is_tap = is_tap_netdev(netdev_);
856 if (rx->is_tap) {
857 rx->fd = netdev->tap_fd;
796223f5
BP
858 } else {
859 struct sockaddr_ll sll;
b73c8518 860 int ifindex, val;
32383c3b 861 /* Result of tcpdump -dd inbound */
259e0b1a 862 static const struct sock_filter filt[] = {
32383c3b
MM
863 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
864 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
865 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
866 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
867 };
259e0b1a
BP
868 static const struct sock_fprog fprog = {
869 ARRAY_SIZE(filt), (struct sock_filter *) filt
870 };
7b6b0ef4 871
796223f5 872 /* Create file descriptor. */
9dc63482
BP
873 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
874 if (rx->fd < 0) {
796223f5 875 error = errno;
10a89ef0 876 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
877 goto error;
878 }
33d82a56 879
b73c8518
SH
880 val = 1;
881 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
882 error = errno;
883 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
884 netdev_get_name(netdev_), ovs_strerror(error));
885 goto error;
886 }
887
796223f5 888 /* Set non-blocking mode. */
9dc63482 889 error = set_nonblocking(rx->fd);
796223f5
BP
890 if (error) {
891 goto error;
892 }
7b6b0ef4 893
796223f5 894 /* Get ethernet device index. */
180c6d0b 895 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
896 if (error) {
897 goto error;
898 }
7b6b0ef4 899
796223f5
BP
900 /* Bind to specific ethernet device. */
901 memset(&sll, 0, sizeof sll);
902 sll.sll_family = AF_PACKET;
903 sll.sll_ifindex = ifindex;
b73c8518 904 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 905 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
906 error = errno;
907 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 908 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
909 goto error;
910 }
32383c3b
MM
911
912 /* Filter for only inbound packets. */
9dc63482 913 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
914 sizeof fprog);
915 if (error) {
916 error = errno;
259e0b1a 917 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 918 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
919 goto error;
920 }
7b6b0ef4 921 }
86383816 922 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 923
7b6b0ef4
BP
924 return 0;
925
926error:
9dc63482
BP
927 if (rx->fd >= 0) {
928 close(rx->fd);
7b6b0ef4 929 }
86383816 930 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
931 return error;
932}
933
796223f5 934static void
f7791740 935netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 936{
f7791740 937 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 938
796223f5
BP
939 if (!rx->is_tap) {
940 close(rx->fd);
8b61709d 941 }
9dc63482
BP
942}
943
944static void
f7791740 945netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 946{
f7791740 947 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 948
796223f5
BP
949 free(rx);
950}
8b61709d 951
b73c8518
SH
952static ovs_be16
953auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
954{
955 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
956 return htons(aux->tp_vlan_tpid);
957 } else {
958 return htons(ETH_TYPE_VLAN);
959 }
960}
961
962static bool
963auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
964{
965 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
966}
967
796223f5 968static int
cf62fa4c 969netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 970{
b73c8518 971 size_t size;
796223f5 972 ssize_t retval;
b73c8518
SH
973 struct iovec iov;
974 struct cmsghdr *cmsg;
975 union {
976 struct cmsghdr cmsg;
977 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
978 } cmsg_buffer;
979 struct msghdr msgh;
980
981 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
982 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
983 size = dp_packet_tailroom(buffer);
b73c8518 984
cf62fa4c 985 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
986 iov.iov_len = size;
987 msgh.msg_name = NULL;
988 msgh.msg_namelen = 0;
989 msgh.msg_iov = &iov;
990 msgh.msg_iovlen = 1;
991 msgh.msg_control = &cmsg_buffer;
992 msgh.msg_controllen = sizeof cmsg_buffer;
993 msgh.msg_flags = 0;
8e8cddf7 994
796223f5 995 do {
b73c8518 996 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
997 } while (retval < 0 && errno == EINTR);
998
bfd3367b 999 if (retval < 0) {
b73c8518
SH
1000 return errno;
1001 } else if (retval > size) {
1002 return EMSGSIZE;
1003 }
1004
cf62fa4c 1005 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1006
1007 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1008 const struct tpacket_auxdata *aux;
1009
1010 if (cmsg->cmsg_level != SOL_PACKET
1011 || cmsg->cmsg_type != PACKET_AUXDATA
1012 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1013 continue;
8b61709d 1014 }
b73c8518
SH
1015
1016 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1017 if (auxdata_has_vlan_tci(aux)) {
1018 if (retval < ETH_HEADER_LEN) {
1019 return EINVAL;
1020 }
1021
1022 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1023 htons(aux->tp_vlan_tci));
1024 break;
1025 }
1026 }
1027
1028 return 0;
1029}
1030
1031static int
cf62fa4c 1032netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1033{
1034 ssize_t retval;
cf62fa4c 1035 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1036
1037 do {
cf62fa4c 1038 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1039 } while (retval < 0 && errno == EINTR);
1040
1041 if (retval < 0) {
bfd3367b
SH
1042 return errno;
1043 } else if (retval > size) {
1044 return EMSGSIZE;
8b61709d 1045 }
b73c8518 1046
cf62fa4c 1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1048 return 0;
1049}
1050
1051static int
e14deea0 1052netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 1053 int *c)
b73c8518 1054{
f7791740 1055 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1056 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1057 struct dp_packet *buffer;
df1e5a3b
PS
1058 ssize_t retval;
1059 int mtu;
1060
1061 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1062 mtu = ETH_PAYLOAD_MAX;
1063 }
1064
cf62fa4c 1065 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1066 DP_NETDEV_HEADROOM);
b73c8518 1067 retval = (rx->is_tap
f7791740
PS
1068 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1069 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1070
1071 if (retval) {
1072 if (retval != EAGAIN && retval != EMSGSIZE) {
1073 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
f7791740 1074 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
df1e5a3b 1075 }
cf62fa4c 1076 dp_packet_delete(buffer);
df1e5a3b
PS
1077 } else {
1078 dp_packet_pad(buffer);
2bc1bbd2 1079 dp_packet_set_rss_hash(buffer, 0);
cf62fa4c 1080 packets[0] = buffer;
df1e5a3b 1081 *c = 1;
b73c8518
SH
1082 }
1083
1084 return retval;
8b61709d
BP
1085}
1086
8b61709d 1087static void
f7791740 1088netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1089{
f7791740 1090 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1091 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1092}
1093
8b61709d 1094static int
f7791740 1095netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1096{
f7791740 1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1098 if (rx->is_tap) {
8b61709d 1099 struct ifreq ifr;
f7791740 1100 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1101 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1102 if (error) {
1103 return error;
1104 }
796223f5 1105 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1106 return 0;
1107 } else {
796223f5 1108 return drain_rcvbuf(rx->fd);
8b61709d
BP
1109 }
1110}
1111
1112/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1113 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1114 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1115 * the packet is too big or too small to transmit on the device.
1116 *
1117 * The caller retains ownership of 'buffer' in all cases.
1118 *
1119 * The kernel maintains a packet transmission queue, so the caller is not
1120 * expected to do additional queuing of packets. */
1121static int
f00fa8cb 1122netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
e14deea0 1123 struct dp_packet **pkts, int cnt, bool may_steal)
8b61709d 1124{
f4fd623c
DDP
1125 int i;
1126 int error = 0;
40d26f04 1127
f4fd623c
DDP
1128 /* 'i' is incremented only if there's no error */
1129 for (i = 0; i < cnt;) {
cf62fa4c
PS
1130 const void *data = dp_packet_data(pkts[i]);
1131 size_t size = dp_packet_size(pkts[i]);
f23347ea 1132 ssize_t retval;
8b61709d 1133
796223f5 1134 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1135 /* Use our AF_PACKET socket to send to this device. */
1136 struct sockaddr_ll sll;
1137 struct msghdr msg;
1138 struct iovec iov;
1139 int ifindex;
488d734d
BP
1140 int sock;
1141
1142 sock = af_packet_sock();
1143 if (sock < 0) {
c4c7a3d7 1144 return -sock;
488d734d 1145 }
f23347ea 1146
86383816
BP
1147 ifindex = netdev_get_ifindex(netdev_);
1148 if (ifindex < 0) {
1149 return -ifindex;
f23347ea 1150 }
8b61709d 1151
f23347ea
BP
1152 /* We don't bother setting most fields in sockaddr_ll because the
1153 * kernel ignores them for SOCK_RAW. */
1154 memset(&sll, 0, sizeof sll);
1155 sll.sll_family = AF_PACKET;
1156 sll.sll_ifindex = ifindex;
76c308b5 1157
ebc56baa 1158 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1159 iov.iov_len = size;
76c308b5 1160
f23347ea
BP
1161 msg.msg_name = &sll;
1162 msg.msg_namelen = sizeof sll;
1163 msg.msg_iov = &iov;
1164 msg.msg_iovlen = 1;
1165 msg.msg_control = NULL;
1166 msg.msg_controllen = 0;
1167 msg.msg_flags = 0;
1168
488d734d 1169 retval = sendmsg(sock, &msg, 0);
f23347ea 1170 } else {
796223f5
BP
1171 /* Use the tap fd to send to this device. This is essential for
1172 * tap devices, because packets sent to a tap device with an
1173 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1174 * tap device. This doesn't occur on other interface types
1175 * because we attach a socket filter to the rx socket. */
b5d57fc8 1176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1177
d0d08f8a 1178 retval = write(netdev->tap_fd, data, size);
f23347ea 1179 }
76c308b5 1180
8b61709d
BP
1181 if (retval < 0) {
1182 /* The Linux AF_PACKET implementation never blocks waiting for room
1183 * for packets, instead returning ENOBUFS. Translate this into
1184 * EAGAIN for the caller. */
f4fd623c
DDP
1185 error = errno == ENOBUFS ? EAGAIN : errno;
1186 if (error == EINTR) {
1187 /* continue without incrementing 'i', i.e. retry this packet */
8b61709d 1188 continue;
8b61709d 1189 }
f4fd623c 1190 break;
8b61709d 1191 } else if (retval != size) {
f4fd623c
DDP
1192 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1193 " of %"PRIuSIZE") on %s", retval, size,
1194 netdev_get_name(netdev_));
1195 error = EMSGSIZE;
1196 break;
1197 }
1198
1199 /* Process the next packet in the batch */
1200 i++;
1201 }
1202
1203 if (may_steal) {
1204 for (i = 0; i < cnt; i++) {
e14deea0 1205 dp_packet_delete(pkts[i]);
8b61709d
BP
1206 }
1207 }
f4fd623c
DDP
1208
1209 if (error && error != EAGAIN) {
1210 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1211 netdev_get_name(netdev_), ovs_strerror(error));
1212 }
1213
1214 return error;
1215
8b61709d
BP
1216}
1217
1218/* Registers with the poll loop to wake up from the next call to poll_block()
1219 * when the packet transmission queue has sufficient room to transmit a packet
1220 * with netdev_send().
1221 *
1222 * The kernel maintains a packet transmission queue, so the client is not
1223 * expected to do additional queuing of packets. Thus, this function is
1224 * unlikely to ever be used. It is included for completeness. */
1225static void
f00fa8cb 1226netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1227{
796223f5 1228 if (is_tap_netdev(netdev)) {
8b61709d
BP
1229 /* TAP device always accepts packets.*/
1230 poll_immediate_wake();
1231 }
1232}
1233
1234/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1235 * otherwise a positive errno value. */
1236static int
74ff3298 1237netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1238{
b5d57fc8 1239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1240 enum netdev_flags old_flags = 0;
eb395f2e
BP
1241 int error;
1242
86383816
BP
1243 ovs_mutex_lock(&netdev->mutex);
1244
b5d57fc8 1245 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1246 error = netdev->ether_addr_error;
1247 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1248 goto exit;
44445cac 1249 }
b5d57fc8 1250 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1251 }
1252
7eb1bd81 1253 /* Tap devices must be brought down before setting the address. */
796223f5 1254 if (is_tap_netdev(netdev_)) {
4f9f3f21 1255 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1256 }
44445cac
PS
1257 error = set_etheraddr(netdev_get_name(netdev_), mac);
1258 if (!error || error == ENODEV) {
b5d57fc8
BP
1259 netdev->ether_addr_error = error;
1260 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1261 if (!error) {
74ff3298 1262 netdev->etheraddr = mac;
eb395f2e 1263 }
8b61709d 1264 }
44445cac 1265
4f9f3f21
BP
1266 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1267 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1268 }
7eb1bd81 1269
86383816
BP
1270exit:
1271 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1272 return error;
1273}
1274
44445cac 1275/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1276static int
74ff3298 1277netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1278{
b5d57fc8 1279 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1280 int error;
44445cac 1281
86383816 1282 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1283 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1284 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1285 &netdev->etheraddr);
b5d57fc8 1286 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1287 }
44445cac 1288
86383816
BP
1289 error = netdev->ether_addr_error;
1290 if (!error) {
74ff3298 1291 *mac = netdev->etheraddr;
44445cac 1292 }
86383816 1293 ovs_mutex_unlock(&netdev->mutex);
44445cac 1294
86383816 1295 return error;
8b61709d
BP
1296}
1297
8b61709d 1298static int
73371c09 1299netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1300{
86383816
BP
1301 int error;
1302
b5d57fc8 1303 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1304 struct ifreq ifr;
90a6637d 1305
86383816 1306 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1307 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1308 netdev->mtu = ifr.ifr_mtu;
1309 netdev->cache_valid |= VALID_MTU;
8b61709d 1310 }
90a6637d 1311
86383816
BP
1312 error = netdev->netdev_mtu_error;
1313 if (!error) {
b5d57fc8 1314 *mtup = netdev->mtu;
90a6637d 1315 }
73371c09
BP
1316
1317 return error;
1318}
1319
1320/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1321 * in bytes, not including the hardware header; thus, this is typically 1500
1322 * bytes for Ethernet devices. */
1323static int
1324netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1325{
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1327 int error;
1328
1329 ovs_mutex_lock(&netdev->mutex);
1330 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1331 ovs_mutex_unlock(&netdev->mutex);
1332
1333 return error;
8b61709d
BP
1334}
1335
9b020780
PS
1336/* Sets the maximum size of transmitted (MTU) for given device using linux
1337 * networking ioctl interface.
1338 */
1339static int
1340netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1341{
b5d57fc8 1342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1343 struct ifreq ifr;
1344 int error;
1345
86383816 1346 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1347 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1348 error = netdev->netdev_mtu_error;
1349 if (error || netdev->mtu == mtu) {
1350 goto exit;
90a6637d 1351 }
b5d57fc8 1352 netdev->cache_valid &= ~VALID_MTU;
153e5481 1353 }
9b020780 1354 ifr.ifr_mtu = mtu;
259e0b1a
BP
1355 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1356 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1357 if (!error || error == ENODEV) {
b5d57fc8
BP
1358 netdev->netdev_mtu_error = error;
1359 netdev->mtu = ifr.ifr_mtu;
1360 netdev->cache_valid |= VALID_MTU;
9b020780 1361 }
86383816
BP
1362exit:
1363 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1364 return error;
9b020780
PS
1365}
1366
9ab3d9a3
BP
1367/* Returns the ifindex of 'netdev', if successful, as a positive number.
1368 * On failure, returns a negative errno value. */
1369static int
86383816 1370netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1371{
86383816 1372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1373 int ifindex, error;
1374
86383816
BP
1375 ovs_mutex_lock(&netdev->mutex);
1376 error = get_ifindex(netdev_, &ifindex);
1377 ovs_mutex_unlock(&netdev->mutex);
1378
9ab3d9a3
BP
1379 return error ? -error : ifindex;
1380}
1381
8b61709d
BP
1382static int
1383netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1384{
b5d57fc8 1385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1386
86383816 1387 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1388 if (netdev->miimon_interval > 0) {
1389 *carrier = netdev->miimon;
3a183124 1390 } else {
b5d57fc8 1391 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1392 }
86383816 1393 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1394
3a183124 1395 return 0;
8b61709d
BP
1396}
1397
65c3058c 1398static long long int
86383816 1399netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1400{
86383816
BP
1401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1402 long long int carrier_resets;
1403
1404 ovs_mutex_lock(&netdev->mutex);
1405 carrier_resets = netdev->carrier_resets;
1406 ovs_mutex_unlock(&netdev->mutex);
1407
1408 return carrier_resets;
65c3058c
EJ
1409}
1410
63331829 1411static int
1670c579
EJ
1412netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1413 struct mii_ioctl_data *data)
63331829 1414{
63331829 1415 struct ifreq ifr;
782e6111 1416 int error;
63331829 1417
63331829 1418 memset(&ifr, 0, sizeof ifr);
782e6111 1419 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1420 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1421 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1422
782e6111
EJ
1423 return error;
1424}
1425
1426static int
1670c579 1427netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1428{
782e6111
EJ
1429 struct mii_ioctl_data data;
1430 int error;
63331829 1431
782e6111
EJ
1432 *miimon = false;
1433
1434 memset(&data, 0, sizeof data);
1670c579 1435 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1436 if (!error) {
1437 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1438 data.reg_num = MII_BMSR;
1670c579 1439 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1440 &data);
63331829
EJ
1441
1442 if (!error) {
782e6111 1443 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1444 } else {
1445 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1446 }
1447 } else {
1448 struct ethtool_cmd ecmd;
63331829
EJ
1449
1450 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1451 name);
1452
ab985a77 1453 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1454 memset(&ecmd, 0, sizeof ecmd);
1455 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1456 "ETHTOOL_GLINK");
1457 if (!error) {
782e6111
EJ
1458 struct ethtool_value eval;
1459
1460 memcpy(&eval, &ecmd, sizeof eval);
1461 *miimon = !!eval.data;
63331829
EJ
1462 } else {
1463 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1464 }
1465 }
1466
1467 return error;
1468}
1469
1670c579
EJ
1470static int
1471netdev_linux_set_miimon_interval(struct netdev *netdev_,
1472 long long int interval)
1473{
b5d57fc8 1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1475
86383816 1476 ovs_mutex_lock(&netdev->mutex);
1670c579 1477 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1478 if (netdev->miimon_interval != interval) {
19c8e9c1 1479 if (interval && !netdev->miimon_interval) {
812c272c 1480 atomic_count_inc(&miimon_cnt);
19c8e9c1 1481 } else if (!interval && netdev->miimon_interval) {
812c272c 1482 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1483 }
1484
b5d57fc8
BP
1485 netdev->miimon_interval = interval;
1486 timer_set_expired(&netdev->miimon_timer);
1670c579 1487 }
86383816 1488 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1489
1490 return 0;
1491}
1492
1493static void
1494netdev_linux_miimon_run(void)
1495{
1496 struct shash device_shash;
1497 struct shash_node *node;
1498
1499 shash_init(&device_shash);
b5d57fc8 1500 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1501 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1502 struct netdev *netdev = node->data;
1503 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1504 bool miimon;
1505
86383816
BP
1506 ovs_mutex_lock(&dev->mutex);
1507 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1508 netdev_linux_get_miimon(dev->up.name, &miimon);
1509 if (miimon != dev->miimon) {
1510 dev->miimon = miimon;
1511 netdev_linux_changed(dev, dev->ifi_flags, 0);
1512 }
1670c579 1513
86383816 1514 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1515 }
86383816 1516 ovs_mutex_unlock(&dev->mutex);
2f980d74 1517 netdev_close(netdev);
1670c579
EJ
1518 }
1519
1520 shash_destroy(&device_shash);
1521}
1522
1523static void
1524netdev_linux_miimon_wait(void)
1525{
1526 struct shash device_shash;
1527 struct shash_node *node;
1528
1529 shash_init(&device_shash);
b5d57fc8 1530 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1531 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1532 struct netdev *netdev = node->data;
1533 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1534
86383816 1535 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1536 if (dev->miimon_interval > 0) {
1537 timer_wait(&dev->miimon_timer);
1538 }
86383816 1539 ovs_mutex_unlock(&dev->mutex);
2f980d74 1540 netdev_close(netdev);
1670c579
EJ
1541 }
1542 shash_destroy(&device_shash);
1543}
1544
92df599c
JG
1545static void
1546swap_uint64(uint64_t *a, uint64_t *b)
1547{
1de0e8ae
BP
1548 uint64_t tmp = *a;
1549 *a = *b;
1550 *b = tmp;
92df599c
JG
1551}
1552
c060c4cf
EJ
1553/* Copies 'src' into 'dst', performing format conversion in the process.
1554 *
1555 * 'src' is allowed to be misaligned. */
1556static void
1557netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1558 const struct ovs_vport_stats *src)
1559{
6a54dedc
BP
1560 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1561 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1562 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1563 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1564 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1565 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1566 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1567 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1568 dst->multicast = 0;
1569 dst->collisions = 0;
1570 dst->rx_length_errors = 0;
1571 dst->rx_over_errors = 0;
1572 dst->rx_crc_errors = 0;
1573 dst->rx_frame_errors = 0;
1574 dst->rx_fifo_errors = 0;
1575 dst->rx_missed_errors = 0;
1576 dst->tx_aborted_errors = 0;
1577 dst->tx_carrier_errors = 0;
1578 dst->tx_fifo_errors = 0;
1579 dst->tx_heartbeat_errors = 0;
1580 dst->tx_window_errors = 0;
1581}
1582
1583static int
1584get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1585{
93451a0a 1586 struct dpif_netlink_vport reply;
c060c4cf
EJ
1587 struct ofpbuf *buf;
1588 int error;
1589
93451a0a 1590 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1591 if (error) {
1592 return error;
1593 } else if (!reply.stats) {
1594 ofpbuf_delete(buf);
1595 return EOPNOTSUPP;
1596 }
1597
1598 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1599
1600 ofpbuf_delete(buf);
1601
1602 return 0;
1603}
1604
f613a0d7
PS
1605static void
1606get_stats_via_vport(const struct netdev *netdev_,
1607 struct netdev_stats *stats)
8b61709d 1608{
b5d57fc8 1609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1610
b5d57fc8
BP
1611 if (!netdev->vport_stats_error ||
1612 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1613 int error;
7fbef77a 1614
c060c4cf 1615 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1616 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1617 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1618 "(%s)",
1619 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1620 }
b5d57fc8
BP
1621 netdev->vport_stats_error = error;
1622 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1623 }
f613a0d7 1624}
8b61709d 1625
f613a0d7
PS
1626/* Retrieves current device stats for 'netdev-linux'. */
1627static int
1628netdev_linux_get_stats(const struct netdev *netdev_,
1629 struct netdev_stats *stats)
1630{
b5d57fc8 1631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1632 struct netdev_stats dev_stats;
1633 int error;
1634
86383816 1635 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1636 get_stats_via_vport(netdev_, stats);
35eef899 1637 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1638 if (error) {
86383816
BP
1639 if (!netdev->vport_stats_error) {
1640 error = 0;
f613a0d7 1641 }
86383816 1642 } else if (netdev->vport_stats_error) {
04c881eb 1643 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1644 *stats = dev_stats;
1645 } else {
04c881eb
AZ
1646 /* Use kernel netdev's packet and byte counts since vport's counters
1647 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1648 * enabled. */
1649 stats->rx_packets = dev_stats.rx_packets;
1650 stats->rx_bytes = dev_stats.rx_bytes;
1651 stats->tx_packets = dev_stats.tx_packets;
1652 stats->tx_bytes = dev_stats.tx_bytes;
1653
f613a0d7
PS
1654 stats->rx_errors += dev_stats.rx_errors;
1655 stats->tx_errors += dev_stats.tx_errors;
1656 stats->rx_dropped += dev_stats.rx_dropped;
1657 stats->tx_dropped += dev_stats.tx_dropped;
1658 stats->multicast += dev_stats.multicast;
1659 stats->collisions += dev_stats.collisions;
1660 stats->rx_length_errors += dev_stats.rx_length_errors;
1661 stats->rx_over_errors += dev_stats.rx_over_errors;
1662 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1663 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1664 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1665 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1666 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1667 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1668 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1669 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1670 stats->tx_window_errors += dev_stats.tx_window_errors;
1671 }
86383816
BP
1672 ovs_mutex_unlock(&netdev->mutex);
1673
1674 return error;
f613a0d7
PS
1675}
1676
1677/* Retrieves current device stats for 'netdev-tap' netdev or
1678 * netdev-internal. */
1679static int
15aee116 1680netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1681{
b5d57fc8 1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1683 struct netdev_stats dev_stats;
1684 int error;
1685
86383816 1686 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1687 get_stats_via_vport(netdev_, stats);
35eef899 1688 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1689 if (error) {
86383816
BP
1690 if (!netdev->vport_stats_error) {
1691 error = 0;
8b61709d 1692 }
86383816
BP
1693 } else if (netdev->vport_stats_error) {
1694 /* Transmit and receive stats will appear to be swapped relative to the
1695 * other ports since we are the one sending the data, not a remote
1696 * computer. For consistency, we swap them back here. This does not
1697 * apply if we are getting stats from the vport layer because it always
1698 * tracks stats from the perspective of the switch. */
fe6b0e03 1699
f613a0d7 1700 *stats = dev_stats;
92df599c
JG
1701 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1702 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1703 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1704 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1705 stats->rx_length_errors = 0;
1706 stats->rx_over_errors = 0;
1707 stats->rx_crc_errors = 0;
1708 stats->rx_frame_errors = 0;
1709 stats->rx_fifo_errors = 0;
1710 stats->rx_missed_errors = 0;
1711 stats->tx_aborted_errors = 0;
1712 stats->tx_carrier_errors = 0;
1713 stats->tx_fifo_errors = 0;
1714 stats->tx_heartbeat_errors = 0;
1715 stats->tx_window_errors = 0;
f613a0d7 1716 } else {
04c881eb
AZ
1717 /* Use kernel netdev's packet and byte counts since vport counters
1718 * do not reflect packet counts on the wire when GSO, TSO or GRO
1719 * are enabled. */
1720 stats->rx_packets = dev_stats.tx_packets;
1721 stats->rx_bytes = dev_stats.tx_bytes;
1722 stats->tx_packets = dev_stats.rx_packets;
1723 stats->tx_bytes = dev_stats.rx_bytes;
1724
f613a0d7
PS
1725 stats->rx_dropped += dev_stats.tx_dropped;
1726 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1727
f613a0d7
PS
1728 stats->rx_errors += dev_stats.tx_errors;
1729 stats->tx_errors += dev_stats.rx_errors;
1730
1731 stats->multicast += dev_stats.multicast;
1732 stats->collisions += dev_stats.collisions;
1733 }
86383816
BP
1734 ovs_mutex_unlock(&netdev->mutex);
1735
1736 return error;
8b61709d
BP
1737}
1738
bba1e6f3
PS
1739static int
1740netdev_internal_get_stats(const struct netdev *netdev_,
1741 struct netdev_stats *stats)
1742{
b5d57fc8 1743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1744 int error;
bba1e6f3 1745
86383816 1746 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1747 get_stats_via_vport(netdev_, stats);
86383816
BP
1748 error = netdev->vport_stats_error;
1749 ovs_mutex_unlock(&netdev->mutex);
1750
1751 return error;
bba1e6f3
PS
1752}
1753
51f87458 1754static void
b5d57fc8 1755netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1756{
1757 struct ethtool_cmd ecmd;
6c038611 1758 uint32_t speed;
8b61709d
BP
1759 int error;
1760
b5d57fc8 1761 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1762 return;
1763 }
1764
ab985a77 1765 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1766 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1767 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1768 ETHTOOL_GSET, "ETHTOOL_GSET");
1769 if (error) {
51f87458 1770 goto out;
8b61709d
BP
1771 }
1772
1773 /* Supported features. */
b5d57fc8 1774 netdev->supported = 0;
8b61709d 1775 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1776 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1777 }
1778 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1779 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1780 }
1781 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1782 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1783 }
1784 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1785 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1786 }
1787 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1788 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1789 }
1790 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1791 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1792 }
1793 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1794 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1795 }
1796 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1797 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1798 }
1799 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1800 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1801 }
1802 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1803 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1804 }
1805 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1806 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1807 }
1808 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1809 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1810 }
1811
1812 /* Advertised features. */
b5d57fc8 1813 netdev->advertised = 0;
8b61709d 1814 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1815 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1816 }
1817 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1818 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1819 }
1820 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1821 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1822 }
1823 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1824 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1825 }
1826 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1827 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1828 }
1829 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1830 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1831 }
1832 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1833 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1834 }
1835 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1836 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1837 }
1838 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1839 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1840 }
1841 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1842 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1843 }
1844 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1845 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1846 }
1847 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1848 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1849 }
1850
1851 /* Current settings. */
2a529ead 1852 speed = ecmd.speed;
6c038611 1853 if (speed == SPEED_10) {
b5d57fc8 1854 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1855 } else if (speed == SPEED_100) {
b5d57fc8 1856 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1857 } else if (speed == SPEED_1000) {
b5d57fc8 1858 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1859 } else if (speed == SPEED_10000) {
b5d57fc8 1860 netdev->current = NETDEV_F_10GB_FD;
6c038611 1861 } else if (speed == 40000) {
b5d57fc8 1862 netdev->current = NETDEV_F_40GB_FD;
6c038611 1863 } else if (speed == 100000) {
b5d57fc8 1864 netdev->current = NETDEV_F_100GB_FD;
6c038611 1865 } else if (speed == 1000000) {
b5d57fc8 1866 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1867 } else {
b5d57fc8 1868 netdev->current = 0;
8b61709d
BP
1869 }
1870
1871 if (ecmd.port == PORT_TP) {
b5d57fc8 1872 netdev->current |= NETDEV_F_COPPER;
8b61709d 1873 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1874 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1875 }
1876
1877 if (ecmd.autoneg) {
b5d57fc8 1878 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1879 }
1880
51f87458 1881out:
b5d57fc8
BP
1882 netdev->cache_valid |= VALID_FEATURES;
1883 netdev->get_features_error = error;
51f87458
PS
1884}
1885
887ed8b2
BP
1886/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1887 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1888 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1889static int
1890netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1891 enum netdev_features *current,
1892 enum netdev_features *advertised,
1893 enum netdev_features *supported,
1894 enum netdev_features *peer)
51f87458 1895{
b5d57fc8 1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1897 int error;
51f87458 1898
86383816 1899 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1900 netdev_linux_read_features(netdev);
b5d57fc8
BP
1901 if (!netdev->get_features_error) {
1902 *current = netdev->current;
1903 *advertised = netdev->advertised;
1904 *supported = netdev->supported;
887ed8b2 1905 *peer = 0; /* XXX */
51f87458 1906 }
86383816
BP
1907 error = netdev->get_features_error;
1908 ovs_mutex_unlock(&netdev->mutex);
1909
1910 return error;
8b61709d
BP
1911}
1912
1913/* Set the features advertised by 'netdev' to 'advertise'. */
1914static int
86383816 1915netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1916 enum netdev_features advertise)
8b61709d 1917{
86383816 1918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1919 struct ethtool_cmd ecmd;
1920 int error;
1921
86383816
BP
1922 ovs_mutex_lock(&netdev->mutex);
1923
ab985a77 1924 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1925 memset(&ecmd, 0, sizeof ecmd);
86383816 1926 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1927 ETHTOOL_GSET, "ETHTOOL_GSET");
1928 if (error) {
86383816 1929 goto exit;
8b61709d
BP
1930 }
1931
1932 ecmd.advertising = 0;
6c038611 1933 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1934 ecmd.advertising |= ADVERTISED_10baseT_Half;
1935 }
6c038611 1936 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1937 ecmd.advertising |= ADVERTISED_10baseT_Full;
1938 }
6c038611 1939 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1940 ecmd.advertising |= ADVERTISED_100baseT_Half;
1941 }
6c038611 1942 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1943 ecmd.advertising |= ADVERTISED_100baseT_Full;
1944 }
6c038611 1945 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1946 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1947 }
6c038611 1948 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1949 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1950 }
6c038611 1951 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1952 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1953 }
6c038611 1954 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1955 ecmd.advertising |= ADVERTISED_TP;
1956 }
6c038611 1957 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1958 ecmd.advertising |= ADVERTISED_FIBRE;
1959 }
6c038611 1960 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1961 ecmd.advertising |= ADVERTISED_Autoneg;
1962 }
6c038611 1963 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1964 ecmd.advertising |= ADVERTISED_Pause;
1965 }
6c038611 1966 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1967 ecmd.advertising |= ADVERTISED_Asym_Pause;
1968 }
ab985a77 1969 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
1970 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1971 ETHTOOL_SSET, "ETHTOOL_SSET");
1972
1973exit:
1974 ovs_mutex_unlock(&netdev->mutex);
1975 return error;
8b61709d
BP
1976}
1977
f8500004
JP
1978/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1979 * successful, otherwise a positive errno value. */
8b61709d 1980static int
b5d57fc8 1981netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1982 uint32_t kbits_rate, uint32_t kbits_burst)
1983{
b5d57fc8
BP
1984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1985 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1986 int error;
8b61709d 1987
80a86fbe
BP
1988 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1989 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1990 : kbits_burst); /* Stick with user-specified value. */
1991
86383816 1992 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1993 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
1994 error = netdev->netdev_policing_error;
1995 if (error || (netdev->kbits_rate == kbits_rate &&
1996 netdev->kbits_burst == kbits_burst)) {
c9f71668 1997 /* Assume that settings haven't changed since we last set them. */
86383816 1998 goto out;
c9f71668 1999 }
b5d57fc8 2000 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2001 }
2002
ac8c3412 2003 COVERAGE_INC(netdev_set_policing);
f8500004 2004 /* Remove any existing ingress qdisc. */
b5d57fc8 2005 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
2006 if (error) {
2007 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2008 netdev_name, ovs_strerror(error));
c9f71668 2009 goto out;
f8500004
JP
2010 }
2011
8b61709d 2012 if (kbits_rate) {
b5d57fc8 2013 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
2014 if (error) {
2015 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2016 netdev_name, ovs_strerror(error));
c9f71668 2017 goto out;
8b61709d
BP
2018 }
2019
b5d57fc8 2020 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2021 if (error){
2022 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2023 netdev_name, ovs_strerror(error));
c9f71668 2024 goto out;
8b61709d 2025 }
8b61709d
BP
2026 }
2027
b5d57fc8
BP
2028 netdev->kbits_rate = kbits_rate;
2029 netdev->kbits_burst = kbits_burst;
f8500004 2030
c9f71668
PS
2031out:
2032 if (!error || error == ENODEV) {
b5d57fc8
BP
2033 netdev->netdev_policing_error = error;
2034 netdev->cache_valid |= VALID_POLICING;
c9f71668 2035 }
86383816 2036 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2037 return error;
8b61709d
BP
2038}
2039
c1c9c9c4
BP
2040static int
2041netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2042 struct sset *types)
c1c9c9c4 2043{
559eb230 2044 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2045
2046 for (opsp = tcs; *opsp != NULL; opsp++) {
2047 const struct tc_ops *ops = *opsp;
2048 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2049 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2050 }
2051 }
2052 return 0;
2053}
2054
2055static const struct tc_ops *
2056tc_lookup_ovs_name(const char *name)
2057{
559eb230 2058 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2059
2060 for (opsp = tcs; *opsp != NULL; opsp++) {
2061 const struct tc_ops *ops = *opsp;
2062 if (!strcmp(name, ops->ovs_name)) {
2063 return ops;
2064 }
2065 }
2066 return NULL;
2067}
2068
2069static const struct tc_ops *
2070tc_lookup_linux_name(const char *name)
2071{
559eb230 2072 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2073
2074 for (opsp = tcs; *opsp != NULL; opsp++) {
2075 const struct tc_ops *ops = *opsp;
2076 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2077 return ops;
2078 }
2079 }
2080 return NULL;
2081}
2082
93b13be8 2083static struct tc_queue *
b5d57fc8 2084tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2085 size_t hash)
2086{
b5d57fc8 2087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2088 struct tc_queue *queue;
2089
b5d57fc8 2090 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2091 if (queue->queue_id == queue_id) {
2092 return queue;
2093 }
2094 }
2095 return NULL;
2096}
2097
2098static struct tc_queue *
2099tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2100{
2101 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2102}
2103
c1c9c9c4
BP
2104static int
2105netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2106 const char *type,
2107 struct netdev_qos_capabilities *caps)
2108{
2109 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2110 if (!ops) {
2111 return EOPNOTSUPP;
2112 }
2113 caps->n_queues = ops->n_queues;
2114 return 0;
2115}
2116
2117static int
b5d57fc8 2118netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2119 const char **typep, struct smap *details)
c1c9c9c4 2120{
b5d57fc8 2121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2122 int error;
2123
86383816 2124 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2125 error = tc_query_qdisc(netdev_);
86383816
BP
2126 if (!error) {
2127 *typep = netdev->tc->ops->ovs_name;
2128 error = (netdev->tc->ops->qdisc_get
2129 ? netdev->tc->ops->qdisc_get(netdev_, details)
2130 : 0);
c1c9c9c4 2131 }
86383816 2132 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2133
86383816 2134 return error;
c1c9c9c4
BP
2135}
2136
2137static int
b5d57fc8 2138netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2139 const char *type, const struct smap *details)
c1c9c9c4 2140{
b5d57fc8 2141 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2142 const struct tc_ops *new_ops;
2143 int error;
2144
2145 new_ops = tc_lookup_ovs_name(type);
2146 if (!new_ops || !new_ops->tc_install) {
2147 return EOPNOTSUPP;
2148 }
2149
86383816 2150 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2151 error = tc_query_qdisc(netdev_);
c1c9c9c4 2152 if (error) {
86383816 2153 goto exit;
c1c9c9c4
BP
2154 }
2155
b5d57fc8 2156 if (new_ops == netdev->tc->ops) {
86383816 2157 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2158 } else {
2159 /* Delete existing qdisc. */
b5d57fc8 2160 error = tc_del_qdisc(netdev_);
c1c9c9c4 2161 if (error) {
86383816 2162 goto exit;
c1c9c9c4 2163 }
b5d57fc8 2164 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2165
2166 /* Install new qdisc. */
b5d57fc8
BP
2167 error = new_ops->tc_install(netdev_, details);
2168 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2169 }
86383816
BP
2170
2171exit:
2172 ovs_mutex_unlock(&netdev->mutex);
2173 return error;
c1c9c9c4
BP
2174}
2175
2176static int
b5d57fc8 2177netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2178 unsigned int queue_id, struct smap *details)
c1c9c9c4 2179{
b5d57fc8 2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2181 int error;
2182
86383816 2183 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2184 error = tc_query_qdisc(netdev_);
86383816 2185 if (!error) {
b5d57fc8 2186 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2187 error = (queue
b5d57fc8 2188 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2189 : ENOENT);
c1c9c9c4 2190 }
86383816
BP
2191 ovs_mutex_unlock(&netdev->mutex);
2192
2193 return error;
c1c9c9c4
BP
2194}
2195
2196static int
b5d57fc8 2197netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2198 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2199{
b5d57fc8 2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2201 int error;
2202
86383816 2203 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2204 error = tc_query_qdisc(netdev_);
86383816
BP
2205 if (!error) {
2206 error = (queue_id < netdev->tc->ops->n_queues
2207 && netdev->tc->ops->class_set
2208 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2209 : EINVAL);
c1c9c9c4 2210 }
86383816 2211 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2212
86383816 2213 return error;
c1c9c9c4
BP
2214}
2215
2216static int
b5d57fc8 2217netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2218{
b5d57fc8 2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2220 int error;
2221
86383816 2222 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2223 error = tc_query_qdisc(netdev_);
86383816
BP
2224 if (!error) {
2225 if (netdev->tc->ops->class_delete) {
2226 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2227 error = (queue
2228 ? netdev->tc->ops->class_delete(netdev_, queue)
2229 : ENOENT);
2230 } else {
2231 error = EINVAL;
2232 }
c1c9c9c4 2233 }
86383816
BP
2234 ovs_mutex_unlock(&netdev->mutex);
2235
2236 return error;
c1c9c9c4
BP
2237}
2238
2239static int
b5d57fc8 2240netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2241 unsigned int queue_id,
2242 struct netdev_queue_stats *stats)
2243{
b5d57fc8 2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2245 int error;
2246
86383816 2247 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2248 error = tc_query_qdisc(netdev_);
86383816
BP
2249 if (!error) {
2250 if (netdev->tc->ops->class_get_stats) {
2251 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2252 if (queue) {
2253 stats->created = queue->created;
2254 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2255 stats);
2256 } else {
2257 error = ENOENT;
2258 }
2259 } else {
2260 error = EOPNOTSUPP;
6dc34a0d 2261 }
c1c9c9c4 2262 }
86383816
BP
2263 ovs_mutex_unlock(&netdev->mutex);
2264
2265 return error;
c1c9c9c4
BP
2266}
2267
d57695d7
JS
2268struct queue_dump_state {
2269 struct nl_dump dump;
2270 struct ofpbuf buf;
2271};
2272
23a98ffe 2273static bool
d57695d7 2274start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2275{
2276 struct ofpbuf request;
2277 struct tcmsg *tcmsg;
2278
2279 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2280 if (!tcmsg) {
2281 return false;
2282 }
3c4de644 2283 tcmsg->tcm_parent = 0;
d57695d7 2284 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2285 ofpbuf_uninit(&request);
d57695d7
JS
2286
2287 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2288 return true;
c1c9c9c4
BP
2289}
2290
d57695d7
JS
2291static int
2292finish_queue_dump(struct queue_dump_state *state)
2293{
2294 ofpbuf_uninit(&state->buf);
2295 return nl_dump_done(&state->dump);
2296}
2297
89454bf4
BP
2298struct netdev_linux_queue_state {
2299 unsigned int *queues;
2300 size_t cur_queue;
2301 size_t n_queues;
2302};
2303
c1c9c9c4 2304static int
89454bf4 2305netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2306{
89454bf4 2307 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2308 int error;
2309
86383816 2310 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2311 error = tc_query_qdisc(netdev_);
86383816
BP
2312 if (!error) {
2313 if (netdev->tc->ops->class_get) {
89454bf4
BP
2314 struct netdev_linux_queue_state *state;
2315 struct tc_queue *queue;
2316 size_t i;
2317
2318 *statep = state = xmalloc(sizeof *state);
2319 state->n_queues = hmap_count(&netdev->tc->queues);
2320 state->cur_queue = 0;
2321 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2322
2323 i = 0;
2324 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2325 state->queues[i++] = queue->queue_id;
86383816 2326 }
c1c9c9c4 2327 } else {
86383816 2328 error = EOPNOTSUPP;
c1c9c9c4
BP
2329 }
2330 }
86383816 2331 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2332
86383816 2333 return error;
c1c9c9c4
BP
2334}
2335
89454bf4
BP
2336static int
2337netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2338 unsigned int *queue_idp, struct smap *details)
2339{
2340 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2341 struct netdev_linux_queue_state *state = state_;
2342 int error = EOF;
2343
2344 ovs_mutex_lock(&netdev->mutex);
2345 while (state->cur_queue < state->n_queues) {
2346 unsigned int queue_id = state->queues[state->cur_queue++];
2347 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2348
2349 if (queue) {
2350 *queue_idp = queue_id;
2351 error = netdev->tc->ops->class_get(netdev_, queue, details);
2352 break;
2353 }
2354 }
2355 ovs_mutex_unlock(&netdev->mutex);
2356
2357 return error;
2358}
2359
2360static int
2361netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2362 void *state_)
2363{
2364 struct netdev_linux_queue_state *state = state_;
2365
2366 free(state->queues);
2367 free(state);
2368 return 0;
2369}
2370
c1c9c9c4 2371static int
b5d57fc8 2372netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2373 netdev_dump_queue_stats_cb *cb, void *aux)
2374{
b5d57fc8 2375 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2376 int error;
2377
86383816 2378 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2379 error = tc_query_qdisc(netdev_);
86383816 2380 if (!error) {
d57695d7 2381 struct queue_dump_state state;
c1c9c9c4 2382
86383816
BP
2383 if (!netdev->tc->ops->class_dump_stats) {
2384 error = EOPNOTSUPP;
d57695d7 2385 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2386 error = ENODEV;
2387 } else {
2388 struct ofpbuf msg;
2389 int retval;
2390
d57695d7 2391 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2392 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2393 cb, aux);
2394 if (retval) {
2395 error = retval;
2396 }
2397 }
2398
d57695d7 2399 retval = finish_queue_dump(&state);
86383816
BP
2400 if (retval) {
2401 error = retval;
2402 }
c1c9c9c4
BP
2403 }
2404 }
86383816 2405 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2406
86383816 2407 return error;
c1c9c9c4
BP
2408}
2409
8b61709d 2410static int
f1acd62b
BP
2411netdev_linux_get_in4(const struct netdev *netdev_,
2412 struct in_addr *address, struct in_addr *netmask)
8b61709d 2413{
b5d57fc8 2414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2415 int error;
149f577a 2416
86383816 2417 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2418 if (!(netdev->cache_valid & VALID_IN4)) {
b5d57fc8 2419 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d 2420 SIOCGIFADDR, "SIOCGIFADDR");
86383816
BP
2421 if (!error) {
2422 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2423 SIOCGIFNETMASK, "SIOCGIFNETMASK");
8b61709d 2424 }
49af9a3d
AW
2425 netdev->in4_error = error;
2426 netdev->cache_valid |= VALID_IN4;
86383816 2427 } else {
49af9a3d 2428 error = netdev->in4_error;
86383816 2429 }
8b61709d 2430
86383816
BP
2431 if (!error) {
2432 if (netdev->address.s_addr != INADDR_ANY) {
2433 *address = netdev->address;
2434 *netmask = netdev->netmask;
2435 } else {
2436 error = EADDRNOTAVAIL;
f1acd62b 2437 }
8b61709d 2438 }
86383816
BP
2439 ovs_mutex_unlock(&netdev->mutex);
2440
2441 return error;
8b61709d
BP
2442}
2443
8b61709d 2444static int
f1acd62b
BP
2445netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2446 struct in_addr netmask)
8b61709d 2447{
b5d57fc8 2448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2449 int error;
2450
86383816 2451 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2452 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2453 if (!error) {
b5d57fc8
BP
2454 netdev->address = address;
2455 netdev->netmask = netmask;
f1acd62b 2456 if (address.s_addr != INADDR_ANY) {
8b61709d 2457 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2458 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2459 }
2460 }
49af9a3d
AW
2461
2462 if (!error) {
2463 netdev->cache_valid |= VALID_IN4;
2464 netdev->in4_error = 0;
2465 } else {
2466 netdev->cache_valid &= ~VALID_IN4;
2467 }
86383816
BP
2468 ovs_mutex_unlock(&netdev->mutex);
2469
8b61709d
BP
2470 return error;
2471}
2472
2473static bool
2474parse_if_inet6_line(const char *line,
2475 struct in6_addr *in6, char ifname[16 + 1])
2476{
2477 uint8_t *s6 = in6->s6_addr;
2478#define X8 "%2"SCNx8
c2c28dfd
BP
2479 return ovs_scan(line,
2480 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2481 "%*x %*x %*x %*x %16s\n",
2482 &s6[0], &s6[1], &s6[2], &s6[3],
2483 &s6[4], &s6[5], &s6[6], &s6[7],
2484 &s6[8], &s6[9], &s6[10], &s6[11],
2485 &s6[12], &s6[13], &s6[14], &s6[15],
2486 ifname);
8b61709d
BP
2487}
2488
7df6932e
AW
2489/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2490 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2491 * error. */
8b61709d
BP
2492static int
2493netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2494{
b5d57fc8 2495 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2496 int error;
86383816
BP
2497
2498 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2499 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2500 FILE *file;
2501 char line[128];
2502
b5d57fc8 2503 netdev->in6 = in6addr_any;
7df6932e 2504 netdev->in6_error = EADDRNOTAVAIL;
8b61709d
BP
2505
2506 file = fopen("/proc/net/if_inet6", "r");
2507 if (file != NULL) {
2508 const char *name = netdev_get_name(netdev_);
2509 while (fgets(line, sizeof line, file)) {
2a022368 2510 struct in6_addr in6_tmp;
8b61709d 2511 char ifname[16 + 1];
2a022368 2512 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2513 && !strcmp(name, ifname))
2514 {
b5d57fc8 2515 netdev->in6 = in6_tmp;
7df6932e 2516 netdev->in6_error = 0;
8b61709d
BP
2517 break;
2518 }
2519 }
2520 fclose(file);
7df6932e
AW
2521 } else {
2522 netdev->in6_error = EOPNOTSUPP;
8b61709d 2523 }
b5d57fc8 2524 netdev->cache_valid |= VALID_IN6;
8b61709d 2525 }
b5d57fc8 2526 *in6 = netdev->in6;
7df6932e 2527 error = netdev->in6_error;
86383816
BP
2528 ovs_mutex_unlock(&netdev->mutex);
2529
7df6932e 2530 return error;
8b61709d
BP
2531}
2532
2533static void
2534make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2535{
2536 struct sockaddr_in sin;
2537 memset(&sin, 0, sizeof sin);
2538 sin.sin_family = AF_INET;
2539 sin.sin_addr = addr;
2540 sin.sin_port = 0;
2541
2542 memset(sa, 0, sizeof *sa);
2543 memcpy(sa, &sin, sizeof sin);
2544}
2545
2546static int
2547do_set_addr(struct netdev *netdev,
2548 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2549{
2550 struct ifreq ifr;
149f577a 2551
259e0b1a
BP
2552 make_in4_sockaddr(&ifr.ifr_addr, addr);
2553 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2554 ioctl_name);
8b61709d
BP
2555}
2556
2557/* Adds 'router' as a default IP gateway. */
2558static int
67a4917b 2559netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2560{
2561 struct in_addr any = { INADDR_ANY };
2562 struct rtentry rt;
2563 int error;
2564
2565 memset(&rt, 0, sizeof rt);
2566 make_in4_sockaddr(&rt.rt_dst, any);
2567 make_in4_sockaddr(&rt.rt_gateway, router);
2568 make_in4_sockaddr(&rt.rt_genmask, any);
2569 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2570 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2571 if (error) {
10a89ef0 2572 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2573 }
2574 return error;
2575}
2576
f1acd62b
BP
2577static int
2578netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2579 char **netdev_name)
2580{
2581 static const char fn[] = "/proc/net/route";
2582 FILE *stream;
2583 char line[256];
2584 int ln;
2585
2586 *netdev_name = NULL;
2587 stream = fopen(fn, "r");
2588 if (stream == NULL) {
10a89ef0 2589 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2590 return errno;
2591 }
2592
2593 ln = 0;
2594 while (fgets(line, sizeof line, stream)) {
2595 if (++ln >= 2) {
2596 char iface[17];
dbba996b 2597 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2598 int refcnt, metric, mtu;
2599 unsigned int flags, use, window, irtt;
2600
c2c28dfd
BP
2601 if (!ovs_scan(line,
2602 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2603 " %d %u %u\n",
2604 iface, &dest, &gateway, &flags, &refcnt,
2605 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2606 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2607 fn, ln, line);
2608 continue;
2609 }
2610 if (!(flags & RTF_UP)) {
2611 /* Skip routes that aren't up. */
2612 continue;
2613 }
2614
2615 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2616 * network byte order, so we don't need need any endian
f1acd62b
BP
2617 * conversions here. */
2618 if ((dest & mask) == (host->s_addr & mask)) {
2619 if (!gateway) {
2620 /* The host is directly reachable. */
2621 next_hop->s_addr = 0;
2622 } else {
2623 /* To reach the host, we must go through a gateway. */
2624 next_hop->s_addr = gateway;
2625 }
2626 *netdev_name = xstrdup(iface);
2627 fclose(stream);
2628 return 0;
2629 }
2630 }
2631 }
2632
2633 fclose(stream);
2634 return ENXIO;
2635}
2636
e210037e 2637static int
b5d57fc8 2638netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2639{
b5d57fc8 2640 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2641 int error = 0;
2642
86383816 2643 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2644 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2645 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2646
2647 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2648 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2649 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2650 cmd,
2651 ETHTOOL_GDRVINFO,
2652 "ETHTOOL_GDRVINFO");
2653 if (!error) {
b5d57fc8 2654 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2655 }
2656 }
e210037e 2657
e210037e 2658 if (!error) {
b5d57fc8
BP
2659 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2660 smap_add(smap, "driver_version", netdev->drvinfo.version);
2661 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2662 }
86383816
BP
2663 ovs_mutex_unlock(&netdev->mutex);
2664
e210037e
AE
2665 return error;
2666}
2667
4f925bd3 2668static int
275707c3
EJ
2669netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2670 struct smap *smap)
4f925bd3 2671{
79f1cbe9 2672 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2673 return 0;
2674}
2675
8b61709d
BP
2676/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2677 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2678 * returns 0. Otherwise, it returns a positive errno value; in particular,
2679 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2680static int
2681netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2682 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2683{
2684 struct arpreq r;
c100e025 2685 struct sockaddr_in sin;
8b61709d
BP
2686 int retval;
2687
2688 memset(&r, 0, sizeof r);
f2cc621b 2689 memset(&sin, 0, sizeof sin);
c100e025
BP
2690 sin.sin_family = AF_INET;
2691 sin.sin_addr.s_addr = ip;
2692 sin.sin_port = 0;
2693 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2694 r.arp_ha.sa_family = ARPHRD_ETHER;
2695 r.arp_flags = 0;
71d7c22f 2696 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2697 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2698 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2699 if (!retval) {
2700 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2701 } else if (retval != ENXIO) {
2702 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2703 netdev_get_name(netdev), IP_ARGS(ip),
2704 ovs_strerror(retval));
8b61709d
BP
2705 }
2706 return retval;
2707}
2708
2709static int
2710nd_to_iff_flags(enum netdev_flags nd)
2711{
2712 int iff = 0;
2713 if (nd & NETDEV_UP) {
2714 iff |= IFF_UP;
2715 }
2716 if (nd & NETDEV_PROMISC) {
2717 iff |= IFF_PROMISC;
2718 }
7ba19d41
AC
2719 if (nd & NETDEV_LOOPBACK) {
2720 iff |= IFF_LOOPBACK;
2721 }
8b61709d
BP
2722 return iff;
2723}
2724
2725static int
2726iff_to_nd_flags(int iff)
2727{
2728 enum netdev_flags nd = 0;
2729 if (iff & IFF_UP) {
2730 nd |= NETDEV_UP;
2731 }
2732 if (iff & IFF_PROMISC) {
2733 nd |= NETDEV_PROMISC;
2734 }
7ba19d41
AC
2735 if (iff & IFF_LOOPBACK) {
2736 nd |= NETDEV_LOOPBACK;
2737 }
8b61709d
BP
2738 return nd;
2739}
2740
2741static int
4f9f3f21
BP
2742update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2743 enum netdev_flags on, enum netdev_flags *old_flagsp)
2744 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2745{
2746 int old_flags, new_flags;
c37d4da4
EJ
2747 int error = 0;
2748
b5d57fc8 2749 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2750 *old_flagsp = iff_to_nd_flags(old_flags);
2751 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2752 if (new_flags != old_flags) {
4f9f3f21
BP
2753 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2754 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2755 }
4f9f3f21
BP
2756
2757 return error;
2758}
2759
2760static int
2761netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2762 enum netdev_flags on, enum netdev_flags *old_flagsp)
2763{
2764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2765 int error;
2766
2767 ovs_mutex_lock(&netdev->mutex);
2768 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2769 ovs_mutex_unlock(&netdev->mutex);
2770
8b61709d
BP
2771 return error;
2772}
2773
2f9dd77f 2774#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2775 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2776{ \
2777 NAME, \
2778 \
259e0b1a 2779 NULL, \
c3827f61
BP
2780 netdev_linux_run, \
2781 netdev_linux_wait, \
2782 \
9dc63482
BP
2783 netdev_linux_alloc, \
2784 CONSTRUCT, \
2785 netdev_linux_destruct, \
2786 netdev_linux_dealloc, \
de5cdb90 2787 NULL, /* get_config */ \
6d9e6eb4 2788 NULL, /* set_config */ \
f431bf7d 2789 NULL, /* get_tunnel_config */ \
a36de779
PS
2790 NULL, /* build header */ \
2791 NULL, /* push header */ \
2792 NULL, /* pop header */ \
7dec44fe 2793 NULL, /* get_numa_id */ \
5496878c 2794 NULL, /* set_multiq */ \
c3827f61 2795 \
c3827f61
BP
2796 netdev_linux_send, \
2797 netdev_linux_send_wait, \
2798 \
2799 netdev_linux_set_etheraddr, \
2800 netdev_linux_get_etheraddr, \
2801 netdev_linux_get_mtu, \
9b020780 2802 netdev_linux_set_mtu, \
c3827f61
BP
2803 netdev_linux_get_ifindex, \
2804 netdev_linux_get_carrier, \
65c3058c 2805 netdev_linux_get_carrier_resets, \
1670c579 2806 netdev_linux_set_miimon_interval, \
f613a0d7 2807 GET_STATS, \
c3827f61 2808 \
51f87458 2809 GET_FEATURES, \
c3827f61 2810 netdev_linux_set_advertisements, \
c3827f61
BP
2811 \
2812 netdev_linux_set_policing, \
2813 netdev_linux_get_qos_types, \
2814 netdev_linux_get_qos_capabilities, \
2815 netdev_linux_get_qos, \
2816 netdev_linux_set_qos, \
2817 netdev_linux_get_queue, \
2818 netdev_linux_set_queue, \
2819 netdev_linux_delete_queue, \
2820 netdev_linux_get_queue_stats, \
89454bf4
BP
2821 netdev_linux_queue_dump_start, \
2822 netdev_linux_queue_dump_next, \
2823 netdev_linux_queue_dump_done, \
c3827f61
BP
2824 netdev_linux_dump_queue_stats, \
2825 \
2826 netdev_linux_get_in4, \
2827 netdev_linux_set_in4, \
2828 netdev_linux_get_in6, \
2829 netdev_linux_add_router, \
2830 netdev_linux_get_next_hop, \
4f925bd3 2831 GET_STATUS, \
c3827f61
BP
2832 netdev_linux_arp_lookup, \
2833 \
2834 netdev_linux_update_flags, \
2835 \
f7791740
PS
2836 netdev_linux_rxq_alloc, \
2837 netdev_linux_rxq_construct, \
2838 netdev_linux_rxq_destruct, \
2839 netdev_linux_rxq_dealloc, \
2840 netdev_linux_rxq_recv, \
2841 netdev_linux_rxq_wait, \
2842 netdev_linux_rxq_drain, \
c3827f61
BP
2843}
2844
2845const struct netdev_class netdev_linux_class =
2846 NETDEV_LINUX_CLASS(
2847 "system",
9dc63482 2848 netdev_linux_construct,
f613a0d7 2849 netdev_linux_get_stats,
51f87458 2850 netdev_linux_get_features,
275707c3 2851 netdev_linux_get_status);
c3827f61
BP
2852
2853const struct netdev_class netdev_tap_class =
2854 NETDEV_LINUX_CLASS(
2855 "tap",
9dc63482 2856 netdev_linux_construct_tap,
bba1e6f3 2857 netdev_tap_get_stats,
51f87458 2858 netdev_linux_get_features,
275707c3 2859 netdev_linux_get_status);
c3827f61
BP
2860
2861const struct netdev_class netdev_internal_class =
2862 NETDEV_LINUX_CLASS(
2863 "internal",
9dc63482 2864 netdev_linux_construct,
bba1e6f3 2865 netdev_internal_get_stats,
51f87458 2866 NULL, /* get_features */
275707c3 2867 netdev_internal_get_status);
8b61709d 2868\f
677d9158
JV
2869
2870#define CODEL_N_QUEUES 0x0000
2871
2f4298ce
BP
2872/* In sufficiently new kernel headers these are defined as enums in
2873 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2874 * kernels. (This overrides any enum definition in the header file but that's
2875 * harmless.) */
2876#define TCA_CODEL_TARGET 1
2877#define TCA_CODEL_LIMIT 2
2878#define TCA_CODEL_INTERVAL 3
2879
677d9158
JV
2880struct codel {
2881 struct tc tc;
2882 uint32_t target;
2883 uint32_t limit;
2884 uint32_t interval;
2885};
2886
2887static struct codel *
2888codel_get__(const struct netdev *netdev_)
2889{
2890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2891 return CONTAINER_OF(netdev->tc, struct codel, tc);
2892}
2893
2894static void
2895codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2896 uint32_t interval)
2897{
2898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2899 struct codel *codel;
2900
2901 codel = xmalloc(sizeof *codel);
2902 tc_init(&codel->tc, &tc_ops_codel);
2903 codel->target = target;
2904 codel->limit = limit;
2905 codel->interval = interval;
2906
2907 netdev->tc = &codel->tc;
2908}
2909
2910static int
2911codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2912 uint32_t interval)
2913{
2914 size_t opt_offset;
2915 struct ofpbuf request;
2916 struct tcmsg *tcmsg;
2917 uint32_t otarget, olimit, ointerval;
2918 int error;
2919
2920 tc_del_qdisc(netdev);
2921
2922 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2923 NLM_F_EXCL | NLM_F_CREATE, &request);
2924 if (!tcmsg) {
2925 return ENODEV;
2926 }
2927 tcmsg->tcm_handle = tc_make_handle(1, 0);
2928 tcmsg->tcm_parent = TC_H_ROOT;
2929
2930 otarget = target ? target : 5000;
2931 olimit = limit ? limit : 10240;
2932 ointerval = interval ? interval : 100000;
2933
2934 nl_msg_put_string(&request, TCA_KIND, "codel");
2935 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2936 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2937 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2938 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2939 nl_msg_end_nested(&request, opt_offset);
2940
2941 error = tc_transact(&request, NULL);
2942 if (error) {
2943 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2944 "target %u, limit %u, interval %u error %d(%s)",
2945 netdev_get_name(netdev),
2946 otarget, olimit, ointerval,
2947 error, ovs_strerror(error));
2948 }
2949 return error;
2950}
2951
2952static void
2953codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2954 const struct smap *details, struct codel *codel)
2955{
2956 const char *target_s;
2957 const char *limit_s;
2958 const char *interval_s;
2959
2960 target_s = smap_get(details, "target");
2961 limit_s = smap_get(details, "limit");
2962 interval_s = smap_get(details, "interval");
2963
2964 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2965 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2966 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2967
2968 if (!codel->target) {
2969 codel->target = 5000;
2970 }
2971 if (!codel->limit) {
2972 codel->limit = 10240;
2973 }
2974 if (!codel->interval) {
2975 codel->interval = 100000;
2976 }
2977}
2978
2979static int
2980codel_tc_install(struct netdev *netdev, const struct smap *details)
2981{
2982 int error;
2983 struct codel codel;
2984
2985 codel_parse_qdisc_details__(netdev, details, &codel);
2986 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2987 codel.interval);
2988 if (!error) {
2989 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2990 }
2991 return error;
2992}
2993
2994static int
2995codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2996{
2997 static const struct nl_policy tca_codel_policy[] = {
2998 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2999 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3000 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3001 };
3002
3003 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3004
3005 if (!nl_parse_nested(nl_options, tca_codel_policy,
3006 attrs, ARRAY_SIZE(tca_codel_policy))) {
3007 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3008 return EPROTO;
3009 }
3010
3011 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3012 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3013 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3014 return 0;
3015}
3016
3017static int
3018codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3019{
3020 struct nlattr *nlattr;
3021 const char * kind;
3022 int error;
3023 struct codel codel;
3024
3025 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3026 if (error != 0) {
3027 return error;
3028 }
3029
3030 error = codel_parse_tca_options__(nlattr, &codel);
3031 if (error != 0) {
3032 return error;
3033 }
3034
3035 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3036 return 0;
3037}
3038
3039
3040static void
3041codel_tc_destroy(struct tc *tc)
3042{
3043 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3044 tc_destroy(tc);
3045 free(codel);
3046}
3047
3048static int
3049codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3050{
3051 const struct codel *codel = codel_get__(netdev);
3052 smap_add_format(details, "target", "%u", codel->target);
3053 smap_add_format(details, "limit", "%u", codel->limit);
3054 smap_add_format(details, "interval", "%u", codel->interval);
3055 return 0;
3056}
3057
3058static int
3059codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3060{
3061 struct codel codel;
3062
3063 codel_parse_qdisc_details__(netdev, details, &codel);
3064 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3065 codel_get__(netdev)->target = codel.target;
3066 codel_get__(netdev)->limit = codel.limit;
3067 codel_get__(netdev)->interval = codel.interval;
3068 return 0;
3069}
3070
3071static const struct tc_ops tc_ops_codel = {
3072 "codel", /* linux_name */
3073 "linux-codel", /* ovs_name */
3074 CODEL_N_QUEUES, /* n_queues */
3075 codel_tc_install,
3076 codel_tc_load,
3077 codel_tc_destroy,
3078 codel_qdisc_get,
3079 codel_qdisc_set,
3080 NULL,
3081 NULL,
3082 NULL,
3083 NULL,
3084 NULL
3085};
3086\f
3087/* FQ-CoDel traffic control class. */
3088
3089#define FQCODEL_N_QUEUES 0x0000
3090
2f4298ce
BP
3091/* In sufficiently new kernel headers these are defined as enums in
3092 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3093 * kernels. (This overrides any enum definition in the header file but that's
3094 * harmless.) */
3095#define TCA_FQ_CODEL_TARGET 1
3096#define TCA_FQ_CODEL_LIMIT 2
3097#define TCA_FQ_CODEL_INTERVAL 3
3098#define TCA_FQ_CODEL_ECN 4
3099#define TCA_FQ_CODEL_FLOWS 5
3100#define TCA_FQ_CODEL_QUANTUM 6
3101
677d9158
JV
3102struct fqcodel {
3103 struct tc tc;
3104 uint32_t target;
3105 uint32_t limit;
3106 uint32_t interval;
3107 uint32_t flows;
3108 uint32_t quantum;
3109};
3110
3111static struct fqcodel *
3112fqcodel_get__(const struct netdev *netdev_)
3113{
3114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3115 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3116}
3117
3118static void
3119fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3120 uint32_t interval, uint32_t flows, uint32_t quantum)
3121{
3122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3123 struct fqcodel *fqcodel;
3124
3125 fqcodel = xmalloc(sizeof *fqcodel);
3126 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3127 fqcodel->target = target;
3128 fqcodel->limit = limit;
3129 fqcodel->interval = interval;
3130 fqcodel->flows = flows;
3131 fqcodel->quantum = quantum;
3132
3133 netdev->tc = &fqcodel->tc;
3134}
3135
3136static int
3137fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3138 uint32_t interval, uint32_t flows, uint32_t quantum)
3139{
3140 size_t opt_offset;
3141 struct ofpbuf request;
3142 struct tcmsg *tcmsg;
3143 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3144 int error;
3145
3146 tc_del_qdisc(netdev);
3147
3148 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3149 NLM_F_EXCL | NLM_F_CREATE, &request);
3150 if (!tcmsg) {
3151 return ENODEV;
3152 }
3153 tcmsg->tcm_handle = tc_make_handle(1, 0);
3154 tcmsg->tcm_parent = TC_H_ROOT;
3155
3156 otarget = target ? target : 5000;
3157 olimit = limit ? limit : 10240;
3158 ointerval = interval ? interval : 100000;
3159 oflows = flows ? flows : 1024;
3160 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3161 not mtu */
3162
3163 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3164 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3165 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3166 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3167 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3168 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3169 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3170 nl_msg_end_nested(&request, opt_offset);
3171
3172 error = tc_transact(&request, NULL);
3173 if (error) {
3174 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3175 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3176 netdev_get_name(netdev),
3177 otarget, olimit, ointerval, oflows, oquantum,
3178 error, ovs_strerror(error));
3179 }
3180 return error;
3181}
3182
3183static void
3184fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3185 const struct smap *details, struct fqcodel *fqcodel)
3186{
3187 const char *target_s;
3188 const char *limit_s;
3189 const char *interval_s;
3190 const char *flows_s;
3191 const char *quantum_s;
3192
3193 target_s = smap_get(details, "target");
3194 limit_s = smap_get(details, "limit");
3195 interval_s = smap_get(details, "interval");
3196 flows_s = smap_get(details, "flows");
3197 quantum_s = smap_get(details, "quantum");
3198 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3199 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3200 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3201 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3202 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3203 if (!fqcodel->target) {
3204 fqcodel->target = 5000;
3205 }
3206 if (!fqcodel->limit) {
3207 fqcodel->limit = 10240;
3208 }
3209 if (!fqcodel->interval) {
3210 fqcodel->interval = 1000000;
3211 }
3212 if (!fqcodel->flows) {
3213 fqcodel->flows = 1024;
3214 }
3215 if (!fqcodel->quantum) {
3216 fqcodel->quantum = 1514;
3217 }
3218}
3219
3220static int
3221fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3222{
3223 int error;
3224 struct fqcodel fqcodel;
3225
3226 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3227 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3228 fqcodel.interval, fqcodel.flows,
3229 fqcodel.quantum);
3230 if (!error) {
3231 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3232 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3233 }
3234 return error;
3235}
3236
3237static int
3238fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3239{
3240 static const struct nl_policy tca_fqcodel_policy[] = {
3241 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3242 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3243 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3244 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3245 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3246 };
3247
3248 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3249
3250 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3251 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3252 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3253 return EPROTO;
3254 }
3255
3256 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3257 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3258 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3259 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3260 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3261 return 0;
3262}
3263
3264static int
3265fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3266{
3267 struct nlattr *nlattr;
3268 const char * kind;
3269 int error;
3270 struct fqcodel fqcodel;
3271
3272 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3273 if (error != 0) {
3274 return error;
3275 }
3276
3277 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3278 if (error != 0) {
3279 return error;
3280 }
3281
3282 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3283 fqcodel.flows, fqcodel.quantum);
3284 return 0;
3285}
3286
3287static void
3288fqcodel_tc_destroy(struct tc *tc)
3289{
3290 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3291 tc_destroy(tc);
3292 free(fqcodel);
3293}
3294
3295static int
3296fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3297{
3298 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3299 smap_add_format(details, "target", "%u", fqcodel->target);
3300 smap_add_format(details, "limit", "%u", fqcodel->limit);
3301 smap_add_format(details, "interval", "%u", fqcodel->interval);
3302 smap_add_format(details, "flows", "%u", fqcodel->flows);
3303 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3304 return 0;
3305}
3306
3307static int
3308fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3309{
3310 struct fqcodel fqcodel;
3311
3312 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3313 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3314 fqcodel.flows, fqcodel.quantum);
3315 fqcodel_get__(netdev)->target = fqcodel.target;
3316 fqcodel_get__(netdev)->limit = fqcodel.limit;
3317 fqcodel_get__(netdev)->interval = fqcodel.interval;
3318 fqcodel_get__(netdev)->flows = fqcodel.flows;
3319 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3320 return 0;
3321}
3322
3323static const struct tc_ops tc_ops_fqcodel = {
3324 "fq_codel", /* linux_name */
3325 "linux-fq_codel", /* ovs_name */
3326 FQCODEL_N_QUEUES, /* n_queues */
3327 fqcodel_tc_install,
3328 fqcodel_tc_load,
3329 fqcodel_tc_destroy,
3330 fqcodel_qdisc_get,
3331 fqcodel_qdisc_set,
3332 NULL,
3333 NULL,
3334 NULL,
3335 NULL,
3336 NULL
3337};
3338\f
3339/* SFQ traffic control class. */
3340
3341#define SFQ_N_QUEUES 0x0000
3342
3343struct sfq {
3344 struct tc tc;
3345 uint32_t quantum;
3346 uint32_t perturb;
3347};
3348
3349static struct sfq *
3350sfq_get__(const struct netdev *netdev_)
3351{
3352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3353 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3354}
3355
3356static void
3357sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3358{
3359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3360 struct sfq *sfq;
3361
3362 sfq = xmalloc(sizeof *sfq);
3363 tc_init(&sfq->tc, &tc_ops_sfq);
3364 sfq->perturb = perturb;
3365 sfq->quantum = quantum;
3366
3367 netdev->tc = &sfq->tc;
3368}
3369
3370static int
3371sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3372{
3373 struct tc_sfq_qopt opt;
3374 struct ofpbuf request;
3375 struct tcmsg *tcmsg;
3376 int mtu;
3377 int mtu_error, error;
3378 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3379
3380 tc_del_qdisc(netdev);
3381
3382 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3383 NLM_F_EXCL | NLM_F_CREATE, &request);
3384 if (!tcmsg) {
3385 return ENODEV;
3386 }
3387 tcmsg->tcm_handle = tc_make_handle(1, 0);
3388 tcmsg->tcm_parent = TC_H_ROOT;
3389
3390 memset(&opt, 0, sizeof opt);
3391 if (!quantum) {
3392 if (!mtu_error) {
3393 opt.quantum = mtu; /* if we cannot find mtu, use default */
3394 }
3395 } else {
3396 opt.quantum = quantum;
3397 }
3398
3399 if (!perturb) {
3400 opt.perturb_period = 10;
3401 } else {
3402 opt.perturb_period = perturb;
3403 }
3404
3405 nl_msg_put_string(&request, TCA_KIND, "sfq");
3406 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3407
3408 error = tc_transact(&request, NULL);
3409 if (error) {
3410 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3411 "quantum %u, perturb %u error %d(%s)",
3412 netdev_get_name(netdev),
3413 opt.quantum, opt.perturb_period,
3414 error, ovs_strerror(error));
3415 }
3416 return error;
3417}
3418
3419static void
3420sfq_parse_qdisc_details__(struct netdev *netdev,
3421 const struct smap *details, struct sfq *sfq)
3422{
3423 const char *perturb_s;
3424 const char *quantum_s;
3425 int mtu;
3426 int mtu_error;
3427
3428 perturb_s = smap_get(details, "perturb");
3429 quantum_s = smap_get(details, "quantum");
3430 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3431 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3432 if (!sfq->perturb) {
3433 sfq->perturb = 10;
3434 }
3435
3436 if (!sfq->quantum) {
3437 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3438 if (!mtu_error) {
3439 sfq->quantum = mtu;
3440 } else {
3441 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3442 "device without mtu");
3443 return;
3444 }
3445 }
3446}
3447
3448static int
3449sfq_tc_install(struct netdev *netdev, const struct smap *details)
3450{
3451 int error;
3452 struct sfq sfq;
3453
3454 sfq_parse_qdisc_details__(netdev, details, &sfq);
3455 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3456 if (!error) {
3457 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3458 }
3459 return error;
3460}
3461
3462static int
3463sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3464{
3465 const struct tc_sfq_qopt *sfq;
3466 struct nlattr *nlattr;
3467 const char * kind;
3468 int error;
3469
3470 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3471 if (error == 0) {
3472 sfq = nl_attr_get(nlattr);
3473 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3474 return 0;
3475 }
3476
3477 return error;
3478}
3479
3480static void
3481sfq_tc_destroy(struct tc *tc)
3482{
3483 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3484 tc_destroy(tc);
3485 free(sfq);
3486}
3487
3488static int
3489sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3490{
3491 const struct sfq *sfq = sfq_get__(netdev);
3492 smap_add_format(details, "quantum", "%u", sfq->quantum);
3493 smap_add_format(details, "perturb", "%u", sfq->perturb);
3494 return 0;
3495}
3496
3497static int
3498sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3499{
3500 struct sfq sfq;
3501
3502 sfq_parse_qdisc_details__(netdev, details, &sfq);
3503 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3504 sfq_get__(netdev)->quantum = sfq.quantum;
3505 sfq_get__(netdev)->perturb = sfq.perturb;
3506 return 0;
3507}
3508
3509static const struct tc_ops tc_ops_sfq = {
3510 "sfq", /* linux_name */
3511 "linux-sfq", /* ovs_name */
3512 SFQ_N_QUEUES, /* n_queues */
3513 sfq_tc_install,
3514 sfq_tc_load,
3515 sfq_tc_destroy,
3516 sfq_qdisc_get,
3517 sfq_qdisc_set,
3518 NULL,
3519 NULL,
3520 NULL,
3521 NULL,
3522 NULL
3523};
3524\f
c1c9c9c4 3525/* HTB traffic control class. */
559843ed 3526
c1c9c9c4 3527#define HTB_N_QUEUES 0xf000
4f631ccd 3528#define HTB_RATE2QUANTUM 10
8b61709d 3529
c1c9c9c4
BP
3530struct htb {
3531 struct tc tc;
3532 unsigned int max_rate; /* In bytes/s. */
3533};
8b61709d 3534
c1c9c9c4 3535struct htb_class {
93b13be8 3536 struct tc_queue tc_queue;
c1c9c9c4
BP
3537 unsigned int min_rate; /* In bytes/s. */
3538 unsigned int max_rate; /* In bytes/s. */
3539 unsigned int burst; /* In bytes. */
3540 unsigned int priority; /* Lower values are higher priorities. */
3541};
8b61709d 3542
c1c9c9c4 3543static struct htb *
b5d57fc8 3544htb_get__(const struct netdev *netdev_)
c1c9c9c4 3545{
b5d57fc8
BP
3546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3547 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3548}
3549
24045e35 3550static void
b5d57fc8 3551htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3552{
b5d57fc8 3553 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3554 struct htb *htb;
3555
3556 htb = xmalloc(sizeof *htb);
3557 tc_init(&htb->tc, &tc_ops_htb);
3558 htb->max_rate = max_rate;
3559
b5d57fc8 3560 netdev->tc = &htb->tc;
c1c9c9c4
BP
3561}
3562
3563/* Create an HTB qdisc.
3564 *
a339aa81 3565 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3566static int
3567htb_setup_qdisc__(struct netdev *netdev)
3568{
3569 size_t opt_offset;
3570 struct tc_htb_glob opt;
3571 struct ofpbuf request;
3572 struct tcmsg *tcmsg;
3573
3574 tc_del_qdisc(netdev);
3575
3576 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3577 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3578 if (!tcmsg) {
3579 return ENODEV;
3580 }
c1c9c9c4
BP
3581 tcmsg->tcm_handle = tc_make_handle(1, 0);
3582 tcmsg->tcm_parent = TC_H_ROOT;
3583
3584 nl_msg_put_string(&request, TCA_KIND, "htb");
3585
3586 memset(&opt, 0, sizeof opt);
4f631ccd 3587 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3588 opt.version = 3;
4ecf12d5 3589 opt.defcls = 1;
c1c9c9c4
BP
3590
3591 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3592 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3593 nl_msg_end_nested(&request, opt_offset);
3594
3595 return tc_transact(&request, NULL);
3596}
3597
3598/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3599 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3600static int
3601htb_setup_class__(struct netdev *netdev, unsigned int handle,
3602 unsigned int parent, struct htb_class *class)
3603{
3604 size_t opt_offset;
3605 struct tc_htb_opt opt;
3606 struct ofpbuf request;
3607 struct tcmsg *tcmsg;
3608 int error;
3609 int mtu;
3610
73371c09 3611 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3612 if (error) {
f915f1a8
BP
3613 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3614 netdev_get_name(netdev));
9b020780 3615 return error;
f915f1a8 3616 }
c1c9c9c4
BP
3617
3618 memset(&opt, 0, sizeof opt);
3619 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3620 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3621 /* Makes sure the quantum is at least MTU. Setting quantum will
3622 * make htb ignore the r2q for this class. */
3623 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3624 opt.quantum = mtu;
3625 }
c1c9c9c4
BP
3626 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3627 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3628 opt.prio = class->priority;
3629
3630 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3631 if (!tcmsg) {
3632 return ENODEV;
3633 }
c1c9c9c4
BP
3634 tcmsg->tcm_handle = handle;
3635 tcmsg->tcm_parent = parent;
3636
3637 nl_msg_put_string(&request, TCA_KIND, "htb");
3638 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3639 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3640 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3641 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3642 nl_msg_end_nested(&request, opt_offset);
3643
3644 error = tc_transact(&request, NULL);
3645 if (error) {
3646 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3647 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3648 netdev_get_name(netdev),
3649 tc_get_major(handle), tc_get_minor(handle),
3650 tc_get_major(parent), tc_get_minor(parent),
3651 class->min_rate, class->max_rate,
10a89ef0 3652 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3653 }
3654 return error;
3655}
3656
3657/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3658 * description of them into 'details'. The description complies with the
3659 * specification given in the vswitch database documentation for linux-htb
3660 * queue details. */
3661static int
3662htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3663{
3664 static const struct nl_policy tca_htb_policy[] = {
3665 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3666 .min_len = sizeof(struct tc_htb_opt) },
3667 };
3668
3669 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3670 const struct tc_htb_opt *htb;
3671
3672 if (!nl_parse_nested(nl_options, tca_htb_policy,
3673 attrs, ARRAY_SIZE(tca_htb_policy))) {
3674 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3675 return EPROTO;
3676 }
3677
3678 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3679 class->min_rate = htb->rate.rate;
3680 class->max_rate = htb->ceil.rate;
3681 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3682 class->priority = htb->prio;
3683 return 0;
3684}
3685
3686static int
3687htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3688 struct htb_class *options,
3689 struct netdev_queue_stats *stats)
3690{
3691 struct nlattr *nl_options;
3692 unsigned int handle;
3693 int error;
3694
3695 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3696 if (!error && queue_id) {
17ee3c1f
BP
3697 unsigned int major = tc_get_major(handle);
3698 unsigned int minor = tc_get_minor(handle);
3699 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3700 *queue_id = minor - 1;
c1c9c9c4
BP
3701 } else {
3702 error = EPROTO;
3703 }
3704 }
3705 if (!error && options) {
3706 error = htb_parse_tca_options__(nl_options, options);
3707 }
3708 return error;
3709}
3710
3711static void
73371c09 3712htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3713 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3714{
73371c09 3715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3716 const char *max_rate_s;
3717
79f1cbe9 3718 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
3719 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3720 if (!hc->max_rate) {
a00ca915 3721 enum netdev_features current;
c1c9c9c4 3722
73371c09
BP
3723 netdev_linux_read_features(netdev);
3724 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3725 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3726 }
3727 hc->min_rate = hc->max_rate;
3728 hc->burst = 0;
3729 hc->priority = 0;
3730}
3731
3732static int
3733htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3734 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3735{
3736 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
3737 const char *min_rate_s = smap_get(details, "min-rate");
3738 const char *max_rate_s = smap_get(details, "max-rate");
3739 const char *burst_s = smap_get(details, "burst");
3740 const char *priority_s = smap_get(details, "priority");
9b020780 3741 int mtu, error;
c1c9c9c4 3742
73371c09 3743 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3744 if (error) {
f915f1a8
BP
3745 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3746 netdev_get_name(netdev));
9b020780 3747 return error;
f915f1a8
BP
3748 }
3749
4f104611
EJ
3750 /* HTB requires at least an mtu sized min-rate to send any traffic even
3751 * on uncongested links. */
c45ab5e9 3752 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 3753 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3754 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3755
3756 /* max-rate */
3757 hc->max_rate = (max_rate_s
3758 ? strtoull(max_rate_s, NULL, 10) / 8
3759 : htb->max_rate);
3760 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3761 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3762
3763 /* burst
3764 *
3765 * According to hints in the documentation that I've read, it is important
3766 * that 'burst' be at least as big as the largest frame that might be
3767 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3768 * but having it a bit too small is a problem. Since netdev_get_mtu()
3769 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3770 * the MTU. We actually add 64, instead of 14, as a guard against
3771 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
3772 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3773 hc->burst = MAX(hc->burst, mtu + 64);
3774
3775 /* priority */
3776 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3777
3778 return 0;
3779}
3780
3781static int
3782htb_query_class__(const struct netdev *netdev, unsigned int handle,
3783 unsigned int parent, struct htb_class *options,
3784 struct netdev_queue_stats *stats)
3785{
3786 struct ofpbuf *reply;
3787 int error;
3788
3789 error = tc_query_class(netdev, handle, parent, &reply);
3790 if (!error) {
3791 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3792 ofpbuf_delete(reply);
3793 }
3794 return error;
3795}
3796
3797static int
79f1cbe9 3798htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3799{
3800 int error;
3801
3802 error = htb_setup_qdisc__(netdev);
3803 if (!error) {
3804 struct htb_class hc;
3805
3806 htb_parse_qdisc_details__(netdev, details, &hc);
3807 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3808 tc_make_handle(1, 0), &hc);
3809 if (!error) {
3810 htb_install__(netdev, hc.max_rate);
3811 }
3812 }
3813 return error;
3814}
3815
93b13be8
BP
3816static struct htb_class *
3817htb_class_cast__(const struct tc_queue *queue)
3818{
3819 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3820}
3821
c1c9c9c4
BP
3822static void
3823htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3824 const struct htb_class *hc)
3825{
3826 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3827 size_t hash = hash_int(queue_id, 0);
3828 struct tc_queue *queue;
c1c9c9c4
BP
3829 struct htb_class *hcp;
3830
93b13be8
BP
3831 queue = tc_find_queue__(netdev, queue_id, hash);
3832 if (queue) {
3833 hcp = htb_class_cast__(queue);
3834 } else {
c1c9c9c4 3835 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3836 queue = &hcp->tc_queue;
3837 queue->queue_id = queue_id;
6dc34a0d 3838 queue->created = time_msec();
93b13be8 3839 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3840 }
93b13be8
BP
3841
3842 hcp->min_rate = hc->min_rate;
3843 hcp->max_rate = hc->max_rate;
3844 hcp->burst = hc->burst;
3845 hcp->priority = hc->priority;
c1c9c9c4
BP
3846}
3847
3848static int
3849htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3850{
c1c9c9c4 3851 struct ofpbuf msg;
d57695d7 3852 struct queue_dump_state state;
c1c9c9c4 3853 struct htb_class hc;
c1c9c9c4
BP
3854
3855 /* Get qdisc options. */
3856 hc.max_rate = 0;
3857 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3858 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3859
3860 /* Get queues. */
d57695d7 3861 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3862 return ENODEV;
3863 }
d57695d7 3864 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3865 unsigned int queue_id;
3866
3867 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3868 htb_update_queue__(netdev, queue_id, &hc);
3869 }
3870 }
d57695d7 3871 finish_queue_dump(&state);
c1c9c9c4
BP
3872
3873 return 0;
3874}
3875
3876static void
3877htb_tc_destroy(struct tc *tc)
3878{
3879 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 3880 struct htb_class *hc, *next;
c1c9c9c4 3881
4e8e4213 3882 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 3883 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
3884 free(hc);
3885 }
3886 tc_destroy(tc);
3887 free(htb);
3888}
3889
3890static int
79f1cbe9 3891htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3892{
3893 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3894 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3895 return 0;
3896}
3897
3898static int
79f1cbe9 3899htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3900{
3901 struct htb_class hc;
3902 int error;
3903
3904 htb_parse_qdisc_details__(netdev, details, &hc);
3905 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3906 tc_make_handle(1, 0), &hc);
3907 if (!error) {
3908 htb_get__(netdev)->max_rate = hc.max_rate;
3909 }
3910 return error;
3911}
3912
3913static int
93b13be8 3914htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3915 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3916{
93b13be8 3917 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3918
79f1cbe9 3919 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3920 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3921 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3922 }
79f1cbe9 3923 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3924 if (hc->priority) {
79f1cbe9 3925 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3926 }
3927 return 0;
3928}
3929
3930static int
3931htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3932 const struct smap *details)
c1c9c9c4
BP
3933{
3934 struct htb_class hc;
3935 int error;
3936
3937 error = htb_parse_class_details__(netdev, details, &hc);
3938 if (error) {
3939 return error;
3940 }
3941
17ee3c1f 3942 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3943 tc_make_handle(1, 0xfffe), &hc);
3944 if (error) {
3945 return error;
3946 }
3947
3948 htb_update_queue__(netdev, queue_id, &hc);
3949 return 0;
3950}
3951
3952static int
93b13be8 3953htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3954{
93b13be8 3955 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3956 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3957 int error;
3958
93b13be8 3959 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3960 if (!error) {
93b13be8 3961 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3962 free(hc);
c1c9c9c4
BP
3963 }
3964 return error;
3965}
3966
3967static int
93b13be8 3968htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3969 struct netdev_queue_stats *stats)
3970{
93b13be8 3971 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3972 tc_make_handle(1, 0xfffe), NULL, stats);
3973}
3974
3975static int
3976htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3977 const struct ofpbuf *nlmsg,
3978 netdev_dump_queue_stats_cb *cb, void *aux)
3979{
3980 struct netdev_queue_stats stats;
17ee3c1f 3981 unsigned int handle, major, minor;
c1c9c9c4
BP
3982 int error;
3983
3984 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3985 if (error) {
3986 return error;
3987 }
3988
17ee3c1f
BP
3989 major = tc_get_major(handle);
3990 minor = tc_get_minor(handle);
3991 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3992 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3993 }
3994 return 0;
3995}
3996
3997static const struct tc_ops tc_ops_htb = {
3998 "htb", /* linux_name */
3999 "linux-htb", /* ovs_name */
4000 HTB_N_QUEUES, /* n_queues */
4001 htb_tc_install,
4002 htb_tc_load,
4003 htb_tc_destroy,
4004 htb_qdisc_get,
4005 htb_qdisc_set,
4006 htb_class_get,
4007 htb_class_set,
4008 htb_class_delete,
4009 htb_class_get_stats,
4010 htb_class_dump_stats
4011};
4012\f
a339aa81
EJ
4013/* "linux-hfsc" traffic control class. */
4014
4015#define HFSC_N_QUEUES 0xf000
4016
4017struct hfsc {
4018 struct tc tc;
4019 uint32_t max_rate;
4020};
4021
4022struct hfsc_class {
4023 struct tc_queue tc_queue;
4024 uint32_t min_rate;
4025 uint32_t max_rate;
4026};
4027
4028static struct hfsc *
b5d57fc8 4029hfsc_get__(const struct netdev *netdev_)
a339aa81 4030{
b5d57fc8
BP
4031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4032 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4033}
4034
4035static struct hfsc_class *
4036hfsc_class_cast__(const struct tc_queue *queue)
4037{
4038 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4039}
4040
24045e35 4041static void
b5d57fc8 4042hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4043{
b5d57fc8 4044 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4045 struct hfsc *hfsc;
4046
a339aa81
EJ
4047 hfsc = xmalloc(sizeof *hfsc);
4048 tc_init(&hfsc->tc, &tc_ops_hfsc);
4049 hfsc->max_rate = max_rate;
b5d57fc8 4050 netdev->tc = &hfsc->tc;
a339aa81
EJ
4051}
4052
4053static void
4054hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4055 const struct hfsc_class *hc)
4056{
4057 size_t hash;
4058 struct hfsc *hfsc;
4059 struct hfsc_class *hcp;
4060 struct tc_queue *queue;
4061
4062 hfsc = hfsc_get__(netdev);
4063 hash = hash_int(queue_id, 0);
4064
4065 queue = tc_find_queue__(netdev, queue_id, hash);
4066 if (queue) {
4067 hcp = hfsc_class_cast__(queue);
4068 } else {
4069 hcp = xmalloc(sizeof *hcp);
4070 queue = &hcp->tc_queue;
4071 queue->queue_id = queue_id;
6dc34a0d 4072 queue->created = time_msec();
a339aa81
EJ
4073 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4074 }
4075
4076 hcp->min_rate = hc->min_rate;
4077 hcp->max_rate = hc->max_rate;
4078}
4079
4080static int
4081hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4082{
4083 const struct tc_service_curve *rsc, *fsc, *usc;
4084 static const struct nl_policy tca_hfsc_policy[] = {
4085 [TCA_HFSC_RSC] = {
4086 .type = NL_A_UNSPEC,
4087 .optional = false,
4088 .min_len = sizeof(struct tc_service_curve),
4089 },
4090 [TCA_HFSC_FSC] = {
4091 .type = NL_A_UNSPEC,
4092 .optional = false,
4093 .min_len = sizeof(struct tc_service_curve),
4094 },
4095 [TCA_HFSC_USC] = {
4096 .type = NL_A_UNSPEC,
4097 .optional = false,
4098 .min_len = sizeof(struct tc_service_curve),
4099 },
4100 };
4101 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4102
4103 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4104 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4106 return EPROTO;
4107 }
4108
4109 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4110 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4111 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4112
4113 if (rsc->m1 != 0 || rsc->d != 0 ||
4114 fsc->m1 != 0 || fsc->d != 0 ||
4115 usc->m1 != 0 || usc->d != 0) {
4116 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4117 "Non-linear service curves are not supported.");
4118 return EPROTO;
4119 }
4120
4121 if (rsc->m2 != fsc->m2) {
4122 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4123 "Real-time service curves are not supported ");
4124 return EPROTO;
4125 }
4126
4127 if (rsc->m2 > usc->m2) {
4128 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4129 "Min-rate service curve is greater than "
4130 "the max-rate service curve.");
4131 return EPROTO;
4132 }
4133
4134 class->min_rate = fsc->m2;
4135 class->max_rate = usc->m2;
4136 return 0;
4137}
4138
4139static int
4140hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4141 struct hfsc_class *options,
4142 struct netdev_queue_stats *stats)
4143{
4144 int error;
4145 unsigned int handle;
4146 struct nlattr *nl_options;
4147
4148 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4149 if (error) {
4150 return error;
4151 }
4152
4153 if (queue_id) {
4154 unsigned int major, minor;
4155
4156 major = tc_get_major(handle);
4157 minor = tc_get_minor(handle);
4158 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4159 *queue_id = minor - 1;
4160 } else {
4161 return EPROTO;
4162 }
4163 }
4164
4165 if (options) {
4166 error = hfsc_parse_tca_options__(nl_options, options);
4167 }
4168
4169 return error;
4170}
4171
4172static int
4173hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4174 unsigned int parent, struct hfsc_class *options,
4175 struct netdev_queue_stats *stats)
4176{
4177 int error;
4178 struct ofpbuf *reply;
4179
4180 error = tc_query_class(netdev, handle, parent, &reply);
4181 if (error) {
4182 return error;
4183 }
4184
4185 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4186 ofpbuf_delete(reply);
4187 return error;
4188}
4189
4190static void
73371c09 4191hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4192 struct hfsc_class *class)
4193{
73371c09 4194 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4195 uint32_t max_rate;
4196 const char *max_rate_s;
4197
79f1cbe9 4198 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
4199 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4200
4201 if (!max_rate) {
a00ca915 4202 enum netdev_features current;
a339aa81 4203
73371c09
BP
4204 netdev_linux_read_features(netdev);
4205 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4206 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4207 }
4208
4209 class->min_rate = max_rate;
4210 class->max_rate = max_rate;
4211}
4212
4213static int
4214hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4215 const struct smap *details,
a339aa81
EJ
4216 struct hfsc_class * class)
4217{
4218 const struct hfsc *hfsc;
4219 uint32_t min_rate, max_rate;
4220 const char *min_rate_s, *max_rate_s;
4221
4222 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
4223 min_rate_s = smap_get(details, "min-rate");
4224 max_rate_s = smap_get(details, "max-rate");
a339aa81 4225
c45ab5e9 4226 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 4227 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4228 min_rate = MIN(min_rate, hfsc->max_rate);
4229
4230 max_rate = (max_rate_s
4231 ? strtoull(max_rate_s, NULL, 10) / 8
4232 : hfsc->max_rate);
4233 max_rate = MAX(max_rate, min_rate);
4234 max_rate = MIN(max_rate, hfsc->max_rate);
4235
4236 class->min_rate = min_rate;
4237 class->max_rate = max_rate;
4238
4239 return 0;
4240}
4241
4242/* Create an HFSC qdisc.
4243 *
4244 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4245static int
4246hfsc_setup_qdisc__(struct netdev * netdev)
4247{
4248 struct tcmsg *tcmsg;
4249 struct ofpbuf request;
4250 struct tc_hfsc_qopt opt;
4251
4252 tc_del_qdisc(netdev);
4253
4254 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4255 NLM_F_EXCL | NLM_F_CREATE, &request);
4256
4257 if (!tcmsg) {
4258 return ENODEV;
4259 }
4260
4261 tcmsg->tcm_handle = tc_make_handle(1, 0);
4262 tcmsg->tcm_parent = TC_H_ROOT;
4263
4264 memset(&opt, 0, sizeof opt);
4265 opt.defcls = 1;
4266
4267 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4268 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4269
4270 return tc_transact(&request, NULL);
4271}
4272
4273/* Create an HFSC class.
4274 *
4275 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4276 * sc rate <min_rate> ul rate <max_rate>" */
4277static int
4278hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4279 unsigned int parent, struct hfsc_class *class)
4280{
4281 int error;
4282 size_t opt_offset;
4283 struct tcmsg *tcmsg;
4284 struct ofpbuf request;
4285 struct tc_service_curve min, max;
4286
4287 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4288
4289 if (!tcmsg) {
4290 return ENODEV;
4291 }
4292
4293 tcmsg->tcm_handle = handle;
4294 tcmsg->tcm_parent = parent;
4295
4296 min.m1 = 0;
4297 min.d = 0;
4298 min.m2 = class->min_rate;
4299
4300 max.m1 = 0;
4301 max.d = 0;
4302 max.m2 = class->max_rate;
4303
4304 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4305 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4306 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4307 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4308 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4309 nl_msg_end_nested(&request, opt_offset);
4310
4311 error = tc_transact(&request, NULL);
4312 if (error) {
4313 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4314 "min-rate %ubps, max-rate %ubps (%s)",
4315 netdev_get_name(netdev),
4316 tc_get_major(handle), tc_get_minor(handle),
4317 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4318 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4319 }
4320
4321 return error;
4322}
4323
4324static int
79f1cbe9 4325hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4326{
4327 int error;
4328 struct hfsc_class class;
4329
4330 error = hfsc_setup_qdisc__(netdev);
4331
4332 if (error) {
4333 return error;
4334 }
4335
4336 hfsc_parse_qdisc_details__(netdev, details, &class);
4337 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4338 tc_make_handle(1, 0), &class);
4339
4340 if (error) {
4341 return error;
4342 }
4343
4344 hfsc_install__(netdev, class.max_rate);
4345 return 0;
4346}
4347
4348static int
4349hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4350{
4351 struct ofpbuf msg;
d57695d7 4352 struct queue_dump_state state;
a339aa81
EJ
4353 struct hfsc_class hc;
4354
4355 hc.max_rate = 0;
4356 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4357 hfsc_install__(netdev, hc.max_rate);
a339aa81 4358
d57695d7 4359 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4360 return ENODEV;
4361 }
4362
d57695d7 4363 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4364 unsigned int queue_id;
4365
4366 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4367 hfsc_update_queue__(netdev, queue_id, &hc);
4368 }
4369 }
4370
d57695d7 4371 finish_queue_dump(&state);
a339aa81
EJ
4372 return 0;
4373}
4374
4375static void
4376hfsc_tc_destroy(struct tc *tc)
4377{
4378 struct hfsc *hfsc;
4379 struct hfsc_class *hc, *next;
4380
4381 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4382
4383 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4384 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4385 free(hc);
4386 }
4387
4388 tc_destroy(tc);
4389 free(hfsc);
4390}
4391
4392static int
79f1cbe9 4393hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4394{
4395 const struct hfsc *hfsc;
4396 hfsc = hfsc_get__(netdev);
79f1cbe9 4397 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4398 return 0;
4399}
4400
4401static int
79f1cbe9 4402hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4403{
4404 int error;
4405 struct hfsc_class class;
4406
4407 hfsc_parse_qdisc_details__(netdev, details, &class);
4408 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4409 tc_make_handle(1, 0), &class);
4410
4411 if (!error) {
4412 hfsc_get__(netdev)->max_rate = class.max_rate;
4413 }
4414
4415 return error;
4416}
4417
4418static int
4419hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4420 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4421{
4422 const struct hfsc_class *hc;
4423
4424 hc = hfsc_class_cast__(queue);
79f1cbe9 4425 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4426 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4427 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4428 }
4429 return 0;
4430}
4431
4432static int
4433hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4434 const struct smap *details)
a339aa81
EJ
4435{
4436 int error;
4437 struct hfsc_class class;
4438
4439 error = hfsc_parse_class_details__(netdev, details, &class);
4440 if (error) {
4441 return error;
4442 }
4443
4444 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4445 tc_make_handle(1, 0xfffe), &class);
4446 if (error) {
4447 return error;
4448 }
4449
4450 hfsc_update_queue__(netdev, queue_id, &class);
4451 return 0;
4452}
4453
4454static int
4455hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4456{
4457 int error;
4458 struct hfsc *hfsc;
4459 struct hfsc_class *hc;
4460
4461 hc = hfsc_class_cast__(queue);
4462 hfsc = hfsc_get__(netdev);
4463
4464 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4465 if (!error) {
4466 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4467 free(hc);
4468 }
4469 return error;
4470}
4471
4472static int
4473hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4474 struct netdev_queue_stats *stats)
4475{
4476 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4477 tc_make_handle(1, 0xfffe), NULL, stats);
4478}
4479
4480static int
4481hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4482 const struct ofpbuf *nlmsg,
4483 netdev_dump_queue_stats_cb *cb, void *aux)
4484{
4485 struct netdev_queue_stats stats;
4486 unsigned int handle, major, minor;
4487 int error;
4488
4489 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4490 if (error) {
4491 return error;
4492 }
4493
4494 major = tc_get_major(handle);
4495 minor = tc_get_minor(handle);
4496 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4497 (*cb)(minor - 1, &stats, aux);
4498 }
4499 return 0;
4500}
4501
4502static const struct tc_ops tc_ops_hfsc = {
4503 "hfsc", /* linux_name */
4504 "linux-hfsc", /* ovs_name */
4505 HFSC_N_QUEUES, /* n_queues */
4506 hfsc_tc_install, /* tc_install */
4507 hfsc_tc_load, /* tc_load */
4508 hfsc_tc_destroy, /* tc_destroy */
4509 hfsc_qdisc_get, /* qdisc_get */
4510 hfsc_qdisc_set, /* qdisc_set */
4511 hfsc_class_get, /* class_get */
4512 hfsc_class_set, /* class_set */
4513 hfsc_class_delete, /* class_delete */
4514 hfsc_class_get_stats, /* class_get_stats */
4515 hfsc_class_dump_stats /* class_dump_stats */
4516};
4517\f
c1c9c9c4
BP
4518/* "linux-default" traffic control class.
4519 *
4520 * This class represents the default, unnamed Linux qdisc. It corresponds to
4521 * the "" (empty string) QoS type in the OVS database. */
4522
4523static void
b5d57fc8 4524default_install__(struct netdev *netdev_)
c1c9c9c4 4525{
b5d57fc8 4526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4527 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4528
559eb230
BP
4529 /* Nothing but a tc class implementation is allowed to write to a tc. This
4530 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4531 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4532}
4533
4534static int
4535default_tc_install(struct netdev *netdev,
79f1cbe9 4536 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4537{
4538 default_install__(netdev);
4539 return 0;
4540}
4541
4542static int
4543default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4544{
4545 default_install__(netdev);
4546 return 0;
4547}
4548
4549static const struct tc_ops tc_ops_default = {
4550 NULL, /* linux_name */
4551 "", /* ovs_name */
4552 0, /* n_queues */
4553 default_tc_install,
4554 default_tc_load,
4555 NULL, /* tc_destroy */
4556 NULL, /* qdisc_get */
4557 NULL, /* qdisc_set */
4558 NULL, /* class_get */
4559 NULL, /* class_set */
4560 NULL, /* class_delete */
4561 NULL, /* class_get_stats */
4562 NULL /* class_dump_stats */
4563};
4564\f
4565/* "linux-other" traffic control class.
4566 *
4567 * */
4568
4569static int
b5d57fc8 4570other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4571{
b5d57fc8 4572 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4573 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4574
559eb230
BP
4575 /* Nothing but a tc class implementation is allowed to write to a tc. This
4576 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4577 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4578 return 0;
4579}
4580
4581static const struct tc_ops tc_ops_other = {
4582 NULL, /* linux_name */
4583 "linux-other", /* ovs_name */
4584 0, /* n_queues */
4585 NULL, /* tc_install */
4586 other_tc_load,
4587 NULL, /* tc_destroy */
4588 NULL, /* qdisc_get */
4589 NULL, /* qdisc_set */
4590 NULL, /* class_get */
4591 NULL, /* class_set */
4592 NULL, /* class_delete */
4593 NULL, /* class_get_stats */
4594 NULL /* class_dump_stats */
4595};
4596\f
4597/* Traffic control. */
4598
4599/* Number of kernel "tc" ticks per second. */
4600static double ticks_per_s;
4601
4602/* Number of kernel "jiffies" per second. This is used for the purpose of
4603 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4604 * one jiffy's worth of data.
4605 *
4606 * There are two possibilities here:
4607 *
4608 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4609 * approximate range of 100 to 1024. That means that we really need to
4610 * make sure that the qdisc can buffer that much data.
4611 *
4612 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4613 * has finely granular timers and there's no need to fudge additional room
4614 * for buffers. (There's no extra effort needed to implement that: the
4615 * large 'buffer_hz' is used as a divisor, so practically any number will
4616 * come out as 0 in the division. Small integer results in the case of
4617 * really high dividends won't have any real effect anyhow.)
4618 */
4619static unsigned int buffer_hz;
4620
4621/* Returns tc handle 'major':'minor'. */
4622static unsigned int
4623tc_make_handle(unsigned int major, unsigned int minor)
4624{
4625 return TC_H_MAKE(major << 16, minor);
4626}
4627
4628/* Returns the major number from 'handle'. */
4629static unsigned int
4630tc_get_major(unsigned int handle)
4631{
4632 return TC_H_MAJ(handle) >> 16;
4633}
4634
4635/* Returns the minor number from 'handle'. */
4636static unsigned int
4637tc_get_minor(unsigned int handle)
4638{
4639 return TC_H_MIN(handle);
4640}
4641
4642static struct tcmsg *
4643tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4644 struct ofpbuf *request)
4645{
4646 struct tcmsg *tcmsg;
4647 int ifindex;
4648 int error;
4649
4650 error = get_ifindex(netdev, &ifindex);
4651 if (error) {
4652 return NULL;
4653 }
4654
4655 ofpbuf_init(request, 512);
4656 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4657 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4658 tcmsg->tcm_family = AF_UNSPEC;
4659 tcmsg->tcm_ifindex = ifindex;
4660 /* Caller should fill in tcmsg->tcm_handle. */
4661 /* Caller should fill in tcmsg->tcm_parent. */
4662
4663 return tcmsg;
4664}
4665
4666static int
4667tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4668{
a88b4e04 4669 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4670 ofpbuf_uninit(request);
4671 return error;
4672}
4673
f8500004
JP
4674/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4675 * policing configuration.
4676 *
4677 * This function is equivalent to running the following when 'add' is true:
4678 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4679 *
4680 * This function is equivalent to running the following when 'add' is false:
4681 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4682 *
4683 * The configuration and stats may be seen with the following command:
4684 * /sbin/tc -s qdisc show dev <devname>
4685 *
4686 * Returns 0 if successful, otherwise a positive errno value.
4687 */
4688static int
4689tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4690{
4691 struct ofpbuf request;
4692 struct tcmsg *tcmsg;
4693 int error;
4694 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4695 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4696
4697 tcmsg = tc_make_request(netdev, type, flags, &request);
4698 if (!tcmsg) {
4699 return ENODEV;
4700 }
4701 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4702 tcmsg->tcm_parent = TC_H_INGRESS;
4703 nl_msg_put_string(&request, TCA_KIND, "ingress");
4704 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4705
4706 error = tc_transact(&request, NULL);
4707 if (error) {
4708 /* If we're deleting the qdisc, don't worry about some of the
4709 * error conditions. */
4710 if (!add && (error == ENOENT || error == EINVAL)) {
4711 return 0;
4712 }
4713 return error;
4714 }
4715
4716 return 0;
4717}
4718
4719/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4720 * of 'kbits_burst'.
4721 *
4722 * This function is equivalent to running:
4723 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4724 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4725 * mtu 65535 drop
4726 *
4727 * The configuration and stats may be seen with the following command:
c7952afb 4728 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4729 *
4730 * Returns 0 if successful, otherwise a positive errno value.
4731 */
4732static int
c7952afb
BP
4733tc_add_policer(struct netdev *netdev,
4734 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4735{
4736 struct tc_police tc_police;
4737 struct ofpbuf request;
4738 struct tcmsg *tcmsg;
4739 size_t basic_offset;
4740 size_t police_offset;
4741 int error;
4742 int mtu = 65535;
4743
4744 memset(&tc_police, 0, sizeof tc_police);
4745 tc_police.action = TC_POLICE_SHOT;
4746 tc_police.mtu = mtu;
1aca400c 4747 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb
BP
4748
4749 /* The following appears wrong in two ways:
4750 *
4751 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4752 * arguments (or at least consistently "bytes" as both or "bits" as
4753 * both), but this supplies bytes for the first argument and bits for the
4754 * second.
4755 *
4756 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4757 *
4758 * However if you "fix" those problems then "tc filter show ..." shows
4759 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4760 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4761 * tc's point of view. Whatever. */
4762 tc_police.burst = tc_bytes_to_ticks(
4763 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
f8500004
JP
4764
4765 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4766 NLM_F_EXCL | NLM_F_CREATE, &request);
4767 if (!tcmsg) {
4768 return ENODEV;
4769 }
4770 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4771 tcmsg->tcm_info = tc_make_handle(49,
4772 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4773
4774 nl_msg_put_string(&request, TCA_KIND, "basic");
4775 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4776 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4777 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4778 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4779 nl_msg_end_nested(&request, police_offset);
4780 nl_msg_end_nested(&request, basic_offset);
4781
4782 error = tc_transact(&request, NULL);
4783 if (error) {
4784 return error;
4785 }
4786
4787 return 0;
4788}
4789
c1c9c9c4
BP
4790static void
4791read_psched(void)
4792{
4793 /* The values in psched are not individually very meaningful, but they are
4794 * important. The tables below show some values seen in the wild.
4795 *
4796 * Some notes:
4797 *
4798 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4799 * (Before that, there are hints that it was 1000000000.)
4800 *
4801 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4802 * above.
4803 *
4804 * /proc/net/psched
4805 * -----------------------------------
4806 * [1] 000c8000 000f4240 000f4240 00000064
4807 * [2] 000003e8 00000400 000f4240 3b9aca00
4808 * [3] 000003e8 00000400 000f4240 3b9aca00
4809 * [4] 000003e8 00000400 000f4240 00000064
4810 * [5] 000003e8 00000040 000f4240 3b9aca00
4811 * [6] 000003e8 00000040 000f4240 000000f9
4812 *
4813 * a b c d ticks_per_s buffer_hz
4814 * ------- --------- ---------- ------------- ----------- -------------
4815 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4816 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4817 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4818 * [4] 1,000 1,024 1,000,000 100 976,562 100
4819 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4820 * [6] 1,000 64 1,000,000 249 15,625,000 249
4821 *
4822 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4823 * [2] 2.6.26-1-686-bigmem from Debian lenny
4824 * [3] 2.6.26-2-sparc64 from Debian lenny
4825 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4826 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4827 * [6] 2.6.34 from kernel.org on KVM
4828 */
23882115 4829 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4830 static const char fn[] = "/proc/net/psched";
4831 unsigned int a, b, c, d;
4832 FILE *stream;
4833
23882115
BP
4834 if (!ovsthread_once_start(&once)) {
4835 return;
4836 }
4837
c1c9c9c4
BP
4838 ticks_per_s = 1.0;
4839 buffer_hz = 100;
4840
4841 stream = fopen(fn, "r");
4842 if (!stream) {
10a89ef0 4843 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4844 goto exit;
c1c9c9c4
BP
4845 }
4846
4847 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4848 VLOG_WARN("%s: read failed", fn);
4849 fclose(stream);
23882115 4850 goto exit;
c1c9c9c4
BP
4851 }
4852 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4853 fclose(stream);
4854
4855 if (!a || !c) {
4856 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4857 goto exit;
c1c9c9c4
BP
4858 }
4859
4860 ticks_per_s = (double) a * c / b;
4861 if (c == 1000000) {
4862 buffer_hz = d;
4863 } else {
4864 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4865 fn, a, b, c, d);
4866 }
4867 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4868
4869exit:
4870 ovsthread_once_done(&once);
c1c9c9c4
BP
4871}
4872
4873/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4874 * rate of 'rate' bytes per second. */
4875static unsigned int
4876tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4877{
23882115 4878 read_psched();
c1c9c9c4
BP
4879 return (rate * ticks) / ticks_per_s;
4880}
4881
4882/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4883 * rate of 'rate' bytes per second. */
4884static unsigned int
4885tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4886{
23882115 4887 read_psched();
015c93a4 4888 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4889}
4890
4891/* Returns the number of bytes that need to be reserved for qdisc buffering at
4892 * a transmission rate of 'rate' bytes per second. */
4893static unsigned int
4894tc_buffer_per_jiffy(unsigned int rate)
4895{
23882115 4896 read_psched();
c1c9c9c4
BP
4897 return rate / buffer_hz;
4898}
4899
4900/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4901 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4902 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4903 * stores NULL into it if it is absent.
4904 *
4905 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4906 * 'msg'.
4907 *
4908 * Returns 0 if successful, otherwise a positive errno value. */
4909static int
4910tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4911 struct nlattr **options)
4912{
4913 static const struct nl_policy tca_policy[] = {
4914 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4915 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4916 };
4917 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4918
4919 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4920 tca_policy, ta, ARRAY_SIZE(ta))) {
4921 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4922 goto error;
4923 }
4924
4925 if (kind) {
4926 *kind = nl_attr_get_string(ta[TCA_KIND]);
4927 }
4928
4929 if (options) {
4930 *options = ta[TCA_OPTIONS];
4931 }
4932
4933 return 0;
4934
4935error:
4936 if (kind) {
4937 *kind = NULL;
4938 }
4939 if (options) {
4940 *options = NULL;
4941 }
4942 return EPROTO;
4943}
4944
4945/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4946 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4947 * into '*options', and its queue statistics into '*stats'. Any of the output
4948 * arguments may be null.
4949 *
4950 * Returns 0 if successful, otherwise a positive errno value. */
4951static int
4952tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4953 struct nlattr **options, struct netdev_queue_stats *stats)
4954{
4955 static const struct nl_policy tca_policy[] = {
4956 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4957 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4958 };
4959 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4960
4961 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4962 tca_policy, ta, ARRAY_SIZE(ta))) {
4963 VLOG_WARN_RL(&rl, "failed to parse class message");
4964 goto error;
4965 }
4966
4967 if (handlep) {
4968 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4969 *handlep = tc->tcm_handle;
4970 }
4971
4972 if (options) {
4973 *options = ta[TCA_OPTIONS];
4974 }
4975
4976 if (stats) {
4977 const struct gnet_stats_queue *gsq;
4978 struct gnet_stats_basic gsb;
4979
4980 static const struct nl_policy stats_policy[] = {
4981 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4982 .min_len = sizeof gsb },
4983 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4984 .min_len = sizeof *gsq },
4985 };
4986 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4987
4988 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4989 sa, ARRAY_SIZE(sa))) {
4990 VLOG_WARN_RL(&rl, "failed to parse class stats");
4991 goto error;
4992 }
4993
4994 /* Alignment issues screw up the length of struct gnet_stats_basic on
4995 * some arch/bitsize combinations. Newer versions of Linux have a
4996 * struct gnet_stats_basic_packed, but we can't depend on that. The
4997 * easiest thing to do is just to make a copy. */
4998 memset(&gsb, 0, sizeof gsb);
4999 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5000 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5001 stats->tx_bytes = gsb.bytes;
5002 stats->tx_packets = gsb.packets;
5003
5004 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5005 stats->tx_errors = gsq->drops;
5006 }
5007
5008 return 0;
5009
5010error:
5011 if (options) {
5012 *options = NULL;
5013 }
5014 if (stats) {
5015 memset(stats, 0, sizeof *stats);
5016 }
5017 return EPROTO;
5018}
5019
5020/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5021 * on 'netdev'. */
5022static int
5023tc_query_class(const struct netdev *netdev,
5024 unsigned int handle, unsigned int parent,
5025 struct ofpbuf **replyp)
5026{
5027 struct ofpbuf request;
5028 struct tcmsg *tcmsg;
5029 int error;
5030
5031 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5032 if (!tcmsg) {
5033 return ENODEV;
5034 }
c1c9c9c4
BP
5035 tcmsg->tcm_handle = handle;
5036 tcmsg->tcm_parent = parent;
5037
5038 error = tc_transact(&request, replyp);
5039 if (error) {
5040 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5041 netdev_get_name(netdev),
5042 tc_get_major(handle), tc_get_minor(handle),
5043 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5044 ovs_strerror(error));
c1c9c9c4
BP
5045 }
5046 return error;
5047}
5048
5049/* Equivalent to "tc class del dev <name> handle <handle>". */
5050static int
5051tc_delete_class(const struct netdev *netdev, unsigned int handle)
5052{
5053 struct ofpbuf request;
5054 struct tcmsg *tcmsg;
5055 int error;
5056
5057 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5058 if (!tcmsg) {
5059 return ENODEV;
5060 }
c1c9c9c4
BP
5061 tcmsg->tcm_handle = handle;
5062 tcmsg->tcm_parent = 0;
5063
5064 error = tc_transact(&request, NULL);
5065 if (error) {
5066 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5067 netdev_get_name(netdev),
5068 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5069 ovs_strerror(error));
c1c9c9c4
BP
5070 }
5071 return error;
5072}
5073
5074/* Equivalent to "tc qdisc del dev <name> root". */
5075static int
b5d57fc8 5076tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5077{
b5d57fc8 5078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5079 struct ofpbuf request;
5080 struct tcmsg *tcmsg;
5081 int error;
5082
b5d57fc8 5083 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5084 if (!tcmsg) {
5085 return ENODEV;
5086 }
c1c9c9c4
BP
5087 tcmsg->tcm_handle = tc_make_handle(1, 0);
5088 tcmsg->tcm_parent = TC_H_ROOT;
5089
5090 error = tc_transact(&request, NULL);
5091 if (error == EINVAL) {
5092 /* EINVAL probably means that the default qdisc was in use, in which
5093 * case we've accomplished our purpose. */
5094 error = 0;
5095 }
b5d57fc8
BP
5096 if (!error && netdev->tc) {
5097 if (netdev->tc->ops->tc_destroy) {
5098 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5099 }
b5d57fc8 5100 netdev->tc = NULL;
c1c9c9c4
BP
5101 }
5102 return error;
5103}
5104
ac3e3aaa
BP
5105static bool
5106getqdisc_is_safe(void)
5107{
5108 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5109 static bool safe = false;
5110
5111 if (ovsthread_once_start(&once)) {
5112 struct utsname utsname;
5113 int major, minor;
5114
5115 if (uname(&utsname) == -1) {
5116 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5117 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5118 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5119 } else if (major < 2 || (major == 2 && minor < 35)) {
5120 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5121 utsname.release);
5122 } else {
5123 safe = true;
5124 }
5125 ovsthread_once_done(&once);
5126 }
5127 return safe;
5128}
5129
c1c9c9c4
BP
5130/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5131 * kernel to determine what they are. Returns 0 if successful, otherwise a
5132 * positive errno value. */
5133static int
b5d57fc8 5134tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5135{
b5d57fc8 5136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5137 struct ofpbuf request, *qdisc;
5138 const struct tc_ops *ops;
5139 struct tcmsg *tcmsg;
5140 int load_error;
5141 int error;
5142
b5d57fc8 5143 if (netdev->tc) {
c1c9c9c4
BP
5144 return 0;
5145 }
5146
5147 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5148 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5149 * 2.6.35 without that fix backported to it.
5150 *
5151 * To avoid the OOPS, we must not make a request that would attempt to dump
5152 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5153 * few others. There are a few ways that I can see to do this, but most of
5154 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5155 * technique chosen here is to assume that any non-default qdisc that we
5156 * create will have a class with handle 1:0. The built-in qdiscs only have
5157 * a class with handle 0:0.
5158 *
ac3e3aaa
BP
5159 * On Linux 2.6.35+ we use the straightforward method because it allows us
5160 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5161 * in such a case we get no response at all from the kernel (!) if a
5162 * builtin qdisc is in use (which is later caught by "!error &&
5163 * !qdisc->size"). */
b5d57fc8 5164 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5165 if (!tcmsg) {
5166 return ENODEV;
5167 }
ac3e3aaa
BP
5168 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5169 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5170
5171 /* Figure out what tc class to instantiate. */
5172 error = tc_transact(&request, &qdisc);
ac3e3aaa 5173 if (!error && qdisc->size) {
c1c9c9c4
BP
5174 const char *kind;
5175
5176 error = tc_parse_qdisc(qdisc, &kind, NULL);
5177 if (error) {
5178 ops = &tc_ops_other;
5179 } else {
5180 ops = tc_lookup_linux_name(kind);
5181 if (!ops) {
5182 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5183 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5184
5185 ops = &tc_ops_other;
5186 }
5187 }
ac3e3aaa
BP
5188 } else if ((!error && !qdisc->size) || error == ENOENT) {
5189 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5190 * set up by some other entity that doesn't have a handle 1:0. We will
5191 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5192 ops = &tc_ops_default;
5193 error = 0;
5194 } else {
5195 /* Who knows? Maybe the device got deleted. */
5196 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5197 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5198 ops = &tc_ops_other;
5199 }
5200
5201 /* Instantiate it. */
b5d57fc8
BP
5202 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5203 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5204 ofpbuf_delete(qdisc);
5205
5206 return error ? error : load_error;
5207}
5208
5209/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5210 approximate the time to transmit packets of various lengths. For an MTU of
5211 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5212 represents two possible packet lengths; for a MTU of 513 through 1024, four
5213 possible lengths; and so on.
5214
5215 Returns, for the specified 'mtu', the number of bits that packet lengths
5216 need to be shifted right to fit within such a 256-entry table. */
5217static int
5218tc_calc_cell_log(unsigned int mtu)
5219{
5220 int cell_log;
5221
5222 if (!mtu) {
5223 mtu = ETH_PAYLOAD_MAX;
5224 }
5225 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5226
5227 for (cell_log = 0; mtu >= 256; cell_log++) {
5228 mtu >>= 1;
5229 }
5230
5231 return cell_log;
5232}
5233
5234/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5235 * of 'mtu'. */
5236static void
5237tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5238{
5239 memset(rate, 0, sizeof *rate);
5240 rate->cell_log = tc_calc_cell_log(mtu);
5241 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5242 /* rate->cell_align = 0; */ /* distro headers. */
5243 rate->mpu = ETH_TOTAL_MIN;
5244 rate->rate = Bps;
5245}
5246
5247/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5248 * attribute of the specified "type".
5249 *
5250 * See tc_calc_cell_log() above for a description of "rtab"s. */
5251static void
5252tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5253{
5254 uint32_t *rtab;
5255 unsigned int i;
5256
5257 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5258 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5259 unsigned packet_size = (i + 1) << rate->cell_log;
5260 if (packet_size < rate->mpu) {
5261 packet_size = rate->mpu;
5262 }
5263 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5264 }
5265}
5266
5267/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5268 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5269 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5270 * 0 is fine.) */
c1c9c9c4
BP
5271static int
5272tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5273{
5274 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5275 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5276}
d3980822 5277\f
aaf2fb1a
BP
5278/* Linux-only functions declared in netdev-linux.h */
5279
5280/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5281 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5282int
5283netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5284 const char *flag_name, bool enable)
5285{
5286 const char *netdev_name = netdev_get_name(netdev);
5287 struct ethtool_value evalue;
5288 uint32_t new_flags;
5289 int error;
5290
ab985a77 5291 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5292 memset(&evalue, 0, sizeof evalue);
5293 error = netdev_linux_do_ethtool(netdev_name,
5294 (struct ethtool_cmd *)&evalue,
5295 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5296 if (error) {
5297 return error;
5298 }
5299
ab985a77 5300 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
5301 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5302 error = netdev_linux_do_ethtool(netdev_name,
5303 (struct ethtool_cmd *)&evalue,
5304 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5305 if (error) {
5306 return error;
5307 }
5308
ab985a77 5309 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5310 memset(&evalue, 0, sizeof evalue);
5311 error = netdev_linux_do_ethtool(netdev_name,
5312 (struct ethtool_cmd *)&evalue,
5313 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5314 if (error) {
5315 return error;
5316 }
5317
5318 if (new_flags != evalue.data) {
5319 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5320 "device %s failed", enable ? "enable" : "disable",
5321 flag_name, netdev_name);
5322 return EOPNOTSUPP;
5323 }
5324
5325 return 0;
5326}
5327\f
5328/* Utility functions. */
5329
d3980822 5330/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5331static void
d3980822
BP
5332netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5333 const struct rtnl_link_stats *src)
5334{
f613a0d7
PS
5335 dst->rx_packets = src->rx_packets;
5336 dst->tx_packets = src->tx_packets;
5337 dst->rx_bytes = src->rx_bytes;
5338 dst->tx_bytes = src->tx_bytes;
5339 dst->rx_errors = src->rx_errors;
5340 dst->tx_errors = src->tx_errors;
5341 dst->rx_dropped = src->rx_dropped;
5342 dst->tx_dropped = src->tx_dropped;
5343 dst->multicast = src->multicast;
5344 dst->collisions = src->collisions;
5345 dst->rx_length_errors = src->rx_length_errors;
5346 dst->rx_over_errors = src->rx_over_errors;
5347 dst->rx_crc_errors = src->rx_crc_errors;
5348 dst->rx_frame_errors = src->rx_frame_errors;
5349 dst->rx_fifo_errors = src->rx_fifo_errors;
5350 dst->rx_missed_errors = src->rx_missed_errors;
5351 dst->tx_aborted_errors = src->tx_aborted_errors;
5352 dst->tx_carrier_errors = src->tx_carrier_errors;
5353 dst->tx_fifo_errors = src->tx_fifo_errors;
5354 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5355 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5356}
5357
337c9b99
BP
5358/* Copies 'src' into 'dst', performing format conversion in the process. */
5359static void
5360netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5361 const struct rtnl_link_stats64 *src)
5362{
5363 dst->rx_packets = src->rx_packets;
5364 dst->tx_packets = src->tx_packets;
5365 dst->rx_bytes = src->rx_bytes;
5366 dst->tx_bytes = src->tx_bytes;
5367 dst->rx_errors = src->rx_errors;
5368 dst->tx_errors = src->tx_errors;
5369 dst->rx_dropped = src->rx_dropped;
5370 dst->tx_dropped = src->tx_dropped;
5371 dst->multicast = src->multicast;
5372 dst->collisions = src->collisions;
5373 dst->rx_length_errors = src->rx_length_errors;
5374 dst->rx_over_errors = src->rx_over_errors;
5375 dst->rx_crc_errors = src->rx_crc_errors;
5376 dst->rx_frame_errors = src->rx_frame_errors;
5377 dst->rx_fifo_errors = src->rx_fifo_errors;
5378 dst->rx_missed_errors = src->rx_missed_errors;
5379 dst->tx_aborted_errors = src->tx_aborted_errors;
5380 dst->tx_carrier_errors = src->tx_carrier_errors;
5381 dst->tx_fifo_errors = src->tx_fifo_errors;
5382 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5383 dst->tx_window_errors = src->tx_window_errors;
5384}
5385
c1c9c9c4 5386static int
35eef899 5387get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5388{
c1c9c9c4
BP
5389 struct ofpbuf request;
5390 struct ofpbuf *reply;
c1c9c9c4
BP
5391 int error;
5392
5393 ofpbuf_init(&request, 0);
13a24df8
BP
5394 nl_msg_put_nlmsghdr(&request,
5395 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5396 RTM_GETLINK, NLM_F_REQUEST);
5397 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5398 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5399 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5400 ofpbuf_uninit(&request);
5401 if (error) {
5402 return error;
5403 }
5404
13a24df8 5405 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5406 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5407 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5408 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5409 error = 0;
5410 } else {
337c9b99
BP
5411 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5412 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5413 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5414 error = 0;
5415 } else {
5416 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5417 error = EPROTO;
5418 }
13a24df8
BP
5419 }
5420 } else {
5421 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5422 error = EPROTO;
c1c9c9c4 5423 }
8b61709d 5424
8b61709d 5425
576e26d7 5426 ofpbuf_delete(reply);
35eef899 5427 return error;
8b61709d 5428}
c1c9c9c4 5429
3a183124 5430static int
b5d57fc8 5431get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5432{
5433 struct ifreq ifr;
5434 int error;
5435
755be9ea 5436 *flags = 0;
259e0b1a 5437 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5438 if (!error) {
5439 *flags = ifr.ifr_flags;
5440 }
8b61709d
BP
5441 return error;
5442}
5443
5444static int
4b609110 5445set_flags(const char *name, unsigned int flags)
8b61709d
BP
5446{
5447 struct ifreq ifr;
5448
5449 ifr.ifr_flags = flags;
259e0b1a 5450 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5451}
5452
5453static int
5454do_get_ifindex(const char *netdev_name)
5455{
5456 struct ifreq ifr;
259e0b1a 5457 int error;
8b61709d 5458
71d7c22f 5459 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5460 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5461
5462 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5463 if (error) {
8b61709d 5464 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5465 netdev_name, ovs_strerror(error));
5466 return -error;
8b61709d
BP
5467 }
5468 return ifr.ifr_ifindex;
5469}
5470
5471static int
5472get_ifindex(const struct netdev *netdev_, int *ifindexp)
5473{
b5d57fc8 5474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5475
b5d57fc8 5476 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5477 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5478
8b61709d 5479 if (ifindex < 0) {
b5d57fc8
BP
5480 netdev->get_ifindex_error = -ifindex;
5481 netdev->ifindex = 0;
c7b1b0a5 5482 } else {
b5d57fc8
BP
5483 netdev->get_ifindex_error = 0;
5484 netdev->ifindex = ifindex;
8b61709d 5485 }
b5d57fc8 5486 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5487 }
c7b1b0a5 5488
b5d57fc8
BP
5489 *ifindexp = netdev->ifindex;
5490 return netdev->get_ifindex_error;
8b61709d
BP
5491}
5492
5493static int
74ff3298 5494get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5495{
5496 struct ifreq ifr;
5497 int hwaddr_family;
259e0b1a 5498 int error;
8b61709d
BP
5499
5500 memset(&ifr, 0, sizeof ifr);
71d7c22f 5501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5502 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5503 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5504 if (error) {
78857dfb
BP
5505 /* ENODEV probably means that a vif disappeared asynchronously and
5506 * hasn't been removed from the database yet, so reduce the log level
5507 * to INFO for that case. */
259e0b1a 5508 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5509 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5510 netdev_name, ovs_strerror(error));
5511 return error;
8b61709d
BP
5512 }
5513 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5514 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5515 VLOG_WARN("%s device has unknown hardware address family %d",
5516 netdev_name, hwaddr_family);
5517 }
5518 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5519 return 0;
5520}
5521
5522static int
74ff3298 5523set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5524{
5525 struct ifreq ifr;
259e0b1a 5526 int error;
8b61709d
BP
5527
5528 memset(&ifr, 0, sizeof ifr);
71d7c22f 5529 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5530 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5531 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5532 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5533 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5534 if (error) {
8b61709d 5535 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5536 netdev_name, ovs_strerror(error));
8b61709d 5537 }
259e0b1a 5538 return error;
8b61709d
BP
5539}
5540
5541static int
0b0544d7 5542netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5543 int cmd, const char *cmd_name)
5544{
5545 struct ifreq ifr;
259e0b1a 5546 int error;
8b61709d
BP
5547
5548 memset(&ifr, 0, sizeof ifr);
71d7c22f 5549 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5550 ifr.ifr_data = (caddr_t) ecmd;
5551
5552 ecmd->cmd = cmd;
259e0b1a
BP
5553 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5554 if (error) {
5555 if (error != EOPNOTSUPP) {
8b61709d 5556 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5557 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5558 } else {
5559 /* The device doesn't support this operation. That's pretty
5560 * common, so there's no point in logging anything. */
5561 }
8b61709d 5562 }
259e0b1a 5563 return error;
8b61709d 5564}
f1acd62b
BP
5565
5566static int
5567netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5568 int cmd, const char *cmd_name)
5569{
5570 struct ifreq ifr;
5571 int error;
5572
5573 ifr.ifr_addr.sa_family = AF_INET;
259e0b1a 5574 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 5575 if (!error) {
db5a1019
AW
5576 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5577 &ifr.ifr_addr);
f1acd62b
BP
5578 *ip = sin->sin_addr;
5579 }
5580 return error;
5581}
488d734d
BP
5582
5583/* Returns an AF_PACKET raw socket or a negative errno value. */
5584static int
5585af_packet_sock(void)
5586{
23882115
BP
5587 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5588 static int sock;
488d734d 5589
23882115 5590 if (ovsthread_once_start(&once)) {
488d734d
BP
5591 sock = socket(AF_PACKET, SOCK_RAW, 0);
5592 if (sock >= 0) {
8450059e
BP
5593 int error = set_nonblocking(sock);
5594 if (error) {
5595 close(sock);
5596 sock = -error;
5597 }
488d734d
BP
5598 } else {
5599 sock = -errno;
10a89ef0
BP
5600 VLOG_ERR("failed to create packet socket: %s",
5601 ovs_strerror(errno));
488d734d 5602 }
23882115 5603 ovsthread_once_done(&once);
488d734d
BP
5604 }
5605
5606 return sock;
5607}