]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
ofproto: Fully construct rules before putting them in the classifier.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
275707c3 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d
BP
22#include <fcntl.h>
23#include <arpa/inet.h>
24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
c060c4cf 61#include "netlink.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d 67#include "shash.h"
c060c4cf 68#include "socket-util.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
c060c4cf 71#include "unaligned.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 75
d76f09ea
BP
76COVERAGE_DEFINE(netdev_set_policing);
77COVERAGE_DEFINE(netdev_arp_lookup);
78COVERAGE_DEFINE(netdev_get_ifindex);
79COVERAGE_DEFINE(netdev_get_hwaddr);
80COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
81COVERAGE_DEFINE(netdev_get_ethtool);
82COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 83
8b61709d
BP
84\f
85/* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87#ifndef ADVERTISED_Pause
88#define ADVERTISED_Pause (1 << 13)
89#endif
90#ifndef ADVERTISED_Asym_Pause
91#define ADVERTISED_Asym_Pause (1 << 14)
92#endif
93
e47bd51a
JP
94/* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96#ifndef ETHTOOL_GFLAGS
97#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98#endif
99#ifndef ETHTOOL_SFLAGS
100#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101#endif
102
c1c9c9c4
BP
103/* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105#ifndef TC_RTAB_SIZE
106#define TC_RTAB_SIZE 1024
107#endif
108
8b61709d 109enum {
7fbef77a
JG
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
112 VALID_IN4 = 1 << 2,
113 VALID_IN6 = 1 << 3,
114 VALID_MTU = 1 << 4,
3a183124 115 VALID_POLICING = 1 << 5,
4f925bd3
PS
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
51f87458 118 VALID_FEATURES = 1 << 8,
8b61709d 119};
c1c9c9c4
BP
120\f
121/* Traffic control. */
122
123/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
124 * network device.
125 *
126 * Each TC implementation subclasses this with whatever additional data it
127 * needs. */
c1c9c9c4
BP
128struct tc {
129 const struct tc_ops *ops;
93b13be8
BP
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
133};
c1c9c9c4 134
559eb230
BP
135#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
136
93b13be8
BP
137/* One traffic control queue.
138 *
139 * Each TC implementation subclasses this with whatever additional data it
140 * needs. */
141struct tc_queue {
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 144 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
145};
146
147/* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
149 *
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
153struct tc_ops {
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
158
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
161
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
165
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
171 *
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
175 *
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
178 *
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
79f1cbe9 181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
182
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
186 *
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
192 * 'netdev'.
193 *
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
197
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
200 * tc_destroy(tc).
201 *
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
205 *
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
208
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
210 *
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
214 *
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
218 *
219 * This function may be null if 'tc' is not configurable.
220 */
79f1cbe9 221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
222
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
225 *
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
229 *
230 * This function may be null if 'tc' is not configurable.
231 */
79f1cbe9 232 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 233
93b13be8
BP
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
236 *
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
240 *
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
244 *
245 * This function may be null if 'tc' does not have queues ('n_queues' is
246 * 0). */
93b13be8 247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 248 struct smap *details);
c1c9c9c4
BP
249
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
253 * 'n_queues'.
254 *
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
258 *
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 262 const struct smap *details);
c1c9c9c4 263
93b13be8
BP
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
266 *
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
93b13be8 269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 270
93b13be8
BP
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
273 *
274 * On success, initializes '*stats'.
275 *
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
93b13be8
BP
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
c1c9c9c4
BP
280 struct netdev_queue_stats *stats);
281
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
284 *
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
290};
291
292static void
293tc_init(struct tc *tc, const struct tc_ops *ops)
294{
295 tc->ops = ops;
93b13be8 296 hmap_init(&tc->queues);
c1c9c9c4
BP
297}
298
299static void
300tc_destroy(struct tc *tc)
301{
93b13be8 302 hmap_destroy(&tc->queues);
c1c9c9c4
BP
303}
304
305static const struct tc_ops tc_ops_htb;
a339aa81 306static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
307static const struct tc_ops tc_ops_default;
308static const struct tc_ops tc_ops_other;
309
559eb230 310static const struct tc_ops *const tcs[] = {
c1c9c9c4 311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
315 NULL
316};
149f577a 317
c1c9c9c4
BP
318static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319static unsigned int tc_get_major(unsigned int handle);
320static unsigned int tc_get_minor(unsigned int handle);
321
322static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324static unsigned int tc_buffer_per_jiffy(unsigned int rate);
325
326static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
329static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330static int tc_add_policer(struct netdev *netdev, int kbits_rate,
331 int kbits_burst);
c1c9c9c4
BP
332
333static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341static int tc_delete_class(const struct netdev *, unsigned int handle);
342
343static int tc_del_qdisc(struct netdev *netdev);
344static int tc_query_qdisc(const struct netdev *netdev);
345
346static int tc_calc_cell_log(unsigned int mtu);
347static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
351\f
b5d57fc8
BP
352struct netdev_linux {
353 struct netdev up;
149f577a 354
86383816
BP
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
357
149f577a 358 unsigned int cache_valid;
ac4d3bcb 359 unsigned int change_seq;
8b61709d 360
1670c579
EJ
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
364
8722022c
BP
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
367 int ifindex;
368 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 369 struct in_addr address, netmask;
8b61709d
BP
370 struct in6_addr in6;
371 int mtu;
059e5f4f 372 unsigned int ifi_flags;
65c3058c 373 long long int carrier_resets;
80a86fbe
BP
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
bba1e6f3
PS
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
90a6637d 378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 380 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 383
a00ca915
EJ
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 387
4f925bd3 388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 389 struct tc *tc;
149f577a 390
d0d08f8a
BP
391 /* For devices of class netdev_tap_class only. */
392 int tap_fd;
8b61709d
BP
393};
394
796223f5
BP
395struct netdev_rx_linux {
396 struct netdev_rx up;
397 bool is_tap;
5b7448ed 398 int fd;
149f577a 399};
8b61709d 400
8b61709d
BP
401/* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
404
259e0b1a 405static void netdev_linux_run(void);
6f643e49 406
0b0544d7 407static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 408 int cmd, const char *cmd_name);
f1acd62b
BP
409static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
b5d57fc8 411static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 412static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
413static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
414 enum netdev_flags on, enum netdev_flags *old_flagsp)
415 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
416static int do_get_ifindex(const char *netdev_name);
417static int get_ifindex(const struct netdev *, int *ifindexp);
418static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 422static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
423static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 425static int af_packet_sock(void);
1670c579
EJ
426static void netdev_linux_miimon_run(void);
427static void netdev_linux_miimon_wait(void);
8b61709d 428
15b3596a
JG
429static bool
430is_netdev_linux_class(const struct netdev_class *netdev_class)
431{
259e0b1a 432 return netdev_class->run == netdev_linux_run;
15b3596a
JG
433}
434
796223f5
BP
435static bool
436is_tap_netdev(const struct netdev *netdev)
437{
b5d57fc8 438 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
439}
440
8b61709d
BP
441static struct netdev_linux *
442netdev_linux_cast(const struct netdev *netdev)
443{
b5d57fc8 444 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 445
180c6d0b 446 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 447}
796223f5
BP
448
449static struct netdev_rx_linux *
450netdev_rx_linux_cast(const struct netdev_rx *rx)
451{
9dc63482 452 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
796223f5
BP
453 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
454}
ff4ed3c9 455\f
cee87338 456static void netdev_linux_update(struct netdev_linux *netdev,
86383816
BP
457 const struct rtnetlink_link_change *)
458 OVS_REQUIRES(netdev->mutex);
cee87338 459static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
460 unsigned int ifi_flags, unsigned int mask)
461 OVS_REQUIRES(netdev->mutex);
cee87338
BP
462
463/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
464 * if no such socket could be created. */
465static struct nl_sock *
466netdev_linux_notify_sock(void)
467{
468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
469 static struct nl_sock *sock;
470
471 if (ovsthread_once_start(&once)) {
472 int error;
473
474 error = nl_sock_create(NETLINK_ROUTE, &sock);
475 if (!error) {
476 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
477 if (error) {
478 nl_sock_destroy(sock);
479 sock = NULL;
480 }
481 }
482 ovsthread_once_done(&once);
483 }
484
485 return sock;
486}
487
8b61709d
BP
488static void
489netdev_linux_run(void)
490{
cee87338
BP
491 struct nl_sock *sock;
492 int error;
493
1670c579 494 netdev_linux_miimon_run();
cee87338
BP
495
496 sock = netdev_linux_notify_sock();
497 if (!sock) {
498 return;
499 }
500
501 do {
502 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
503 uint64_t buf_stub[4096 / 8];
504 struct ofpbuf buf;
505
506 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
507 error = nl_sock_recv(sock, &buf, false);
508 if (!error) {
509 struct rtnetlink_link_change change;
510
511 if (rtnetlink_link_parse(&buf, &change)) {
512 struct netdev *netdev_ = netdev_from_name(change.ifname);
513 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
515
516 ovs_mutex_lock(&netdev->mutex);
cee87338 517 netdev_linux_update(netdev, &change);
86383816 518 ovs_mutex_unlock(&netdev->mutex);
cee87338 519 }
38e0065b 520 netdev_close(netdev_);
cee87338
BP
521 }
522 } else if (error == ENOBUFS) {
523 struct shash device_shash;
524 struct shash_node *node;
525
526 nl_sock_drain(sock);
527
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev_ = node->data;
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
533 unsigned int flags;
534
86383816 535 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
536 get_flags(netdev_, &flags);
537 netdev_linux_changed(netdev, flags, 0);
86383816
BP
538 ovs_mutex_unlock(&netdev->mutex);
539
cee87338
BP
540 netdev_close(netdev_);
541 }
542 shash_destroy(&device_shash);
543 } else if (error != EAGAIN) {
544 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
545 ovs_strerror(error));
546 }
547 ofpbuf_uninit(&buf);
548 } while (!error);
8b61709d
BP
549}
550
551static void
552netdev_linux_wait(void)
553{
cee87338
BP
554 struct nl_sock *sock;
555
1670c579 556 netdev_linux_miimon_wait();
cee87338
BP
557 sock = netdev_linux_notify_sock();
558 if (sock) {
559 nl_sock_wait(sock, POLLIN);
560 }
8b61709d
BP
561}
562
ac4d3bcb 563static void
b5d57fc8
BP
564netdev_linux_changed(struct netdev_linux *dev,
565 unsigned int ifi_flags, unsigned int mask)
86383816 566 OVS_REQUIRES(dev->mutex)
ac4d3bcb
EJ
567{
568 dev->change_seq++;
569 if (!dev->change_seq) {
570 dev->change_seq++;
571 }
8aa77183
BP
572
573 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
574 dev->carrier_resets++;
575 }
576 dev->ifi_flags = ifi_flags;
577
4f925bd3
PS
578 dev->cache_valid &= mask;
579}
580
581static void
b5d57fc8
BP
582netdev_linux_update(struct netdev_linux *dev,
583 const struct rtnetlink_link_change *change)
86383816 584 OVS_REQUIRES(dev->mutex)
4f925bd3
PS
585{
586 if (change->nlmsg_type == RTM_NEWLINK) {
587 /* Keep drv-info */
b5d57fc8 588 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 589
c7b1b0a5 590 /* Update netdev from rtnl-change msg. */
90a6637d
PS
591 if (change->mtu) {
592 dev->mtu = change->mtu;
593 dev->cache_valid |= VALID_MTU;
594 dev->netdev_mtu_error = 0;
595 }
596
44445cac
PS
597 if (!eth_addr_is_zero(change->addr)) {
598 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
599 dev->cache_valid |= VALID_ETHERADDR;
600 dev->ether_addr_error = 0;
601 }
602
c7b1b0a5
PS
603 dev->ifindex = change->ifi_index;
604 dev->cache_valid |= VALID_IFINDEX;
605 dev->get_ifindex_error = 0;
606
4f925bd3 607 } else {
b5d57fc8 608 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 609 }
ac4d3bcb
EJ
610}
611
9dc63482
BP
612static struct netdev *
613netdev_linux_alloc(void)
614{
615 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
616 return &netdev->up;
617}
618
cee87338 619static void
9dc63482
BP
620netdev_linux_common_construct(struct netdev_linux *netdev)
621{
834d6caf 622 ovs_mutex_init(&netdev->mutex);
9dc63482 623 netdev->change_seq = 1;
9dc63482
BP
624}
625
1f6e0fbd
BP
626/* Creates system and internal devices. */
627static int
9dc63482 628netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 629{
9dc63482 630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
631 int error;
632
cee87338 633 netdev_linux_common_construct(netdev);
1f6e0fbd 634
b5d57fc8
BP
635 error = get_flags(&netdev->up, &netdev->ifi_flags);
636 if (error == ENODEV) {
9dc63482 637 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 638 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
639 return ENODEV;
640 } else {
641 /* "Internal" netdevs have to be created as netdev objects before
642 * they exist in the kernel, because creating them in the kernel
643 * happens by passing a netdev object to dpif_port_add().
644 * Therefore, ignore the error. */
645 }
646 }
46415c90 647
a740f0de
JG
648 return 0;
649}
650
5b7448ed
JG
651/* For most types of netdevs we open the device for each call of
652 * netdev_open(). However, this is not the case with tap devices,
653 * since it is only possible to open the device once. In this
654 * situation we share a single file descriptor, and consequently
655 * buffers, across all readers. Therefore once data is read it will
656 * be unavailable to other reads for tap devices. */
a740f0de 657static int
9dc63482 658netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 659{
9dc63482 660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 661 static const char tap_dev[] = "/dev/net/tun";
9dc63482 662 const char *name = netdev_->name;
a740f0de
JG
663 struct ifreq ifr;
664 int error;
665
cee87338 666 netdev_linux_common_construct(netdev);
1f6e0fbd 667
6c88d577 668 /* Open tap device. */
d0d08f8a
BP
669 netdev->tap_fd = open(tap_dev, O_RDWR);
670 if (netdev->tap_fd < 0) {
6c88d577 671 error = errno;
10a89ef0 672 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 673 return error;
6c88d577
JP
674 }
675
676 /* Create tap device. */
677 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 678 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 679 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 680 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 681 ovs_strerror(errno));
6c88d577 682 error = errno;
f61d8d29 683 goto error_close;
6c88d577
JP
684 }
685
686 /* Make non-blocking. */
d0d08f8a 687 error = set_nonblocking(netdev->tap_fd);
a740f0de 688 if (error) {
f61d8d29 689 goto error_close;
a740f0de
JG
690 }
691
692 return 0;
693
f61d8d29 694error_close:
d0d08f8a 695 close(netdev->tap_fd);
a740f0de
JG
696 return error;
697}
698
6c88d577 699static void
9dc63482 700netdev_linux_destruct(struct netdev *netdev_)
6c88d577 701{
b5d57fc8 702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 703
b5d57fc8
BP
704 if (netdev->tc && netdev->tc->ops->tc_destroy) {
705 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
706 }
707
d0d08f8a
BP
708 if (netdev_get_class(netdev_) == &netdev_tap_class
709 && netdev->tap_fd >= 0)
710 {
711 close(netdev->tap_fd);
6c88d577 712 }
86383816
BP
713
714 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
715}
716
9dc63482
BP
717static void
718netdev_linux_dealloc(struct netdev *netdev_)
719{
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 free(netdev);
722}
723
724static struct netdev_rx *
725netdev_linux_rx_alloc(void)
726{
727 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
728 return &rx->up;
729}
730
7b6b0ef4 731static int
9dc63482 732netdev_linux_rx_construct(struct netdev_rx *rx_)
7b6b0ef4 733{
9dc63482
BP
734 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
735 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 737 int error;
7b6b0ef4 738
86383816 739 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
740 rx->is_tap = is_tap_netdev(netdev_);
741 if (rx->is_tap) {
742 rx->fd = netdev->tap_fd;
796223f5
BP
743 } else {
744 struct sockaddr_ll sll;
745 int ifindex;
32383c3b 746 /* Result of tcpdump -dd inbound */
259e0b1a 747 static const struct sock_filter filt[] = {
32383c3b
MM
748 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
749 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
750 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
751 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
752 };
259e0b1a
BP
753 static const struct sock_fprog fprog = {
754 ARRAY_SIZE(filt), (struct sock_filter *) filt
755 };
7b6b0ef4 756
796223f5 757 /* Create file descriptor. */
9dc63482
BP
758 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
759 if (rx->fd < 0) {
796223f5 760 error = errno;
10a89ef0 761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
762 goto error;
763 }
33d82a56 764
796223f5 765 /* Set non-blocking mode. */
9dc63482 766 error = set_nonblocking(rx->fd);
796223f5
BP
767 if (error) {
768 goto error;
769 }
7b6b0ef4 770
796223f5 771 /* Get ethernet device index. */
180c6d0b 772 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
773 if (error) {
774 goto error;
775 }
7b6b0ef4 776
796223f5
BP
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
9dc63482 782 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
783 error = errno;
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 785 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
786 goto error;
787 }
32383c3b
MM
788
789 /* Filter for only inbound packets. */
9dc63482 790 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
791 sizeof fprog);
792 if (error) {
793 error = errno;
259e0b1a 794 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 795 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
796 goto error;
797 }
7b6b0ef4 798 }
86383816 799 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 800
7b6b0ef4
BP
801 return 0;
802
803error:
9dc63482
BP
804 if (rx->fd >= 0) {
805 close(rx->fd);
7b6b0ef4 806 }
86383816 807 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
808 return error;
809}
810
796223f5 811static void
9dc63482 812netdev_linux_rx_destruct(struct netdev_rx *rx_)
8b61709d 813{
796223f5 814 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
8b61709d 815
796223f5
BP
816 if (!rx->is_tap) {
817 close(rx->fd);
8b61709d 818 }
9dc63482
BP
819}
820
821static void
822netdev_linux_rx_dealloc(struct netdev_rx *rx_)
823{
824 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
825
796223f5
BP
826 free(rx);
827}
8b61709d 828
796223f5 829static int
9dc63482 830netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
796223f5
BP
831{
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
833 ssize_t retval;
8e8cddf7 834
796223f5
BP
835 do {
836 retval = (rx->is_tap
837 ? read(rx->fd, data, size)
838 : recv(rx->fd, data, size, MSG_TRUNC));
839 } while (retval < 0 && errno == EINTR);
840
bb5c1468
Z
841 if (retval >= 0) {
842 return retval > size ? -EMSGSIZE : retval;
796223f5
BP
843 } else {
844 if (errno != EAGAIN) {
845 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
10a89ef0 846 ovs_strerror(errno), netdev_rx_get_name(rx_));
8b61709d 847 }
796223f5 848 return -errno;
8b61709d
BP
849 }
850}
851
8b61709d 852static void
9dc63482 853netdev_linux_rx_wait(struct netdev_rx *rx_)
8b61709d 854{
796223f5
BP
855 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
856 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
857}
858
8b61709d 859static int
9dc63482 860netdev_linux_rx_drain(struct netdev_rx *rx_)
8b61709d 861{
796223f5
BP
862 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
863 if (rx->is_tap) {
8b61709d 864 struct ifreq ifr;
259e0b1a
BP
865 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
866 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
867 if (error) {
868 return error;
869 }
796223f5 870 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
871 return 0;
872 } else {
796223f5 873 return drain_rcvbuf(rx->fd);
8b61709d
BP
874 }
875}
876
877/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
878 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
879 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
880 * the packet is too big or too small to transmit on the device.
881 *
882 * The caller retains ownership of 'buffer' in all cases.
883 *
884 * The kernel maintains a packet transmission queue, so the caller is not
885 * expected to do additional queuing of packets. */
886static int
887netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
888{
f23347ea
BP
889 for (;;) {
890 ssize_t retval;
8b61709d 891
796223f5 892 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
893 /* Use our AF_PACKET socket to send to this device. */
894 struct sockaddr_ll sll;
895 struct msghdr msg;
896 struct iovec iov;
897 int ifindex;
488d734d
BP
898 int sock;
899
900 sock = af_packet_sock();
901 if (sock < 0) {
c4c7a3d7 902 return -sock;
488d734d 903 }
f23347ea 904
86383816
BP
905 ifindex = netdev_get_ifindex(netdev_);
906 if (ifindex < 0) {
907 return -ifindex;
f23347ea 908 }
8b61709d 909
f23347ea
BP
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
76c308b5 915
ebc56baa 916 iov.iov_base = CONST_CAST(void *, data);
f23347ea 917 iov.iov_len = size;
76c308b5 918
f23347ea
BP
919 msg.msg_name = &sll;
920 msg.msg_namelen = sizeof sll;
921 msg.msg_iov = &iov;
922 msg.msg_iovlen = 1;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
925 msg.msg_flags = 0;
926
488d734d 927 retval = sendmsg(sock, &msg, 0);
f23347ea 928 } else {
796223f5
BP
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
b5d57fc8 934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 935
d0d08f8a 936 retval = write(netdev->tap_fd, data, size);
f23347ea 937 }
76c308b5 938
8b61709d
BP
939 if (retval < 0) {
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
944 return EAGAIN;
945 } else if (errno == EINTR) {
946 continue;
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
10a89ef0 949 netdev_get_name(netdev_), ovs_strerror(errno));
8b61709d
BP
950 }
951 return errno;
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
955 return EMSGSIZE;
956 } else {
957 return 0;
958 }
959 }
960}
961
962/* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
965 *
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
969static void
796223f5 970netdev_linux_send_wait(struct netdev *netdev)
8b61709d 971{
796223f5 972 if (is_tap_netdev(netdev)) {
8b61709d
BP
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
975 }
976}
977
978/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
980static int
981netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
983{
b5d57fc8 984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 985 enum netdev_flags old_flags = 0;
eb395f2e
BP
986 int error;
987
86383816
BP
988 ovs_mutex_lock(&netdev->mutex);
989
b5d57fc8 990 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
991 error = netdev->ether_addr_error;
992 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
993 goto exit;
44445cac 994 }
b5d57fc8 995 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
996 }
997
7eb1bd81 998 /* Tap devices must be brought down before setting the address. */
796223f5 999 if (is_tap_netdev(netdev_)) {
4f9f3f21 1000 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1001 }
44445cac
PS
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
b5d57fc8
BP
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1006 if (!error) {
b5d57fc8 1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1008 }
8b61709d 1009 }
44445cac 1010
4f9f3f21
BP
1011 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1012 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1013 }
7eb1bd81 1014
86383816
BP
1015exit:
1016 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1017 return error;
1018}
1019
44445cac 1020/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1021static int
1022netdev_linux_get_etheraddr(const struct netdev *netdev_,
1023 uint8_t mac[ETH_ADDR_LEN])
1024{
b5d57fc8 1025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1026 int error;
44445cac 1027
86383816 1028 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1029 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816
BP
1030 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1031 netdev->etheraddr);
b5d57fc8 1032 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1033 }
44445cac 1034
86383816
BP
1035 error = netdev->ether_addr_error;
1036 if (!error) {
b5d57fc8 1037 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac 1038 }
86383816 1039 ovs_mutex_unlock(&netdev->mutex);
44445cac 1040
86383816 1041 return error;
8b61709d
BP
1042}
1043
8b61709d 1044static int
73371c09 1045netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1046{
86383816
BP
1047 int error;
1048
b5d57fc8 1049 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1050 struct ifreq ifr;
90a6637d 1051
86383816 1052 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1053 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1054 netdev->mtu = ifr.ifr_mtu;
1055 netdev->cache_valid |= VALID_MTU;
8b61709d 1056 }
90a6637d 1057
86383816
BP
1058 error = netdev->netdev_mtu_error;
1059 if (!error) {
b5d57fc8 1060 *mtup = netdev->mtu;
90a6637d 1061 }
73371c09
BP
1062
1063 return error;
1064}
1065
1066/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1067 * in bytes, not including the hardware header; thus, this is typically 1500
1068 * bytes for Ethernet devices. */
1069static int
1070netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1071{
1072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1073 int error;
1074
1075 ovs_mutex_lock(&netdev->mutex);
1076 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1077 ovs_mutex_unlock(&netdev->mutex);
1078
1079 return error;
8b61709d
BP
1080}
1081
9b020780
PS
1082/* Sets the maximum size of transmitted (MTU) for given device using linux
1083 * networking ioctl interface.
1084 */
1085static int
1086netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1087{
b5d57fc8 1088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1089 struct ifreq ifr;
1090 int error;
1091
86383816 1092 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1093 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1094 error = netdev->netdev_mtu_error;
1095 if (error || netdev->mtu == mtu) {
1096 goto exit;
90a6637d 1097 }
b5d57fc8 1098 netdev->cache_valid &= ~VALID_MTU;
153e5481 1099 }
9b020780 1100 ifr.ifr_mtu = mtu;
259e0b1a
BP
1101 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1102 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1103 if (!error || error == ENODEV) {
b5d57fc8
BP
1104 netdev->netdev_mtu_error = error;
1105 netdev->mtu = ifr.ifr_mtu;
1106 netdev->cache_valid |= VALID_MTU;
9b020780 1107 }
86383816
BP
1108exit:
1109 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1110 return error;
9b020780
PS
1111}
1112
9ab3d9a3
BP
1113/* Returns the ifindex of 'netdev', if successful, as a positive number.
1114 * On failure, returns a negative errno value. */
1115static int
86383816 1116netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1117{
86383816 1118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1119 int ifindex, error;
1120
86383816
BP
1121 ovs_mutex_lock(&netdev->mutex);
1122 error = get_ifindex(netdev_, &ifindex);
1123 ovs_mutex_unlock(&netdev->mutex);
1124
9ab3d9a3
BP
1125 return error ? -error : ifindex;
1126}
1127
8b61709d
BP
1128static int
1129netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1130{
b5d57fc8 1131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1132
86383816 1133 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1134 if (netdev->miimon_interval > 0) {
1135 *carrier = netdev->miimon;
3a183124 1136 } else {
b5d57fc8 1137 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1138 }
86383816 1139 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1140
3a183124 1141 return 0;
8b61709d
BP
1142}
1143
65c3058c 1144static long long int
86383816 1145netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1146{
86383816
BP
1147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1148 long long int carrier_resets;
1149
1150 ovs_mutex_lock(&netdev->mutex);
1151 carrier_resets = netdev->carrier_resets;
1152 ovs_mutex_unlock(&netdev->mutex);
1153
1154 return carrier_resets;
65c3058c
EJ
1155}
1156
63331829 1157static int
1670c579
EJ
1158netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1159 struct mii_ioctl_data *data)
63331829 1160{
63331829 1161 struct ifreq ifr;
782e6111 1162 int error;
63331829 1163
63331829 1164 memset(&ifr, 0, sizeof ifr);
782e6111 1165 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1166 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1167 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1168
782e6111
EJ
1169 return error;
1170}
1171
1172static int
1670c579 1173netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1174{
782e6111
EJ
1175 struct mii_ioctl_data data;
1176 int error;
63331829 1177
782e6111
EJ
1178 *miimon = false;
1179
1180 memset(&data, 0, sizeof data);
1670c579 1181 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1182 if (!error) {
1183 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1184 data.reg_num = MII_BMSR;
1670c579 1185 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1186 &data);
63331829
EJ
1187
1188 if (!error) {
782e6111 1189 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1190 } else {
1191 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1192 }
1193 } else {
1194 struct ethtool_cmd ecmd;
63331829
EJ
1195
1196 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1197 name);
1198
ab985a77 1199 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1200 memset(&ecmd, 0, sizeof ecmd);
1201 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1202 "ETHTOOL_GLINK");
1203 if (!error) {
782e6111
EJ
1204 struct ethtool_value eval;
1205
1206 memcpy(&eval, &ecmd, sizeof eval);
1207 *miimon = !!eval.data;
63331829
EJ
1208 } else {
1209 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1210 }
1211 }
1212
1213 return error;
1214}
1215
1670c579
EJ
1216static int
1217netdev_linux_set_miimon_interval(struct netdev *netdev_,
1218 long long int interval)
1219{
b5d57fc8 1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1221
86383816 1222 ovs_mutex_lock(&netdev->mutex);
1670c579 1223 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8
BP
1224 if (netdev->miimon_interval != interval) {
1225 netdev->miimon_interval = interval;
1226 timer_set_expired(&netdev->miimon_timer);
1670c579 1227 }
86383816 1228 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1229
1230 return 0;
1231}
1232
1233static void
1234netdev_linux_miimon_run(void)
1235{
1236 struct shash device_shash;
1237 struct shash_node *node;
1238
1239 shash_init(&device_shash);
b5d57fc8 1240 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1241 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1242 struct netdev *netdev = node->data;
1243 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1244 bool miimon;
1245
86383816
BP
1246 ovs_mutex_lock(&dev->mutex);
1247 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1248 netdev_linux_get_miimon(dev->up.name, &miimon);
1249 if (miimon != dev->miimon) {
1250 dev->miimon = miimon;
1251 netdev_linux_changed(dev, dev->ifi_flags, 0);
1252 }
1670c579 1253
86383816 1254 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1255 }
86383816 1256 ovs_mutex_unlock(&dev->mutex);
2f980d74 1257 netdev_close(netdev);
1670c579
EJ
1258 }
1259
1260 shash_destroy(&device_shash);
1261}
1262
1263static void
1264netdev_linux_miimon_wait(void)
1265{
1266 struct shash device_shash;
1267 struct shash_node *node;
1268
1269 shash_init(&device_shash);
b5d57fc8 1270 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1271 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1272 struct netdev *netdev = node->data;
1273 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1274
86383816 1275 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1276 if (dev->miimon_interval > 0) {
1277 timer_wait(&dev->miimon_timer);
1278 }
86383816 1279 ovs_mutex_unlock(&dev->mutex);
2f980d74 1280 netdev_close(netdev);
1670c579
EJ
1281 }
1282 shash_destroy(&device_shash);
1283}
1284
8b61709d
BP
1285/* Check whether we can we use RTM_GETLINK to get network device statistics.
1286 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1287 * enabled. */
1288static bool
1289check_for_working_netlink_stats(void)
1290{
1291 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1292 * preferable, so if that works, we'll use it. */
1293 int ifindex = do_get_ifindex("lo");
1294 if (ifindex < 0) {
1295 VLOG_WARN("failed to get ifindex for lo, "
1296 "obtaining netdev stats from proc");
1297 return false;
1298 } else {
1299 struct netdev_stats stats;
1300 int error = get_stats_via_netlink(ifindex, &stats);
1301 if (!error) {
1302 VLOG_DBG("obtaining netdev stats via rtnetlink");
1303 return true;
1304 } else {
1305 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1306 "via proc (you are probably running a pre-2.6.19 "
10a89ef0 1307 "kernel)", ovs_strerror(error));
8b61709d
BP
1308 return false;
1309 }
1310 }
1311}
1312
92df599c
JG
1313static void
1314swap_uint64(uint64_t *a, uint64_t *b)
1315{
1de0e8ae
BP
1316 uint64_t tmp = *a;
1317 *a = *b;
1318 *b = tmp;
92df599c
JG
1319}
1320
c060c4cf
EJ
1321/* Copies 'src' into 'dst', performing format conversion in the process.
1322 *
1323 * 'src' is allowed to be misaligned. */
1324static void
1325netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1326 const struct ovs_vport_stats *src)
1327{
1328 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1329 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1330 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1331 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1332 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1333 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1334 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1335 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1336 dst->multicast = 0;
1337 dst->collisions = 0;
1338 dst->rx_length_errors = 0;
1339 dst->rx_over_errors = 0;
1340 dst->rx_crc_errors = 0;
1341 dst->rx_frame_errors = 0;
1342 dst->rx_fifo_errors = 0;
1343 dst->rx_missed_errors = 0;
1344 dst->tx_aborted_errors = 0;
1345 dst->tx_carrier_errors = 0;
1346 dst->tx_fifo_errors = 0;
1347 dst->tx_heartbeat_errors = 0;
1348 dst->tx_window_errors = 0;
1349}
1350
1351static int
1352get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1353{
1354 struct dpif_linux_vport reply;
1355 struct ofpbuf *buf;
1356 int error;
1357
1358 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1359 if (error) {
1360 return error;
1361 } else if (!reply.stats) {
1362 ofpbuf_delete(buf);
1363 return EOPNOTSUPP;
1364 }
1365
1366 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1367
1368 ofpbuf_delete(buf);
1369
1370 return 0;
1371}
1372
f613a0d7
PS
1373static void
1374get_stats_via_vport(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
8b61709d 1376{
b5d57fc8 1377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1378
b5d57fc8
BP
1379 if (!netdev->vport_stats_error ||
1380 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1381 int error;
7fbef77a 1382
c060c4cf 1383 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1384 if (error && error != ENOENT) {
a57a8488 1385 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1386 "(%s)",
1387 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1388 }
b5d57fc8
BP
1389 netdev->vport_stats_error = error;
1390 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1391 }
f613a0d7 1392}
8b61709d 1393
f613a0d7
PS
1394static int
1395netdev_linux_sys_get_stats(const struct netdev *netdev_,
86383816 1396 struct netdev_stats *stats)
f613a0d7 1397{
23882115
BP
1398 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1399 static int use_netlink_stats;
f613a0d7
PS
1400 int error;
1401
23882115 1402 if (ovsthread_once_start(&once)) {
f613a0d7 1403 use_netlink_stats = check_for_working_netlink_stats();
23882115 1404 ovsthread_once_done(&once);
f613a0d7
PS
1405 }
1406
1407 if (use_netlink_stats) {
1408 int ifindex;
1409
1410 error = get_ifindex(netdev_, &ifindex);
1411 if (!error) {
1412 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1413 }
f613a0d7
PS
1414 } else {
1415 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1416 }
7fbef77a 1417
f613a0d7
PS
1418 if (error) {
1419 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1420 netdev_get_name(netdev_), error);
1421 }
1422 return error;
1423
1424}
1425
1426/* Retrieves current device stats for 'netdev-linux'. */
1427static int
1428netdev_linux_get_stats(const struct netdev *netdev_,
1429 struct netdev_stats *stats)
1430{
b5d57fc8 1431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1432 struct netdev_stats dev_stats;
1433 int error;
1434
86383816 1435 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1436 get_stats_via_vport(netdev_, stats);
f613a0d7 1437 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
f613a0d7 1438 if (error) {
86383816
BP
1439 if (!netdev->vport_stats_error) {
1440 error = 0;
f613a0d7 1441 }
86383816 1442 } else if (netdev->vport_stats_error) {
f613a0d7
PS
1443 /* stats not available from OVS then use ioctl stats. */
1444 *stats = dev_stats;
1445 } else {
1446 stats->rx_errors += dev_stats.rx_errors;
1447 stats->tx_errors += dev_stats.tx_errors;
1448 stats->rx_dropped += dev_stats.rx_dropped;
1449 stats->tx_dropped += dev_stats.tx_dropped;
1450 stats->multicast += dev_stats.multicast;
1451 stats->collisions += dev_stats.collisions;
1452 stats->rx_length_errors += dev_stats.rx_length_errors;
1453 stats->rx_over_errors += dev_stats.rx_over_errors;
1454 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1455 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1456 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1457 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1458 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1459 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1460 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1461 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1462 stats->tx_window_errors += dev_stats.tx_window_errors;
1463 }
86383816
BP
1464 ovs_mutex_unlock(&netdev->mutex);
1465
1466 return error;
f613a0d7
PS
1467}
1468
1469/* Retrieves current device stats for 'netdev-tap' netdev or
1470 * netdev-internal. */
1471static int
15aee116 1472netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1473{
b5d57fc8 1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1475 struct netdev_stats dev_stats;
1476 int error;
1477
86383816 1478 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1479 get_stats_via_vport(netdev_, stats);
f613a0d7
PS
1480 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1481 if (error) {
86383816
BP
1482 if (!netdev->vport_stats_error) {
1483 error = 0;
8b61709d 1484 }
86383816
BP
1485 } else if (netdev->vport_stats_error) {
1486 /* Transmit and receive stats will appear to be swapped relative to the
1487 * other ports since we are the one sending the data, not a remote
1488 * computer. For consistency, we swap them back here. This does not
1489 * apply if we are getting stats from the vport layer because it always
1490 * tracks stats from the perspective of the switch. */
fe6b0e03 1491
f613a0d7 1492 *stats = dev_stats;
92df599c
JG
1493 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1494 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1495 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1496 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1497 stats->rx_length_errors = 0;
1498 stats->rx_over_errors = 0;
1499 stats->rx_crc_errors = 0;
1500 stats->rx_frame_errors = 0;
1501 stats->rx_fifo_errors = 0;
1502 stats->rx_missed_errors = 0;
1503 stats->tx_aborted_errors = 0;
1504 stats->tx_carrier_errors = 0;
1505 stats->tx_fifo_errors = 0;
1506 stats->tx_heartbeat_errors = 0;
1507 stats->tx_window_errors = 0;
f613a0d7
PS
1508 } else {
1509 stats->rx_dropped += dev_stats.tx_dropped;
1510 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1511
f613a0d7
PS
1512 stats->rx_errors += dev_stats.tx_errors;
1513 stats->tx_errors += dev_stats.rx_errors;
1514
1515 stats->multicast += dev_stats.multicast;
1516 stats->collisions += dev_stats.collisions;
1517 }
86383816
BP
1518 ovs_mutex_unlock(&netdev->mutex);
1519
1520 return error;
8b61709d
BP
1521}
1522
bba1e6f3
PS
1523static int
1524netdev_internal_get_stats(const struct netdev *netdev_,
1525 struct netdev_stats *stats)
1526{
b5d57fc8 1527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1528 int error;
bba1e6f3 1529
86383816 1530 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1531 get_stats_via_vport(netdev_, stats);
86383816
BP
1532 error = netdev->vport_stats_error;
1533 ovs_mutex_unlock(&netdev->mutex);
1534
1535 return error;
bba1e6f3
PS
1536}
1537
2f31a822
EJ
1538static int
1539netdev_internal_set_stats(struct netdev *netdev,
1540 const struct netdev_stats *stats)
1541{
1542 struct ovs_vport_stats vport_stats;
1543 struct dpif_linux_vport vport;
1544 int err;
1545
1546 vport_stats.rx_packets = stats->rx_packets;
1547 vport_stats.tx_packets = stats->tx_packets;
1548 vport_stats.rx_bytes = stats->rx_bytes;
1549 vport_stats.tx_bytes = stats->tx_bytes;
1550 vport_stats.rx_errors = stats->rx_errors;
1551 vport_stats.tx_errors = stats->tx_errors;
1552 vport_stats.rx_dropped = stats->rx_dropped;
1553 vport_stats.tx_dropped = stats->tx_dropped;
1554
1555 dpif_linux_vport_init(&vport);
1556 vport.cmd = OVS_VPORT_CMD_SET;
1557 vport.name = netdev_get_name(netdev);
1558 vport.stats = &vport_stats;
1559
1560 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1561
1562 /* If the vport layer doesn't know about the device, that doesn't mean it
1563 * doesn't exist (after all were able to open it when netdev_open() was
1564 * called), it just means that it isn't attached and we'll be getting
1565 * stats a different way. */
1566 if (err == ENODEV) {
1567 err = EOPNOTSUPP;
1568 }
1569
1570 return err;
1571}
1572
51f87458 1573static void
b5d57fc8 1574netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1575{
1576 struct ethtool_cmd ecmd;
6c038611 1577 uint32_t speed;
8b61709d
BP
1578 int error;
1579
b5d57fc8 1580 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1581 return;
1582 }
1583
ab985a77 1584 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1585 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1586 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1587 ETHTOOL_GSET, "ETHTOOL_GSET");
1588 if (error) {
51f87458 1589 goto out;
8b61709d
BP
1590 }
1591
1592 /* Supported features. */
b5d57fc8 1593 netdev->supported = 0;
8b61709d 1594 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1595 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1596 }
1597 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1598 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1599 }
1600 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1601 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1602 }
1603 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1604 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1605 }
1606 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1607 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1608 }
1609 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1610 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1611 }
1612 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1613 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1614 }
1615 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1616 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1617 }
1618 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1619 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1620 }
1621 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1622 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1623 }
1624 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1625 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1626 }
1627 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1628 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1629 }
1630
1631 /* Advertised features. */
b5d57fc8 1632 netdev->advertised = 0;
8b61709d 1633 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1634 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1635 }
1636 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1637 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1638 }
1639 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1640 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1641 }
1642 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1643 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1644 }
1645 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1646 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1647 }
1648 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1649 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1650 }
1651 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1652 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1653 }
1654 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1655 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1656 }
1657 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1658 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1659 }
1660 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1661 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1662 }
1663 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1664 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1665 }
1666 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1667 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1668 }
1669
1670 /* Current settings. */
2a529ead 1671 speed = ecmd.speed;
6c038611 1672 if (speed == SPEED_10) {
b5d57fc8 1673 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1674 } else if (speed == SPEED_100) {
b5d57fc8 1675 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1676 } else if (speed == SPEED_1000) {
b5d57fc8 1677 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1678 } else if (speed == SPEED_10000) {
b5d57fc8 1679 netdev->current = NETDEV_F_10GB_FD;
6c038611 1680 } else if (speed == 40000) {
b5d57fc8 1681 netdev->current = NETDEV_F_40GB_FD;
6c038611 1682 } else if (speed == 100000) {
b5d57fc8 1683 netdev->current = NETDEV_F_100GB_FD;
6c038611 1684 } else if (speed == 1000000) {
b5d57fc8 1685 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1686 } else {
b5d57fc8 1687 netdev->current = 0;
8b61709d
BP
1688 }
1689
1690 if (ecmd.port == PORT_TP) {
b5d57fc8 1691 netdev->current |= NETDEV_F_COPPER;
8b61709d 1692 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1693 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1694 }
1695
1696 if (ecmd.autoneg) {
b5d57fc8 1697 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1698 }
1699
51f87458 1700out:
b5d57fc8
BP
1701 netdev->cache_valid |= VALID_FEATURES;
1702 netdev->get_features_error = error;
51f87458
PS
1703}
1704
887ed8b2
BP
1705/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1706 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1707 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1708static int
1709netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1710 enum netdev_features *current,
1711 enum netdev_features *advertised,
1712 enum netdev_features *supported,
1713 enum netdev_features *peer)
51f87458 1714{
b5d57fc8 1715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1716 int error;
51f87458 1717
86383816 1718 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1719 netdev_linux_read_features(netdev);
b5d57fc8
BP
1720 if (!netdev->get_features_error) {
1721 *current = netdev->current;
1722 *advertised = netdev->advertised;
1723 *supported = netdev->supported;
887ed8b2 1724 *peer = 0; /* XXX */
51f87458 1725 }
86383816
BP
1726 error = netdev->get_features_error;
1727 ovs_mutex_unlock(&netdev->mutex);
1728
1729 return error;
8b61709d
BP
1730}
1731
1732/* Set the features advertised by 'netdev' to 'advertise'. */
1733static int
86383816 1734netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1735 enum netdev_features advertise)
8b61709d 1736{
86383816 1737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1738 struct ethtool_cmd ecmd;
1739 int error;
1740
86383816
BP
1741 ovs_mutex_lock(&netdev->mutex);
1742
ab985a77 1743 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1744 memset(&ecmd, 0, sizeof ecmd);
86383816 1745 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1746 ETHTOOL_GSET, "ETHTOOL_GSET");
1747 if (error) {
86383816 1748 goto exit;
8b61709d
BP
1749 }
1750
1751 ecmd.advertising = 0;
6c038611 1752 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1753 ecmd.advertising |= ADVERTISED_10baseT_Half;
1754 }
6c038611 1755 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1756 ecmd.advertising |= ADVERTISED_10baseT_Full;
1757 }
6c038611 1758 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1759 ecmd.advertising |= ADVERTISED_100baseT_Half;
1760 }
6c038611 1761 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1762 ecmd.advertising |= ADVERTISED_100baseT_Full;
1763 }
6c038611 1764 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1765 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1766 }
6c038611 1767 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1768 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1769 }
6c038611 1770 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1771 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1772 }
6c038611 1773 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1774 ecmd.advertising |= ADVERTISED_TP;
1775 }
6c038611 1776 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1777 ecmd.advertising |= ADVERTISED_FIBRE;
1778 }
6c038611 1779 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1780 ecmd.advertising |= ADVERTISED_Autoneg;
1781 }
6c038611 1782 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1783 ecmd.advertising |= ADVERTISED_Pause;
1784 }
6c038611 1785 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1786 ecmd.advertising |= ADVERTISED_Asym_Pause;
1787 }
ab985a77 1788 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
1789 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1790 ETHTOOL_SSET, "ETHTOOL_SSET");
1791
1792exit:
1793 ovs_mutex_unlock(&netdev->mutex);
1794 return error;
8b61709d
BP
1795}
1796
f8500004
JP
1797/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1798 * successful, otherwise a positive errno value. */
8b61709d 1799static int
b5d57fc8 1800netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1801 uint32_t kbits_rate, uint32_t kbits_burst)
1802{
b5d57fc8
BP
1803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1804 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1805 int error;
8b61709d 1806
80a86fbe
BP
1807 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1808 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1809 : kbits_burst); /* Stick with user-specified value. */
1810
86383816 1811 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1812 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
1813 error = netdev->netdev_policing_error;
1814 if (error || (netdev->kbits_rate == kbits_rate &&
1815 netdev->kbits_burst == kbits_burst)) {
c9f71668 1816 /* Assume that settings haven't changed since we last set them. */
86383816 1817 goto out;
c9f71668 1818 }
b5d57fc8 1819 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1820 }
1821
ac8c3412 1822 COVERAGE_INC(netdev_set_policing);
f8500004 1823 /* Remove any existing ingress qdisc. */
b5d57fc8 1824 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1825 if (error) {
1826 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1827 netdev_name, ovs_strerror(error));
c9f71668 1828 goto out;
f8500004
JP
1829 }
1830
8b61709d 1831 if (kbits_rate) {
b5d57fc8 1832 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1833 if (error) {
1834 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 1835 netdev_name, ovs_strerror(error));
c9f71668 1836 goto out;
8b61709d
BP
1837 }
1838
b5d57fc8 1839 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
1840 if (error){
1841 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 1842 netdev_name, ovs_strerror(error));
c9f71668 1843 goto out;
8b61709d 1844 }
8b61709d
BP
1845 }
1846
b5d57fc8
BP
1847 netdev->kbits_rate = kbits_rate;
1848 netdev->kbits_burst = kbits_burst;
f8500004 1849
c9f71668
PS
1850out:
1851 if (!error || error == ENODEV) {
b5d57fc8
BP
1852 netdev->netdev_policing_error = error;
1853 netdev->cache_valid |= VALID_POLICING;
c9f71668 1854 }
86383816 1855 ovs_mutex_unlock(&netdev->mutex);
c9f71668 1856 return error;
8b61709d
BP
1857}
1858
c1c9c9c4
BP
1859static int
1860netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1861 struct sset *types)
c1c9c9c4 1862{
559eb230 1863 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1864
1865 for (opsp = tcs; *opsp != NULL; opsp++) {
1866 const struct tc_ops *ops = *opsp;
1867 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1868 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1869 }
1870 }
1871 return 0;
1872}
1873
1874static const struct tc_ops *
1875tc_lookup_ovs_name(const char *name)
1876{
559eb230 1877 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1878
1879 for (opsp = tcs; *opsp != NULL; opsp++) {
1880 const struct tc_ops *ops = *opsp;
1881 if (!strcmp(name, ops->ovs_name)) {
1882 return ops;
1883 }
1884 }
1885 return NULL;
1886}
1887
1888static const struct tc_ops *
1889tc_lookup_linux_name(const char *name)
1890{
559eb230 1891 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1892
1893 for (opsp = tcs; *opsp != NULL; opsp++) {
1894 const struct tc_ops *ops = *opsp;
1895 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1896 return ops;
1897 }
1898 }
1899 return NULL;
1900}
1901
93b13be8 1902static struct tc_queue *
b5d57fc8 1903tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
1904 size_t hash)
1905{
b5d57fc8 1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
1907 struct tc_queue *queue;
1908
b5d57fc8 1909 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
1910 if (queue->queue_id == queue_id) {
1911 return queue;
1912 }
1913 }
1914 return NULL;
1915}
1916
1917static struct tc_queue *
1918tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1919{
1920 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1921}
1922
c1c9c9c4
BP
1923static int
1924netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1925 const char *type,
1926 struct netdev_qos_capabilities *caps)
1927{
1928 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1929 if (!ops) {
1930 return EOPNOTSUPP;
1931 }
1932 caps->n_queues = ops->n_queues;
1933 return 0;
1934}
1935
1936static int
b5d57fc8 1937netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 1938 const char **typep, struct smap *details)
c1c9c9c4 1939{
b5d57fc8 1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1941 int error;
1942
86383816 1943 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1944 error = tc_query_qdisc(netdev_);
86383816
BP
1945 if (!error) {
1946 *typep = netdev->tc->ops->ovs_name;
1947 error = (netdev->tc->ops->qdisc_get
1948 ? netdev->tc->ops->qdisc_get(netdev_, details)
1949 : 0);
c1c9c9c4 1950 }
86383816 1951 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 1952
86383816 1953 return error;
c1c9c9c4
BP
1954}
1955
1956static int
b5d57fc8 1957netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 1958 const char *type, const struct smap *details)
c1c9c9c4 1959{
b5d57fc8 1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1961 const struct tc_ops *new_ops;
1962 int error;
1963
1964 new_ops = tc_lookup_ovs_name(type);
1965 if (!new_ops || !new_ops->tc_install) {
1966 return EOPNOTSUPP;
1967 }
1968
86383816 1969 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1970 error = tc_query_qdisc(netdev_);
c1c9c9c4 1971 if (error) {
86383816 1972 goto exit;
c1c9c9c4
BP
1973 }
1974
b5d57fc8 1975 if (new_ops == netdev->tc->ops) {
86383816 1976 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
1977 } else {
1978 /* Delete existing qdisc. */
b5d57fc8 1979 error = tc_del_qdisc(netdev_);
c1c9c9c4 1980 if (error) {
86383816 1981 goto exit;
c1c9c9c4 1982 }
b5d57fc8 1983 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
1984
1985 /* Install new qdisc. */
b5d57fc8
BP
1986 error = new_ops->tc_install(netdev_, details);
1987 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 1988 }
86383816
BP
1989
1990exit:
1991 ovs_mutex_unlock(&netdev->mutex);
1992 return error;
c1c9c9c4
BP
1993}
1994
1995static int
b5d57fc8 1996netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 1997 unsigned int queue_id, struct smap *details)
c1c9c9c4 1998{
b5d57fc8 1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2000 int error;
2001
86383816 2002 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2003 error = tc_query_qdisc(netdev_);
86383816 2004 if (!error) {
b5d57fc8 2005 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2006 error = (queue
b5d57fc8 2007 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2008 : ENOENT);
c1c9c9c4 2009 }
86383816
BP
2010 ovs_mutex_unlock(&netdev->mutex);
2011
2012 return error;
c1c9c9c4
BP
2013}
2014
2015static int
b5d57fc8 2016netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2017 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2018{
b5d57fc8 2019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2020 int error;
2021
86383816 2022 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2023 error = tc_query_qdisc(netdev_);
86383816
BP
2024 if (!error) {
2025 error = (queue_id < netdev->tc->ops->n_queues
2026 && netdev->tc->ops->class_set
2027 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2028 : EINVAL);
c1c9c9c4 2029 }
86383816 2030 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2031
86383816 2032 return error;
c1c9c9c4
BP
2033}
2034
2035static int
b5d57fc8 2036netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2037{
b5d57fc8 2038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2039 int error;
2040
86383816 2041 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2042 error = tc_query_qdisc(netdev_);
86383816
BP
2043 if (!error) {
2044 if (netdev->tc->ops->class_delete) {
2045 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2046 error = (queue
2047 ? netdev->tc->ops->class_delete(netdev_, queue)
2048 : ENOENT);
2049 } else {
2050 error = EINVAL;
2051 }
c1c9c9c4 2052 }
86383816
BP
2053 ovs_mutex_unlock(&netdev->mutex);
2054
2055 return error;
c1c9c9c4
BP
2056}
2057
2058static int
b5d57fc8 2059netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2060 unsigned int queue_id,
2061 struct netdev_queue_stats *stats)
2062{
b5d57fc8 2063 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2064 int error;
2065
86383816 2066 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2067 error = tc_query_qdisc(netdev_);
86383816
BP
2068 if (!error) {
2069 if (netdev->tc->ops->class_get_stats) {
2070 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2071 if (queue) {
2072 stats->created = queue->created;
2073 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2074 stats);
2075 } else {
2076 error = ENOENT;
2077 }
2078 } else {
2079 error = EOPNOTSUPP;
6dc34a0d 2080 }
c1c9c9c4 2081 }
86383816
BP
2082 ovs_mutex_unlock(&netdev->mutex);
2083
2084 return error;
c1c9c9c4
BP
2085}
2086
23a98ffe 2087static bool
c1c9c9c4
BP
2088start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2089{
2090 struct ofpbuf request;
2091 struct tcmsg *tcmsg;
2092
2093 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2094 if (!tcmsg) {
2095 return false;
2096 }
3c4de644 2097 tcmsg->tcm_parent = 0;
a88b4e04 2098 nl_dump_start(dump, NETLINK_ROUTE, &request);
c1c9c9c4 2099 ofpbuf_uninit(&request);
23a98ffe 2100 return true;
c1c9c9c4
BP
2101}
2102
2103static int
b5d57fc8 2104netdev_linux_dump_queues(const struct netdev *netdev_,
c1c9c9c4
BP
2105 netdev_dump_queues_cb *cb, void *aux)
2106{
b5d57fc8 2107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2108 int error;
2109
86383816 2110 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2111 error = tc_query_qdisc(netdev_);
86383816
BP
2112 if (!error) {
2113 if (netdev->tc->ops->class_get) {
2114 struct tc_queue *queue, *next_queue;
2115 struct smap details;
c1c9c9c4 2116
86383816
BP
2117 smap_init(&details);
2118 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2119 &netdev->tc->queues) {
2120 int retval;
c1c9c9c4 2121
86383816
BP
2122 smap_clear(&details);
2123
2124 retval = netdev->tc->ops->class_get(netdev_, queue, &details);
2125 if (!retval) {
2126 (*cb)(queue->queue_id, &details, aux);
2127 } else {
2128 error = retval;
2129 }
2130 }
2131 smap_destroy(&details);
c1c9c9c4 2132 } else {
86383816 2133 error = EOPNOTSUPP;
c1c9c9c4
BP
2134 }
2135 }
86383816 2136 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2137
86383816 2138 return error;
c1c9c9c4
BP
2139}
2140
2141static int
b5d57fc8 2142netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2143 netdev_dump_queue_stats_cb *cb, void *aux)
2144{
b5d57fc8 2145 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2146 int error;
2147
86383816 2148 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2149 error = tc_query_qdisc(netdev_);
86383816
BP
2150 if (!error) {
2151 struct nl_dump dump;
c1c9c9c4 2152
86383816
BP
2153 if (!netdev->tc->ops->class_dump_stats) {
2154 error = EOPNOTSUPP;
2155 } else if (!start_queue_dump(netdev_, &dump)) {
2156 error = ENODEV;
2157 } else {
2158 struct ofpbuf msg;
2159 int retval;
2160
2161 while (nl_dump_next(&dump, &msg)) {
2162 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2163 cb, aux);
2164 if (retval) {
2165 error = retval;
2166 }
2167 }
2168
2169 retval = nl_dump_done(&dump);
2170 if (retval) {
2171 error = retval;
2172 }
c1c9c9c4
BP
2173 }
2174 }
86383816 2175 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2176
86383816 2177 return error;
c1c9c9c4
BP
2178}
2179
8b61709d 2180static int
f1acd62b
BP
2181netdev_linux_get_in4(const struct netdev *netdev_,
2182 struct in_addr *address, struct in_addr *netmask)
8b61709d 2183{
b5d57fc8 2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2185 int error;
149f577a 2186
86383816 2187 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2188 if (!(netdev->cache_valid & VALID_IN4)) {
b5d57fc8 2189 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d 2190 SIOCGIFADDR, "SIOCGIFADDR");
86383816
BP
2191 if (!error) {
2192 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2193 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2194 if (!error) {
2195 netdev->cache_valid |= VALID_IN4;
2196 }
8b61709d 2197 }
86383816
BP
2198 } else {
2199 error = 0;
2200 }
8b61709d 2201
86383816
BP
2202 if (!error) {
2203 if (netdev->address.s_addr != INADDR_ANY) {
2204 *address = netdev->address;
2205 *netmask = netdev->netmask;
2206 } else {
2207 error = EADDRNOTAVAIL;
f1acd62b 2208 }
8b61709d 2209 }
86383816
BP
2210 ovs_mutex_unlock(&netdev->mutex);
2211
2212 return error;
8b61709d
BP
2213}
2214
8b61709d 2215static int
f1acd62b
BP
2216netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2217 struct in_addr netmask)
8b61709d 2218{
b5d57fc8 2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2220 int error;
2221
86383816 2222 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2223 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2224 if (!error) {
b5d57fc8
BP
2225 netdev->cache_valid |= VALID_IN4;
2226 netdev->address = address;
2227 netdev->netmask = netmask;
f1acd62b 2228 if (address.s_addr != INADDR_ANY) {
8b61709d 2229 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2230 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2231 }
2232 }
86383816
BP
2233 ovs_mutex_unlock(&netdev->mutex);
2234
8b61709d
BP
2235 return error;
2236}
2237
2238static bool
2239parse_if_inet6_line(const char *line,
2240 struct in6_addr *in6, char ifname[16 + 1])
2241{
2242 uint8_t *s6 = in6->s6_addr;
2243#define X8 "%2"SCNx8
2244 return sscanf(line,
2245 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2246 "%*x %*x %*x %*x %16s\n",
2247 &s6[0], &s6[1], &s6[2], &s6[3],
2248 &s6[4], &s6[5], &s6[6], &s6[7],
2249 &s6[8], &s6[9], &s6[10], &s6[11],
2250 &s6[12], &s6[13], &s6[14], &s6[15],
2251 ifname) == 17;
2252}
2253
2254/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2255 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2256static int
2257netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2258{
b5d57fc8 2259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
2260
2261 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2262 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2263 FILE *file;
2264 char line[128];
2265
b5d57fc8 2266 netdev->in6 = in6addr_any;
8b61709d
BP
2267
2268 file = fopen("/proc/net/if_inet6", "r");
2269 if (file != NULL) {
2270 const char *name = netdev_get_name(netdev_);
2271 while (fgets(line, sizeof line, file)) {
2a022368 2272 struct in6_addr in6_tmp;
8b61709d 2273 char ifname[16 + 1];
2a022368 2274 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2275 && !strcmp(name, ifname))
2276 {
b5d57fc8 2277 netdev->in6 = in6_tmp;
8b61709d
BP
2278 break;
2279 }
2280 }
2281 fclose(file);
2282 }
b5d57fc8 2283 netdev->cache_valid |= VALID_IN6;
8b61709d 2284 }
b5d57fc8 2285 *in6 = netdev->in6;
86383816
BP
2286 ovs_mutex_unlock(&netdev->mutex);
2287
8b61709d
BP
2288 return 0;
2289}
2290
2291static void
2292make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2293{
2294 struct sockaddr_in sin;
2295 memset(&sin, 0, sizeof sin);
2296 sin.sin_family = AF_INET;
2297 sin.sin_addr = addr;
2298 sin.sin_port = 0;
2299
2300 memset(sa, 0, sizeof *sa);
2301 memcpy(sa, &sin, sizeof sin);
2302}
2303
2304static int
2305do_set_addr(struct netdev *netdev,
2306 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2307{
2308 struct ifreq ifr;
149f577a 2309
259e0b1a
BP
2310 make_in4_sockaddr(&ifr.ifr_addr, addr);
2311 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2312 ioctl_name);
8b61709d
BP
2313}
2314
2315/* Adds 'router' as a default IP gateway. */
2316static int
67a4917b 2317netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2318{
2319 struct in_addr any = { INADDR_ANY };
2320 struct rtentry rt;
2321 int error;
2322
2323 memset(&rt, 0, sizeof rt);
2324 make_in4_sockaddr(&rt.rt_dst, any);
2325 make_in4_sockaddr(&rt.rt_gateway, router);
2326 make_in4_sockaddr(&rt.rt_genmask, any);
2327 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2328 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2329 if (error) {
10a89ef0 2330 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2331 }
2332 return error;
2333}
2334
f1acd62b
BP
2335static int
2336netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2337 char **netdev_name)
2338{
2339 static const char fn[] = "/proc/net/route";
2340 FILE *stream;
2341 char line[256];
2342 int ln;
2343
2344 *netdev_name = NULL;
2345 stream = fopen(fn, "r");
2346 if (stream == NULL) {
10a89ef0 2347 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2348 return errno;
2349 }
2350
2351 ln = 0;
2352 while (fgets(line, sizeof line, stream)) {
2353 if (++ln >= 2) {
2354 char iface[17];
dbba996b 2355 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2356 int refcnt, metric, mtu;
2357 unsigned int flags, use, window, irtt;
2358
2359 if (sscanf(line,
2360 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2361 " %d %u %u\n",
2362 iface, &dest, &gateway, &flags, &refcnt,
2363 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2364
d295e8e9 2365 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2366 fn, ln, line);
2367 continue;
2368 }
2369 if (!(flags & RTF_UP)) {
2370 /* Skip routes that aren't up. */
2371 continue;
2372 }
2373
2374 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2375 * network byte order, so we don't need need any endian
f1acd62b
BP
2376 * conversions here. */
2377 if ((dest & mask) == (host->s_addr & mask)) {
2378 if (!gateway) {
2379 /* The host is directly reachable. */
2380 next_hop->s_addr = 0;
2381 } else {
2382 /* To reach the host, we must go through a gateway. */
2383 next_hop->s_addr = gateway;
2384 }
2385 *netdev_name = xstrdup(iface);
2386 fclose(stream);
2387 return 0;
2388 }
2389 }
2390 }
2391
2392 fclose(stream);
2393 return ENXIO;
2394}
2395
e210037e 2396static int
b5d57fc8 2397netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2398{
b5d57fc8 2399 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2400 int error = 0;
2401
86383816 2402 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2403 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2404 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2405
2406 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2407 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2408 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2409 cmd,
2410 ETHTOOL_GDRVINFO,
2411 "ETHTOOL_GDRVINFO");
2412 if (!error) {
b5d57fc8 2413 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2414 }
2415 }
e210037e 2416
e210037e 2417 if (!error) {
b5d57fc8
BP
2418 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2419 smap_add(smap, "driver_version", netdev->drvinfo.version);
2420 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2421 }
86383816
BP
2422 ovs_mutex_unlock(&netdev->mutex);
2423
e210037e
AE
2424 return error;
2425}
2426
4f925bd3 2427static int
275707c3
EJ
2428netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2429 struct smap *smap)
4f925bd3 2430{
79f1cbe9 2431 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2432 return 0;
2433}
2434
8b61709d
BP
2435/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2436 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2437 * returns 0. Otherwise, it returns a positive errno value; in particular,
2438 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2439static int
2440netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2441 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2442{
2443 struct arpreq r;
c100e025 2444 struct sockaddr_in sin;
8b61709d
BP
2445 int retval;
2446
2447 memset(&r, 0, sizeof r);
f2cc621b 2448 memset(&sin, 0, sizeof sin);
c100e025
BP
2449 sin.sin_family = AF_INET;
2450 sin.sin_addr.s_addr = ip;
2451 sin.sin_port = 0;
2452 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2453 r.arp_ha.sa_family = ARPHRD_ETHER;
2454 r.arp_flags = 0;
71d7c22f 2455 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2456 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2457 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2458 if (!retval) {
2459 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2460 } else if (retval != ENXIO) {
2461 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2462 netdev_get_name(netdev), IP_ARGS(ip),
2463 ovs_strerror(retval));
8b61709d
BP
2464 }
2465 return retval;
2466}
2467
2468static int
2469nd_to_iff_flags(enum netdev_flags nd)
2470{
2471 int iff = 0;
2472 if (nd & NETDEV_UP) {
2473 iff |= IFF_UP;
2474 }
2475 if (nd & NETDEV_PROMISC) {
2476 iff |= IFF_PROMISC;
2477 }
2478 return iff;
2479}
2480
2481static int
2482iff_to_nd_flags(int iff)
2483{
2484 enum netdev_flags nd = 0;
2485 if (iff & IFF_UP) {
2486 nd |= NETDEV_UP;
2487 }
2488 if (iff & IFF_PROMISC) {
2489 nd |= NETDEV_PROMISC;
2490 }
2491 return nd;
2492}
2493
2494static int
4f9f3f21
BP
2495update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2496 enum netdev_flags on, enum netdev_flags *old_flagsp)
2497 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2498{
2499 int old_flags, new_flags;
c37d4da4
EJ
2500 int error = 0;
2501
b5d57fc8 2502 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2503 *old_flagsp = iff_to_nd_flags(old_flags);
2504 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2505 if (new_flags != old_flags) {
4f9f3f21
BP
2506 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2507 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2508 }
4f9f3f21
BP
2509
2510 return error;
2511}
2512
2513static int
2514netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2515 enum netdev_flags on, enum netdev_flags *old_flagsp)
2516{
2517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2518 int error;
2519
2520 ovs_mutex_lock(&netdev->mutex);
2521 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2522 ovs_mutex_unlock(&netdev->mutex);
2523
8b61709d
BP
2524 return error;
2525}
2526
ac4d3bcb 2527static unsigned int
86383816 2528netdev_linux_change_seq(const struct netdev *netdev_)
ac4d3bcb 2529{
86383816
BP
2530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2531 unsigned int change_seq;
2532
2533 ovs_mutex_lock(&netdev->mutex);
2534 change_seq = netdev->change_seq;
2535 ovs_mutex_unlock(&netdev->mutex);
2536
2537 return change_seq;
ac4d3bcb
EJ
2538}
2539
9dc63482 2540#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
51f87458 2541 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2542{ \
2543 NAME, \
2544 \
259e0b1a 2545 NULL, \
c3827f61
BP
2546 netdev_linux_run, \
2547 netdev_linux_wait, \
2548 \
9dc63482
BP
2549 netdev_linux_alloc, \
2550 CONSTRUCT, \
2551 netdev_linux_destruct, \
2552 netdev_linux_dealloc, \
de5cdb90 2553 NULL, /* get_config */ \
6d9e6eb4 2554 NULL, /* set_config */ \
f431bf7d 2555 NULL, /* get_tunnel_config */ \
c3827f61 2556 \
c3827f61
BP
2557 netdev_linux_send, \
2558 netdev_linux_send_wait, \
2559 \
2560 netdev_linux_set_etheraddr, \
2561 netdev_linux_get_etheraddr, \
2562 netdev_linux_get_mtu, \
9b020780 2563 netdev_linux_set_mtu, \
c3827f61
BP
2564 netdev_linux_get_ifindex, \
2565 netdev_linux_get_carrier, \
65c3058c 2566 netdev_linux_get_carrier_resets, \
1670c579 2567 netdev_linux_set_miimon_interval, \
f613a0d7 2568 GET_STATS, \
c3827f61
BP
2569 SET_STATS, \
2570 \
51f87458 2571 GET_FEATURES, \
c3827f61 2572 netdev_linux_set_advertisements, \
c3827f61
BP
2573 \
2574 netdev_linux_set_policing, \
2575 netdev_linux_get_qos_types, \
2576 netdev_linux_get_qos_capabilities, \
2577 netdev_linux_get_qos, \
2578 netdev_linux_set_qos, \
2579 netdev_linux_get_queue, \
2580 netdev_linux_set_queue, \
2581 netdev_linux_delete_queue, \
2582 netdev_linux_get_queue_stats, \
2583 netdev_linux_dump_queues, \
2584 netdev_linux_dump_queue_stats, \
2585 \
2586 netdev_linux_get_in4, \
2587 netdev_linux_set_in4, \
2588 netdev_linux_get_in6, \
2589 netdev_linux_add_router, \
2590 netdev_linux_get_next_hop, \
4f925bd3 2591 GET_STATUS, \
c3827f61
BP
2592 netdev_linux_arp_lookup, \
2593 \
2594 netdev_linux_update_flags, \
2595 \
9dc63482
BP
2596 netdev_linux_change_seq, \
2597 \
2598 netdev_linux_rx_alloc, \
2599 netdev_linux_rx_construct, \
2600 netdev_linux_rx_destruct, \
2601 netdev_linux_rx_dealloc, \
2602 netdev_linux_rx_recv, \
2603 netdev_linux_rx_wait, \
2604 netdev_linux_rx_drain, \
c3827f61
BP
2605}
2606
2607const struct netdev_class netdev_linux_class =
2608 NETDEV_LINUX_CLASS(
2609 "system",
9dc63482 2610 netdev_linux_construct,
f613a0d7 2611 netdev_linux_get_stats,
4f925bd3 2612 NULL, /* set_stats */
51f87458 2613 netdev_linux_get_features,
275707c3 2614 netdev_linux_get_status);
c3827f61
BP
2615
2616const struct netdev_class netdev_tap_class =
2617 NETDEV_LINUX_CLASS(
2618 "tap",
9dc63482 2619 netdev_linux_construct_tap,
bba1e6f3 2620 netdev_tap_get_stats,
4f925bd3 2621 NULL, /* set_stats */
51f87458 2622 netdev_linux_get_features,
275707c3 2623 netdev_linux_get_status);
c3827f61
BP
2624
2625const struct netdev_class netdev_internal_class =
2626 NETDEV_LINUX_CLASS(
2627 "internal",
9dc63482 2628 netdev_linux_construct,
bba1e6f3 2629 netdev_internal_get_stats,
2f31a822 2630 netdev_internal_set_stats,
51f87458 2631 NULL, /* get_features */
275707c3 2632 netdev_internal_get_status);
8b61709d 2633\f
c1c9c9c4 2634/* HTB traffic control class. */
559843ed 2635
c1c9c9c4 2636#define HTB_N_QUEUES 0xf000
8b61709d 2637
c1c9c9c4
BP
2638struct htb {
2639 struct tc tc;
2640 unsigned int max_rate; /* In bytes/s. */
2641};
8b61709d 2642
c1c9c9c4 2643struct htb_class {
93b13be8 2644 struct tc_queue tc_queue;
c1c9c9c4
BP
2645 unsigned int min_rate; /* In bytes/s. */
2646 unsigned int max_rate; /* In bytes/s. */
2647 unsigned int burst; /* In bytes. */
2648 unsigned int priority; /* Lower values are higher priorities. */
2649};
8b61709d 2650
c1c9c9c4 2651static struct htb *
b5d57fc8 2652htb_get__(const struct netdev *netdev_)
c1c9c9c4 2653{
b5d57fc8
BP
2654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2655 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
2656}
2657
24045e35 2658static void
b5d57fc8 2659htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 2660{
b5d57fc8 2661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2662 struct htb *htb;
2663
2664 htb = xmalloc(sizeof *htb);
2665 tc_init(&htb->tc, &tc_ops_htb);
2666 htb->max_rate = max_rate;
2667
b5d57fc8 2668 netdev->tc = &htb->tc;
c1c9c9c4
BP
2669}
2670
2671/* Create an HTB qdisc.
2672 *
a339aa81 2673 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2674static int
2675htb_setup_qdisc__(struct netdev *netdev)
2676{
2677 size_t opt_offset;
2678 struct tc_htb_glob opt;
2679 struct ofpbuf request;
2680 struct tcmsg *tcmsg;
2681
2682 tc_del_qdisc(netdev);
2683
2684 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2685 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2686 if (!tcmsg) {
2687 return ENODEV;
2688 }
c1c9c9c4
BP
2689 tcmsg->tcm_handle = tc_make_handle(1, 0);
2690 tcmsg->tcm_parent = TC_H_ROOT;
2691
2692 nl_msg_put_string(&request, TCA_KIND, "htb");
2693
2694 memset(&opt, 0, sizeof opt);
2695 opt.rate2quantum = 10;
2696 opt.version = 3;
4ecf12d5 2697 opt.defcls = 1;
c1c9c9c4
BP
2698
2699 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2700 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2701 nl_msg_end_nested(&request, opt_offset);
2702
2703 return tc_transact(&request, NULL);
2704}
2705
2706/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2707 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2708static int
2709htb_setup_class__(struct netdev *netdev, unsigned int handle,
2710 unsigned int parent, struct htb_class *class)
2711{
2712 size_t opt_offset;
2713 struct tc_htb_opt opt;
2714 struct ofpbuf request;
2715 struct tcmsg *tcmsg;
2716 int error;
2717 int mtu;
2718
73371c09 2719 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 2720 if (error) {
f915f1a8
BP
2721 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2722 netdev_get_name(netdev));
9b020780 2723 return error;
f915f1a8 2724 }
c1c9c9c4
BP
2725
2726 memset(&opt, 0, sizeof opt);
2727 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2728 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2729 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2730 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2731 opt.prio = class->priority;
2732
2733 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2734 if (!tcmsg) {
2735 return ENODEV;
2736 }
c1c9c9c4
BP
2737 tcmsg->tcm_handle = handle;
2738 tcmsg->tcm_parent = parent;
2739
2740 nl_msg_put_string(&request, TCA_KIND, "htb");
2741 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2742 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2743 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2744 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2745 nl_msg_end_nested(&request, opt_offset);
2746
2747 error = tc_transact(&request, NULL);
2748 if (error) {
2749 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2750 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2751 netdev_get_name(netdev),
2752 tc_get_major(handle), tc_get_minor(handle),
2753 tc_get_major(parent), tc_get_minor(parent),
2754 class->min_rate, class->max_rate,
10a89ef0 2755 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
2756 }
2757 return error;
2758}
2759
2760/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2761 * description of them into 'details'. The description complies with the
2762 * specification given in the vswitch database documentation for linux-htb
2763 * queue details. */
2764static int
2765htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2766{
2767 static const struct nl_policy tca_htb_policy[] = {
2768 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2769 .min_len = sizeof(struct tc_htb_opt) },
2770 };
2771
2772 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2773 const struct tc_htb_opt *htb;
2774
2775 if (!nl_parse_nested(nl_options, tca_htb_policy,
2776 attrs, ARRAY_SIZE(tca_htb_policy))) {
2777 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2778 return EPROTO;
2779 }
2780
2781 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2782 class->min_rate = htb->rate.rate;
2783 class->max_rate = htb->ceil.rate;
2784 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2785 class->priority = htb->prio;
2786 return 0;
2787}
2788
2789static int
2790htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2791 struct htb_class *options,
2792 struct netdev_queue_stats *stats)
2793{
2794 struct nlattr *nl_options;
2795 unsigned int handle;
2796 int error;
2797
2798 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2799 if (!error && queue_id) {
17ee3c1f
BP
2800 unsigned int major = tc_get_major(handle);
2801 unsigned int minor = tc_get_minor(handle);
2802 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2803 *queue_id = minor - 1;
c1c9c9c4
BP
2804 } else {
2805 error = EPROTO;
2806 }
2807 }
2808 if (!error && options) {
2809 error = htb_parse_tca_options__(nl_options, options);
2810 }
2811 return error;
2812}
2813
2814static void
73371c09 2815htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 2816 const struct smap *details, struct htb_class *hc)
c1c9c9c4 2817{
73371c09 2818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2819 const char *max_rate_s;
2820
79f1cbe9 2821 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2822 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2823 if (!hc->max_rate) {
a00ca915 2824 enum netdev_features current;
c1c9c9c4 2825
73371c09
BP
2826 netdev_linux_read_features(netdev);
2827 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 2828 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2829 }
2830 hc->min_rate = hc->max_rate;
2831 hc->burst = 0;
2832 hc->priority = 0;
2833}
2834
2835static int
2836htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2837 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2838{
2839 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2840 const char *min_rate_s = smap_get(details, "min-rate");
2841 const char *max_rate_s = smap_get(details, "max-rate");
2842 const char *burst_s = smap_get(details, "burst");
2843 const char *priority_s = smap_get(details, "priority");
9b020780 2844 int mtu, error;
c1c9c9c4 2845
73371c09 2846 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 2847 if (error) {
f915f1a8
BP
2848 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2849 netdev_get_name(netdev));
9b020780 2850 return error;
f915f1a8
BP
2851 }
2852
4f104611
EJ
2853 /* HTB requires at least an mtu sized min-rate to send any traffic even
2854 * on uncongested links. */
c45ab5e9 2855 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2856 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2857 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2858
2859 /* max-rate */
2860 hc->max_rate = (max_rate_s
2861 ? strtoull(max_rate_s, NULL, 10) / 8
2862 : htb->max_rate);
2863 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2864 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2865
2866 /* burst
2867 *
2868 * According to hints in the documentation that I've read, it is important
2869 * that 'burst' be at least as big as the largest frame that might be
2870 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2871 * but having it a bit too small is a problem. Since netdev_get_mtu()
2872 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2873 * the MTU. We actually add 64, instead of 14, as a guard against
2874 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2875 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2876 hc->burst = MAX(hc->burst, mtu + 64);
2877
2878 /* priority */
2879 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2880
2881 return 0;
2882}
2883
2884static int
2885htb_query_class__(const struct netdev *netdev, unsigned int handle,
2886 unsigned int parent, struct htb_class *options,
2887 struct netdev_queue_stats *stats)
2888{
2889 struct ofpbuf *reply;
2890 int error;
2891
2892 error = tc_query_class(netdev, handle, parent, &reply);
2893 if (!error) {
2894 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2895 ofpbuf_delete(reply);
2896 }
2897 return error;
2898}
2899
2900static int
79f1cbe9 2901htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2902{
2903 int error;
2904
2905 error = htb_setup_qdisc__(netdev);
2906 if (!error) {
2907 struct htb_class hc;
2908
2909 htb_parse_qdisc_details__(netdev, details, &hc);
2910 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2911 tc_make_handle(1, 0), &hc);
2912 if (!error) {
2913 htb_install__(netdev, hc.max_rate);
2914 }
2915 }
2916 return error;
2917}
2918
93b13be8
BP
2919static struct htb_class *
2920htb_class_cast__(const struct tc_queue *queue)
2921{
2922 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2923}
2924
c1c9c9c4
BP
2925static void
2926htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2927 const struct htb_class *hc)
2928{
2929 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2930 size_t hash = hash_int(queue_id, 0);
2931 struct tc_queue *queue;
c1c9c9c4
BP
2932 struct htb_class *hcp;
2933
93b13be8
BP
2934 queue = tc_find_queue__(netdev, queue_id, hash);
2935 if (queue) {
2936 hcp = htb_class_cast__(queue);
2937 } else {
c1c9c9c4 2938 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2939 queue = &hcp->tc_queue;
2940 queue->queue_id = queue_id;
6dc34a0d 2941 queue->created = time_msec();
93b13be8 2942 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2943 }
93b13be8
BP
2944
2945 hcp->min_rate = hc->min_rate;
2946 hcp->max_rate = hc->max_rate;
2947 hcp->burst = hc->burst;
2948 hcp->priority = hc->priority;
c1c9c9c4
BP
2949}
2950
2951static int
2952htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2953{
c1c9c9c4
BP
2954 struct ofpbuf msg;
2955 struct nl_dump dump;
2956 struct htb_class hc;
c1c9c9c4
BP
2957
2958 /* Get qdisc options. */
2959 hc.max_rate = 0;
2960 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2961 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2962
2963 /* Get queues. */
23a98ffe
BP
2964 if (!start_queue_dump(netdev, &dump)) {
2965 return ENODEV;
2966 }
c1c9c9c4
BP
2967 while (nl_dump_next(&dump, &msg)) {
2968 unsigned int queue_id;
2969
2970 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2971 htb_update_queue__(netdev, queue_id, &hc);
2972 }
2973 }
2974 nl_dump_done(&dump);
2975
2976 return 0;
2977}
2978
2979static void
2980htb_tc_destroy(struct tc *tc)
2981{
2982 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2983 struct htb_class *hc, *next;
c1c9c9c4 2984
4e8e4213 2985 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2986 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2987 free(hc);
2988 }
2989 tc_destroy(tc);
2990 free(htb);
2991}
2992
2993static int
79f1cbe9 2994htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2995{
2996 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2997 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2998 return 0;
2999}
3000
3001static int
79f1cbe9 3002htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3003{
3004 struct htb_class hc;
3005 int error;
3006
3007 htb_parse_qdisc_details__(netdev, details, &hc);
3008 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3009 tc_make_handle(1, 0), &hc);
3010 if (!error) {
3011 htb_get__(netdev)->max_rate = hc.max_rate;
3012 }
3013 return error;
3014}
3015
3016static int
93b13be8 3017htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3018 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3019{
93b13be8 3020 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3021
79f1cbe9 3022 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3023 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3024 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3025 }
79f1cbe9 3026 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3027 if (hc->priority) {
79f1cbe9 3028 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3029 }
3030 return 0;
3031}
3032
3033static int
3034htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3035 const struct smap *details)
c1c9c9c4
BP
3036{
3037 struct htb_class hc;
3038 int error;
3039
3040 error = htb_parse_class_details__(netdev, details, &hc);
3041 if (error) {
3042 return error;
3043 }
3044
17ee3c1f 3045 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3046 tc_make_handle(1, 0xfffe), &hc);
3047 if (error) {
3048 return error;
3049 }
3050
3051 htb_update_queue__(netdev, queue_id, &hc);
3052 return 0;
3053}
3054
3055static int
93b13be8 3056htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3057{
93b13be8 3058 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3059 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3060 int error;
3061
93b13be8 3062 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3063 if (!error) {
93b13be8 3064 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3065 free(hc);
c1c9c9c4
BP
3066 }
3067 return error;
3068}
3069
3070static int
93b13be8 3071htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3072 struct netdev_queue_stats *stats)
3073{
93b13be8 3074 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3075 tc_make_handle(1, 0xfffe), NULL, stats);
3076}
3077
3078static int
3079htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3080 const struct ofpbuf *nlmsg,
3081 netdev_dump_queue_stats_cb *cb, void *aux)
3082{
3083 struct netdev_queue_stats stats;
17ee3c1f 3084 unsigned int handle, major, minor;
c1c9c9c4
BP
3085 int error;
3086
3087 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3088 if (error) {
3089 return error;
3090 }
3091
17ee3c1f
BP
3092 major = tc_get_major(handle);
3093 minor = tc_get_minor(handle);
3094 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3095 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3096 }
3097 return 0;
3098}
3099
3100static const struct tc_ops tc_ops_htb = {
3101 "htb", /* linux_name */
3102 "linux-htb", /* ovs_name */
3103 HTB_N_QUEUES, /* n_queues */
3104 htb_tc_install,
3105 htb_tc_load,
3106 htb_tc_destroy,
3107 htb_qdisc_get,
3108 htb_qdisc_set,
3109 htb_class_get,
3110 htb_class_set,
3111 htb_class_delete,
3112 htb_class_get_stats,
3113 htb_class_dump_stats
3114};
3115\f
a339aa81
EJ
3116/* "linux-hfsc" traffic control class. */
3117
3118#define HFSC_N_QUEUES 0xf000
3119
3120struct hfsc {
3121 struct tc tc;
3122 uint32_t max_rate;
3123};
3124
3125struct hfsc_class {
3126 struct tc_queue tc_queue;
3127 uint32_t min_rate;
3128 uint32_t max_rate;
3129};
3130
3131static struct hfsc *
b5d57fc8 3132hfsc_get__(const struct netdev *netdev_)
a339aa81 3133{
b5d57fc8
BP
3134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3135 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
3136}
3137
3138static struct hfsc_class *
3139hfsc_class_cast__(const struct tc_queue *queue)
3140{
3141 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3142}
3143
24045e35 3144static void
b5d57fc8 3145hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 3146{
b5d57fc8 3147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3148 struct hfsc *hfsc;
3149
a339aa81
EJ
3150 hfsc = xmalloc(sizeof *hfsc);
3151 tc_init(&hfsc->tc, &tc_ops_hfsc);
3152 hfsc->max_rate = max_rate;
b5d57fc8 3153 netdev->tc = &hfsc->tc;
a339aa81
EJ
3154}
3155
3156static void
3157hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3158 const struct hfsc_class *hc)
3159{
3160 size_t hash;
3161 struct hfsc *hfsc;
3162 struct hfsc_class *hcp;
3163 struct tc_queue *queue;
3164
3165 hfsc = hfsc_get__(netdev);
3166 hash = hash_int(queue_id, 0);
3167
3168 queue = tc_find_queue__(netdev, queue_id, hash);
3169 if (queue) {
3170 hcp = hfsc_class_cast__(queue);
3171 } else {
3172 hcp = xmalloc(sizeof *hcp);
3173 queue = &hcp->tc_queue;
3174 queue->queue_id = queue_id;
6dc34a0d 3175 queue->created = time_msec();
a339aa81
EJ
3176 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3177 }
3178
3179 hcp->min_rate = hc->min_rate;
3180 hcp->max_rate = hc->max_rate;
3181}
3182
3183static int
3184hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3185{
3186 const struct tc_service_curve *rsc, *fsc, *usc;
3187 static const struct nl_policy tca_hfsc_policy[] = {
3188 [TCA_HFSC_RSC] = {
3189 .type = NL_A_UNSPEC,
3190 .optional = false,
3191 .min_len = sizeof(struct tc_service_curve),
3192 },
3193 [TCA_HFSC_FSC] = {
3194 .type = NL_A_UNSPEC,
3195 .optional = false,
3196 .min_len = sizeof(struct tc_service_curve),
3197 },
3198 [TCA_HFSC_USC] = {
3199 .type = NL_A_UNSPEC,
3200 .optional = false,
3201 .min_len = sizeof(struct tc_service_curve),
3202 },
3203 };
3204 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3205
3206 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3207 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3208 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3209 return EPROTO;
3210 }
3211
3212 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3213 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3214 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3215
3216 if (rsc->m1 != 0 || rsc->d != 0 ||
3217 fsc->m1 != 0 || fsc->d != 0 ||
3218 usc->m1 != 0 || usc->d != 0) {
3219 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3220 "Non-linear service curves are not supported.");
3221 return EPROTO;
3222 }
3223
3224 if (rsc->m2 != fsc->m2) {
3225 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3226 "Real-time service curves are not supported ");
3227 return EPROTO;
3228 }
3229
3230 if (rsc->m2 > usc->m2) {
3231 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3232 "Min-rate service curve is greater than "
3233 "the max-rate service curve.");
3234 return EPROTO;
3235 }
3236
3237 class->min_rate = fsc->m2;
3238 class->max_rate = usc->m2;
3239 return 0;
3240}
3241
3242static int
3243hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3244 struct hfsc_class *options,
3245 struct netdev_queue_stats *stats)
3246{
3247 int error;
3248 unsigned int handle;
3249 struct nlattr *nl_options;
3250
3251 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3252 if (error) {
3253 return error;
3254 }
3255
3256 if (queue_id) {
3257 unsigned int major, minor;
3258
3259 major = tc_get_major(handle);
3260 minor = tc_get_minor(handle);
3261 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3262 *queue_id = minor - 1;
3263 } else {
3264 return EPROTO;
3265 }
3266 }
3267
3268 if (options) {
3269 error = hfsc_parse_tca_options__(nl_options, options);
3270 }
3271
3272 return error;
3273}
3274
3275static int
3276hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3277 unsigned int parent, struct hfsc_class *options,
3278 struct netdev_queue_stats *stats)
3279{
3280 int error;
3281 struct ofpbuf *reply;
3282
3283 error = tc_query_class(netdev, handle, parent, &reply);
3284 if (error) {
3285 return error;
3286 }
3287
3288 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3289 ofpbuf_delete(reply);
3290 return error;
3291}
3292
3293static void
73371c09 3294hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
3295 struct hfsc_class *class)
3296{
73371c09 3297 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3298 uint32_t max_rate;
3299 const char *max_rate_s;
3300
79f1cbe9 3301 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3302 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3303
3304 if (!max_rate) {
a00ca915 3305 enum netdev_features current;
a339aa81 3306
73371c09
BP
3307 netdev_linux_read_features(netdev);
3308 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3309 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3310 }
3311
3312 class->min_rate = max_rate;
3313 class->max_rate = max_rate;
3314}
3315
3316static int
3317hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3318 const struct smap *details,
a339aa81
EJ
3319 struct hfsc_class * class)
3320{
3321 const struct hfsc *hfsc;
3322 uint32_t min_rate, max_rate;
3323 const char *min_rate_s, *max_rate_s;
3324
3325 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3326 min_rate_s = smap_get(details, "min-rate");
3327 max_rate_s = smap_get(details, "max-rate");
a339aa81 3328
c45ab5e9 3329 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3330 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3331 min_rate = MIN(min_rate, hfsc->max_rate);
3332
3333 max_rate = (max_rate_s
3334 ? strtoull(max_rate_s, NULL, 10) / 8
3335 : hfsc->max_rate);
3336 max_rate = MAX(max_rate, min_rate);
3337 max_rate = MIN(max_rate, hfsc->max_rate);
3338
3339 class->min_rate = min_rate;
3340 class->max_rate = max_rate;
3341
3342 return 0;
3343}
3344
3345/* Create an HFSC qdisc.
3346 *
3347 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3348static int
3349hfsc_setup_qdisc__(struct netdev * netdev)
3350{
3351 struct tcmsg *tcmsg;
3352 struct ofpbuf request;
3353 struct tc_hfsc_qopt opt;
3354
3355 tc_del_qdisc(netdev);
3356
3357 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3358 NLM_F_EXCL | NLM_F_CREATE, &request);
3359
3360 if (!tcmsg) {
3361 return ENODEV;
3362 }
3363
3364 tcmsg->tcm_handle = tc_make_handle(1, 0);
3365 tcmsg->tcm_parent = TC_H_ROOT;
3366
3367 memset(&opt, 0, sizeof opt);
3368 opt.defcls = 1;
3369
3370 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3371 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3372
3373 return tc_transact(&request, NULL);
3374}
3375
3376/* Create an HFSC class.
3377 *
3378 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3379 * sc rate <min_rate> ul rate <max_rate>" */
3380static int
3381hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3382 unsigned int parent, struct hfsc_class *class)
3383{
3384 int error;
3385 size_t opt_offset;
3386 struct tcmsg *tcmsg;
3387 struct ofpbuf request;
3388 struct tc_service_curve min, max;
3389
3390 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3391
3392 if (!tcmsg) {
3393 return ENODEV;
3394 }
3395
3396 tcmsg->tcm_handle = handle;
3397 tcmsg->tcm_parent = parent;
3398
3399 min.m1 = 0;
3400 min.d = 0;
3401 min.m2 = class->min_rate;
3402
3403 max.m1 = 0;
3404 max.d = 0;
3405 max.m2 = class->max_rate;
3406
3407 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3408 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3409 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3410 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3411 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3412 nl_msg_end_nested(&request, opt_offset);
3413
3414 error = tc_transact(&request, NULL);
3415 if (error) {
3416 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3417 "min-rate %ubps, max-rate %ubps (%s)",
3418 netdev_get_name(netdev),
3419 tc_get_major(handle), tc_get_minor(handle),
3420 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 3421 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
3422 }
3423
3424 return error;
3425}
3426
3427static int
79f1cbe9 3428hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3429{
3430 int error;
3431 struct hfsc_class class;
3432
3433 error = hfsc_setup_qdisc__(netdev);
3434
3435 if (error) {
3436 return error;
3437 }
3438
3439 hfsc_parse_qdisc_details__(netdev, details, &class);
3440 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3441 tc_make_handle(1, 0), &class);
3442
3443 if (error) {
3444 return error;
3445 }
3446
3447 hfsc_install__(netdev, class.max_rate);
3448 return 0;
3449}
3450
3451static int
3452hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3453{
3454 struct ofpbuf msg;
a339aa81
EJ
3455 struct nl_dump dump;
3456 struct hfsc_class hc;
3457
3458 hc.max_rate = 0;
3459 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3460 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3461
3462 if (!start_queue_dump(netdev, &dump)) {
3463 return ENODEV;
3464 }
3465
3466 while (nl_dump_next(&dump, &msg)) {
3467 unsigned int queue_id;
3468
3469 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3470 hfsc_update_queue__(netdev, queue_id, &hc);
3471 }
3472 }
3473
3474 nl_dump_done(&dump);
3475 return 0;
3476}
3477
3478static void
3479hfsc_tc_destroy(struct tc *tc)
3480{
3481 struct hfsc *hfsc;
3482 struct hfsc_class *hc, *next;
3483
3484 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3485
3486 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3487 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3488 free(hc);
3489 }
3490
3491 tc_destroy(tc);
3492 free(hfsc);
3493}
3494
3495static int
79f1cbe9 3496hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3497{
3498 const struct hfsc *hfsc;
3499 hfsc = hfsc_get__(netdev);
79f1cbe9 3500 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3501 return 0;
3502}
3503
3504static int
79f1cbe9 3505hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3506{
3507 int error;
3508 struct hfsc_class class;
3509
3510 hfsc_parse_qdisc_details__(netdev, details, &class);
3511 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3512 tc_make_handle(1, 0), &class);
3513
3514 if (!error) {
3515 hfsc_get__(netdev)->max_rate = class.max_rate;
3516 }
3517
3518 return error;
3519}
3520
3521static int
3522hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3523 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3524{
3525 const struct hfsc_class *hc;
3526
3527 hc = hfsc_class_cast__(queue);
79f1cbe9 3528 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3529 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3530 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3531 }
3532 return 0;
3533}
3534
3535static int
3536hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3537 const struct smap *details)
a339aa81
EJ
3538{
3539 int error;
3540 struct hfsc_class class;
3541
3542 error = hfsc_parse_class_details__(netdev, details, &class);
3543 if (error) {
3544 return error;
3545 }
3546
3547 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3548 tc_make_handle(1, 0xfffe), &class);
3549 if (error) {
3550 return error;
3551 }
3552
3553 hfsc_update_queue__(netdev, queue_id, &class);
3554 return 0;
3555}
3556
3557static int
3558hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3559{
3560 int error;
3561 struct hfsc *hfsc;
3562 struct hfsc_class *hc;
3563
3564 hc = hfsc_class_cast__(queue);
3565 hfsc = hfsc_get__(netdev);
3566
3567 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3568 if (!error) {
3569 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3570 free(hc);
3571 }
3572 return error;
3573}
3574
3575static int
3576hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3577 struct netdev_queue_stats *stats)
3578{
3579 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3580 tc_make_handle(1, 0xfffe), NULL, stats);
3581}
3582
3583static int
3584hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3585 const struct ofpbuf *nlmsg,
3586 netdev_dump_queue_stats_cb *cb, void *aux)
3587{
3588 struct netdev_queue_stats stats;
3589 unsigned int handle, major, minor;
3590 int error;
3591
3592 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3593 if (error) {
3594 return error;
3595 }
3596
3597 major = tc_get_major(handle);
3598 minor = tc_get_minor(handle);
3599 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3600 (*cb)(minor - 1, &stats, aux);
3601 }
3602 return 0;
3603}
3604
3605static const struct tc_ops tc_ops_hfsc = {
3606 "hfsc", /* linux_name */
3607 "linux-hfsc", /* ovs_name */
3608 HFSC_N_QUEUES, /* n_queues */
3609 hfsc_tc_install, /* tc_install */
3610 hfsc_tc_load, /* tc_load */
3611 hfsc_tc_destroy, /* tc_destroy */
3612 hfsc_qdisc_get, /* qdisc_get */
3613 hfsc_qdisc_set, /* qdisc_set */
3614 hfsc_class_get, /* class_get */
3615 hfsc_class_set, /* class_set */
3616 hfsc_class_delete, /* class_delete */
3617 hfsc_class_get_stats, /* class_get_stats */
3618 hfsc_class_dump_stats /* class_dump_stats */
3619};
3620\f
c1c9c9c4
BP
3621/* "linux-default" traffic control class.
3622 *
3623 * This class represents the default, unnamed Linux qdisc. It corresponds to
3624 * the "" (empty string) QoS type in the OVS database. */
3625
3626static void
b5d57fc8 3627default_install__(struct netdev *netdev_)
c1c9c9c4 3628{
b5d57fc8 3629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3630 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 3631
559eb230
BP
3632 /* Nothing but a tc class implementation is allowed to write to a tc. This
3633 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3634 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3635}
3636
3637static int
3638default_tc_install(struct netdev *netdev,
79f1cbe9 3639 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3640{
3641 default_install__(netdev);
3642 return 0;
3643}
3644
3645static int
3646default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3647{
3648 default_install__(netdev);
3649 return 0;
3650}
3651
3652static const struct tc_ops tc_ops_default = {
3653 NULL, /* linux_name */
3654 "", /* ovs_name */
3655 0, /* n_queues */
3656 default_tc_install,
3657 default_tc_load,
3658 NULL, /* tc_destroy */
3659 NULL, /* qdisc_get */
3660 NULL, /* qdisc_set */
3661 NULL, /* class_get */
3662 NULL, /* class_set */
3663 NULL, /* class_delete */
3664 NULL, /* class_get_stats */
3665 NULL /* class_dump_stats */
3666};
3667\f
3668/* "linux-other" traffic control class.
3669 *
3670 * */
3671
3672static int
b5d57fc8 3673other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 3674{
b5d57fc8 3675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3676 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 3677
559eb230
BP
3678 /* Nothing but a tc class implementation is allowed to write to a tc. This
3679 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3680 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3681 return 0;
3682}
3683
3684static const struct tc_ops tc_ops_other = {
3685 NULL, /* linux_name */
3686 "linux-other", /* ovs_name */
3687 0, /* n_queues */
3688 NULL, /* tc_install */
3689 other_tc_load,
3690 NULL, /* tc_destroy */
3691 NULL, /* qdisc_get */
3692 NULL, /* qdisc_set */
3693 NULL, /* class_get */
3694 NULL, /* class_set */
3695 NULL, /* class_delete */
3696 NULL, /* class_get_stats */
3697 NULL /* class_dump_stats */
3698};
3699\f
3700/* Traffic control. */
3701
3702/* Number of kernel "tc" ticks per second. */
3703static double ticks_per_s;
3704
3705/* Number of kernel "jiffies" per second. This is used for the purpose of
3706 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3707 * one jiffy's worth of data.
3708 *
3709 * There are two possibilities here:
3710 *
3711 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3712 * approximate range of 100 to 1024. That means that we really need to
3713 * make sure that the qdisc can buffer that much data.
3714 *
3715 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3716 * has finely granular timers and there's no need to fudge additional room
3717 * for buffers. (There's no extra effort needed to implement that: the
3718 * large 'buffer_hz' is used as a divisor, so practically any number will
3719 * come out as 0 in the division. Small integer results in the case of
3720 * really high dividends won't have any real effect anyhow.)
3721 */
3722static unsigned int buffer_hz;
3723
3724/* Returns tc handle 'major':'minor'. */
3725static unsigned int
3726tc_make_handle(unsigned int major, unsigned int minor)
3727{
3728 return TC_H_MAKE(major << 16, minor);
3729}
3730
3731/* Returns the major number from 'handle'. */
3732static unsigned int
3733tc_get_major(unsigned int handle)
3734{
3735 return TC_H_MAJ(handle) >> 16;
3736}
3737
3738/* Returns the minor number from 'handle'. */
3739static unsigned int
3740tc_get_minor(unsigned int handle)
3741{
3742 return TC_H_MIN(handle);
3743}
3744
3745static struct tcmsg *
3746tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3747 struct ofpbuf *request)
3748{
3749 struct tcmsg *tcmsg;
3750 int ifindex;
3751 int error;
3752
3753 error = get_ifindex(netdev, &ifindex);
3754 if (error) {
3755 return NULL;
3756 }
3757
3758 ofpbuf_init(request, 512);
3759 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3760 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3761 tcmsg->tcm_family = AF_UNSPEC;
3762 tcmsg->tcm_ifindex = ifindex;
3763 /* Caller should fill in tcmsg->tcm_handle. */
3764 /* Caller should fill in tcmsg->tcm_parent. */
3765
3766 return tcmsg;
3767}
3768
3769static int
3770tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3771{
a88b4e04 3772 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
3773 ofpbuf_uninit(request);
3774 return error;
3775}
3776
f8500004
JP
3777/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3778 * policing configuration.
3779 *
3780 * This function is equivalent to running the following when 'add' is true:
3781 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3782 *
3783 * This function is equivalent to running the following when 'add' is false:
3784 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3785 *
3786 * The configuration and stats may be seen with the following command:
3787 * /sbin/tc -s qdisc show dev <devname>
3788 *
3789 * Returns 0 if successful, otherwise a positive errno value.
3790 */
3791static int
3792tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3793{
3794 struct ofpbuf request;
3795 struct tcmsg *tcmsg;
3796 int error;
3797 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3798 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3799
3800 tcmsg = tc_make_request(netdev, type, flags, &request);
3801 if (!tcmsg) {
3802 return ENODEV;
3803 }
3804 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3805 tcmsg->tcm_parent = TC_H_INGRESS;
3806 nl_msg_put_string(&request, TCA_KIND, "ingress");
3807 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3808
3809 error = tc_transact(&request, NULL);
3810 if (error) {
3811 /* If we're deleting the qdisc, don't worry about some of the
3812 * error conditions. */
3813 if (!add && (error == ENOENT || error == EINVAL)) {
3814 return 0;
3815 }
3816 return error;
3817 }
3818
3819 return 0;
3820}
3821
3822/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3823 * of 'kbits_burst'.
3824 *
3825 * This function is equivalent to running:
3826 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3827 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3828 * mtu 65535 drop
3829 *
3830 * The configuration and stats may be seen with the following command:
3831 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3832 *
3833 * Returns 0 if successful, otherwise a positive errno value.
3834 */
3835static int
3836tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3837{
3838 struct tc_police tc_police;
3839 struct ofpbuf request;
3840 struct tcmsg *tcmsg;
3841 size_t basic_offset;
3842 size_t police_offset;
3843 int error;
3844 int mtu = 65535;
3845
3846 memset(&tc_police, 0, sizeof tc_police);
3847 tc_police.action = TC_POLICE_SHOT;
3848 tc_police.mtu = mtu;
e5c08015 3849 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
f8500004
JP
3850 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3851 kbits_burst * 1024);
3852
3853 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3854 NLM_F_EXCL | NLM_F_CREATE, &request);
3855 if (!tcmsg) {
3856 return ENODEV;
3857 }
3858 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3859 tcmsg->tcm_info = tc_make_handle(49,
3860 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3861
3862 nl_msg_put_string(&request, TCA_KIND, "basic");
3863 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3864 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3865 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3866 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3867 nl_msg_end_nested(&request, police_offset);
3868 nl_msg_end_nested(&request, basic_offset);
3869
3870 error = tc_transact(&request, NULL);
3871 if (error) {
3872 return error;
3873 }
3874
3875 return 0;
3876}
3877
c1c9c9c4
BP
3878static void
3879read_psched(void)
3880{
3881 /* The values in psched are not individually very meaningful, but they are
3882 * important. The tables below show some values seen in the wild.
3883 *
3884 * Some notes:
3885 *
3886 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3887 * (Before that, there are hints that it was 1000000000.)
3888 *
3889 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3890 * above.
3891 *
3892 * /proc/net/psched
3893 * -----------------------------------
3894 * [1] 000c8000 000f4240 000f4240 00000064
3895 * [2] 000003e8 00000400 000f4240 3b9aca00
3896 * [3] 000003e8 00000400 000f4240 3b9aca00
3897 * [4] 000003e8 00000400 000f4240 00000064
3898 * [5] 000003e8 00000040 000f4240 3b9aca00
3899 * [6] 000003e8 00000040 000f4240 000000f9
3900 *
3901 * a b c d ticks_per_s buffer_hz
3902 * ------- --------- ---------- ------------- ----------- -------------
3903 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3904 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3905 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3906 * [4] 1,000 1,024 1,000,000 100 976,562 100
3907 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3908 * [6] 1,000 64 1,000,000 249 15,625,000 249
3909 *
3910 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3911 * [2] 2.6.26-1-686-bigmem from Debian lenny
3912 * [3] 2.6.26-2-sparc64 from Debian lenny
3913 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3914 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3915 * [6] 2.6.34 from kernel.org on KVM
3916 */
23882115 3917 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
3918 static const char fn[] = "/proc/net/psched";
3919 unsigned int a, b, c, d;
3920 FILE *stream;
3921
23882115
BP
3922 if (!ovsthread_once_start(&once)) {
3923 return;
3924 }
3925
c1c9c9c4
BP
3926 ticks_per_s = 1.0;
3927 buffer_hz = 100;
3928
3929 stream = fopen(fn, "r");
3930 if (!stream) {
10a89ef0 3931 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 3932 goto exit;
c1c9c9c4
BP
3933 }
3934
3935 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3936 VLOG_WARN("%s: read failed", fn);
3937 fclose(stream);
23882115 3938 goto exit;
c1c9c9c4
BP
3939 }
3940 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3941 fclose(stream);
3942
3943 if (!a || !c) {
3944 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 3945 goto exit;
c1c9c9c4
BP
3946 }
3947
3948 ticks_per_s = (double) a * c / b;
3949 if (c == 1000000) {
3950 buffer_hz = d;
3951 } else {
3952 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3953 fn, a, b, c, d);
3954 }
3955 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
3956
3957exit:
3958 ovsthread_once_done(&once);
c1c9c9c4
BP
3959}
3960
3961/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3962 * rate of 'rate' bytes per second. */
3963static unsigned int
3964tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3965{
23882115 3966 read_psched();
c1c9c9c4
BP
3967 return (rate * ticks) / ticks_per_s;
3968}
3969
3970/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3971 * rate of 'rate' bytes per second. */
3972static unsigned int
3973tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3974{
23882115 3975 read_psched();
015c93a4 3976 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3977}
3978
3979/* Returns the number of bytes that need to be reserved for qdisc buffering at
3980 * a transmission rate of 'rate' bytes per second. */
3981static unsigned int
3982tc_buffer_per_jiffy(unsigned int rate)
3983{
23882115 3984 read_psched();
c1c9c9c4
BP
3985 return rate / buffer_hz;
3986}
3987
3988/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3989 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3990 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3991 * stores NULL into it if it is absent.
3992 *
3993 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3994 * 'msg'.
3995 *
3996 * Returns 0 if successful, otherwise a positive errno value. */
3997static int
3998tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3999 struct nlattr **options)
4000{
4001 static const struct nl_policy tca_policy[] = {
4002 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4003 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4004 };
4005 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4006
4007 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4008 tca_policy, ta, ARRAY_SIZE(ta))) {
4009 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4010 goto error;
4011 }
4012
4013 if (kind) {
4014 *kind = nl_attr_get_string(ta[TCA_KIND]);
4015 }
4016
4017 if (options) {
4018 *options = ta[TCA_OPTIONS];
4019 }
4020
4021 return 0;
4022
4023error:
4024 if (kind) {
4025 *kind = NULL;
4026 }
4027 if (options) {
4028 *options = NULL;
4029 }
4030 return EPROTO;
4031}
4032
4033/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4034 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4035 * into '*options', and its queue statistics into '*stats'. Any of the output
4036 * arguments may be null.
4037 *
4038 * Returns 0 if successful, otherwise a positive errno value. */
4039static int
4040tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4041 struct nlattr **options, struct netdev_queue_stats *stats)
4042{
4043 static const struct nl_policy tca_policy[] = {
4044 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4045 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4046 };
4047 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4048
4049 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4050 tca_policy, ta, ARRAY_SIZE(ta))) {
4051 VLOG_WARN_RL(&rl, "failed to parse class message");
4052 goto error;
4053 }
4054
4055 if (handlep) {
4056 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4057 *handlep = tc->tcm_handle;
4058 }
4059
4060 if (options) {
4061 *options = ta[TCA_OPTIONS];
4062 }
4063
4064 if (stats) {
4065 const struct gnet_stats_queue *gsq;
4066 struct gnet_stats_basic gsb;
4067
4068 static const struct nl_policy stats_policy[] = {
4069 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4070 .min_len = sizeof gsb },
4071 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4072 .min_len = sizeof *gsq },
4073 };
4074 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4075
4076 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4077 sa, ARRAY_SIZE(sa))) {
4078 VLOG_WARN_RL(&rl, "failed to parse class stats");
4079 goto error;
4080 }
4081
4082 /* Alignment issues screw up the length of struct gnet_stats_basic on
4083 * some arch/bitsize combinations. Newer versions of Linux have a
4084 * struct gnet_stats_basic_packed, but we can't depend on that. The
4085 * easiest thing to do is just to make a copy. */
4086 memset(&gsb, 0, sizeof gsb);
4087 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4088 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4089 stats->tx_bytes = gsb.bytes;
4090 stats->tx_packets = gsb.packets;
4091
4092 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4093 stats->tx_errors = gsq->drops;
4094 }
4095
4096 return 0;
4097
4098error:
4099 if (options) {
4100 *options = NULL;
4101 }
4102 if (stats) {
4103 memset(stats, 0, sizeof *stats);
4104 }
4105 return EPROTO;
4106}
4107
4108/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4109 * on 'netdev'. */
4110static int
4111tc_query_class(const struct netdev *netdev,
4112 unsigned int handle, unsigned int parent,
4113 struct ofpbuf **replyp)
4114{
4115 struct ofpbuf request;
4116 struct tcmsg *tcmsg;
4117 int error;
4118
4119 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
4120 if (!tcmsg) {
4121 return ENODEV;
4122 }
c1c9c9c4
BP
4123 tcmsg->tcm_handle = handle;
4124 tcmsg->tcm_parent = parent;
4125
4126 error = tc_transact(&request, replyp);
4127 if (error) {
4128 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4129 netdev_get_name(netdev),
4130 tc_get_major(handle), tc_get_minor(handle),
4131 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4132 ovs_strerror(error));
c1c9c9c4
BP
4133 }
4134 return error;
4135}
4136
4137/* Equivalent to "tc class del dev <name> handle <handle>". */
4138static int
4139tc_delete_class(const struct netdev *netdev, unsigned int handle)
4140{
4141 struct ofpbuf request;
4142 struct tcmsg *tcmsg;
4143 int error;
4144
4145 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4146 if (!tcmsg) {
4147 return ENODEV;
4148 }
c1c9c9c4
BP
4149 tcmsg->tcm_handle = handle;
4150 tcmsg->tcm_parent = 0;
4151
4152 error = tc_transact(&request, NULL);
4153 if (error) {
4154 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4155 netdev_get_name(netdev),
4156 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 4157 ovs_strerror(error));
c1c9c9c4
BP
4158 }
4159 return error;
4160}
4161
4162/* Equivalent to "tc qdisc del dev <name> root". */
4163static int
b5d57fc8 4164tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 4165{
b5d57fc8 4166 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4167 struct ofpbuf request;
4168 struct tcmsg *tcmsg;
4169 int error;
4170
b5d57fc8 4171 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4172 if (!tcmsg) {
4173 return ENODEV;
4174 }
c1c9c9c4
BP
4175 tcmsg->tcm_handle = tc_make_handle(1, 0);
4176 tcmsg->tcm_parent = TC_H_ROOT;
4177
4178 error = tc_transact(&request, NULL);
4179 if (error == EINVAL) {
4180 /* EINVAL probably means that the default qdisc was in use, in which
4181 * case we've accomplished our purpose. */
4182 error = 0;
4183 }
b5d57fc8
BP
4184 if (!error && netdev->tc) {
4185 if (netdev->tc->ops->tc_destroy) {
4186 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 4187 }
b5d57fc8 4188 netdev->tc = NULL;
c1c9c9c4
BP
4189 }
4190 return error;
4191}
4192
4193/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4194 * kernel to determine what they are. Returns 0 if successful, otherwise a
4195 * positive errno value. */
4196static int
b5d57fc8 4197tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 4198{
b5d57fc8 4199 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4200 struct ofpbuf request, *qdisc;
4201 const struct tc_ops *ops;
4202 struct tcmsg *tcmsg;
4203 int load_error;
4204 int error;
4205
b5d57fc8 4206 if (netdev->tc) {
c1c9c9c4
BP
4207 return 0;
4208 }
4209
4210 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4211 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4212 * 2.6.35 without that fix backported to it.
4213 *
4214 * To avoid the OOPS, we must not make a request that would attempt to dump
4215 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4216 * few others. There are a few ways that I can see to do this, but most of
4217 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4218 * technique chosen here is to assume that any non-default qdisc that we
4219 * create will have a class with handle 1:0. The built-in qdiscs only have
4220 * a class with handle 0:0.
4221 *
4222 * We could check for Linux 2.6.35+ and use a more straightforward method
4223 * there. */
b5d57fc8 4224 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4225 if (!tcmsg) {
4226 return ENODEV;
4227 }
c1c9c9c4
BP
4228 tcmsg->tcm_handle = tc_make_handle(1, 0);
4229 tcmsg->tcm_parent = 0;
4230
4231 /* Figure out what tc class to instantiate. */
4232 error = tc_transact(&request, &qdisc);
4233 if (!error) {
4234 const char *kind;
4235
4236 error = tc_parse_qdisc(qdisc, &kind, NULL);
4237 if (error) {
4238 ops = &tc_ops_other;
4239 } else {
4240 ops = tc_lookup_linux_name(kind);
4241 if (!ops) {
4242 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4243 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4244
4245 ops = &tc_ops_other;
4246 }
4247 }
4248 } else if (error == ENOENT) {
4249 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4250 * other entity that doesn't have a handle 1:0. We will assume
4251 * that it's the system default qdisc. */
4252 ops = &tc_ops_default;
4253 error = 0;
4254 } else {
4255 /* Who knows? Maybe the device got deleted. */
4256 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 4257 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
4258 ops = &tc_ops_other;
4259 }
4260
4261 /* Instantiate it. */
b5d57fc8
BP
4262 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4263 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
4264 ofpbuf_delete(qdisc);
4265
4266 return error ? error : load_error;
4267}
4268
4269/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4270 approximate the time to transmit packets of various lengths. For an MTU of
4271 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4272 represents two possible packet lengths; for a MTU of 513 through 1024, four
4273 possible lengths; and so on.
4274
4275 Returns, for the specified 'mtu', the number of bits that packet lengths
4276 need to be shifted right to fit within such a 256-entry table. */
4277static int
4278tc_calc_cell_log(unsigned int mtu)
4279{
4280 int cell_log;
4281
4282 if (!mtu) {
4283 mtu = ETH_PAYLOAD_MAX;
4284 }
4285 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4286
4287 for (cell_log = 0; mtu >= 256; cell_log++) {
4288 mtu >>= 1;
4289 }
4290
4291 return cell_log;
4292}
4293
4294/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4295 * of 'mtu'. */
4296static void
4297tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4298{
4299 memset(rate, 0, sizeof *rate);
4300 rate->cell_log = tc_calc_cell_log(mtu);
4301 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4302 /* rate->cell_align = 0; */ /* distro headers. */
4303 rate->mpu = ETH_TOTAL_MIN;
4304 rate->rate = Bps;
4305}
4306
4307/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4308 * attribute of the specified "type".
4309 *
4310 * See tc_calc_cell_log() above for a description of "rtab"s. */
4311static void
4312tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4313{
4314 uint32_t *rtab;
4315 unsigned int i;
4316
4317 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4318 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4319 unsigned packet_size = (i + 1) << rate->cell_log;
4320 if (packet_size < rate->mpu) {
4321 packet_size = rate->mpu;
4322 }
4323 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4324 }
4325}
4326
4327/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4328 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4329 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4330 * 0 is fine.) */
c1c9c9c4
BP
4331static int
4332tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4333{
4334 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4335 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4336}
d3980822 4337\f
aaf2fb1a
BP
4338/* Linux-only functions declared in netdev-linux.h */
4339
4340/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4341 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4342int
4343netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4344 const char *flag_name, bool enable)
4345{
4346 const char *netdev_name = netdev_get_name(netdev);
4347 struct ethtool_value evalue;
4348 uint32_t new_flags;
4349 int error;
4350
ab985a77 4351 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4352 memset(&evalue, 0, sizeof evalue);
4353 error = netdev_linux_do_ethtool(netdev_name,
4354 (struct ethtool_cmd *)&evalue,
4355 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4356 if (error) {
4357 return error;
4358 }
4359
ab985a77 4360 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4361 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4362 error = netdev_linux_do_ethtool(netdev_name,
4363 (struct ethtool_cmd *)&evalue,
4364 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4365 if (error) {
4366 return error;
4367 }
4368
ab985a77 4369 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4370 memset(&evalue, 0, sizeof evalue);
4371 error = netdev_linux_do_ethtool(netdev_name,
4372 (struct ethtool_cmd *)&evalue,
4373 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4374 if (error) {
4375 return error;
4376 }
4377
4378 if (new_flags != evalue.data) {
4379 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4380 "device %s failed", enable ? "enable" : "disable",
4381 flag_name, netdev_name);
4382 return EOPNOTSUPP;
4383 }
4384
4385 return 0;
4386}
4387\f
4388/* Utility functions. */
4389
d3980822 4390/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4391static void
d3980822
BP
4392netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4393 const struct rtnl_link_stats *src)
4394{
f613a0d7
PS
4395 dst->rx_packets = src->rx_packets;
4396 dst->tx_packets = src->tx_packets;
4397 dst->rx_bytes = src->rx_bytes;
4398 dst->tx_bytes = src->tx_bytes;
4399 dst->rx_errors = src->rx_errors;
4400 dst->tx_errors = src->tx_errors;
4401 dst->rx_dropped = src->rx_dropped;
4402 dst->tx_dropped = src->tx_dropped;
4403 dst->multicast = src->multicast;
4404 dst->collisions = src->collisions;
4405 dst->rx_length_errors = src->rx_length_errors;
4406 dst->rx_over_errors = src->rx_over_errors;
4407 dst->rx_crc_errors = src->rx_crc_errors;
4408 dst->rx_frame_errors = src->rx_frame_errors;
4409 dst->rx_fifo_errors = src->rx_fifo_errors;
4410 dst->rx_missed_errors = src->rx_missed_errors;
4411 dst->tx_aborted_errors = src->tx_aborted_errors;
4412 dst->tx_carrier_errors = src->tx_carrier_errors;
4413 dst->tx_fifo_errors = src->tx_fifo_errors;
4414 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4415 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4416}
4417
c1c9c9c4
BP
4418static int
4419get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4420{
4421 /* Policy for RTNLGRP_LINK messages.
4422 *
4423 * There are *many* more fields in these messages, but currently we only
4424 * care about these fields. */
4425 static const struct nl_policy rtnlgrp_link_policy[] = {
4426 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4427 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4428 .min_len = sizeof(struct rtnl_link_stats) },
4429 };
4430
4431 struct ofpbuf request;
4432 struct ofpbuf *reply;
4433 struct ifinfomsg *ifi;
c1c9c9c4
BP
4434 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4435 int error;
4436
4437 ofpbuf_init(&request, 0);
4438 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4439 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4440 ifi->ifi_family = PF_UNSPEC;
4441 ifi->ifi_index = ifindex;
a88b4e04 4442 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
4443 ofpbuf_uninit(&request);
4444 if (error) {
4445 return error;
4446 }
4447
4448 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4449 rtnlgrp_link_policy,
4450 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4451 ofpbuf_delete(reply);
4452 return EPROTO;
4453 }
4454
4455 if (!attrs[IFLA_STATS]) {
4456 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4457 ofpbuf_delete(reply);
4458 return EPROTO;
4459 }
8b61709d 4460
d3980822 4461 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4462
576e26d7
BP
4463 ofpbuf_delete(reply);
4464
8b61709d
BP
4465 return 0;
4466}
4467
4468static int
4469get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4470{
4471 static const char fn[] = "/proc/net/dev";
4472 char line[1024];
4473 FILE *stream;
4474 int ln;
4475
4476 stream = fopen(fn, "r");
4477 if (!stream) {
10a89ef0 4478 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
8b61709d
BP
4479 return errno;
4480 }
4481
4482 ln = 0;
4483 while (fgets(line, sizeof line, stream)) {
4484 if (++ln >= 3) {
4485 char devname[16];
4486#define X64 "%"SCNu64
4487 if (sscanf(line,
4488 " %15[^:]:"
4489 X64 X64 X64 X64 X64 X64 X64 "%*u"
4490 X64 X64 X64 X64 X64 X64 X64 "%*u",
4491 devname,
4492 &stats->rx_bytes,
4493 &stats->rx_packets,
4494 &stats->rx_errors,
4495 &stats->rx_dropped,
4496 &stats->rx_fifo_errors,
4497 &stats->rx_frame_errors,
4498 &stats->multicast,
4499 &stats->tx_bytes,
4500 &stats->tx_packets,
4501 &stats->tx_errors,
4502 &stats->tx_dropped,
4503 &stats->tx_fifo_errors,
4504 &stats->collisions,
4505 &stats->tx_carrier_errors) != 15) {
4506 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4507 } else if (!strcmp(devname, netdev_name)) {
4508 stats->rx_length_errors = UINT64_MAX;
4509 stats->rx_over_errors = UINT64_MAX;
4510 stats->rx_crc_errors = UINT64_MAX;
4511 stats->rx_missed_errors = UINT64_MAX;
4512 stats->tx_aborted_errors = UINT64_MAX;
4513 stats->tx_heartbeat_errors = UINT64_MAX;
4514 stats->tx_window_errors = UINT64_MAX;
4515 fclose(stream);
4516 return 0;
4517 }
4518 }
4519 }
4520 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4521 fclose(stream);
4522 return ENODEV;
4523}
c1c9c9c4 4524
3a183124 4525static int
b5d57fc8 4526get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
4527{
4528 struct ifreq ifr;
4529 int error;
4530
755be9ea 4531 *flags = 0;
259e0b1a 4532 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
4533 if (!error) {
4534 *flags = ifr.ifr_flags;
4535 }
8b61709d
BP
4536 return error;
4537}
4538
4539static int
4b609110 4540set_flags(const char *name, unsigned int flags)
8b61709d
BP
4541{
4542 struct ifreq ifr;
4543
4544 ifr.ifr_flags = flags;
259e0b1a 4545 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
4546}
4547
4548static int
4549do_get_ifindex(const char *netdev_name)
4550{
4551 struct ifreq ifr;
259e0b1a 4552 int error;
8b61709d 4553
71d7c22f 4554 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 4555 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
4556
4557 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4558 if (error) {
8b61709d 4559 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
4560 netdev_name, ovs_strerror(error));
4561 return -error;
8b61709d
BP
4562 }
4563 return ifr.ifr_ifindex;
4564}
4565
4566static int
4567get_ifindex(const struct netdev *netdev_, int *ifindexp)
4568{
b5d57fc8 4569 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 4570
b5d57fc8 4571 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 4572 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4573
8b61709d 4574 if (ifindex < 0) {
b5d57fc8
BP
4575 netdev->get_ifindex_error = -ifindex;
4576 netdev->ifindex = 0;
c7b1b0a5 4577 } else {
b5d57fc8
BP
4578 netdev->get_ifindex_error = 0;
4579 netdev->ifindex = ifindex;
8b61709d 4580 }
b5d57fc8 4581 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 4582 }
c7b1b0a5 4583
b5d57fc8
BP
4584 *ifindexp = netdev->ifindex;
4585 return netdev->get_ifindex_error;
8b61709d
BP
4586}
4587
4588static int
4589get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4590{
4591 struct ifreq ifr;
4592 int hwaddr_family;
259e0b1a 4593 int error;
8b61709d
BP
4594
4595 memset(&ifr, 0, sizeof ifr);
71d7c22f 4596 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 4597 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
4598 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4599 if (error) {
78857dfb
BP
4600 /* ENODEV probably means that a vif disappeared asynchronously and
4601 * hasn't been removed from the database yet, so reduce the log level
4602 * to INFO for that case. */
259e0b1a 4603 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 4604 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
4605 netdev_name, ovs_strerror(error));
4606 return error;
8b61709d
BP
4607 }
4608 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4609 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4610 VLOG_WARN("%s device has unknown hardware address family %d",
4611 netdev_name, hwaddr_family);
4612 }
4613 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4614 return 0;
4615}
4616
4617static int
44445cac 4618set_etheraddr(const char *netdev_name,
8b61709d
BP
4619 const uint8_t mac[ETH_ADDR_LEN])
4620{
4621 struct ifreq ifr;
259e0b1a 4622 int error;
8b61709d
BP
4623
4624 memset(&ifr, 0, sizeof ifr);
71d7c22f 4625 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4626 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4627 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4628 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
4629 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4630 if (error) {
8b61709d 4631 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 4632 netdev_name, ovs_strerror(error));
8b61709d 4633 }
259e0b1a 4634 return error;
8b61709d
BP
4635}
4636
4637static int
0b0544d7 4638netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4639 int cmd, const char *cmd_name)
4640{
4641 struct ifreq ifr;
259e0b1a 4642 int error;
8b61709d
BP
4643
4644 memset(&ifr, 0, sizeof ifr);
71d7c22f 4645 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4646 ifr.ifr_data = (caddr_t) ecmd;
4647
4648 ecmd->cmd = cmd;
259e0b1a
BP
4649 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4650 if (error) {
4651 if (error != EOPNOTSUPP) {
8b61709d 4652 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 4653 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
4654 } else {
4655 /* The device doesn't support this operation. That's pretty
4656 * common, so there's no point in logging anything. */
4657 }
8b61709d 4658 }
259e0b1a 4659 return error;
8b61709d 4660}
f1acd62b
BP
4661
4662static int
4663netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4664 int cmd, const char *cmd_name)
4665{
4666 struct ifreq ifr;
4667 int error;
4668
4669 ifr.ifr_addr.sa_family = AF_INET;
259e0b1a 4670 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 4671 if (!error) {
db5a1019
AW
4672 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4673 &ifr.ifr_addr);
f1acd62b
BP
4674 *ip = sin->sin_addr;
4675 }
4676 return error;
4677}
488d734d
BP
4678
4679/* Returns an AF_PACKET raw socket or a negative errno value. */
4680static int
4681af_packet_sock(void)
4682{
23882115
BP
4683 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4684 static int sock;
488d734d 4685
23882115 4686 if (ovsthread_once_start(&once)) {
488d734d
BP
4687 sock = socket(AF_PACKET, SOCK_RAW, 0);
4688 if (sock >= 0) {
8450059e
BP
4689 int error = set_nonblocking(sock);
4690 if (error) {
4691 close(sock);
4692 sock = -error;
4693 }
488d734d
BP
4694 } else {
4695 sock = -errno;
10a89ef0
BP
4696 VLOG_ERR("failed to create packet socket: %s",
4697 ovs_strerror(errno));
488d734d 4698 }
23882115 4699 ovsthread_once_done(&once);
488d734d
BP
4700 }
4701
4702 return sock;
4703}