]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netdev-linux: Don't assume 'struct netdev' has offset 0.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
275707c3 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d
BP
22#include <fcntl.h>
23#include <arpa/inet.h>
24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
c060c4cf 61#include "netlink.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d 67#include "shash.h"
c060c4cf 68#include "socket-util.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
c060c4cf 71#include "unaligned.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 75
d76f09ea
BP
76COVERAGE_DEFINE(netdev_set_policing);
77COVERAGE_DEFINE(netdev_arp_lookup);
78COVERAGE_DEFINE(netdev_get_ifindex);
79COVERAGE_DEFINE(netdev_get_hwaddr);
80COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
81COVERAGE_DEFINE(netdev_get_ethtool);
82COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 83
8b61709d
BP
84\f
85/* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87#ifndef ADVERTISED_Pause
88#define ADVERTISED_Pause (1 << 13)
89#endif
90#ifndef ADVERTISED_Asym_Pause
91#define ADVERTISED_Asym_Pause (1 << 14)
92#endif
93
e47bd51a
JP
94/* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96#ifndef ETHTOOL_GFLAGS
97#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98#endif
99#ifndef ETHTOOL_SFLAGS
100#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101#endif
102
c1c9c9c4
BP
103/* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105#ifndef TC_RTAB_SIZE
106#define TC_RTAB_SIZE 1024
107#endif
108
2ee6545f 109static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 110static int cache_notifier_refcount;
8b61709d
BP
111
112enum {
7fbef77a
JG
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
3a183124 118 VALID_POLICING = 1 << 5,
4f925bd3
PS
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
51f87458 121 VALID_FEATURES = 1 << 8,
8b61709d
BP
122};
123
149f577a
JG
124struct tap_state {
125 int fd;
126};
c1c9c9c4
BP
127\f
128/* Traffic control. */
129
130/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
c1c9c9c4
BP
135struct tc {
136 const struct tc_ops *ops;
93b13be8
BP
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140};
c1c9c9c4 141
559eb230
BP
142#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
93b13be8
BP
144/* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 151 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
152};
153
154/* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
156 *
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
160struct tc_ops {
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
165
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
168
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
172
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
178 *
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
182 *
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
185 *
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
79f1cbe9 188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
189
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
193 *
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
199 * 'netdev'.
200 *
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * tc_destroy(tc).
208 *
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
212 *
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
215
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 *
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
221 *
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
225 *
226 * This function may be null if 'tc' is not configurable.
227 */
79f1cbe9 228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
229
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 *
237 * This function may be null if 'tc' is not configurable.
238 */
79f1cbe9 239 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 240
93b13be8
BP
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 *
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
251 *
252 * This function may be null if 'tc' does not have queues ('n_queues' is
253 * 0). */
93b13be8 254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 255 struct smap *details);
c1c9c9c4
BP
256
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * 'n_queues'.
261 *
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 *
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 269 const struct smap *details);
c1c9c9c4 270
93b13be8
BP
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
273 *
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
93b13be8 276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 277
93b13be8
BP
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
280 *
281 * On success, initializes '*stats'.
282 *
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
93b13be8
BP
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
c1c9c9c4
BP
287 struct netdev_queue_stats *stats);
288
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 *
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
297};
298
299static void
300tc_init(struct tc *tc, const struct tc_ops *ops)
301{
302 tc->ops = ops;
93b13be8 303 hmap_init(&tc->queues);
c1c9c9c4
BP
304}
305
306static void
307tc_destroy(struct tc *tc)
308{
93b13be8 309 hmap_destroy(&tc->queues);
c1c9c9c4
BP
310}
311
312static const struct tc_ops tc_ops_htb;
a339aa81 313static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
314static const struct tc_ops tc_ops_default;
315static const struct tc_ops tc_ops_other;
316
559eb230 317static const struct tc_ops *const tcs[] = {
c1c9c9c4 318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
322 NULL
323};
149f577a 324
c1c9c9c4
BP
325static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326static unsigned int tc_get_major(unsigned int handle);
327static unsigned int tc_get_minor(unsigned int handle);
328
329static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332
333static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
336static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 int kbits_burst);
c1c9c9c4
BP
339
340static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348static int tc_delete_class(const struct netdev *, unsigned int handle);
349
350static int tc_del_qdisc(struct netdev *netdev);
351static int tc_query_qdisc(const struct netdev *netdev);
352
353static int tc_calc_cell_log(unsigned int mtu);
354static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358\f
b5d57fc8
BP
359struct netdev_linux {
360 struct netdev up;
149f577a 361
8b61709d 362 struct shash_node *shash_node;
149f577a 363 unsigned int cache_valid;
ac4d3bcb 364 unsigned int change_seq;
8b61709d 365
1670c579
EJ
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
369
8722022c
BP
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
372 int ifindex;
373 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 374 struct in_addr address, netmask;
8b61709d
BP
375 struct in6_addr in6;
376 int mtu;
059e5f4f 377 unsigned int ifi_flags;
65c3058c 378 long long int carrier_resets;
80a86fbe
BP
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
bba1e6f3
PS
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
90a6637d 383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 385 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 388
a00ca915
EJ
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
90a6637d 393
4f925bd3 394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 395 struct tc *tc;
149f577a
JG
396
397 union {
398 struct tap_state tap;
399 } state;
8b61709d
BP
400};
401
796223f5
BP
402struct netdev_rx_linux {
403 struct netdev_rx up;
404 bool is_tap;
5b7448ed 405 int fd;
149f577a 406};
8b61709d 407
796223f5
BP
408static const struct netdev_rx_class netdev_rx_linux_class;
409
76c308b5
BP
410/* Sockets used for ioctl operations. */
411static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 412
8b61709d
BP
413/* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416
15b3596a 417static int netdev_linux_init(void);
6f643e49 418
0b0544d7 419static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 420 int cmd, const char *cmd_name);
149f577a
JG
421static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
f1acd62b
BP
423static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
b5d57fc8 425static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 426static int set_flags(const char *, unsigned int flags);
8b61709d
BP
427static int do_get_ifindex(const char *netdev_name);
428static int get_ifindex(const struct netdev *, int *ifindexp);
429static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 433static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
434static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 436static int af_packet_sock(void);
1670c579
EJ
437static void netdev_linux_miimon_run(void);
438static void netdev_linux_miimon_wait(void);
8b61709d 439
15b3596a
JG
440static bool
441is_netdev_linux_class(const struct netdev_class *netdev_class)
442{
443 return netdev_class->init == netdev_linux_init;
444}
445
796223f5
BP
446static bool
447is_tap_netdev(const struct netdev *netdev)
448{
b5d57fc8 449 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
450}
451
8b61709d
BP
452static struct netdev_linux *
453netdev_linux_cast(const struct netdev *netdev)
454{
b5d57fc8 455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 456
180c6d0b 457 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 458}
796223f5
BP
459
460static struct netdev_rx_linux *
461netdev_rx_linux_cast(const struct netdev_rx *rx)
462{
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
465}
ff4ed3c9 466\f
8b61709d
BP
467static int
468netdev_linux_init(void)
469{
470 static int status = -1;
471 if (status < 0) {
ff4ed3c9 472 /* Create AF_INET socket. */
8b61709d
BP
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
475 if (status) {
10a89ef0 476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
8b61709d
BP
477 }
478 }
479 return status;
480}
481
482static void
483netdev_linux_run(void)
484{
18a23781 485 rtnetlink_link_run();
1670c579 486 netdev_linux_miimon_run();
8b61709d
BP
487}
488
489static void
490netdev_linux_wait(void)
491{
18a23781 492 rtnetlink_link_wait();
1670c579 493 netdev_linux_miimon_wait();
8b61709d
BP
494}
495
ac4d3bcb 496static void
b5d57fc8
BP
497netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
ac4d3bcb
EJ
499{
500 dev->change_seq++;
501 if (!dev->change_seq) {
502 dev->change_seq++;
503 }
8aa77183
BP
504
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
507 }
508 dev->ifi_flags = ifi_flags;
509
4f925bd3
PS
510 dev->cache_valid &= mask;
511}
512
513static void
b5d57fc8
BP
514netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
4f925bd3
PS
516{
517 if (change->nlmsg_type == RTM_NEWLINK) {
518 /* Keep drv-info */
b5d57fc8 519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 520
c7b1b0a5 521 /* Update netdev from rtnl-change msg. */
90a6637d
PS
522 if (change->mtu) {
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
526 }
527
44445cac
PS
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
532 }
533
c7b1b0a5
PS
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
537
4f925bd3 538 } else {
b5d57fc8 539 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 540 }
ac4d3bcb
EJ
541}
542
8b61709d 543static void
21d6e22e 544netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 545 void *aux OVS_UNUSED)
8b61709d 546{
b5d57fc8 547 struct netdev_linux *dev;
8b61709d 548 if (change) {
b5d57fc8
BP
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
8b61709d
BP
552 }
553 } else {
46415c90 554 struct shash device_shash;
8b61709d 555 struct shash_node *node;
46415c90
JG
556
557 shash_init(&device_shash);
b5d57fc8 558 netdev_get_devices(&netdev_linux_class, &device_shash);
46415c90 559 SHASH_FOR_EACH (node, &device_shash) {
96172faa 560 struct netdev *netdev = node->data;
059e5f4f 561 unsigned int flags;
3a183124 562
96172faa 563 dev = netdev_linux_cast(netdev);
3a183124 564
180c6d0b 565 get_flags(&dev->up, &flags);
b5d57fc8 566 netdev_linux_changed(dev, flags, 0);
8b61709d 567 }
46415c90 568 shash_destroy(&device_shash);
8b61709d
BP
569 }
570}
571
572static int
1f6e0fbd 573cache_notifier_ref(void)
6c88d577 574{
46415c90 575 if (!cache_notifier_refcount) {
cb22974d 576 ovs_assert(!netdev_linux_cache_notifier);
2ee6545f
EJ
577
578 netdev_linux_cache_notifier =
579 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
580
581 if (!netdev_linux_cache_notifier) {
582 return EINVAL;
149f577a
JG
583 }
584 }
46415c90 585 cache_notifier_refcount++;
6c88d577 586
1f6e0fbd
BP
587 return 0;
588}
589
590static void
591cache_notifier_unref(void)
592{
cb22974d 593 ovs_assert(cache_notifier_refcount > 0);
1f6e0fbd 594 if (!--cache_notifier_refcount) {
cb22974d 595 ovs_assert(netdev_linux_cache_notifier);
1f6e0fbd
BP
596 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
597 netdev_linux_cache_notifier = NULL;
598 }
599}
600
601/* Creates system and internal devices. */
602static int
603netdev_linux_create(const struct netdev_class *class, const char *name,
b5d57fc8 604 struct netdev **netdevp)
1f6e0fbd 605{
b5d57fc8 606 struct netdev_linux *netdev;
1f6e0fbd
BP
607 int error;
608
609 error = cache_notifier_ref();
610 if (error) {
611 return error;
612 }
613
b5d57fc8
BP
614 netdev = xzalloc(sizeof *netdev);
615 netdev->change_seq = 1;
616 netdev_init(&netdev->up, name, class);
617 error = get_flags(&netdev->up, &netdev->ifi_flags);
618 if (error == ENODEV) {
619 if (class != &netdev_internal_class) {
620 /* The device does not exist, so don't allow it to be opened. */
621 netdev_uninit(&netdev->up, false);
622 cache_notifier_unref();
623 free(netdev);
624 return ENODEV;
625 } else {
626 /* "Internal" netdevs have to be created as netdev objects before
627 * they exist in the kernel, because creating them in the kernel
628 * happens by passing a netdev object to dpif_port_add().
629 * Therefore, ignore the error. */
630 }
631 }
46415c90 632
b5d57fc8 633 *netdevp = &netdev->up;
a740f0de
JG
634 return 0;
635}
636
5b7448ed
JG
637/* For most types of netdevs we open the device for each call of
638 * netdev_open(). However, this is not the case with tap devices,
639 * since it is only possible to open the device once. In this
640 * situation we share a single file descriptor, and consequently
641 * buffers, across all readers. Therefore once data is read it will
642 * be unavailable to other reads for tap devices. */
a740f0de 643static int
b8dcf5e9 644netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
b5d57fc8 645 const char *name, struct netdev **netdevp)
a740f0de 646{
b5d57fc8 647 struct netdev_linux *netdev;
a740f0de
JG
648 struct tap_state *state;
649 static const char tap_dev[] = "/dev/net/tun";
650 struct ifreq ifr;
651 int error;
652
b5d57fc8 653 netdev = xzalloc(sizeof *netdev);
2e5ae318 654 netdev->change_seq = 1;
b5d57fc8 655 state = &netdev->state.tap;
a740f0de 656
1f6e0fbd
BP
657 error = cache_notifier_ref();
658 if (error) {
659 goto error;
660 }
661
6c88d577 662 /* Open tap device. */
149f577a
JG
663 state->fd = open(tap_dev, O_RDWR);
664 if (state->fd < 0) {
6c88d577 665 error = errno;
10a89ef0 666 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1f6e0fbd 667 goto error_unref_notifier;
6c88d577
JP
668 }
669
670 /* Create tap device. */
671 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 672 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 673 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577 674 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 675 ovs_strerror(errno));
6c88d577 676 error = errno;
f61d8d29 677 goto error_close;
6c88d577
JP
678 }
679
680 /* Make non-blocking. */
149f577a 681 error = set_nonblocking(state->fd);
a740f0de 682 if (error) {
f61d8d29 683 goto error_close;
a740f0de
JG
684 }
685
b5d57fc8
BP
686 netdev_init(&netdev->up, name, &netdev_tap_class);
687 *netdevp = &netdev->up;
a740f0de
JG
688 return 0;
689
f61d8d29
BP
690error_close:
691 close(state->fd);
1f6e0fbd
BP
692error_unref_notifier:
693 cache_notifier_unref();
a740f0de 694error:
b5d57fc8 695 free(netdev);
a740f0de
JG
696 return error;
697}
698
a740f0de 699static void
b5d57fc8 700destroy_tap(struct netdev_linux *netdev)
a740f0de 701{
b5d57fc8 702 struct tap_state *state = &netdev->state.tap;
149f577a
JG
703
704 if (state->fd >= 0) {
705 close(state->fd);
a740f0de
JG
706 }
707}
708
b5d57fc8 709/* Destroys the netdev device 'netdev_'. */
6c88d577 710static void
b5d57fc8 711netdev_linux_destroy(struct netdev *netdev_)
6c88d577 712{
b5d57fc8 713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 714
b5d57fc8
BP
715 if (netdev->tc && netdev->tc->ops->tc_destroy) {
716 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
717 }
718
b5d57fc8
BP
719 if (netdev_get_class(netdev_) == &netdev_tap_class) {
720 destroy_tap(netdev);
6c88d577 721 }
b5d57fc8 722 free(netdev);
1f6e0fbd
BP
723
724 cache_notifier_unref();
6c88d577
JP
725}
726
7b6b0ef4 727static int
796223f5 728netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
7b6b0ef4
BP
729{
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5
BP
731 bool is_tap = is_tap_netdev(netdev_);
732 struct netdev_rx_linux *rx;
7b6b0ef4
BP
733 int error;
734 int fd;
735
796223f5 736 if (is_tap) {
b5d57fc8 737 fd = netdev->state.tap.fd;
796223f5
BP
738 } else {
739 struct sockaddr_ll sll;
740 int ifindex;
32383c3b
MM
741 /* Result of tcpdump -dd inbound */
742 static struct sock_filter filt[] = {
743 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
744 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
745 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
746 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
747 };
748 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
7b6b0ef4 749
796223f5
BP
750 /* Create file descriptor. */
751 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 if (fd < 0) {
753 error = errno;
10a89ef0 754 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
755 goto error;
756 }
33d82a56 757
796223f5
BP
758 /* Set non-blocking mode. */
759 error = set_nonblocking(fd);
760 if (error) {
761 goto error;
762 }
7b6b0ef4 763
796223f5 764 /* Get ethernet device index. */
180c6d0b 765 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
766 if (error) {
767 goto error;
768 }
7b6b0ef4 769
796223f5
BP
770 /* Bind to specific ethernet device. */
771 memset(&sll, 0, sizeof sll);
772 sll.sll_family = AF_PACKET;
773 sll.sll_ifindex = ifindex;
774 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
775 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
776 error = errno;
777 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 778 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
779 goto error;
780 }
32383c3b
MM
781
782 /* Filter for only inbound packets. */
783 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
784 sizeof fprog);
785 if (error) {
786 error = errno;
787 VLOG_ERR("%s: failed attach filter (%s)",
10a89ef0 788 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
789 goto error;
790 }
7b6b0ef4
BP
791 }
792
796223f5 793 rx = xmalloc(sizeof *rx);
b5d57fc8 794 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
796223f5
BP
795 rx->is_tap = is_tap;
796 rx->fd = fd;
7b6b0ef4 797
796223f5 798 *rxp = &rx->up;
7b6b0ef4
BP
799 return 0;
800
801error:
802 if (fd >= 0) {
803 close(fd);
804 }
805 return error;
806}
807
796223f5
BP
808static void
809netdev_rx_linux_destroy(struct netdev_rx *rx_)
8b61709d 810{
796223f5 811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
8b61709d 812
796223f5
BP
813 if (!rx->is_tap) {
814 close(rx->fd);
8b61709d 815 }
796223f5
BP
816 free(rx);
817}
8b61709d 818
796223f5
BP
819static int
820netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
821{
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
823 ssize_t retval;
8e8cddf7 824
796223f5
BP
825 do {
826 retval = (rx->is_tap
827 ? read(rx->fd, data, size)
828 : recv(rx->fd, data, size, MSG_TRUNC));
829 } while (retval < 0 && errno == EINTR);
830
bb5c1468
Z
831 if (retval >= 0) {
832 return retval > size ? -EMSGSIZE : retval;
796223f5
BP
833 } else {
834 if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
10a89ef0 836 ovs_strerror(errno), netdev_rx_get_name(rx_));
8b61709d 837 }
796223f5 838 return -errno;
8b61709d
BP
839 }
840}
841
8b61709d 842static void
796223f5 843netdev_rx_linux_wait(struct netdev_rx *rx_)
8b61709d 844{
796223f5
BP
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
846 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
847}
848
8b61709d 849static int
796223f5 850netdev_rx_linux_drain(struct netdev_rx *rx_)
8b61709d 851{
796223f5
BP
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
853 if (rx->is_tap) {
8b61709d 854 struct ifreq ifr;
796223f5 855 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
8b61709d
BP
856 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
857 if (error) {
858 return error;
859 }
796223f5 860 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
861 return 0;
862 } else {
796223f5 863 return drain_rcvbuf(rx->fd);
8b61709d
BP
864 }
865}
866
867/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
868 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
869 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
870 * the packet is too big or too small to transmit on the device.
871 *
872 * The caller retains ownership of 'buffer' in all cases.
873 *
874 * The kernel maintains a packet transmission queue, so the caller is not
875 * expected to do additional queuing of packets. */
876static int
877netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
878{
f23347ea
BP
879 for (;;) {
880 ssize_t retval;
8b61709d 881
796223f5 882 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
883 /* Use our AF_PACKET socket to send to this device. */
884 struct sockaddr_ll sll;
885 struct msghdr msg;
886 struct iovec iov;
887 int ifindex;
888 int error;
488d734d
BP
889 int sock;
890
891 sock = af_packet_sock();
892 if (sock < 0) {
c4c7a3d7 893 return -sock;
488d734d 894 }
f23347ea
BP
895
896 error = get_ifindex(netdev_, &ifindex);
897 if (error) {
898 return error;
899 }
8b61709d 900
f23347ea
BP
901 /* We don't bother setting most fields in sockaddr_ll because the
902 * kernel ignores them for SOCK_RAW. */
903 memset(&sll, 0, sizeof sll);
904 sll.sll_family = AF_PACKET;
905 sll.sll_ifindex = ifindex;
76c308b5 906
ebc56baa 907 iov.iov_base = CONST_CAST(void *, data);
f23347ea 908 iov.iov_len = size;
76c308b5 909
f23347ea
BP
910 msg.msg_name = &sll;
911 msg.msg_namelen = sizeof sll;
912 msg.msg_iov = &iov;
913 msg.msg_iovlen = 1;
914 msg.msg_control = NULL;
915 msg.msg_controllen = 0;
916 msg.msg_flags = 0;
917
488d734d 918 retval = sendmsg(sock, &msg, 0);
f23347ea 919 } else {
796223f5
BP
920 /* Use the tap fd to send to this device. This is essential for
921 * tap devices, because packets sent to a tap device with an
922 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
923 * tap device. This doesn't occur on other interface types
924 * because we attach a socket filter to the rx socket. */
b5d57fc8 925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 926
b5d57fc8 927 retval = write(netdev->state.tap.fd, data, size);
f23347ea 928 }
76c308b5 929
8b61709d
BP
930 if (retval < 0) {
931 /* The Linux AF_PACKET implementation never blocks waiting for room
932 * for packets, instead returning ENOBUFS. Translate this into
933 * EAGAIN for the caller. */
934 if (errno == ENOBUFS) {
935 return EAGAIN;
936 } else if (errno == EINTR) {
937 continue;
938 } else if (errno != EAGAIN) {
939 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
10a89ef0 940 netdev_get_name(netdev_), ovs_strerror(errno));
8b61709d
BP
941 }
942 return errno;
943 } else if (retval != size) {
944 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
945 "%zu) on %s", retval, size, netdev_get_name(netdev_));
946 return EMSGSIZE;
947 } else {
948 return 0;
949 }
950 }
951}
952
953/* Registers with the poll loop to wake up from the next call to poll_block()
954 * when the packet transmission queue has sufficient room to transmit a packet
955 * with netdev_send().
956 *
957 * The kernel maintains a packet transmission queue, so the client is not
958 * expected to do additional queuing of packets. Thus, this function is
959 * unlikely to ever be used. It is included for completeness. */
960static void
796223f5 961netdev_linux_send_wait(struct netdev *netdev)
8b61709d 962{
796223f5 963 if (is_tap_netdev(netdev)) {
8b61709d
BP
964 /* TAP device always accepts packets.*/
965 poll_immediate_wake();
966 }
967}
968
969/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
970 * otherwise a positive errno value. */
971static int
972netdev_linux_set_etheraddr(struct netdev *netdev_,
973 const uint8_t mac[ETH_ADDR_LEN])
974{
b5d57fc8 975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4b609110 976 struct netdev_saved_flags *sf = NULL;
eb395f2e
BP
977 int error;
978
b5d57fc8
BP
979 if (netdev->cache_valid & VALID_ETHERADDR) {
980 if (netdev->ether_addr_error) {
981 return netdev->ether_addr_error;
44445cac 982 }
b5d57fc8 983 if (eth_addr_equals(netdev->etheraddr, mac)) {
44445cac
PS
984 return 0;
985 }
b5d57fc8 986 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
987 }
988
7eb1bd81 989 /* Tap devices must be brought down before setting the address. */
796223f5 990 if (is_tap_netdev(netdev_)) {
bbd5b6f4 991 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
7eb1bd81 992 }
44445cac
PS
993 error = set_etheraddr(netdev_get_name(netdev_), mac);
994 if (!error || error == ENODEV) {
b5d57fc8
BP
995 netdev->ether_addr_error = error;
996 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 997 if (!error) {
b5d57fc8 998 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 999 }
8b61709d 1000 }
44445cac 1001
4b609110 1002 netdev_restore_flags(sf);
7eb1bd81 1003
8b61709d
BP
1004 return error;
1005}
1006
44445cac 1007/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1008static int
1009netdev_linux_get_etheraddr(const struct netdev *netdev_,
1010 uint8_t mac[ETH_ADDR_LEN])
1011{
b5d57fc8 1012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
44445cac 1013
b5d57fc8 1014 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
8b61709d 1015 int error = get_etheraddr(netdev_get_name(netdev_),
b5d57fc8 1016 netdev->etheraddr);
44445cac 1017
b5d57fc8
BP
1018 netdev->ether_addr_error = error;
1019 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1020 }
44445cac 1021
b5d57fc8
BP
1022 if (!netdev->ether_addr_error) {
1023 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac
PS
1024 }
1025
b5d57fc8 1026 return netdev->ether_addr_error;
8b61709d
BP
1027}
1028
1029/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1030 * in bytes, not including the hardware header; thus, this is typically 1500
1031 * bytes for Ethernet devices. */
1032static int
1033netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1034{
b5d57fc8
BP
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d
BP
1037 struct ifreq ifr;
1038 int error;
1039
149f577a
JG
1040 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1041 SIOCGIFMTU, "SIOCGIFMTU");
90a6637d 1042
b5d57fc8
BP
1043 netdev->netdev_mtu_error = error;
1044 netdev->mtu = ifr.ifr_mtu;
1045 netdev->cache_valid |= VALID_MTU;
8b61709d 1046 }
90a6637d 1047
b5d57fc8
BP
1048 if (!netdev->netdev_mtu_error) {
1049 *mtup = netdev->mtu;
90a6637d 1050 }
b5d57fc8 1051 return netdev->netdev_mtu_error;
8b61709d
BP
1052}
1053
9b020780
PS
1054/* Sets the maximum size of transmitted (MTU) for given device using linux
1055 * networking ioctl interface.
1056 */
1057static int
1058netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1059{
b5d57fc8 1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1061 struct ifreq ifr;
1062 int error;
1063
b5d57fc8
BP
1064 if (netdev->cache_valid & VALID_MTU) {
1065 if (netdev->netdev_mtu_error) {
1066 return netdev->netdev_mtu_error;
90a6637d 1067 }
b5d57fc8 1068 if (netdev->mtu == mtu) {
90a6637d
PS
1069 return 0;
1070 }
b5d57fc8 1071 netdev->cache_valid &= ~VALID_MTU;
153e5481 1072 }
9b020780
PS
1073 ifr.ifr_mtu = mtu;
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1076 if (!error || error == ENODEV) {
b5d57fc8
BP
1077 netdev->netdev_mtu_error = error;
1078 netdev->mtu = ifr.ifr_mtu;
1079 netdev->cache_valid |= VALID_MTU;
9b020780 1080 }
90a6637d 1081 return error;
9b020780
PS
1082}
1083
9ab3d9a3
BP
1084/* Returns the ifindex of 'netdev', if successful, as a positive number.
1085 * On failure, returns a negative errno value. */
1086static int
1087netdev_linux_get_ifindex(const struct netdev *netdev)
1088{
1089 int ifindex, error;
1090
1091 error = get_ifindex(netdev, &ifindex);
1092 return error ? -error : ifindex;
1093}
1094
8b61709d
BP
1095static int
1096netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1097{
b5d57fc8 1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1099
b5d57fc8
BP
1100 if (netdev->miimon_interval > 0) {
1101 *carrier = netdev->miimon;
3a183124 1102 } else {
b5d57fc8 1103 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1104 }
8b61709d 1105
3a183124 1106 return 0;
8b61709d
BP
1107}
1108
65c3058c
EJ
1109static long long int
1110netdev_linux_get_carrier_resets(const struct netdev *netdev)
1111{
b5d57fc8 1112 return netdev_linux_cast(netdev)->carrier_resets;
65c3058c
EJ
1113}
1114
63331829 1115static int
1670c579
EJ
1116netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1117 struct mii_ioctl_data *data)
63331829 1118{
63331829 1119 struct ifreq ifr;
782e6111 1120 int error;
63331829 1121
63331829 1122 memset(&ifr, 0, sizeof ifr);
782e6111 1123 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1124 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1125 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1126
782e6111
EJ
1127 return error;
1128}
1129
1130static int
1670c579 1131netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1132{
782e6111
EJ
1133 struct mii_ioctl_data data;
1134 int error;
63331829 1135
782e6111
EJ
1136 *miimon = false;
1137
1138 memset(&data, 0, sizeof data);
1670c579 1139 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1140 if (!error) {
1141 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1142 data.reg_num = MII_BMSR;
1670c579 1143 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1144 &data);
63331829
EJ
1145
1146 if (!error) {
782e6111 1147 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1148 } else {
1149 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1150 }
1151 } else {
1152 struct ethtool_cmd ecmd;
63331829
EJ
1153
1154 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1155 name);
1156
ab985a77 1157 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1158 memset(&ecmd, 0, sizeof ecmd);
1159 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1160 "ETHTOOL_GLINK");
1161 if (!error) {
782e6111
EJ
1162 struct ethtool_value eval;
1163
1164 memcpy(&eval, &ecmd, sizeof eval);
1165 *miimon = !!eval.data;
63331829
EJ
1166 } else {
1167 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1168 }
1169 }
1170
1171 return error;
1172}
1173
1670c579
EJ
1174static int
1175netdev_linux_set_miimon_interval(struct netdev *netdev_,
1176 long long int interval)
1177{
b5d57fc8 1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579
EJ
1179
1180 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8
BP
1181 if (netdev->miimon_interval != interval) {
1182 netdev->miimon_interval = interval;
1183 timer_set_expired(&netdev->miimon_timer);
1670c579
EJ
1184 }
1185
1186 return 0;
1187}
1188
1189static void
1190netdev_linux_miimon_run(void)
1191{
1192 struct shash device_shash;
1193 struct shash_node *node;
1194
1195 shash_init(&device_shash);
b5d57fc8 1196 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1197 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1198 struct netdev *netdev = node->data;
1199 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1200 bool miimon;
1201
1202 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1203 continue;
1204 }
1205
180c6d0b 1206 netdev_linux_get_miimon(dev->up.name, &miimon);
1670c579 1207 if (miimon != dev->miimon) {
1670c579 1208 dev->miimon = miimon;
b5d57fc8 1209 netdev_linux_changed(dev, dev->ifi_flags, 0);
1670c579
EJ
1210 }
1211
1212 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1213 }
1214
1215 shash_destroy(&device_shash);
1216}
1217
1218static void
1219netdev_linux_miimon_wait(void)
1220{
1221 struct shash device_shash;
1222 struct shash_node *node;
1223
1224 shash_init(&device_shash);
b5d57fc8 1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1226 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1229
1230 if (dev->miimon_interval > 0) {
1231 timer_wait(&dev->miimon_timer);
1232 }
1233 }
1234 shash_destroy(&device_shash);
1235}
1236
8b61709d
BP
1237/* Check whether we can we use RTM_GETLINK to get network device statistics.
1238 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1239 * enabled. */
1240static bool
1241check_for_working_netlink_stats(void)
1242{
1243 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1244 * preferable, so if that works, we'll use it. */
1245 int ifindex = do_get_ifindex("lo");
1246 if (ifindex < 0) {
1247 VLOG_WARN("failed to get ifindex for lo, "
1248 "obtaining netdev stats from proc");
1249 return false;
1250 } else {
1251 struct netdev_stats stats;
1252 int error = get_stats_via_netlink(ifindex, &stats);
1253 if (!error) {
1254 VLOG_DBG("obtaining netdev stats via rtnetlink");
1255 return true;
1256 } else {
1257 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1258 "via proc (you are probably running a pre-2.6.19 "
10a89ef0 1259 "kernel)", ovs_strerror(error));
8b61709d
BP
1260 return false;
1261 }
1262 }
1263}
1264
92df599c
JG
1265static void
1266swap_uint64(uint64_t *a, uint64_t *b)
1267{
1de0e8ae
BP
1268 uint64_t tmp = *a;
1269 *a = *b;
1270 *b = tmp;
92df599c
JG
1271}
1272
c060c4cf
EJ
1273/* Copies 'src' into 'dst', performing format conversion in the process.
1274 *
1275 * 'src' is allowed to be misaligned. */
1276static void
1277netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1278 const struct ovs_vport_stats *src)
1279{
1280 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1281 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1282 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1283 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1284 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1285 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1286 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1287 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1288 dst->multicast = 0;
1289 dst->collisions = 0;
1290 dst->rx_length_errors = 0;
1291 dst->rx_over_errors = 0;
1292 dst->rx_crc_errors = 0;
1293 dst->rx_frame_errors = 0;
1294 dst->rx_fifo_errors = 0;
1295 dst->rx_missed_errors = 0;
1296 dst->tx_aborted_errors = 0;
1297 dst->tx_carrier_errors = 0;
1298 dst->tx_fifo_errors = 0;
1299 dst->tx_heartbeat_errors = 0;
1300 dst->tx_window_errors = 0;
1301}
1302
1303static int
1304get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1305{
1306 struct dpif_linux_vport reply;
1307 struct ofpbuf *buf;
1308 int error;
1309
1310 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1311 if (error) {
1312 return error;
1313 } else if (!reply.stats) {
1314 ofpbuf_delete(buf);
1315 return EOPNOTSUPP;
1316 }
1317
1318 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1319
1320 ofpbuf_delete(buf);
1321
1322 return 0;
1323}
1324
f613a0d7
PS
1325static void
1326get_stats_via_vport(const struct netdev *netdev_,
1327 struct netdev_stats *stats)
8b61709d 1328{
b5d57fc8 1329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1330
b5d57fc8
BP
1331 if (!netdev->vport_stats_error ||
1332 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1333 int error;
7fbef77a 1334
c060c4cf 1335 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1336 if (error && error != ENOENT) {
a57a8488 1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1338 "(%s)",
1339 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1340 }
b5d57fc8
BP
1341 netdev->vport_stats_error = error;
1342 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1343 }
f613a0d7 1344}
8b61709d 1345
f613a0d7
PS
1346static int
1347netdev_linux_sys_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1349{
23882115
BP
1350 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1351 static int use_netlink_stats;
f613a0d7
PS
1352 int error;
1353
23882115 1354 if (ovsthread_once_start(&once)) {
f613a0d7 1355 use_netlink_stats = check_for_working_netlink_stats();
23882115 1356 ovsthread_once_done(&once);
f613a0d7
PS
1357 }
1358
1359 if (use_netlink_stats) {
1360 int ifindex;
1361
1362 error = get_ifindex(netdev_, &ifindex);
1363 if (!error) {
1364 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1365 }
f613a0d7
PS
1366 } else {
1367 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1368 }
7fbef77a 1369
f613a0d7
PS
1370 if (error) {
1371 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1372 netdev_get_name(netdev_), error);
1373 }
1374 return error;
1375
1376}
1377
1378/* Retrieves current device stats for 'netdev-linux'. */
1379static int
1380netdev_linux_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1382{
b5d57fc8 1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1384 struct netdev_stats dev_stats;
1385 int error;
1386
1387 get_stats_via_vport(netdev_, stats);
1388
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1390
1391 if (error) {
b5d57fc8 1392 if (netdev->vport_stats_error) {
f613a0d7 1393 return error;
7fbef77a 1394 } else {
f613a0d7
PS
1395 return 0;
1396 }
1397 }
1398
b5d57fc8 1399 if (netdev->vport_stats_error) {
f613a0d7
PS
1400 /* stats not available from OVS then use ioctl stats. */
1401 *stats = dev_stats;
1402 } else {
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1420 }
1421 return 0;
1422}
1423
1424/* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1426static int
bba1e6f3 1427netdev_tap_get_stats(const struct netdev *netdev_,
f613a0d7
PS
1428 struct netdev_stats *stats)
1429{
b5d57fc8 1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1431 struct netdev_stats dev_stats;
1432 int error;
1433
1434 get_stats_via_vport(netdev_, stats);
1435
1436 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1437 if (error) {
b5d57fc8 1438 if (netdev->vport_stats_error) {
f613a0d7
PS
1439 return error;
1440 } else {
1441 return 0;
8b61709d 1442 }
8b61709d 1443 }
fe6b0e03
JG
1444
1445 /* If this port is an internal port then the transmit and receive stats
1446 * will appear to be swapped relative to the other ports since we are the
1447 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1448 * them back here. This does not apply if we are getting stats from the
1449 * vport layer because it always tracks stats from the perspective of the
1450 * switch. */
b5d57fc8 1451 if (netdev->vport_stats_error) {
f613a0d7 1452 *stats = dev_stats;
92df599c
JG
1453 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1454 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1455 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1456 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1457 stats->rx_length_errors = 0;
1458 stats->rx_over_errors = 0;
1459 stats->rx_crc_errors = 0;
1460 stats->rx_frame_errors = 0;
1461 stats->rx_fifo_errors = 0;
1462 stats->rx_missed_errors = 0;
1463 stats->tx_aborted_errors = 0;
1464 stats->tx_carrier_errors = 0;
1465 stats->tx_fifo_errors = 0;
1466 stats->tx_heartbeat_errors = 0;
1467 stats->tx_window_errors = 0;
f613a0d7
PS
1468 } else {
1469 stats->rx_dropped += dev_stats.tx_dropped;
1470 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1471
f613a0d7
PS
1472 stats->rx_errors += dev_stats.tx_errors;
1473 stats->tx_errors += dev_stats.rx_errors;
1474
1475 stats->multicast += dev_stats.multicast;
1476 stats->collisions += dev_stats.collisions;
1477 }
1478 return 0;
8b61709d
BP
1479}
1480
bba1e6f3
PS
1481static int
1482netdev_internal_get_stats(const struct netdev *netdev_,
1483 struct netdev_stats *stats)
1484{
b5d57fc8 1485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
bba1e6f3
PS
1486
1487 get_stats_via_vport(netdev_, stats);
b5d57fc8 1488 return netdev->vport_stats_error;
bba1e6f3
PS
1489}
1490
2f31a822
EJ
1491static int
1492netdev_internal_set_stats(struct netdev *netdev,
1493 const struct netdev_stats *stats)
1494{
1495 struct ovs_vport_stats vport_stats;
1496 struct dpif_linux_vport vport;
1497 int err;
1498
1499 vport_stats.rx_packets = stats->rx_packets;
1500 vport_stats.tx_packets = stats->tx_packets;
1501 vport_stats.rx_bytes = stats->rx_bytes;
1502 vport_stats.tx_bytes = stats->tx_bytes;
1503 vport_stats.rx_errors = stats->rx_errors;
1504 vport_stats.tx_errors = stats->tx_errors;
1505 vport_stats.rx_dropped = stats->rx_dropped;
1506 vport_stats.tx_dropped = stats->tx_dropped;
1507
1508 dpif_linux_vport_init(&vport);
1509 vport.cmd = OVS_VPORT_CMD_SET;
1510 vport.name = netdev_get_name(netdev);
1511 vport.stats = &vport_stats;
1512
1513 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1514
1515 /* If the vport layer doesn't know about the device, that doesn't mean it
1516 * doesn't exist (after all were able to open it when netdev_open() was
1517 * called), it just means that it isn't attached and we'll be getting
1518 * stats a different way. */
1519 if (err == ENODEV) {
1520 err = EOPNOTSUPP;
1521 }
1522
1523 return err;
1524}
1525
51f87458 1526static void
b5d57fc8 1527netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1528{
1529 struct ethtool_cmd ecmd;
6c038611 1530 uint32_t speed;
8b61709d
BP
1531 int error;
1532
b5d57fc8 1533 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1534 return;
1535 }
1536
ab985a77 1537 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1538 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1539 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1540 ETHTOOL_GSET, "ETHTOOL_GSET");
1541 if (error) {
51f87458 1542 goto out;
8b61709d
BP
1543 }
1544
1545 /* Supported features. */
b5d57fc8 1546 netdev->supported = 0;
8b61709d 1547 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1548 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1549 }
1550 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1551 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1552 }
1553 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1554 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1555 }
1556 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1557 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1558 }
1559 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1560 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1561 }
1562 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1563 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1564 }
1565 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1566 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1567 }
1568 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1569 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1570 }
1571 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1572 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1573 }
1574 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1575 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1576 }
1577 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1578 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1579 }
1580 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1581 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1582 }
1583
1584 /* Advertised features. */
b5d57fc8 1585 netdev->advertised = 0;
8b61709d 1586 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1587 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1588 }
1589 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1590 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1591 }
1592 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1593 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1594 }
1595 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1596 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1597 }
1598 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1599 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1600 }
1601 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1602 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1603 }
1604 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1605 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1606 }
1607 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1608 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1609 }
1610 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1611 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1612 }
1613 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1614 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1615 }
1616 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1617 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1618 }
1619 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1620 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1621 }
1622
1623 /* Current settings. */
2a529ead 1624 speed = ecmd.speed;
6c038611 1625 if (speed == SPEED_10) {
b5d57fc8 1626 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1627 } else if (speed == SPEED_100) {
b5d57fc8 1628 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1629 } else if (speed == SPEED_1000) {
b5d57fc8 1630 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1631 } else if (speed == SPEED_10000) {
b5d57fc8 1632 netdev->current = NETDEV_F_10GB_FD;
6c038611 1633 } else if (speed == 40000) {
b5d57fc8 1634 netdev->current = NETDEV_F_40GB_FD;
6c038611 1635 } else if (speed == 100000) {
b5d57fc8 1636 netdev->current = NETDEV_F_100GB_FD;
6c038611 1637 } else if (speed == 1000000) {
b5d57fc8 1638 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1639 } else {
b5d57fc8 1640 netdev->current = 0;
8b61709d
BP
1641 }
1642
1643 if (ecmd.port == PORT_TP) {
b5d57fc8 1644 netdev->current |= NETDEV_F_COPPER;
8b61709d 1645 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1646 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1647 }
1648
1649 if (ecmd.autoneg) {
b5d57fc8 1650 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1651 }
1652
1653 /* Peer advertisements. */
b5d57fc8 1654 netdev->peer = 0; /* XXX */
8b61709d 1655
51f87458 1656out:
b5d57fc8
BP
1657 netdev->cache_valid |= VALID_FEATURES;
1658 netdev->get_features_error = error;
51f87458
PS
1659}
1660
1661/* Stores the features supported by 'netdev' into each of '*current',
1662 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1663 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1664 * errno value. */
1665static int
1666netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1667 enum netdev_features *current,
1668 enum netdev_features *advertised,
1669 enum netdev_features *supported,
1670 enum netdev_features *peer)
51f87458 1671{
b5d57fc8 1672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
51f87458 1673
b5d57fc8 1674 netdev_linux_read_features(netdev);
51f87458 1675
b5d57fc8
BP
1676 if (!netdev->get_features_error) {
1677 *current = netdev->current;
1678 *advertised = netdev->advertised;
1679 *supported = netdev->supported;
1680 *peer = netdev->peer;
51f87458 1681 }
b5d57fc8 1682 return netdev->get_features_error;
8b61709d
BP
1683}
1684
1685/* Set the features advertised by 'netdev' to 'advertise'. */
1686static int
6c038611
BP
1687netdev_linux_set_advertisements(struct netdev *netdev,
1688 enum netdev_features advertise)
8b61709d
BP
1689{
1690 struct ethtool_cmd ecmd;
1691 int error;
1692
ab985a77 1693 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1694 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1695 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1696 ETHTOOL_GSET, "ETHTOOL_GSET");
1697 if (error) {
1698 return error;
1699 }
1700
1701 ecmd.advertising = 0;
6c038611 1702 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1703 ecmd.advertising |= ADVERTISED_10baseT_Half;
1704 }
6c038611 1705 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1706 ecmd.advertising |= ADVERTISED_10baseT_Full;
1707 }
6c038611 1708 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1709 ecmd.advertising |= ADVERTISED_100baseT_Half;
1710 }
6c038611 1711 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1712 ecmd.advertising |= ADVERTISED_100baseT_Full;
1713 }
6c038611 1714 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1715 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1716 }
6c038611 1717 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1718 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1719 }
6c038611 1720 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1721 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1722 }
6c038611 1723 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1724 ecmd.advertising |= ADVERTISED_TP;
1725 }
6c038611 1726 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1727 ecmd.advertising |= ADVERTISED_FIBRE;
1728 }
6c038611 1729 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1730 ecmd.advertising |= ADVERTISED_Autoneg;
1731 }
6c038611 1732 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1733 ecmd.advertising |= ADVERTISED_Pause;
1734 }
6c038611 1735 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1736 ecmd.advertising |= ADVERTISED_Asym_Pause;
1737 }
ab985a77 1738 COVERAGE_INC(netdev_set_ethtool);
0b0544d7 1739 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1740 ETHTOOL_SSET, "ETHTOOL_SSET");
1741}
1742
f8500004
JP
1743/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1744 * successful, otherwise a positive errno value. */
8b61709d 1745static int
b5d57fc8 1746netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1747 uint32_t kbits_rate, uint32_t kbits_burst)
1748{
b5d57fc8
BP
1749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1750 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1751 int error;
8b61709d 1752
8e460221 1753
80a86fbe
BP
1754 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1755 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1756 : kbits_burst); /* Stick with user-specified value. */
1757
b5d57fc8
BP
1758 if (netdev->cache_valid & VALID_POLICING) {
1759 if (netdev->netdev_policing_error) {
1760 return netdev->netdev_policing_error;
c9f71668
PS
1761 }
1762
b5d57fc8
BP
1763 if (netdev->kbits_rate == kbits_rate &&
1764 netdev->kbits_burst == kbits_burst) {
c9f71668
PS
1765 /* Assume that settings haven't changed since we last set them. */
1766 return 0;
1767 }
b5d57fc8 1768 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1769 }
1770
ac8c3412 1771 COVERAGE_INC(netdev_set_policing);
f8500004 1772 /* Remove any existing ingress qdisc. */
b5d57fc8 1773 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1774 if (error) {
1775 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1776 netdev_name, ovs_strerror(error));
c9f71668 1777 goto out;
f8500004
JP
1778 }
1779
8b61709d 1780 if (kbits_rate) {
b5d57fc8 1781 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1782 if (error) {
1783 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 1784 netdev_name, ovs_strerror(error));
c9f71668 1785 goto out;
8b61709d
BP
1786 }
1787
b5d57fc8 1788 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
1789 if (error){
1790 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 1791 netdev_name, ovs_strerror(error));
c9f71668 1792 goto out;
8b61709d 1793 }
8b61709d
BP
1794 }
1795
b5d57fc8
BP
1796 netdev->kbits_rate = kbits_rate;
1797 netdev->kbits_burst = kbits_burst;
f8500004 1798
c9f71668
PS
1799out:
1800 if (!error || error == ENODEV) {
b5d57fc8
BP
1801 netdev->netdev_policing_error = error;
1802 netdev->cache_valid |= VALID_POLICING;
c9f71668
PS
1803 }
1804 return error;
8b61709d
BP
1805}
1806
c1c9c9c4
BP
1807static int
1808netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1809 struct sset *types)
c1c9c9c4 1810{
559eb230 1811 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1812
1813 for (opsp = tcs; *opsp != NULL; opsp++) {
1814 const struct tc_ops *ops = *opsp;
1815 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1816 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1817 }
1818 }
1819 return 0;
1820}
1821
1822static const struct tc_ops *
1823tc_lookup_ovs_name(const char *name)
1824{
559eb230 1825 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1826
1827 for (opsp = tcs; *opsp != NULL; opsp++) {
1828 const struct tc_ops *ops = *opsp;
1829 if (!strcmp(name, ops->ovs_name)) {
1830 return ops;
1831 }
1832 }
1833 return NULL;
1834}
1835
1836static const struct tc_ops *
1837tc_lookup_linux_name(const char *name)
1838{
559eb230 1839 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1840
1841 for (opsp = tcs; *opsp != NULL; opsp++) {
1842 const struct tc_ops *ops = *opsp;
1843 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1844 return ops;
1845 }
1846 }
1847 return NULL;
1848}
1849
93b13be8 1850static struct tc_queue *
b5d57fc8 1851tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
1852 size_t hash)
1853{
b5d57fc8 1854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
1855 struct tc_queue *queue;
1856
b5d57fc8 1857 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
1858 if (queue->queue_id == queue_id) {
1859 return queue;
1860 }
1861 }
1862 return NULL;
1863}
1864
1865static struct tc_queue *
1866tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1867{
1868 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1869}
1870
c1c9c9c4
BP
1871static int
1872netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1873 const char *type,
1874 struct netdev_qos_capabilities *caps)
1875{
1876 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1877 if (!ops) {
1878 return EOPNOTSUPP;
1879 }
1880 caps->n_queues = ops->n_queues;
1881 return 0;
1882}
1883
1884static int
b5d57fc8 1885netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 1886 const char **typep, struct smap *details)
c1c9c9c4 1887{
b5d57fc8 1888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1889 int error;
1890
b5d57fc8 1891 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1892 if (error) {
1893 return error;
1894 }
1895
b5d57fc8
BP
1896 *typep = netdev->tc->ops->ovs_name;
1897 return (netdev->tc->ops->qdisc_get
1898 ? netdev->tc->ops->qdisc_get(netdev_, details)
c1c9c9c4
BP
1899 : 0);
1900}
1901
1902static int
b5d57fc8 1903netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 1904 const char *type, const struct smap *details)
c1c9c9c4 1905{
b5d57fc8 1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1907 const struct tc_ops *new_ops;
1908 int error;
1909
1910 new_ops = tc_lookup_ovs_name(type);
1911 if (!new_ops || !new_ops->tc_install) {
1912 return EOPNOTSUPP;
1913 }
1914
b5d57fc8 1915 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1916 if (error) {
1917 return error;
1918 }
1919
b5d57fc8
BP
1920 if (new_ops == netdev->tc->ops) {
1921 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
1922 } else {
1923 /* Delete existing qdisc. */
b5d57fc8 1924 error = tc_del_qdisc(netdev_);
c1c9c9c4
BP
1925 if (error) {
1926 return error;
1927 }
b5d57fc8 1928 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
1929
1930 /* Install new qdisc. */
b5d57fc8
BP
1931 error = new_ops->tc_install(netdev_, details);
1932 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
1933
1934 return error;
1935 }
1936}
1937
1938static int
b5d57fc8 1939netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 1940 unsigned int queue_id, struct smap *details)
c1c9c9c4 1941{
b5d57fc8 1942 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1943 int error;
1944
b5d57fc8 1945 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1946 if (error) {
1947 return error;
93b13be8 1948 } else {
b5d57fc8 1949 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
93b13be8 1950 return (queue
b5d57fc8 1951 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 1952 : ENOENT);
c1c9c9c4 1953 }
c1c9c9c4
BP
1954}
1955
1956static int
b5d57fc8 1957netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 1958 unsigned int queue_id, const struct smap *details)
c1c9c9c4 1959{
b5d57fc8 1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1961 int error;
1962
b5d57fc8 1963 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1964 if (error) {
1965 return error;
b5d57fc8
BP
1966 } else if (queue_id >= netdev->tc->ops->n_queues
1967 || !netdev->tc->ops->class_set) {
c1c9c9c4
BP
1968 return EINVAL;
1969 }
1970
b5d57fc8 1971 return netdev->tc->ops->class_set(netdev_, queue_id, details);
c1c9c9c4
BP
1972}
1973
1974static int
b5d57fc8 1975netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 1976{
b5d57fc8 1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1978 int error;
1979
b5d57fc8 1980 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1981 if (error) {
1982 return error;
b5d57fc8 1983 } else if (!netdev->tc->ops->class_delete) {
c1c9c9c4 1984 return EINVAL;
93b13be8 1985 } else {
b5d57fc8 1986 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
93b13be8 1987 return (queue
b5d57fc8 1988 ? netdev->tc->ops->class_delete(netdev_, queue)
93b13be8 1989 : ENOENT);
c1c9c9c4 1990 }
c1c9c9c4
BP
1991}
1992
1993static int
b5d57fc8 1994netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
1995 unsigned int queue_id,
1996 struct netdev_queue_stats *stats)
1997{
b5d57fc8 1998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1999 int error;
2000
b5d57fc8 2001 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
2002 if (error) {
2003 return error;
b5d57fc8 2004 } else if (!netdev->tc->ops->class_get_stats) {
c1c9c9c4 2005 return EOPNOTSUPP;
93b13be8 2006 } else {
b5d57fc8 2007 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
6dc34a0d
BP
2008 if (!queue) {
2009 return ENOENT;
2010 }
2011 stats->created = queue->created;
2012 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
c1c9c9c4 2013 }
c1c9c9c4
BP
2014}
2015
23a98ffe 2016static bool
c1c9c9c4
BP
2017start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2018{
2019 struct ofpbuf request;
2020 struct tcmsg *tcmsg;
2021
2022 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2023 if (!tcmsg) {
2024 return false;
2025 }
3c4de644 2026 tcmsg->tcm_parent = 0;
a88b4e04 2027 nl_dump_start(dump, NETLINK_ROUTE, &request);
c1c9c9c4 2028 ofpbuf_uninit(&request);
23a98ffe 2029 return true;
c1c9c9c4
BP
2030}
2031
2032static int
b5d57fc8 2033netdev_linux_dump_queues(const struct netdev *netdev_,
c1c9c9c4
BP
2034 netdev_dump_queues_cb *cb, void *aux)
2035{
b5d57fc8 2036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f486e840 2037 struct tc_queue *queue, *next_queue;
79f1cbe9 2038 struct smap details;
c1c9c9c4 2039 int last_error;
c1c9c9c4
BP
2040 int error;
2041
b5d57fc8 2042 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
2043 if (error) {
2044 return error;
b5d57fc8 2045 } else if (!netdev->tc->ops->class_get) {
c1c9c9c4
BP
2046 return EOPNOTSUPP;
2047 }
2048
2049 last_error = 0;
79f1cbe9 2050 smap_init(&details);
f486e840 2051 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
b5d57fc8 2052 &netdev->tc->queues) {
79f1cbe9 2053 smap_clear(&details);
c1c9c9c4 2054
b5d57fc8 2055 error = netdev->tc->ops->class_get(netdev_, queue, &details);
c1c9c9c4 2056 if (!error) {
93b13be8 2057 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
2058 } else {
2059 last_error = error;
2060 }
2061 }
79f1cbe9 2062 smap_destroy(&details);
c1c9c9c4
BP
2063
2064 return last_error;
2065}
2066
2067static int
b5d57fc8 2068netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2069 netdev_dump_queue_stats_cb *cb, void *aux)
2070{
b5d57fc8 2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2072 struct nl_dump dump;
2073 struct ofpbuf msg;
2074 int last_error;
2075 int error;
2076
b5d57fc8 2077 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
2078 if (error) {
2079 return error;
b5d57fc8 2080 } else if (!netdev->tc->ops->class_dump_stats) {
c1c9c9c4
BP
2081 return EOPNOTSUPP;
2082 }
2083
2084 last_error = 0;
b5d57fc8 2085 if (!start_queue_dump(netdev_, &dump)) {
23a98ffe
BP
2086 return ENODEV;
2087 }
c1c9c9c4 2088 while (nl_dump_next(&dump, &msg)) {
b5d57fc8 2089 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
c1c9c9c4
BP
2090 if (error) {
2091 last_error = error;
2092 }
2093 }
2094
2095 error = nl_dump_done(&dump);
2096 return error ? error : last_error;
2097}
2098
8b61709d 2099static int
f1acd62b
BP
2100netdev_linux_get_in4(const struct netdev *netdev_,
2101 struct in_addr *address, struct in_addr *netmask)
8b61709d 2102{
b5d57fc8 2103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
149f577a 2104
b5d57fc8 2105 if (!(netdev->cache_valid & VALID_IN4)) {
8b61709d
BP
2106 int error;
2107
b5d57fc8 2108 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d
BP
2109 SIOCGIFADDR, "SIOCGIFADDR");
2110 if (error) {
2111 return error;
2112 }
2113
b5d57fc8 2114 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
f1acd62b
BP
2115 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2116 if (error) {
2117 return error;
2118 }
2119
b5d57fc8 2120 netdev->cache_valid |= VALID_IN4;
8b61709d 2121 }
b5d57fc8
BP
2122 *address = netdev->address;
2123 *netmask = netdev->netmask;
f1acd62b 2124 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2125}
2126
8b61709d 2127static int
f1acd62b
BP
2128netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2129 struct in_addr netmask)
8b61709d 2130{
b5d57fc8 2131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2132 int error;
2133
f1acd62b 2134 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2135 if (!error) {
b5d57fc8
BP
2136 netdev->cache_valid |= VALID_IN4;
2137 netdev->address = address;
2138 netdev->netmask = netmask;
f1acd62b 2139 if (address.s_addr != INADDR_ANY) {
8b61709d 2140 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2141 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2142 }
2143 }
2144 return error;
2145}
2146
2147static bool
2148parse_if_inet6_line(const char *line,
2149 struct in6_addr *in6, char ifname[16 + 1])
2150{
2151 uint8_t *s6 = in6->s6_addr;
2152#define X8 "%2"SCNx8
2153 return sscanf(line,
2154 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2155 "%*x %*x %*x %*x %16s\n",
2156 &s6[0], &s6[1], &s6[2], &s6[3],
2157 &s6[4], &s6[5], &s6[6], &s6[7],
2158 &s6[8], &s6[9], &s6[10], &s6[11],
2159 &s6[12], &s6[13], &s6[14], &s6[15],
2160 ifname) == 17;
2161}
2162
2163/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2164 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2165static int
2166netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2167{
b5d57fc8
BP
2168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2169 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2170 FILE *file;
2171 char line[128];
2172
b5d57fc8 2173 netdev->in6 = in6addr_any;
8b61709d
BP
2174
2175 file = fopen("/proc/net/if_inet6", "r");
2176 if (file != NULL) {
2177 const char *name = netdev_get_name(netdev_);
2178 while (fgets(line, sizeof line, file)) {
2a022368 2179 struct in6_addr in6_tmp;
8b61709d 2180 char ifname[16 + 1];
2a022368 2181 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2182 && !strcmp(name, ifname))
2183 {
b5d57fc8 2184 netdev->in6 = in6_tmp;
8b61709d
BP
2185 break;
2186 }
2187 }
2188 fclose(file);
2189 }
b5d57fc8 2190 netdev->cache_valid |= VALID_IN6;
8b61709d 2191 }
b5d57fc8 2192 *in6 = netdev->in6;
8b61709d
BP
2193 return 0;
2194}
2195
2196static void
2197make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2198{
2199 struct sockaddr_in sin;
2200 memset(&sin, 0, sizeof sin);
2201 sin.sin_family = AF_INET;
2202 sin.sin_addr = addr;
2203 sin.sin_port = 0;
2204
2205 memset(sa, 0, sizeof *sa);
2206 memcpy(sa, &sin, sizeof sin);
2207}
2208
2209static int
2210do_set_addr(struct netdev *netdev,
2211 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2212{
2213 struct ifreq ifr;
71d7c22f 2214 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2215 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2216
2217 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2218 ioctl_name);
8b61709d
BP
2219}
2220
2221/* Adds 'router' as a default IP gateway. */
2222static int
67a4917b 2223netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2224{
2225 struct in_addr any = { INADDR_ANY };
2226 struct rtentry rt;
2227 int error;
2228
2229 memset(&rt, 0, sizeof rt);
2230 make_in4_sockaddr(&rt.rt_dst, any);
2231 make_in4_sockaddr(&rt.rt_gateway, router);
2232 make_in4_sockaddr(&rt.rt_genmask, any);
2233 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2234 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2235 if (error) {
10a89ef0 2236 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2237 }
2238 return error;
2239}
2240
f1acd62b
BP
2241static int
2242netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2243 char **netdev_name)
2244{
2245 static const char fn[] = "/proc/net/route";
2246 FILE *stream;
2247 char line[256];
2248 int ln;
2249
2250 *netdev_name = NULL;
2251 stream = fopen(fn, "r");
2252 if (stream == NULL) {
10a89ef0 2253 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2254 return errno;
2255 }
2256
2257 ln = 0;
2258 while (fgets(line, sizeof line, stream)) {
2259 if (++ln >= 2) {
2260 char iface[17];
dbba996b 2261 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2262 int refcnt, metric, mtu;
2263 unsigned int flags, use, window, irtt;
2264
2265 if (sscanf(line,
2266 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2267 " %d %u %u\n",
2268 iface, &dest, &gateway, &flags, &refcnt,
2269 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2270
d295e8e9 2271 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2272 fn, ln, line);
2273 continue;
2274 }
2275 if (!(flags & RTF_UP)) {
2276 /* Skip routes that aren't up. */
2277 continue;
2278 }
2279
2280 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2281 * network byte order, so we don't need need any endian
f1acd62b
BP
2282 * conversions here. */
2283 if ((dest & mask) == (host->s_addr & mask)) {
2284 if (!gateway) {
2285 /* The host is directly reachable. */
2286 next_hop->s_addr = 0;
2287 } else {
2288 /* To reach the host, we must go through a gateway. */
2289 next_hop->s_addr = gateway;
2290 }
2291 *netdev_name = xstrdup(iface);
2292 fclose(stream);
2293 return 0;
2294 }
2295 }
2296 }
2297
2298 fclose(stream);
2299 return ENXIO;
2300}
2301
e210037e 2302static int
b5d57fc8 2303netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2304{
b5d57fc8 2305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2306 int error = 0;
2307
b5d57fc8
BP
2308 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2309 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2310
2311 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2312 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2313 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2314 cmd,
2315 ETHTOOL_GDRVINFO,
2316 "ETHTOOL_GDRVINFO");
2317 if (!error) {
b5d57fc8 2318 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2319 }
2320 }
e210037e 2321
e210037e 2322 if (!error) {
b5d57fc8
BP
2323 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2324 smap_add(smap, "driver_version", netdev->drvinfo.version);
2325 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2326 }
e210037e
AE
2327 return error;
2328}
2329
4f925bd3 2330static int
275707c3
EJ
2331netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2332 struct smap *smap)
4f925bd3 2333{
79f1cbe9 2334 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2335 return 0;
2336}
2337
8b61709d
BP
2338/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2339 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2340 * returns 0. Otherwise, it returns a positive errno value; in particular,
2341 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2342static int
2343netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2344 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2345{
2346 struct arpreq r;
c100e025 2347 struct sockaddr_in sin;
8b61709d
BP
2348 int retval;
2349
2350 memset(&r, 0, sizeof r);
f2cc621b 2351 memset(&sin, 0, sizeof sin);
c100e025
BP
2352 sin.sin_family = AF_INET;
2353 sin.sin_addr.s_addr = ip;
2354 sin.sin_port = 0;
2355 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2356 r.arp_ha.sa_family = ARPHRD_ETHER;
2357 r.arp_flags = 0;
71d7c22f 2358 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2359 COVERAGE_INC(netdev_arp_lookup);
2360 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2361 if (!retval) {
2362 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2363 } else if (retval != ENXIO) {
2364 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2365 netdev_get_name(netdev), IP_ARGS(ip),
2366 ovs_strerror(retval));
8b61709d
BP
2367 }
2368 return retval;
2369}
2370
2371static int
2372nd_to_iff_flags(enum netdev_flags nd)
2373{
2374 int iff = 0;
2375 if (nd & NETDEV_UP) {
2376 iff |= IFF_UP;
2377 }
2378 if (nd & NETDEV_PROMISC) {
2379 iff |= IFF_PROMISC;
2380 }
2381 return iff;
2382}
2383
2384static int
2385iff_to_nd_flags(int iff)
2386{
2387 enum netdev_flags nd = 0;
2388 if (iff & IFF_UP) {
2389 nd |= NETDEV_UP;
2390 }
2391 if (iff & IFF_PROMISC) {
2392 nd |= NETDEV_PROMISC;
2393 }
2394 return nd;
2395}
2396
2397static int
b5d57fc8 2398netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
8b61709d
BP
2399 enum netdev_flags on, enum netdev_flags *old_flagsp)
2400{
b5d57fc8 2401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 2402 int old_flags, new_flags;
c37d4da4
EJ
2403 int error = 0;
2404
b5d57fc8 2405 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2406 *old_flagsp = iff_to_nd_flags(old_flags);
2407 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2408 if (new_flags != old_flags) {
b5d57fc8
BP
2409 error = set_flags(netdev_get_name(netdev_), new_flags);
2410 get_flags(netdev_, &netdev->ifi_flags);
8b61709d
BP
2411 }
2412 return error;
2413}
2414
ac4d3bcb
EJ
2415static unsigned int
2416netdev_linux_change_seq(const struct netdev *netdev)
2417{
b5d57fc8 2418 return netdev_linux_cast(netdev)->change_seq;
ac4d3bcb
EJ
2419}
2420
4f925bd3 2421#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
51f87458 2422 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2423{ \
2424 NAME, \
2425 \
2426 netdev_linux_init, \
2427 netdev_linux_run, \
2428 netdev_linux_wait, \
2429 \
2430 CREATE, \
2431 netdev_linux_destroy, \
de5cdb90 2432 NULL, /* get_config */ \
6d9e6eb4 2433 NULL, /* set_config */ \
f431bf7d 2434 NULL, /* get_tunnel_config */ \
c3827f61 2435 \
796223f5 2436 netdev_linux_rx_open, \
c3827f61
BP
2437 \
2438 netdev_linux_send, \
2439 netdev_linux_send_wait, \
2440 \
2441 netdev_linux_set_etheraddr, \
2442 netdev_linux_get_etheraddr, \
2443 netdev_linux_get_mtu, \
9b020780 2444 netdev_linux_set_mtu, \
c3827f61
BP
2445 netdev_linux_get_ifindex, \
2446 netdev_linux_get_carrier, \
65c3058c 2447 netdev_linux_get_carrier_resets, \
1670c579 2448 netdev_linux_set_miimon_interval, \
f613a0d7 2449 GET_STATS, \
c3827f61
BP
2450 SET_STATS, \
2451 \
51f87458 2452 GET_FEATURES, \
c3827f61 2453 netdev_linux_set_advertisements, \
c3827f61
BP
2454 \
2455 netdev_linux_set_policing, \
2456 netdev_linux_get_qos_types, \
2457 netdev_linux_get_qos_capabilities, \
2458 netdev_linux_get_qos, \
2459 netdev_linux_set_qos, \
2460 netdev_linux_get_queue, \
2461 netdev_linux_set_queue, \
2462 netdev_linux_delete_queue, \
2463 netdev_linux_get_queue_stats, \
2464 netdev_linux_dump_queues, \
2465 netdev_linux_dump_queue_stats, \
2466 \
2467 netdev_linux_get_in4, \
2468 netdev_linux_set_in4, \
2469 netdev_linux_get_in6, \
2470 netdev_linux_add_router, \
2471 netdev_linux_get_next_hop, \
4f925bd3 2472 GET_STATUS, \
c3827f61
BP
2473 netdev_linux_arp_lookup, \
2474 \
2475 netdev_linux_update_flags, \
2476 \
ac4d3bcb 2477 netdev_linux_change_seq \
c3827f61
BP
2478}
2479
2480const struct netdev_class netdev_linux_class =
2481 NETDEV_LINUX_CLASS(
2482 "system",
2483 netdev_linux_create,
f613a0d7 2484 netdev_linux_get_stats,
4f925bd3 2485 NULL, /* set_stats */
51f87458 2486 netdev_linux_get_features,
275707c3 2487 netdev_linux_get_status);
c3827f61
BP
2488
2489const struct netdev_class netdev_tap_class =
2490 NETDEV_LINUX_CLASS(
2491 "tap",
2492 netdev_linux_create_tap,
bba1e6f3 2493 netdev_tap_get_stats,
4f925bd3 2494 NULL, /* set_stats */
51f87458 2495 netdev_linux_get_features,
275707c3 2496 netdev_linux_get_status);
c3827f61
BP
2497
2498const struct netdev_class netdev_internal_class =
2499 NETDEV_LINUX_CLASS(
2500 "internal",
2501 netdev_linux_create,
bba1e6f3 2502 netdev_internal_get_stats,
2f31a822 2503 netdev_internal_set_stats,
51f87458 2504 NULL, /* get_features */
275707c3 2505 netdev_internal_get_status);
796223f5
BP
2506
2507static const struct netdev_rx_class netdev_rx_linux_class = {
2508 netdev_rx_linux_destroy,
2509 netdev_rx_linux_recv,
2510 netdev_rx_linux_wait,
2511 netdev_rx_linux_drain,
2512};
8b61709d 2513\f
c1c9c9c4 2514/* HTB traffic control class. */
559843ed 2515
c1c9c9c4 2516#define HTB_N_QUEUES 0xf000
8b61709d 2517
c1c9c9c4
BP
2518struct htb {
2519 struct tc tc;
2520 unsigned int max_rate; /* In bytes/s. */
2521};
8b61709d 2522
c1c9c9c4 2523struct htb_class {
93b13be8 2524 struct tc_queue tc_queue;
c1c9c9c4
BP
2525 unsigned int min_rate; /* In bytes/s. */
2526 unsigned int max_rate; /* In bytes/s. */
2527 unsigned int burst; /* In bytes. */
2528 unsigned int priority; /* Lower values are higher priorities. */
2529};
8b61709d 2530
c1c9c9c4 2531static struct htb *
b5d57fc8 2532htb_get__(const struct netdev *netdev_)
c1c9c9c4 2533{
b5d57fc8
BP
2534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2535 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
2536}
2537
24045e35 2538static void
b5d57fc8 2539htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 2540{
b5d57fc8 2541 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2542 struct htb *htb;
2543
2544 htb = xmalloc(sizeof *htb);
2545 tc_init(&htb->tc, &tc_ops_htb);
2546 htb->max_rate = max_rate;
2547
b5d57fc8 2548 netdev->tc = &htb->tc;
c1c9c9c4
BP
2549}
2550
2551/* Create an HTB qdisc.
2552 *
a339aa81 2553 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2554static int
2555htb_setup_qdisc__(struct netdev *netdev)
2556{
2557 size_t opt_offset;
2558 struct tc_htb_glob opt;
2559 struct ofpbuf request;
2560 struct tcmsg *tcmsg;
2561
2562 tc_del_qdisc(netdev);
2563
2564 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2565 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2566 if (!tcmsg) {
2567 return ENODEV;
2568 }
c1c9c9c4
BP
2569 tcmsg->tcm_handle = tc_make_handle(1, 0);
2570 tcmsg->tcm_parent = TC_H_ROOT;
2571
2572 nl_msg_put_string(&request, TCA_KIND, "htb");
2573
2574 memset(&opt, 0, sizeof opt);
2575 opt.rate2quantum = 10;
2576 opt.version = 3;
4ecf12d5 2577 opt.defcls = 1;
c1c9c9c4
BP
2578
2579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2580 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2581 nl_msg_end_nested(&request, opt_offset);
2582
2583 return tc_transact(&request, NULL);
2584}
2585
2586/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2587 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2588static int
2589htb_setup_class__(struct netdev *netdev, unsigned int handle,
2590 unsigned int parent, struct htb_class *class)
2591{
2592 size_t opt_offset;
2593 struct tc_htb_opt opt;
2594 struct ofpbuf request;
2595 struct tcmsg *tcmsg;
2596 int error;
2597 int mtu;
2598
9b020780
PS
2599 error = netdev_get_mtu(netdev, &mtu);
2600 if (error) {
f915f1a8
BP
2601 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2602 netdev_get_name(netdev));
9b020780 2603 return error;
f915f1a8 2604 }
c1c9c9c4
BP
2605
2606 memset(&opt, 0, sizeof opt);
2607 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2608 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2609 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2610 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2611 opt.prio = class->priority;
2612
2613 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2614 if (!tcmsg) {
2615 return ENODEV;
2616 }
c1c9c9c4
BP
2617 tcmsg->tcm_handle = handle;
2618 tcmsg->tcm_parent = parent;
2619
2620 nl_msg_put_string(&request, TCA_KIND, "htb");
2621 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2622 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2623 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2624 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2625 nl_msg_end_nested(&request, opt_offset);
2626
2627 error = tc_transact(&request, NULL);
2628 if (error) {
2629 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2630 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2631 netdev_get_name(netdev),
2632 tc_get_major(handle), tc_get_minor(handle),
2633 tc_get_major(parent), tc_get_minor(parent),
2634 class->min_rate, class->max_rate,
10a89ef0 2635 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
2636 }
2637 return error;
2638}
2639
2640/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2641 * description of them into 'details'. The description complies with the
2642 * specification given in the vswitch database documentation for linux-htb
2643 * queue details. */
2644static int
2645htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2646{
2647 static const struct nl_policy tca_htb_policy[] = {
2648 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2649 .min_len = sizeof(struct tc_htb_opt) },
2650 };
2651
2652 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2653 const struct tc_htb_opt *htb;
2654
2655 if (!nl_parse_nested(nl_options, tca_htb_policy,
2656 attrs, ARRAY_SIZE(tca_htb_policy))) {
2657 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2658 return EPROTO;
2659 }
2660
2661 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2662 class->min_rate = htb->rate.rate;
2663 class->max_rate = htb->ceil.rate;
2664 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2665 class->priority = htb->prio;
2666 return 0;
2667}
2668
2669static int
2670htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2671 struct htb_class *options,
2672 struct netdev_queue_stats *stats)
2673{
2674 struct nlattr *nl_options;
2675 unsigned int handle;
2676 int error;
2677
2678 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2679 if (!error && queue_id) {
17ee3c1f
BP
2680 unsigned int major = tc_get_major(handle);
2681 unsigned int minor = tc_get_minor(handle);
2682 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2683 *queue_id = minor - 1;
c1c9c9c4
BP
2684 } else {
2685 error = EPROTO;
2686 }
2687 }
2688 if (!error && options) {
2689 error = htb_parse_tca_options__(nl_options, options);
2690 }
2691 return error;
2692}
2693
2694static void
2695htb_parse_qdisc_details__(struct netdev *netdev,
79f1cbe9 2696 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2697{
2698 const char *max_rate_s;
2699
79f1cbe9 2700 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2701 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2702 if (!hc->max_rate) {
a00ca915 2703 enum netdev_features current;
c1c9c9c4
BP
2704
2705 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 2706 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2707 }
2708 hc->min_rate = hc->max_rate;
2709 hc->burst = 0;
2710 hc->priority = 0;
2711}
2712
2713static int
2714htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2715 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2716{
2717 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2718 const char *min_rate_s = smap_get(details, "min-rate");
2719 const char *max_rate_s = smap_get(details, "max-rate");
2720 const char *burst_s = smap_get(details, "burst");
2721 const char *priority_s = smap_get(details, "priority");
9b020780 2722 int mtu, error;
c1c9c9c4 2723
9b020780
PS
2724 error = netdev_get_mtu(netdev, &mtu);
2725 if (error) {
f915f1a8
BP
2726 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2727 netdev_get_name(netdev));
9b020780 2728 return error;
f915f1a8
BP
2729 }
2730
4f104611
EJ
2731 /* HTB requires at least an mtu sized min-rate to send any traffic even
2732 * on uncongested links. */
c45ab5e9 2733 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2734 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2735 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2736
2737 /* max-rate */
2738 hc->max_rate = (max_rate_s
2739 ? strtoull(max_rate_s, NULL, 10) / 8
2740 : htb->max_rate);
2741 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2742 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2743
2744 /* burst
2745 *
2746 * According to hints in the documentation that I've read, it is important
2747 * that 'burst' be at least as big as the largest frame that might be
2748 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2749 * but having it a bit too small is a problem. Since netdev_get_mtu()
2750 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2751 * the MTU. We actually add 64, instead of 14, as a guard against
2752 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2753 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2754 hc->burst = MAX(hc->burst, mtu + 64);
2755
2756 /* priority */
2757 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2758
2759 return 0;
2760}
2761
2762static int
2763htb_query_class__(const struct netdev *netdev, unsigned int handle,
2764 unsigned int parent, struct htb_class *options,
2765 struct netdev_queue_stats *stats)
2766{
2767 struct ofpbuf *reply;
2768 int error;
2769
2770 error = tc_query_class(netdev, handle, parent, &reply);
2771 if (!error) {
2772 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2773 ofpbuf_delete(reply);
2774 }
2775 return error;
2776}
2777
2778static int
79f1cbe9 2779htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2780{
2781 int error;
2782
2783 error = htb_setup_qdisc__(netdev);
2784 if (!error) {
2785 struct htb_class hc;
2786
2787 htb_parse_qdisc_details__(netdev, details, &hc);
2788 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2789 tc_make_handle(1, 0), &hc);
2790 if (!error) {
2791 htb_install__(netdev, hc.max_rate);
2792 }
2793 }
2794 return error;
2795}
2796
93b13be8
BP
2797static struct htb_class *
2798htb_class_cast__(const struct tc_queue *queue)
2799{
2800 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2801}
2802
c1c9c9c4
BP
2803static void
2804htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2805 const struct htb_class *hc)
2806{
2807 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2808 size_t hash = hash_int(queue_id, 0);
2809 struct tc_queue *queue;
c1c9c9c4
BP
2810 struct htb_class *hcp;
2811
93b13be8
BP
2812 queue = tc_find_queue__(netdev, queue_id, hash);
2813 if (queue) {
2814 hcp = htb_class_cast__(queue);
2815 } else {
c1c9c9c4 2816 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2817 queue = &hcp->tc_queue;
2818 queue->queue_id = queue_id;
6dc34a0d 2819 queue->created = time_msec();
93b13be8 2820 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2821 }
93b13be8
BP
2822
2823 hcp->min_rate = hc->min_rate;
2824 hcp->max_rate = hc->max_rate;
2825 hcp->burst = hc->burst;
2826 hcp->priority = hc->priority;
c1c9c9c4
BP
2827}
2828
2829static int
2830htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2831{
c1c9c9c4
BP
2832 struct ofpbuf msg;
2833 struct nl_dump dump;
2834 struct htb_class hc;
c1c9c9c4
BP
2835
2836 /* Get qdisc options. */
2837 hc.max_rate = 0;
2838 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2839 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2840
2841 /* Get queues. */
23a98ffe
BP
2842 if (!start_queue_dump(netdev, &dump)) {
2843 return ENODEV;
2844 }
c1c9c9c4
BP
2845 while (nl_dump_next(&dump, &msg)) {
2846 unsigned int queue_id;
2847
2848 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2849 htb_update_queue__(netdev, queue_id, &hc);
2850 }
2851 }
2852 nl_dump_done(&dump);
2853
2854 return 0;
2855}
2856
2857static void
2858htb_tc_destroy(struct tc *tc)
2859{
2860 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2861 struct htb_class *hc, *next;
c1c9c9c4 2862
4e8e4213 2863 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2864 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2865 free(hc);
2866 }
2867 tc_destroy(tc);
2868 free(htb);
2869}
2870
2871static int
79f1cbe9 2872htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2873{
2874 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2875 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2876 return 0;
2877}
2878
2879static int
79f1cbe9 2880htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2881{
2882 struct htb_class hc;
2883 int error;
2884
2885 htb_parse_qdisc_details__(netdev, details, &hc);
2886 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2887 tc_make_handle(1, 0), &hc);
2888 if (!error) {
2889 htb_get__(netdev)->max_rate = hc.max_rate;
2890 }
2891 return error;
2892}
2893
2894static int
93b13be8 2895htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 2896 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 2897{
93b13be8 2898 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2899
79f1cbe9 2900 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 2901 if (hc->min_rate != hc->max_rate) {
79f1cbe9 2902 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 2903 }
79f1cbe9 2904 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 2905 if (hc->priority) {
79f1cbe9 2906 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
2907 }
2908 return 0;
2909}
2910
2911static int
2912htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 2913 const struct smap *details)
c1c9c9c4
BP
2914{
2915 struct htb_class hc;
2916 int error;
2917
2918 error = htb_parse_class_details__(netdev, details, &hc);
2919 if (error) {
2920 return error;
2921 }
2922
17ee3c1f 2923 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2924 tc_make_handle(1, 0xfffe), &hc);
2925 if (error) {
2926 return error;
2927 }
2928
2929 htb_update_queue__(netdev, queue_id, &hc);
2930 return 0;
2931}
2932
2933static int
93b13be8 2934htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2935{
93b13be8 2936 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2937 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2938 int error;
2939
93b13be8 2940 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2941 if (!error) {
93b13be8 2942 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2943 free(hc);
c1c9c9c4
BP
2944 }
2945 return error;
2946}
2947
2948static int
93b13be8 2949htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2950 struct netdev_queue_stats *stats)
2951{
93b13be8 2952 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2953 tc_make_handle(1, 0xfffe), NULL, stats);
2954}
2955
2956static int
2957htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2958 const struct ofpbuf *nlmsg,
2959 netdev_dump_queue_stats_cb *cb, void *aux)
2960{
2961 struct netdev_queue_stats stats;
17ee3c1f 2962 unsigned int handle, major, minor;
c1c9c9c4
BP
2963 int error;
2964
2965 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2966 if (error) {
2967 return error;
2968 }
2969
17ee3c1f
BP
2970 major = tc_get_major(handle);
2971 minor = tc_get_minor(handle);
2972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2973 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2974 }
2975 return 0;
2976}
2977
2978static const struct tc_ops tc_ops_htb = {
2979 "htb", /* linux_name */
2980 "linux-htb", /* ovs_name */
2981 HTB_N_QUEUES, /* n_queues */
2982 htb_tc_install,
2983 htb_tc_load,
2984 htb_tc_destroy,
2985 htb_qdisc_get,
2986 htb_qdisc_set,
2987 htb_class_get,
2988 htb_class_set,
2989 htb_class_delete,
2990 htb_class_get_stats,
2991 htb_class_dump_stats
2992};
2993\f
a339aa81
EJ
2994/* "linux-hfsc" traffic control class. */
2995
2996#define HFSC_N_QUEUES 0xf000
2997
2998struct hfsc {
2999 struct tc tc;
3000 uint32_t max_rate;
3001};
3002
3003struct hfsc_class {
3004 struct tc_queue tc_queue;
3005 uint32_t min_rate;
3006 uint32_t max_rate;
3007};
3008
3009static struct hfsc *
b5d57fc8 3010hfsc_get__(const struct netdev *netdev_)
a339aa81 3011{
b5d57fc8
BP
3012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3013 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
3014}
3015
3016static struct hfsc_class *
3017hfsc_class_cast__(const struct tc_queue *queue)
3018{
3019 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3020}
3021
24045e35 3022static void
b5d57fc8 3023hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 3024{
b5d57fc8 3025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3026 struct hfsc *hfsc;
3027
a339aa81
EJ
3028 hfsc = xmalloc(sizeof *hfsc);
3029 tc_init(&hfsc->tc, &tc_ops_hfsc);
3030 hfsc->max_rate = max_rate;
b5d57fc8 3031 netdev->tc = &hfsc->tc;
a339aa81
EJ
3032}
3033
3034static void
3035hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3036 const struct hfsc_class *hc)
3037{
3038 size_t hash;
3039 struct hfsc *hfsc;
3040 struct hfsc_class *hcp;
3041 struct tc_queue *queue;
3042
3043 hfsc = hfsc_get__(netdev);
3044 hash = hash_int(queue_id, 0);
3045
3046 queue = tc_find_queue__(netdev, queue_id, hash);
3047 if (queue) {
3048 hcp = hfsc_class_cast__(queue);
3049 } else {
3050 hcp = xmalloc(sizeof *hcp);
3051 queue = &hcp->tc_queue;
3052 queue->queue_id = queue_id;
6dc34a0d 3053 queue->created = time_msec();
a339aa81
EJ
3054 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3055 }
3056
3057 hcp->min_rate = hc->min_rate;
3058 hcp->max_rate = hc->max_rate;
3059}
3060
3061static int
3062hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3063{
3064 const struct tc_service_curve *rsc, *fsc, *usc;
3065 static const struct nl_policy tca_hfsc_policy[] = {
3066 [TCA_HFSC_RSC] = {
3067 .type = NL_A_UNSPEC,
3068 .optional = false,
3069 .min_len = sizeof(struct tc_service_curve),
3070 },
3071 [TCA_HFSC_FSC] = {
3072 .type = NL_A_UNSPEC,
3073 .optional = false,
3074 .min_len = sizeof(struct tc_service_curve),
3075 },
3076 [TCA_HFSC_USC] = {
3077 .type = NL_A_UNSPEC,
3078 .optional = false,
3079 .min_len = sizeof(struct tc_service_curve),
3080 },
3081 };
3082 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3083
3084 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3085 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3087 return EPROTO;
3088 }
3089
3090 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3091 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3092 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3093
3094 if (rsc->m1 != 0 || rsc->d != 0 ||
3095 fsc->m1 != 0 || fsc->d != 0 ||
3096 usc->m1 != 0 || usc->d != 0) {
3097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3098 "Non-linear service curves are not supported.");
3099 return EPROTO;
3100 }
3101
3102 if (rsc->m2 != fsc->m2) {
3103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3104 "Real-time service curves are not supported ");
3105 return EPROTO;
3106 }
3107
3108 if (rsc->m2 > usc->m2) {
3109 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3110 "Min-rate service curve is greater than "
3111 "the max-rate service curve.");
3112 return EPROTO;
3113 }
3114
3115 class->min_rate = fsc->m2;
3116 class->max_rate = usc->m2;
3117 return 0;
3118}
3119
3120static int
3121hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3122 struct hfsc_class *options,
3123 struct netdev_queue_stats *stats)
3124{
3125 int error;
3126 unsigned int handle;
3127 struct nlattr *nl_options;
3128
3129 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3130 if (error) {
3131 return error;
3132 }
3133
3134 if (queue_id) {
3135 unsigned int major, minor;
3136
3137 major = tc_get_major(handle);
3138 minor = tc_get_minor(handle);
3139 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3140 *queue_id = minor - 1;
3141 } else {
3142 return EPROTO;
3143 }
3144 }
3145
3146 if (options) {
3147 error = hfsc_parse_tca_options__(nl_options, options);
3148 }
3149
3150 return error;
3151}
3152
3153static int
3154hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3155 unsigned int parent, struct hfsc_class *options,
3156 struct netdev_queue_stats *stats)
3157{
3158 int error;
3159 struct ofpbuf *reply;
3160
3161 error = tc_query_class(netdev, handle, parent, &reply);
3162 if (error) {
3163 return error;
3164 }
3165
3166 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3167 ofpbuf_delete(reply);
3168 return error;
3169}
3170
3171static void
79f1cbe9 3172hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
a339aa81
EJ
3173 struct hfsc_class *class)
3174{
3175 uint32_t max_rate;
3176 const char *max_rate_s;
3177
79f1cbe9 3178 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3179 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3180
3181 if (!max_rate) {
a00ca915 3182 enum netdev_features current;
a339aa81
EJ
3183
3184 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 3185 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3186 }
3187
3188 class->min_rate = max_rate;
3189 class->max_rate = max_rate;
3190}
3191
3192static int
3193hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3194 const struct smap *details,
a339aa81
EJ
3195 struct hfsc_class * class)
3196{
3197 const struct hfsc *hfsc;
3198 uint32_t min_rate, max_rate;
3199 const char *min_rate_s, *max_rate_s;
3200
3201 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3202 min_rate_s = smap_get(details, "min-rate");
3203 max_rate_s = smap_get(details, "max-rate");
a339aa81 3204
c45ab5e9 3205 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3206 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3207 min_rate = MIN(min_rate, hfsc->max_rate);
3208
3209 max_rate = (max_rate_s
3210 ? strtoull(max_rate_s, NULL, 10) / 8
3211 : hfsc->max_rate);
3212 max_rate = MAX(max_rate, min_rate);
3213 max_rate = MIN(max_rate, hfsc->max_rate);
3214
3215 class->min_rate = min_rate;
3216 class->max_rate = max_rate;
3217
3218 return 0;
3219}
3220
3221/* Create an HFSC qdisc.
3222 *
3223 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3224static int
3225hfsc_setup_qdisc__(struct netdev * netdev)
3226{
3227 struct tcmsg *tcmsg;
3228 struct ofpbuf request;
3229 struct tc_hfsc_qopt opt;
3230
3231 tc_del_qdisc(netdev);
3232
3233 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3234 NLM_F_EXCL | NLM_F_CREATE, &request);
3235
3236 if (!tcmsg) {
3237 return ENODEV;
3238 }
3239
3240 tcmsg->tcm_handle = tc_make_handle(1, 0);
3241 tcmsg->tcm_parent = TC_H_ROOT;
3242
3243 memset(&opt, 0, sizeof opt);
3244 opt.defcls = 1;
3245
3246 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3247 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3248
3249 return tc_transact(&request, NULL);
3250}
3251
3252/* Create an HFSC class.
3253 *
3254 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3255 * sc rate <min_rate> ul rate <max_rate>" */
3256static int
3257hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3258 unsigned int parent, struct hfsc_class *class)
3259{
3260 int error;
3261 size_t opt_offset;
3262 struct tcmsg *tcmsg;
3263 struct ofpbuf request;
3264 struct tc_service_curve min, max;
3265
3266 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3267
3268 if (!tcmsg) {
3269 return ENODEV;
3270 }
3271
3272 tcmsg->tcm_handle = handle;
3273 tcmsg->tcm_parent = parent;
3274
3275 min.m1 = 0;
3276 min.d = 0;
3277 min.m2 = class->min_rate;
3278
3279 max.m1 = 0;
3280 max.d = 0;
3281 max.m2 = class->max_rate;
3282
3283 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3285 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3287 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3288 nl_msg_end_nested(&request, opt_offset);
3289
3290 error = tc_transact(&request, NULL);
3291 if (error) {
3292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3293 "min-rate %ubps, max-rate %ubps (%s)",
3294 netdev_get_name(netdev),
3295 tc_get_major(handle), tc_get_minor(handle),
3296 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 3297 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
3298 }
3299
3300 return error;
3301}
3302
3303static int
79f1cbe9 3304hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3305{
3306 int error;
3307 struct hfsc_class class;
3308
3309 error = hfsc_setup_qdisc__(netdev);
3310
3311 if (error) {
3312 return error;
3313 }
3314
3315 hfsc_parse_qdisc_details__(netdev, details, &class);
3316 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3317 tc_make_handle(1, 0), &class);
3318
3319 if (error) {
3320 return error;
3321 }
3322
3323 hfsc_install__(netdev, class.max_rate);
3324 return 0;
3325}
3326
3327static int
3328hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3329{
3330 struct ofpbuf msg;
a339aa81
EJ
3331 struct nl_dump dump;
3332 struct hfsc_class hc;
3333
3334 hc.max_rate = 0;
3335 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3336 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3337
3338 if (!start_queue_dump(netdev, &dump)) {
3339 return ENODEV;
3340 }
3341
3342 while (nl_dump_next(&dump, &msg)) {
3343 unsigned int queue_id;
3344
3345 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3346 hfsc_update_queue__(netdev, queue_id, &hc);
3347 }
3348 }
3349
3350 nl_dump_done(&dump);
3351 return 0;
3352}
3353
3354static void
3355hfsc_tc_destroy(struct tc *tc)
3356{
3357 struct hfsc *hfsc;
3358 struct hfsc_class *hc, *next;
3359
3360 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3361
3362 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3364 free(hc);
3365 }
3366
3367 tc_destroy(tc);
3368 free(hfsc);
3369}
3370
3371static int
79f1cbe9 3372hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3373{
3374 const struct hfsc *hfsc;
3375 hfsc = hfsc_get__(netdev);
79f1cbe9 3376 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3377 return 0;
3378}
3379
3380static int
79f1cbe9 3381hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3382{
3383 int error;
3384 struct hfsc_class class;
3385
3386 hfsc_parse_qdisc_details__(netdev, details, &class);
3387 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3388 tc_make_handle(1, 0), &class);
3389
3390 if (!error) {
3391 hfsc_get__(netdev)->max_rate = class.max_rate;
3392 }
3393
3394 return error;
3395}
3396
3397static int
3398hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3399 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3400{
3401 const struct hfsc_class *hc;
3402
3403 hc = hfsc_class_cast__(queue);
79f1cbe9 3404 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3405 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3406 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3407 }
3408 return 0;
3409}
3410
3411static int
3412hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3413 const struct smap *details)
a339aa81
EJ
3414{
3415 int error;
3416 struct hfsc_class class;
3417
3418 error = hfsc_parse_class_details__(netdev, details, &class);
3419 if (error) {
3420 return error;
3421 }
3422
3423 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3424 tc_make_handle(1, 0xfffe), &class);
3425 if (error) {
3426 return error;
3427 }
3428
3429 hfsc_update_queue__(netdev, queue_id, &class);
3430 return 0;
3431}
3432
3433static int
3434hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3435{
3436 int error;
3437 struct hfsc *hfsc;
3438 struct hfsc_class *hc;
3439
3440 hc = hfsc_class_cast__(queue);
3441 hfsc = hfsc_get__(netdev);
3442
3443 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3444 if (!error) {
3445 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3446 free(hc);
3447 }
3448 return error;
3449}
3450
3451static int
3452hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3453 struct netdev_queue_stats *stats)
3454{
3455 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3456 tc_make_handle(1, 0xfffe), NULL, stats);
3457}
3458
3459static int
3460hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3461 const struct ofpbuf *nlmsg,
3462 netdev_dump_queue_stats_cb *cb, void *aux)
3463{
3464 struct netdev_queue_stats stats;
3465 unsigned int handle, major, minor;
3466 int error;
3467
3468 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3469 if (error) {
3470 return error;
3471 }
3472
3473 major = tc_get_major(handle);
3474 minor = tc_get_minor(handle);
3475 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3476 (*cb)(minor - 1, &stats, aux);
3477 }
3478 return 0;
3479}
3480
3481static const struct tc_ops tc_ops_hfsc = {
3482 "hfsc", /* linux_name */
3483 "linux-hfsc", /* ovs_name */
3484 HFSC_N_QUEUES, /* n_queues */
3485 hfsc_tc_install, /* tc_install */
3486 hfsc_tc_load, /* tc_load */
3487 hfsc_tc_destroy, /* tc_destroy */
3488 hfsc_qdisc_get, /* qdisc_get */
3489 hfsc_qdisc_set, /* qdisc_set */
3490 hfsc_class_get, /* class_get */
3491 hfsc_class_set, /* class_set */
3492 hfsc_class_delete, /* class_delete */
3493 hfsc_class_get_stats, /* class_get_stats */
3494 hfsc_class_dump_stats /* class_dump_stats */
3495};
3496\f
c1c9c9c4
BP
3497/* "linux-default" traffic control class.
3498 *
3499 * This class represents the default, unnamed Linux qdisc. It corresponds to
3500 * the "" (empty string) QoS type in the OVS database. */
3501
3502static void
b5d57fc8 3503default_install__(struct netdev *netdev_)
c1c9c9c4 3504{
b5d57fc8 3505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3506 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 3507
559eb230
BP
3508 /* Nothing but a tc class implementation is allowed to write to a tc. This
3509 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3510 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3511}
3512
3513static int
3514default_tc_install(struct netdev *netdev,
79f1cbe9 3515 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3516{
3517 default_install__(netdev);
3518 return 0;
3519}
3520
3521static int
3522default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3523{
3524 default_install__(netdev);
3525 return 0;
3526}
3527
3528static const struct tc_ops tc_ops_default = {
3529 NULL, /* linux_name */
3530 "", /* ovs_name */
3531 0, /* n_queues */
3532 default_tc_install,
3533 default_tc_load,
3534 NULL, /* tc_destroy */
3535 NULL, /* qdisc_get */
3536 NULL, /* qdisc_set */
3537 NULL, /* class_get */
3538 NULL, /* class_set */
3539 NULL, /* class_delete */
3540 NULL, /* class_get_stats */
3541 NULL /* class_dump_stats */
3542};
3543\f
3544/* "linux-other" traffic control class.
3545 *
3546 * */
3547
3548static int
b5d57fc8 3549other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 3550{
b5d57fc8 3551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3552 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 3553
559eb230
BP
3554 /* Nothing but a tc class implementation is allowed to write to a tc. This
3555 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3556 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3557 return 0;
3558}
3559
3560static const struct tc_ops tc_ops_other = {
3561 NULL, /* linux_name */
3562 "linux-other", /* ovs_name */
3563 0, /* n_queues */
3564 NULL, /* tc_install */
3565 other_tc_load,
3566 NULL, /* tc_destroy */
3567 NULL, /* qdisc_get */
3568 NULL, /* qdisc_set */
3569 NULL, /* class_get */
3570 NULL, /* class_set */
3571 NULL, /* class_delete */
3572 NULL, /* class_get_stats */
3573 NULL /* class_dump_stats */
3574};
3575\f
3576/* Traffic control. */
3577
3578/* Number of kernel "tc" ticks per second. */
3579static double ticks_per_s;
3580
3581/* Number of kernel "jiffies" per second. This is used for the purpose of
3582 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3583 * one jiffy's worth of data.
3584 *
3585 * There are two possibilities here:
3586 *
3587 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3588 * approximate range of 100 to 1024. That means that we really need to
3589 * make sure that the qdisc can buffer that much data.
3590 *
3591 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3592 * has finely granular timers and there's no need to fudge additional room
3593 * for buffers. (There's no extra effort needed to implement that: the
3594 * large 'buffer_hz' is used as a divisor, so practically any number will
3595 * come out as 0 in the division. Small integer results in the case of
3596 * really high dividends won't have any real effect anyhow.)
3597 */
3598static unsigned int buffer_hz;
3599
3600/* Returns tc handle 'major':'minor'. */
3601static unsigned int
3602tc_make_handle(unsigned int major, unsigned int minor)
3603{
3604 return TC_H_MAKE(major << 16, minor);
3605}
3606
3607/* Returns the major number from 'handle'. */
3608static unsigned int
3609tc_get_major(unsigned int handle)
3610{
3611 return TC_H_MAJ(handle) >> 16;
3612}
3613
3614/* Returns the minor number from 'handle'. */
3615static unsigned int
3616tc_get_minor(unsigned int handle)
3617{
3618 return TC_H_MIN(handle);
3619}
3620
3621static struct tcmsg *
3622tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3623 struct ofpbuf *request)
3624{
3625 struct tcmsg *tcmsg;
3626 int ifindex;
3627 int error;
3628
3629 error = get_ifindex(netdev, &ifindex);
3630 if (error) {
3631 return NULL;
3632 }
3633
3634 ofpbuf_init(request, 512);
3635 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3636 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3637 tcmsg->tcm_family = AF_UNSPEC;
3638 tcmsg->tcm_ifindex = ifindex;
3639 /* Caller should fill in tcmsg->tcm_handle. */
3640 /* Caller should fill in tcmsg->tcm_parent. */
3641
3642 return tcmsg;
3643}
3644
3645static int
3646tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3647{
a88b4e04 3648 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
3649 ofpbuf_uninit(request);
3650 return error;
3651}
3652
f8500004
JP
3653/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3654 * policing configuration.
3655 *
3656 * This function is equivalent to running the following when 'add' is true:
3657 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3658 *
3659 * This function is equivalent to running the following when 'add' is false:
3660 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3661 *
3662 * The configuration and stats may be seen with the following command:
3663 * /sbin/tc -s qdisc show dev <devname>
3664 *
3665 * Returns 0 if successful, otherwise a positive errno value.
3666 */
3667static int
3668tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3669{
3670 struct ofpbuf request;
3671 struct tcmsg *tcmsg;
3672 int error;
3673 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3674 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3675
3676 tcmsg = tc_make_request(netdev, type, flags, &request);
3677 if (!tcmsg) {
3678 return ENODEV;
3679 }
3680 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3681 tcmsg->tcm_parent = TC_H_INGRESS;
3682 nl_msg_put_string(&request, TCA_KIND, "ingress");
3683 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3684
3685 error = tc_transact(&request, NULL);
3686 if (error) {
3687 /* If we're deleting the qdisc, don't worry about some of the
3688 * error conditions. */
3689 if (!add && (error == ENOENT || error == EINVAL)) {
3690 return 0;
3691 }
3692 return error;
3693 }
3694
3695 return 0;
3696}
3697
3698/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3699 * of 'kbits_burst'.
3700 *
3701 * This function is equivalent to running:
3702 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3703 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3704 * mtu 65535 drop
3705 *
3706 * The configuration and stats may be seen with the following command:
3707 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3708 *
3709 * Returns 0 if successful, otherwise a positive errno value.
3710 */
3711static int
3712tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3713{
3714 struct tc_police tc_police;
3715 struct ofpbuf request;
3716 struct tcmsg *tcmsg;
3717 size_t basic_offset;
3718 size_t police_offset;
3719 int error;
3720 int mtu = 65535;
3721
3722 memset(&tc_police, 0, sizeof tc_police);
3723 tc_police.action = TC_POLICE_SHOT;
3724 tc_police.mtu = mtu;
e5c08015 3725 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
f8500004
JP
3726 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3727 kbits_burst * 1024);
3728
3729 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3730 NLM_F_EXCL | NLM_F_CREATE, &request);
3731 if (!tcmsg) {
3732 return ENODEV;
3733 }
3734 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3735 tcmsg->tcm_info = tc_make_handle(49,
3736 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3737
3738 nl_msg_put_string(&request, TCA_KIND, "basic");
3739 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3740 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3741 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3742 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3743 nl_msg_end_nested(&request, police_offset);
3744 nl_msg_end_nested(&request, basic_offset);
3745
3746 error = tc_transact(&request, NULL);
3747 if (error) {
3748 return error;
3749 }
3750
3751 return 0;
3752}
3753
c1c9c9c4
BP
3754static void
3755read_psched(void)
3756{
3757 /* The values in psched are not individually very meaningful, but they are
3758 * important. The tables below show some values seen in the wild.
3759 *
3760 * Some notes:
3761 *
3762 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3763 * (Before that, there are hints that it was 1000000000.)
3764 *
3765 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3766 * above.
3767 *
3768 * /proc/net/psched
3769 * -----------------------------------
3770 * [1] 000c8000 000f4240 000f4240 00000064
3771 * [2] 000003e8 00000400 000f4240 3b9aca00
3772 * [3] 000003e8 00000400 000f4240 3b9aca00
3773 * [4] 000003e8 00000400 000f4240 00000064
3774 * [5] 000003e8 00000040 000f4240 3b9aca00
3775 * [6] 000003e8 00000040 000f4240 000000f9
3776 *
3777 * a b c d ticks_per_s buffer_hz
3778 * ------- --------- ---------- ------------- ----------- -------------
3779 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3780 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3782 * [4] 1,000 1,024 1,000,000 100 976,562 100
3783 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3784 * [6] 1,000 64 1,000,000 249 15,625,000 249
3785 *
3786 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3787 * [2] 2.6.26-1-686-bigmem from Debian lenny
3788 * [3] 2.6.26-2-sparc64 from Debian lenny
3789 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3790 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3791 * [6] 2.6.34 from kernel.org on KVM
3792 */
23882115 3793 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
3794 static const char fn[] = "/proc/net/psched";
3795 unsigned int a, b, c, d;
3796 FILE *stream;
3797
23882115
BP
3798 if (!ovsthread_once_start(&once)) {
3799 return;
3800 }
3801
c1c9c9c4
BP
3802 ticks_per_s = 1.0;
3803 buffer_hz = 100;
3804
3805 stream = fopen(fn, "r");
3806 if (!stream) {
10a89ef0 3807 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 3808 goto exit;
c1c9c9c4
BP
3809 }
3810
3811 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3812 VLOG_WARN("%s: read failed", fn);
3813 fclose(stream);
23882115 3814 goto exit;
c1c9c9c4
BP
3815 }
3816 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3817 fclose(stream);
3818
3819 if (!a || !c) {
3820 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 3821 goto exit;
c1c9c9c4
BP
3822 }
3823
3824 ticks_per_s = (double) a * c / b;
3825 if (c == 1000000) {
3826 buffer_hz = d;
3827 } else {
3828 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3829 fn, a, b, c, d);
3830 }
3831 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
3832
3833exit:
3834 ovsthread_once_done(&once);
c1c9c9c4
BP
3835}
3836
3837/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3838 * rate of 'rate' bytes per second. */
3839static unsigned int
3840tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3841{
23882115 3842 read_psched();
c1c9c9c4
BP
3843 return (rate * ticks) / ticks_per_s;
3844}
3845
3846/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3847 * rate of 'rate' bytes per second. */
3848static unsigned int
3849tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3850{
23882115 3851 read_psched();
015c93a4 3852 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3853}
3854
3855/* Returns the number of bytes that need to be reserved for qdisc buffering at
3856 * a transmission rate of 'rate' bytes per second. */
3857static unsigned int
3858tc_buffer_per_jiffy(unsigned int rate)
3859{
23882115 3860 read_psched();
c1c9c9c4
BP
3861 return rate / buffer_hz;
3862}
3863
3864/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3865 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3866 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3867 * stores NULL into it if it is absent.
3868 *
3869 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3870 * 'msg'.
3871 *
3872 * Returns 0 if successful, otherwise a positive errno value. */
3873static int
3874tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3875 struct nlattr **options)
3876{
3877 static const struct nl_policy tca_policy[] = {
3878 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3879 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3880 };
3881 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3882
3883 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3884 tca_policy, ta, ARRAY_SIZE(ta))) {
3885 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3886 goto error;
3887 }
3888
3889 if (kind) {
3890 *kind = nl_attr_get_string(ta[TCA_KIND]);
3891 }
3892
3893 if (options) {
3894 *options = ta[TCA_OPTIONS];
3895 }
3896
3897 return 0;
3898
3899error:
3900 if (kind) {
3901 *kind = NULL;
3902 }
3903 if (options) {
3904 *options = NULL;
3905 }
3906 return EPROTO;
3907}
3908
3909/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3910 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3911 * into '*options', and its queue statistics into '*stats'. Any of the output
3912 * arguments may be null.
3913 *
3914 * Returns 0 if successful, otherwise a positive errno value. */
3915static int
3916tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3917 struct nlattr **options, struct netdev_queue_stats *stats)
3918{
3919 static const struct nl_policy tca_policy[] = {
3920 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3921 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3922 };
3923 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3924
3925 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3926 tca_policy, ta, ARRAY_SIZE(ta))) {
3927 VLOG_WARN_RL(&rl, "failed to parse class message");
3928 goto error;
3929 }
3930
3931 if (handlep) {
3932 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3933 *handlep = tc->tcm_handle;
3934 }
3935
3936 if (options) {
3937 *options = ta[TCA_OPTIONS];
3938 }
3939
3940 if (stats) {
3941 const struct gnet_stats_queue *gsq;
3942 struct gnet_stats_basic gsb;
3943
3944 static const struct nl_policy stats_policy[] = {
3945 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3946 .min_len = sizeof gsb },
3947 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3948 .min_len = sizeof *gsq },
3949 };
3950 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3951
3952 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3953 sa, ARRAY_SIZE(sa))) {
3954 VLOG_WARN_RL(&rl, "failed to parse class stats");
3955 goto error;
3956 }
3957
3958 /* Alignment issues screw up the length of struct gnet_stats_basic on
3959 * some arch/bitsize combinations. Newer versions of Linux have a
3960 * struct gnet_stats_basic_packed, but we can't depend on that. The
3961 * easiest thing to do is just to make a copy. */
3962 memset(&gsb, 0, sizeof gsb);
3963 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3964 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3965 stats->tx_bytes = gsb.bytes;
3966 stats->tx_packets = gsb.packets;
3967
3968 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3969 stats->tx_errors = gsq->drops;
3970 }
3971
3972 return 0;
3973
3974error:
3975 if (options) {
3976 *options = NULL;
3977 }
3978 if (stats) {
3979 memset(stats, 0, sizeof *stats);
3980 }
3981 return EPROTO;
3982}
3983
3984/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3985 * on 'netdev'. */
3986static int
3987tc_query_class(const struct netdev *netdev,
3988 unsigned int handle, unsigned int parent,
3989 struct ofpbuf **replyp)
3990{
3991 struct ofpbuf request;
3992 struct tcmsg *tcmsg;
3993 int error;
3994
3995 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3996 if (!tcmsg) {
3997 return ENODEV;
3998 }
c1c9c9c4
BP
3999 tcmsg->tcm_handle = handle;
4000 tcmsg->tcm_parent = parent;
4001
4002 error = tc_transact(&request, replyp);
4003 if (error) {
4004 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4005 netdev_get_name(netdev),
4006 tc_get_major(handle), tc_get_minor(handle),
4007 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4008 ovs_strerror(error));
c1c9c9c4
BP
4009 }
4010 return error;
4011}
4012
4013/* Equivalent to "tc class del dev <name> handle <handle>". */
4014static int
4015tc_delete_class(const struct netdev *netdev, unsigned int handle)
4016{
4017 struct ofpbuf request;
4018 struct tcmsg *tcmsg;
4019 int error;
4020
4021 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4022 if (!tcmsg) {
4023 return ENODEV;
4024 }
c1c9c9c4
BP
4025 tcmsg->tcm_handle = handle;
4026 tcmsg->tcm_parent = 0;
4027
4028 error = tc_transact(&request, NULL);
4029 if (error) {
4030 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4031 netdev_get_name(netdev),
4032 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 4033 ovs_strerror(error));
c1c9c9c4
BP
4034 }
4035 return error;
4036}
4037
4038/* Equivalent to "tc qdisc del dev <name> root". */
4039static int
b5d57fc8 4040tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 4041{
b5d57fc8 4042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4043 struct ofpbuf request;
4044 struct tcmsg *tcmsg;
4045 int error;
4046
b5d57fc8 4047 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4048 if (!tcmsg) {
4049 return ENODEV;
4050 }
c1c9c9c4
BP
4051 tcmsg->tcm_handle = tc_make_handle(1, 0);
4052 tcmsg->tcm_parent = TC_H_ROOT;
4053
4054 error = tc_transact(&request, NULL);
4055 if (error == EINVAL) {
4056 /* EINVAL probably means that the default qdisc was in use, in which
4057 * case we've accomplished our purpose. */
4058 error = 0;
4059 }
b5d57fc8
BP
4060 if (!error && netdev->tc) {
4061 if (netdev->tc->ops->tc_destroy) {
4062 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 4063 }
b5d57fc8 4064 netdev->tc = NULL;
c1c9c9c4
BP
4065 }
4066 return error;
4067}
4068
4069/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4070 * kernel to determine what they are. Returns 0 if successful, otherwise a
4071 * positive errno value. */
4072static int
b5d57fc8 4073tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 4074{
b5d57fc8 4075 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4076 struct ofpbuf request, *qdisc;
4077 const struct tc_ops *ops;
4078 struct tcmsg *tcmsg;
4079 int load_error;
4080 int error;
4081
b5d57fc8 4082 if (netdev->tc) {
c1c9c9c4
BP
4083 return 0;
4084 }
4085
4086 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4087 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4088 * 2.6.35 without that fix backported to it.
4089 *
4090 * To avoid the OOPS, we must not make a request that would attempt to dump
4091 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4092 * few others. There are a few ways that I can see to do this, but most of
4093 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4094 * technique chosen here is to assume that any non-default qdisc that we
4095 * create will have a class with handle 1:0. The built-in qdiscs only have
4096 * a class with handle 0:0.
4097 *
4098 * We could check for Linux 2.6.35+ and use a more straightforward method
4099 * there. */
b5d57fc8 4100 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4101 if (!tcmsg) {
4102 return ENODEV;
4103 }
c1c9c9c4
BP
4104 tcmsg->tcm_handle = tc_make_handle(1, 0);
4105 tcmsg->tcm_parent = 0;
4106
4107 /* Figure out what tc class to instantiate. */
4108 error = tc_transact(&request, &qdisc);
4109 if (!error) {
4110 const char *kind;
4111
4112 error = tc_parse_qdisc(qdisc, &kind, NULL);
4113 if (error) {
4114 ops = &tc_ops_other;
4115 } else {
4116 ops = tc_lookup_linux_name(kind);
4117 if (!ops) {
4118 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4119 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4120
4121 ops = &tc_ops_other;
4122 }
4123 }
4124 } else if (error == ENOENT) {
4125 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4126 * other entity that doesn't have a handle 1:0. We will assume
4127 * that it's the system default qdisc. */
4128 ops = &tc_ops_default;
4129 error = 0;
4130 } else {
4131 /* Who knows? Maybe the device got deleted. */
4132 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 4133 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
4134 ops = &tc_ops_other;
4135 }
4136
4137 /* Instantiate it. */
b5d57fc8
BP
4138 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4139 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
4140 ofpbuf_delete(qdisc);
4141
4142 return error ? error : load_error;
4143}
4144
4145/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4146 approximate the time to transmit packets of various lengths. For an MTU of
4147 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4148 represents two possible packet lengths; for a MTU of 513 through 1024, four
4149 possible lengths; and so on.
4150
4151 Returns, for the specified 'mtu', the number of bits that packet lengths
4152 need to be shifted right to fit within such a 256-entry table. */
4153static int
4154tc_calc_cell_log(unsigned int mtu)
4155{
4156 int cell_log;
4157
4158 if (!mtu) {
4159 mtu = ETH_PAYLOAD_MAX;
4160 }
4161 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4162
4163 for (cell_log = 0; mtu >= 256; cell_log++) {
4164 mtu >>= 1;
4165 }
4166
4167 return cell_log;
4168}
4169
4170/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4171 * of 'mtu'. */
4172static void
4173tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4174{
4175 memset(rate, 0, sizeof *rate);
4176 rate->cell_log = tc_calc_cell_log(mtu);
4177 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4178 /* rate->cell_align = 0; */ /* distro headers. */
4179 rate->mpu = ETH_TOTAL_MIN;
4180 rate->rate = Bps;
4181}
4182
4183/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4184 * attribute of the specified "type".
4185 *
4186 * See tc_calc_cell_log() above for a description of "rtab"s. */
4187static void
4188tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4189{
4190 uint32_t *rtab;
4191 unsigned int i;
4192
4193 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4194 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4195 unsigned packet_size = (i + 1) << rate->cell_log;
4196 if (packet_size < rate->mpu) {
4197 packet_size = rate->mpu;
4198 }
4199 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4200 }
4201}
4202
4203/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4204 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4205 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4206 * 0 is fine.) */
c1c9c9c4
BP
4207static int
4208tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4209{
4210 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4211 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4212}
d3980822 4213\f
aaf2fb1a
BP
4214/* Linux-only functions declared in netdev-linux.h */
4215
025e874a
BP
4216/* Returns a fd for an AF_INET socket or a negative errno value. */
4217int
4218netdev_linux_get_af_inet_sock(void)
4219{
4220 int error = netdev_linux_init();
4221 return error ? -error : af_inet_sock;
4222}
4223
aaf2fb1a
BP
4224/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4225 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4226int
4227netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4228 const char *flag_name, bool enable)
4229{
4230 const char *netdev_name = netdev_get_name(netdev);
4231 struct ethtool_value evalue;
4232 uint32_t new_flags;
4233 int error;
4234
ab985a77 4235 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4236 memset(&evalue, 0, sizeof evalue);
4237 error = netdev_linux_do_ethtool(netdev_name,
4238 (struct ethtool_cmd *)&evalue,
4239 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4240 if (error) {
4241 return error;
4242 }
4243
ab985a77 4244 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4245 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4246 error = netdev_linux_do_ethtool(netdev_name,
4247 (struct ethtool_cmd *)&evalue,
4248 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4249 if (error) {
4250 return error;
4251 }
4252
ab985a77 4253 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4254 memset(&evalue, 0, sizeof evalue);
4255 error = netdev_linux_do_ethtool(netdev_name,
4256 (struct ethtool_cmd *)&evalue,
4257 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4258 if (error) {
4259 return error;
4260 }
4261
4262 if (new_flags != evalue.data) {
4263 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4264 "device %s failed", enable ? "enable" : "disable",
4265 flag_name, netdev_name);
4266 return EOPNOTSUPP;
4267 }
4268
4269 return 0;
4270}
4271\f
4272/* Utility functions. */
4273
d3980822 4274/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4275static void
d3980822
BP
4276netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4277 const struct rtnl_link_stats *src)
4278{
f613a0d7
PS
4279 dst->rx_packets = src->rx_packets;
4280 dst->tx_packets = src->tx_packets;
4281 dst->rx_bytes = src->rx_bytes;
4282 dst->tx_bytes = src->tx_bytes;
4283 dst->rx_errors = src->rx_errors;
4284 dst->tx_errors = src->tx_errors;
4285 dst->rx_dropped = src->rx_dropped;
4286 dst->tx_dropped = src->tx_dropped;
4287 dst->multicast = src->multicast;
4288 dst->collisions = src->collisions;
4289 dst->rx_length_errors = src->rx_length_errors;
4290 dst->rx_over_errors = src->rx_over_errors;
4291 dst->rx_crc_errors = src->rx_crc_errors;
4292 dst->rx_frame_errors = src->rx_frame_errors;
4293 dst->rx_fifo_errors = src->rx_fifo_errors;
4294 dst->rx_missed_errors = src->rx_missed_errors;
4295 dst->tx_aborted_errors = src->tx_aborted_errors;
4296 dst->tx_carrier_errors = src->tx_carrier_errors;
4297 dst->tx_fifo_errors = src->tx_fifo_errors;
4298 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4299 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4300}
4301
c1c9c9c4
BP
4302static int
4303get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4304{
4305 /* Policy for RTNLGRP_LINK messages.
4306 *
4307 * There are *many* more fields in these messages, but currently we only
4308 * care about these fields. */
4309 static const struct nl_policy rtnlgrp_link_policy[] = {
4310 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4311 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4312 .min_len = sizeof(struct rtnl_link_stats) },
4313 };
4314
4315 struct ofpbuf request;
4316 struct ofpbuf *reply;
4317 struct ifinfomsg *ifi;
c1c9c9c4
BP
4318 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4319 int error;
4320
4321 ofpbuf_init(&request, 0);
4322 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4323 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4324 ifi->ifi_family = PF_UNSPEC;
4325 ifi->ifi_index = ifindex;
a88b4e04 4326 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
4327 ofpbuf_uninit(&request);
4328 if (error) {
4329 return error;
4330 }
4331
4332 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4333 rtnlgrp_link_policy,
4334 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4335 ofpbuf_delete(reply);
4336 return EPROTO;
4337 }
4338
4339 if (!attrs[IFLA_STATS]) {
4340 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4341 ofpbuf_delete(reply);
4342 return EPROTO;
4343 }
8b61709d 4344
d3980822 4345 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4346
576e26d7
BP
4347 ofpbuf_delete(reply);
4348
8b61709d
BP
4349 return 0;
4350}
4351
4352static int
4353get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4354{
4355 static const char fn[] = "/proc/net/dev";
4356 char line[1024];
4357 FILE *stream;
4358 int ln;
4359
4360 stream = fopen(fn, "r");
4361 if (!stream) {
10a89ef0 4362 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
8b61709d
BP
4363 return errno;
4364 }
4365
4366 ln = 0;
4367 while (fgets(line, sizeof line, stream)) {
4368 if (++ln >= 3) {
4369 char devname[16];
4370#define X64 "%"SCNu64
4371 if (sscanf(line,
4372 " %15[^:]:"
4373 X64 X64 X64 X64 X64 X64 X64 "%*u"
4374 X64 X64 X64 X64 X64 X64 X64 "%*u",
4375 devname,
4376 &stats->rx_bytes,
4377 &stats->rx_packets,
4378 &stats->rx_errors,
4379 &stats->rx_dropped,
4380 &stats->rx_fifo_errors,
4381 &stats->rx_frame_errors,
4382 &stats->multicast,
4383 &stats->tx_bytes,
4384 &stats->tx_packets,
4385 &stats->tx_errors,
4386 &stats->tx_dropped,
4387 &stats->tx_fifo_errors,
4388 &stats->collisions,
4389 &stats->tx_carrier_errors) != 15) {
4390 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4391 } else if (!strcmp(devname, netdev_name)) {
4392 stats->rx_length_errors = UINT64_MAX;
4393 stats->rx_over_errors = UINT64_MAX;
4394 stats->rx_crc_errors = UINT64_MAX;
4395 stats->rx_missed_errors = UINT64_MAX;
4396 stats->tx_aborted_errors = UINT64_MAX;
4397 stats->tx_heartbeat_errors = UINT64_MAX;
4398 stats->tx_window_errors = UINT64_MAX;
4399 fclose(stream);
4400 return 0;
4401 }
4402 }
4403 }
4404 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4405 fclose(stream);
4406 return ENODEV;
4407}
c1c9c9c4 4408
3a183124 4409static int
b5d57fc8 4410get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
4411{
4412 struct ifreq ifr;
4413 int error;
4414
755be9ea
EJ
4415 *flags = 0;
4416 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
149f577a 4417 "SIOCGIFFLAGS");
755be9ea
EJ
4418 if (!error) {
4419 *flags = ifr.ifr_flags;
4420 }
8b61709d
BP
4421 return error;
4422}
4423
4424static int
4b609110 4425set_flags(const char *name, unsigned int flags)
8b61709d
BP
4426{
4427 struct ifreq ifr;
4428
4429 ifr.ifr_flags = flags;
4b609110 4430 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
4431}
4432
4433static int
4434do_get_ifindex(const char *netdev_name)
4435{
4436 struct ifreq ifr;
4437
71d7c22f 4438 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4439 COVERAGE_INC(netdev_get_ifindex);
4440 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4441 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
10a89ef0 4442 netdev_name, ovs_strerror(errno));
8b61709d
BP
4443 return -errno;
4444 }
4445 return ifr.ifr_ifindex;
4446}
4447
4448static int
4449get_ifindex(const struct netdev *netdev_, int *ifindexp)
4450{
b5d57fc8 4451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 4452
b5d57fc8 4453 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 4454 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4455
8b61709d 4456 if (ifindex < 0) {
b5d57fc8
BP
4457 netdev->get_ifindex_error = -ifindex;
4458 netdev->ifindex = 0;
c7b1b0a5 4459 } else {
b5d57fc8
BP
4460 netdev->get_ifindex_error = 0;
4461 netdev->ifindex = ifindex;
8b61709d 4462 }
b5d57fc8 4463 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 4464 }
c7b1b0a5 4465
b5d57fc8
BP
4466 *ifindexp = netdev->ifindex;
4467 return netdev->get_ifindex_error;
8b61709d
BP
4468}
4469
4470static int
4471get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4472{
4473 struct ifreq ifr;
4474 int hwaddr_family;
4475
4476 memset(&ifr, 0, sizeof ifr);
71d7c22f 4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4478 COVERAGE_INC(netdev_get_hwaddr);
4479 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4480 /* ENODEV probably means that a vif disappeared asynchronously and
4481 * hasn't been removed from the database yet, so reduce the log level
4482 * to INFO for that case. */
4483 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4484 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
10a89ef0 4485 netdev_name, ovs_strerror(errno));
8b61709d
BP
4486 return errno;
4487 }
4488 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4489 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4490 VLOG_WARN("%s device has unknown hardware address family %d",
4491 netdev_name, hwaddr_family);
4492 }
4493 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4494 return 0;
4495}
4496
4497static int
44445cac 4498set_etheraddr(const char *netdev_name,
8b61709d
BP
4499 const uint8_t mac[ETH_ADDR_LEN])
4500{
4501 struct ifreq ifr;
4502
4503 memset(&ifr, 0, sizeof ifr);
71d7c22f 4504 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4505 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4506 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4507 COVERAGE_INC(netdev_set_hwaddr);
4508 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4509 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
10a89ef0 4510 netdev_name, ovs_strerror(errno));
8b61709d
BP
4511 return errno;
4512 }
4513 return 0;
4514}
4515
4516static int
0b0544d7 4517netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4518 int cmd, const char *cmd_name)
4519{
4520 struct ifreq ifr;
4521
4522 memset(&ifr, 0, sizeof ifr);
71d7c22f 4523 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4524 ifr.ifr_data = (caddr_t) ecmd;
4525
4526 ecmd->cmd = cmd;
8b61709d
BP
4527 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4528 return 0;
4529 } else {
4530 if (errno != EOPNOTSUPP) {
4531 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
10a89ef0 4532 "failed: %s", cmd_name, name, ovs_strerror(errno));
8b61709d
BP
4533 } else {
4534 /* The device doesn't support this operation. That's pretty
4535 * common, so there's no point in logging anything. */
4536 }
4537 return errno;
4538 }
4539}
4540
4541static int
149f577a
JG
4542netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4543 const char *cmd_name)
8b61709d 4544{
71d7c22f 4545 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4546 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a 4547 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
10a89ef0 4548 ovs_strerror(errno));
8b61709d
BP
4549 return errno;
4550 }
4551 return 0;
4552}
f1acd62b
BP
4553
4554static int
4555netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4556 int cmd, const char *cmd_name)
4557{
4558 struct ifreq ifr;
4559 int error;
4560
4561 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4562 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 4563 if (!error) {
db5a1019
AW
4564 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4565 &ifr.ifr_addr);
f1acd62b
BP
4566 *ip = sin->sin_addr;
4567 }
4568 return error;
4569}
488d734d
BP
4570
4571/* Returns an AF_PACKET raw socket or a negative errno value. */
4572static int
4573af_packet_sock(void)
4574{
23882115
BP
4575 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4576 static int sock;
488d734d 4577
23882115 4578 if (ovsthread_once_start(&once)) {
488d734d
BP
4579 sock = socket(AF_PACKET, SOCK_RAW, 0);
4580 if (sock >= 0) {
8450059e
BP
4581 int error = set_nonblocking(sock);
4582 if (error) {
4583 close(sock);
4584 sock = -error;
4585 }
488d734d
BP
4586 } else {
4587 sock = -errno;
10a89ef0
BP
4588 VLOG_ERR("failed to create packet socket: %s",
4589 ovs_strerror(errno));
488d734d 4590 }
23882115 4591 ovsthread_once_done(&once);
488d734d
BP
4592 }
4593
4594 return sock;
4595}