]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
ofproto-dpif: Fix check for 802.1Q header in commit_odp_actions().
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
8b61709d 27#include <linux/if_tun.h>
a740f0de 28#include <linux/ip.h>
8b61709d
BP
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
6f42c8ea 32#include <linux/pkt_sched.h>
e9e28be3 33#include <linux/rtnetlink.h>
8b61709d
BP
34#include <linux/sockios.h>
35#include <linux/version.h>
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
39#include <netpacket/packet.h>
40#include <net/ethernet.h>
41#include <net/if.h>
a740f0de 42#include <linux/if_tunnel.h>
8b61709d
BP
43#include <net/if_arp.h>
44#include <net/if_packet.h>
45#include <net/route.h>
46#include <netinet/in.h>
e9e28be3 47#include <poll.h>
8b61709d
BP
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
e9e28be3
BP
51
52#include "coverage.h"
9fe3b9a2 53#include "dpif-linux.h"
8b61709d
BP
54#include "dynamic-string.h"
55#include "fatal-signal.h"
93b13be8
BP
56#include "hash.h"
57#include "hmap.h"
8b61709d 58#include "netdev-provider.h"
7fbef77a 59#include "netdev-vport.h"
e9e28be3 60#include "netlink.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
e9e28be3 63#include "ofpbuf.h"
8b61709d
BP
64#include "openflow/openflow.h"
65#include "packets.h"
66#include "poll-loop.h"
21d6e22e 67#include "rtnetlink-link.h"
8b61709d
BP
68#include "socket-util.h"
69#include "shash.h"
19993ef3 70#include "sset.h"
1670c579 71#include "timer.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
75
76COVERAGE_DEFINE(netdev_get_vlan_vid);
77COVERAGE_DEFINE(netdev_set_policing);
78COVERAGE_DEFINE(netdev_arp_lookup);
79COVERAGE_DEFINE(netdev_get_ifindex);
80COVERAGE_DEFINE(netdev_get_hwaddr);
81COVERAGE_DEFINE(netdev_set_hwaddr);
82COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
83\f
84/* These were introduced in Linux 2.6.14, so they might be missing if we have
85 * old headers. */
86#ifndef ADVERTISED_Pause
87#define ADVERTISED_Pause (1 << 13)
88#endif
89#ifndef ADVERTISED_Asym_Pause
90#define ADVERTISED_Asym_Pause (1 << 14)
91#endif
92
e47bd51a
JP
93/* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95#ifndef ETHTOOL_GFLAGS
96#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97#endif
98#ifndef ETHTOOL_SFLAGS
99#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100#endif
101
c1c9c9c4
BP
102/* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 * headers. */
104#ifndef TC_RTAB_SIZE
105#define TC_RTAB_SIZE 1024
106#endif
107
0a811051 108static struct nln_notifier netdev_linux_cache_notifier;
46415c90 109static int cache_notifier_refcount;
8b61709d
BP
110
111enum {
7fbef77a
JG
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
114 VALID_IN4 = 1 << 2,
115 VALID_IN6 = 1 << 3,
116 VALID_MTU = 1 << 4,
117 VALID_CARRIER = 1 << 5,
118 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
119 VALID_POLICING = 1 << 7,
120 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
121};
122
149f577a
JG
123struct tap_state {
124 int fd;
61b999dd 125 bool opened;
149f577a 126};
c1c9c9c4
BP
127\f
128/* Traffic control. */
129
130/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
c1c9c9c4
BP
135struct tc {
136 const struct tc_ops *ops;
93b13be8
BP
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140};
c1c9c9c4 141
93b13be8
BP
142/* One traffic control queue.
143 *
144 * Each TC implementation subclasses this with whatever additional data it
145 * needs. */
146struct tc_queue {
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
149};
150
151/* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
153 *
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
157struct tc_ops {
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
162
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
165
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
169
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
175 *
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
179 *
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
182 *
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
190 *
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
196 * 'netdev'.
197 *
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
204 * tc_destroy(tc).
205 *
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
209 *
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
212
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 *
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
218 *
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
222 *
223 * This function may be null if 'tc' is not configurable.
224 */
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
229 *
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
233 *
234 * This function may be null if 'tc' is not configurable.
235 */
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
237
93b13be8
BP
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
240 *
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 *
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
248 *
249 * This function may be null if 'tc' does not have queues ('n_queues' is
250 * 0). */
93b13be8 251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
252 struct shash *details);
253
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
257 * 'n_queues'.
258 *
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 *
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
267
93b13be8
BP
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
270 *
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
93b13be8 273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 274
93b13be8
BP
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
277 *
278 * On success, initializes '*stats'.
279 *
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
93b13be8
BP
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
c1c9c9c4
BP
284 struct netdev_queue_stats *stats);
285
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 *
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
294};
295
296static void
297tc_init(struct tc *tc, const struct tc_ops *ops)
298{
299 tc->ops = ops;
93b13be8 300 hmap_init(&tc->queues);
c1c9c9c4
BP
301}
302
303static void
304tc_destroy(struct tc *tc)
305{
93b13be8 306 hmap_destroy(&tc->queues);
c1c9c9c4
BP
307}
308
309static const struct tc_ops tc_ops_htb;
a339aa81 310static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
311static const struct tc_ops tc_ops_default;
312static const struct tc_ops tc_ops_other;
313
314static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
319 NULL
320};
149f577a 321
c1c9c9c4
BP
322static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323static unsigned int tc_get_major(unsigned int handle);
324static unsigned int tc_get_minor(unsigned int handle);
325
326static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329
330static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333
334static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342static int tc_delete_class(const struct netdev *, unsigned int handle);
343
344static int tc_del_qdisc(struct netdev *netdev);
345static int tc_query_qdisc(const struct netdev *netdev);
346
347static int tc_calc_cell_log(unsigned int mtu);
348static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352\f
149f577a
JG
353struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
355
8b61709d 356 struct shash_node *shash_node;
149f577a 357 unsigned int cache_valid;
ac4d3bcb 358 unsigned int change_seq;
8b61709d 359
1670c579
EJ
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
363
8722022c
BP
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
366 int ifindex;
367 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 368 struct in_addr address, netmask;
8b61709d
BP
369 struct in6_addr in6;
370 int mtu;
371 int carrier;
8722022c
BP
372 bool is_internal; /* Is this an openvswitch internal device? */
373 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
7fbef77a 376 bool have_vport_stats;
c1c9c9c4 377 struct tc *tc;
149f577a
JG
378
379 union {
380 struct tap_state tap;
381 } state;
8b61709d
BP
382};
383
149f577a
JG
384struct netdev_linux {
385 struct netdev netdev;
5b7448ed 386 int fd;
149f577a 387};
8b61709d 388
76c308b5
BP
389/* Sockets used for ioctl operations. */
390static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 391
ff4ed3c9
BP
392/* A Netlink routing socket that is not subscribed to any multicast groups. */
393static struct nl_sock *rtnl_sock;
394
8b61709d
BP
395/* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398
15b3596a 399static int netdev_linux_init(void);
6f643e49 400
0b0544d7 401static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 402 int cmd, const char *cmd_name);
149f577a
JG
403static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
f1acd62b
BP
405static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
8b61709d
BP
407static int get_flags(const struct netdev *, int *flagsp);
408static int set_flags(struct netdev *, int flags);
409static int do_get_ifindex(const char *netdev_name);
410static int get_ifindex(const struct netdev *, int *ifindexp);
411static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 419static int af_packet_sock(void);
1670c579
EJ
420static void netdev_linux_miimon_run(void);
421static void netdev_linux_miimon_wait(void);
8b61709d 422
15b3596a
JG
423static bool
424is_netdev_linux_class(const struct netdev_class *netdev_class)
425{
426 return netdev_class->init == netdev_linux_init;
427}
428
149f577a
JG
429static struct netdev_dev_linux *
430netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 431{
15b3596a
JG
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
434
149f577a 435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
436}
437
8b61709d
BP
438static struct netdev_linux *
439netdev_linux_cast(const struct netdev *netdev)
440{
15b3596a
JG
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
444
8b61709d
BP
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
446}
ff4ed3c9 447\f
8b61709d
BP
448static int
449netdev_linux_init(void)
450{
451 static int status = -1;
452 if (status < 0) {
ff4ed3c9 453 /* Create AF_INET socket. */
8b61709d
BP
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
456 if (status) {
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
458 }
ff4ed3c9
BP
459
460 /* Create rtnetlink socket. */
461 if (!status) {
cceb11f5 462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
463 if (status) {
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
465 strerror(status));
466 }
467 }
8b61709d
BP
468 }
469 return status;
470}
471
472static void
473netdev_linux_run(void)
474{
21d6e22e 475 rtnetlink_link_notifier_run();
1670c579 476 netdev_linux_miimon_run();
8b61709d
BP
477}
478
479static void
480netdev_linux_wait(void)
481{
21d6e22e 482 rtnetlink_link_notifier_wait();
1670c579 483 netdev_linux_miimon_wait();
8b61709d
BP
484}
485
ac4d3bcb
EJ
486static void
487netdev_dev_linux_changed(struct netdev_dev_linux *dev)
488{
489 dev->change_seq++;
490 if (!dev->change_seq) {
491 dev->change_seq++;
492 }
493 dev->cache_valid = 0;
494}
495
8b61709d 496static void
21d6e22e 497netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 498 void *aux OVS_UNUSED)
8b61709d 499{
149f577a 500 struct netdev_dev_linux *dev;
8b61709d 501 if (change) {
46415c90
JG
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
503 if (base_dev) {
15b3596a
JG
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
506
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
ac4d3bcb 509 netdev_dev_linux_changed(dev);
15b3596a 510 }
8b61709d
BP
511 }
512 } else {
46415c90 513 struct shash device_shash;
8b61709d 514 struct shash_node *node;
46415c90
JG
515
516 shash_init(&device_shash);
517 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
518 SHASH_FOR_EACH (node, &device_shash) {
149f577a 519 dev = node->data;
ac4d3bcb 520 netdev_dev_linux_changed(dev);
8b61709d 521 }
46415c90 522 shash_destroy(&device_shash);
8b61709d
BP
523 }
524}
525
c3827f61 526/* Creates system and internal devices. */
8b61709d 527static int
de5cdb90
BP
528netdev_linux_create(const struct netdev_class *class, const char *name,
529 struct netdev_dev **netdev_devp)
6c88d577 530{
149f577a
JG
531 struct netdev_dev_linux *netdev_dev;
532 int error;
6c88d577 533
46415c90 534 if (!cache_notifier_refcount) {
21d6e22e
EJ
535 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
536 netdev_linux_cache_cb, NULL);
149f577a
JG
537 if (error) {
538 return error;
539 }
540 }
46415c90 541 cache_notifier_refcount++;
6c88d577 542
149f577a 543 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 544 netdev_dev->change_seq = 1;
de5cdb90 545 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
46415c90 546
149f577a 547 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
548 return 0;
549}
550
5b7448ed
JG
551/* For most types of netdevs we open the device for each call of
552 * netdev_open(). However, this is not the case with tap devices,
553 * since it is only possible to open the device once. In this
554 * situation we share a single file descriptor, and consequently
555 * buffers, across all readers. Therefore once data is read it will
556 * be unavailable to other reads for tap devices. */
a740f0de 557static int
b8dcf5e9 558netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 559 const char *name, struct netdev_dev **netdev_devp)
a740f0de 560{
149f577a 561 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
562 struct tap_state *state;
563 static const char tap_dev[] = "/dev/net/tun";
564 struct ifreq ifr;
565 int error;
566
149f577a
JG
567 netdev_dev = xzalloc(sizeof *netdev_dev);
568 state = &netdev_dev->state.tap;
a740f0de 569
6c88d577 570 /* Open tap device. */
149f577a
JG
571 state->fd = open(tap_dev, O_RDWR);
572 if (state->fd < 0) {
6c88d577
JP
573 error = errno;
574 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
575 goto error;
576 }
577
578 /* Create tap device. */
579 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 581 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
582 VLOG_WARN("%s: creating tap device failed: %s", name,
583 strerror(errno));
584 error = errno;
585 goto error;
586 }
587
588 /* Make non-blocking. */
149f577a 589 error = set_nonblocking(state->fd);
a740f0de
JG
590 if (error) {
591 goto error;
592 }
593
de5cdb90 594 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 595 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
596 return 0;
597
598error:
149f577a 599 free(netdev_dev);
a740f0de
JG
600 return error;
601}
602
a740f0de 603static void
149f577a 604destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 605{
149f577a
JG
606 struct tap_state *state = &netdev_dev->state.tap;
607
608 if (state->fd >= 0) {
609 close(state->fd);
a740f0de
JG
610 }
611}
612
149f577a 613/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 614static void
149f577a 615netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 616{
149f577a 617 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 618 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 619
c1c9c9c4
BP
620 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
621 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
622 }
623
d2bb2799 624 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 625 cache_notifier_refcount--;
149f577a 626
46415c90 627 if (!cache_notifier_refcount) {
21d6e22e 628 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 629 }
d2bb2799 630 } else if (class == &netdev_tap_class) {
149f577a 631 destroy_tap(netdev_dev);
d2bb2799
BP
632 } else {
633 NOT_REACHED();
6c88d577 634 }
149f577a 635
658797c8 636 free(netdev_dev);
6c88d577
JP
637}
638
8b61709d 639static int
7b6b0ef4 640netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 641{
5b7448ed 642 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
643 struct netdev_linux *netdev;
644 enum netdev_flags flags;
645 int error;
646
647 /* Allocate network device. */
ec6fde61 648 netdev = xzalloc(sizeof *netdev);
49a6a163 649 netdev->fd = -1;
5b7448ed 650 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 651
c3827f61
BP
652 /* Verify that the device really exists, by attempting to read its flags.
653 * (The flags might be cached, in which case this won't actually do an
654 * ioctl.)
655 *
656 * Don't do this for "internal" netdevs, though, because those have to be
657 * created as netdev objects before they exist in the kernel, because
658 * creating them in the kernel happens by passing a netdev object to
659 * dpif_port_add(). */
660 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
661 error = netdev_get_flags(&netdev->netdev, &flags);
662 if (error == ENODEV) {
663 goto error;
664 }
8b61709d
BP
665 }
666
61b999dd
JG
667 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
668 !netdev_dev->state.tap.opened) {
669
670 /* We assume that the first user of the tap device is the primary user
671 * and give them the tap FD. Subsequent users probably just expect
672 * this to be a system device so open it normally to avoid send/receive
673 * directions appearing to be reversed. */
5b7448ed 674 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 675 netdev_dev->state.tap.opened = true;
8b61709d
BP
676 }
677
678 *netdevp = &netdev->netdev;
679 return 0;
680
681error:
149f577a 682 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
683 return error;
684}
685
686/* Closes and destroys 'netdev'. */
687static void
688netdev_linux_close(struct netdev *netdev_)
689{
690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
691
49a6a163 692 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 693 close(netdev->fd);
8b61709d
BP
694 }
695 free(netdev);
696}
e9e28be3 697
19993ef3 698/* Initializes 'sset' with a list of the names of all known network devices. */
8b61709d 699static int
19993ef3 700netdev_linux_enumerate(struct sset *sset)
8b61709d
BP
701{
702 struct if_nameindex *names;
703
704 names = if_nameindex();
705 if (names) {
706 size_t i;
707
708 for (i = 0; names[i].if_name != NULL; i++) {
19993ef3 709 sset_add(sset, names[i].if_name);
8b61709d
BP
710 }
711 if_freenameindex(names);
712 return 0;
713 } else {
714 VLOG_WARN("could not obtain list of network device names: %s",
715 strerror(errno));
716 return errno;
717 }
718}
719
7b6b0ef4
BP
720static int
721netdev_linux_listen(struct netdev *netdev_)
722{
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 struct sockaddr_ll sll;
725 int ifindex;
726 int error;
727 int fd;
728
729 if (netdev->fd >= 0) {
730 return 0;
731 }
732
733 /* Create file descriptor. */
734 fd = socket(PF_PACKET, SOCK_RAW, 0);
735 if (fd < 0) {
736 error = errno;
737 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
738 goto error;
739 }
740
741 /* Set non-blocking mode. */
742 error = set_nonblocking(fd);
743 if (error) {
744 goto error;
745 }
746
747 /* Get ethernet device index. */
748 error = get_ifindex(&netdev->netdev, &ifindex);
749 if (error) {
750 goto error;
751 }
752
753 /* Bind to specific ethernet device. */
754 memset(&sll, 0, sizeof sll);
755 sll.sll_family = AF_PACKET;
756 sll.sll_ifindex = ifindex;
757 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
758 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
759 error = errno;
760 VLOG_ERR("%s: failed to bind raw socket (%s)",
761 netdev_get_name(netdev_), strerror(error));
762 goto error;
763 }
764
765 netdev->fd = fd;
766 return 0;
767
768error:
769 if (fd >= 0) {
770 close(fd);
771 }
772 return error;
773}
774
8b61709d
BP
775static int
776netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
777{
778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
779
5b7448ed 780 if (netdev->fd < 0) {
7b6b0ef4 781 /* Device is not listening. */
c0e5f6ca 782 return -EAGAIN;
8b61709d
BP
783 }
784
785 for (;;) {
5b7448ed 786 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
787 if (retval >= 0) {
788 return retval;
789 } else if (errno != EINTR) {
790 if (errno != EAGAIN) {
791 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
792 strerror(errno), netdev_get_name(netdev_));
793 }
c0e5f6ca 794 return -errno;
8b61709d
BP
795 }
796 }
797}
798
799/* Registers with the poll loop to wake up from the next call to poll_block()
800 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
801static void
802netdev_linux_recv_wait(struct netdev *netdev_)
803{
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
805 if (netdev->fd >= 0) {
806 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
807 }
808}
809
810/* Discards all packets waiting to be received from 'netdev'. */
811static int
812netdev_linux_drain(struct netdev *netdev_)
813{
814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 815 if (netdev->fd < 0) {
8b61709d 816 return 0;
5b7448ed 817 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 818 struct ifreq ifr;
149f577a 819 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
820 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
821 if (error) {
822 return error;
823 }
5b7448ed 824 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
825 return 0;
826 } else {
5b7448ed 827 return drain_rcvbuf(netdev->fd);
8b61709d
BP
828 }
829}
830
831/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
832 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
833 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
834 * the packet is too big or too small to transmit on the device.
835 *
836 * The caller retains ownership of 'buffer' in all cases.
837 *
838 * The kernel maintains a packet transmission queue, so the caller is not
839 * expected to do additional queuing of packets. */
840static int
841netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
842{
f23347ea
BP
843 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
844 for (;;) {
845 ssize_t retval;
8b61709d 846
f23347ea
BP
847 if (netdev->fd < 0) {
848 /* Use our AF_PACKET socket to send to this device. */
849 struct sockaddr_ll sll;
850 struct msghdr msg;
851 struct iovec iov;
852 int ifindex;
853 int error;
488d734d
BP
854 int sock;
855
856 sock = af_packet_sock();
857 if (sock < 0) {
858 return sock;
859 }
f23347ea
BP
860
861 error = get_ifindex(netdev_, &ifindex);
862 if (error) {
863 return error;
864 }
8b61709d 865
f23347ea
BP
866 /* We don't bother setting most fields in sockaddr_ll because the
867 * kernel ignores them for SOCK_RAW. */
868 memset(&sll, 0, sizeof sll);
869 sll.sll_family = AF_PACKET;
870 sll.sll_ifindex = ifindex;
76c308b5 871
f23347ea
BP
872 iov.iov_base = (void *) data;
873 iov.iov_len = size;
76c308b5 874
f23347ea
BP
875 msg.msg_name = &sll;
876 msg.msg_namelen = sizeof sll;
877 msg.msg_iov = &iov;
878 msg.msg_iovlen = 1;
879 msg.msg_control = NULL;
880 msg.msg_controllen = 0;
881 msg.msg_flags = 0;
882
488d734d 883 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
884 } else {
885 /* Use the netdev's own fd to send to this device. This is
886 * essential for tap devices, because packets sent to a tap device
887 * with an AF_PACKET socket will loop back to be *received* again
888 * on the tap device. */
889 retval = write(netdev->fd, data, size);
890 }
76c308b5 891
8b61709d
BP
892 if (retval < 0) {
893 /* The Linux AF_PACKET implementation never blocks waiting for room
894 * for packets, instead returning ENOBUFS. Translate this into
895 * EAGAIN for the caller. */
896 if (errno == ENOBUFS) {
897 return EAGAIN;
898 } else if (errno == EINTR) {
899 continue;
900 } else if (errno != EAGAIN) {
901 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
902 netdev_get_name(netdev_), strerror(errno));
903 }
904 return errno;
905 } else if (retval != size) {
906 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
907 "%zu) on %s", retval, size, netdev_get_name(netdev_));
908 return EMSGSIZE;
909 } else {
910 return 0;
911 }
912 }
913}
914
915/* Registers with the poll loop to wake up from the next call to poll_block()
916 * when the packet transmission queue has sufficient room to transmit a packet
917 * with netdev_send().
918 *
919 * The kernel maintains a packet transmission queue, so the client is not
920 * expected to do additional queuing of packets. Thus, this function is
921 * unlikely to ever be used. It is included for completeness. */
922static void
923netdev_linux_send_wait(struct netdev *netdev_)
924{
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 926 if (netdev->fd < 0) {
8b61709d 927 /* Nothing to do. */
5b7448ed
JG
928 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
929 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
930 } else {
931 /* TAP device always accepts packets.*/
932 poll_immediate_wake();
933 }
934}
935
936/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
937 * otherwise a positive errno value. */
938static int
939netdev_linux_set_etheraddr(struct netdev *netdev_,
940 const uint8_t mac[ETH_ADDR_LEN])
941{
149f577a
JG
942 struct netdev_dev_linux *netdev_dev =
943 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
944 int error;
945
149f577a
JG
946 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
947 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
948 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
949 if (!error) {
149f577a
JG
950 netdev_dev->cache_valid |= VALID_ETHERADDR;
951 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
952 }
953 } else {
954 error = 0;
8b61709d
BP
955 }
956 return error;
957}
958
959/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
960 * free the returned buffer. */
961static int
962netdev_linux_get_etheraddr(const struct netdev *netdev_,
963 uint8_t mac[ETH_ADDR_LEN])
964{
149f577a
JG
965 struct netdev_dev_linux *netdev_dev =
966 netdev_dev_linux_cast(netdev_get_dev(netdev_));
967 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 968 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 969 netdev_dev->etheraddr);
8b61709d
BP
970 if (error) {
971 return error;
972 }
149f577a 973 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 974 }
149f577a 975 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
976 return 0;
977}
978
979/* Returns the maximum size of transmitted (and received) packets on 'netdev',
980 * in bytes, not including the hardware header; thus, this is typically 1500
981 * bytes for Ethernet devices. */
982static int
983netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
984{
149f577a
JG
985 struct netdev_dev_linux *netdev_dev =
986 netdev_dev_linux_cast(netdev_get_dev(netdev_));
987 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
988 struct ifreq ifr;
989 int error;
990
149f577a
JG
991 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
992 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
993 if (error) {
994 return error;
995 }
149f577a
JG
996 netdev_dev->mtu = ifr.ifr_mtu;
997 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 998 }
149f577a 999 *mtup = netdev_dev->mtu;
8b61709d
BP
1000 return 0;
1001}
1002
9ab3d9a3
BP
1003/* Returns the ifindex of 'netdev', if successful, as a positive number.
1004 * On failure, returns a negative errno value. */
1005static int
1006netdev_linux_get_ifindex(const struct netdev *netdev)
1007{
1008 int ifindex, error;
1009
1010 error = get_ifindex(netdev, &ifindex);
1011 return error ? -error : ifindex;
1012}
1013
8b61709d
BP
1014static int
1015netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1016{
149f577a
JG
1017 struct netdev_dev_linux *netdev_dev =
1018 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1019 int error = 0;
1020 char *fn = NULL;
1021 int fd = -1;
1022
1670c579
EJ
1023 if (netdev_dev->miimon_interval > 0) {
1024 *carrier = netdev_dev->miimon;
1025 return 0;
1026 }
1027
149f577a 1028 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
1029 char line[8];
1030 int retval;
1031
149f577a
JG
1032 fn = xasprintf("/sys/class/net/%s/carrier",
1033 netdev_get_name(netdev_));
8b61709d
BP
1034 fd = open(fn, O_RDONLY);
1035 if (fd < 0) {
1036 error = errno;
1037 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1038 goto exit;
1039 }
1040
1041 retval = read(fd, line, sizeof line);
1042 if (retval < 0) {
1043 error = errno;
1044 if (error == EINVAL) {
1045 /* This is the normal return value when we try to check carrier
1046 * if the network device is not up. */
1047 } else {
1048 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1049 }
1050 goto exit;
1051 } else if (retval == 0) {
1052 error = EPROTO;
1053 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1054 goto exit;
1055 }
1056
1057 if (line[0] != '0' && line[0] != '1') {
1058 error = EPROTO;
1059 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1060 fn, line[0]);
1061 goto exit;
1062 }
149f577a
JG
1063 netdev_dev->carrier = line[0] != '0';
1064 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 1065 }
149f577a 1066 *carrier = netdev_dev->carrier;
8b61709d
BP
1067 error = 0;
1068
1069exit:
1070 if (fd >= 0) {
1071 close(fd);
1072 }
1073 free(fn);
1074 return error;
1075}
1076
63331829 1077static int
1670c579
EJ
1078netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1079 struct mii_ioctl_data *data)
63331829 1080{
63331829 1081 struct ifreq ifr;
782e6111 1082 int error;
63331829 1083
63331829 1084 memset(&ifr, 0, sizeof ifr);
782e6111 1085 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1086 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1087 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1088
782e6111
EJ
1089 return error;
1090}
1091
1092static int
1670c579 1093netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1094{
782e6111
EJ
1095 struct mii_ioctl_data data;
1096 int error;
63331829 1097
782e6111
EJ
1098 *miimon = false;
1099
1100 memset(&data, 0, sizeof data);
1670c579 1101 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1102 if (!error) {
1103 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1104 data.reg_num = MII_BMSR;
1670c579 1105 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1106 &data);
63331829
EJ
1107
1108 if (!error) {
782e6111 1109 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1110 } else {
1111 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1112 }
1113 } else {
1114 struct ethtool_cmd ecmd;
63331829
EJ
1115
1116 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1117 name);
1118
1119 memset(&ecmd, 0, sizeof ecmd);
1120 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1121 "ETHTOOL_GLINK");
1122 if (!error) {
782e6111
EJ
1123 struct ethtool_value eval;
1124
1125 memcpy(&eval, &ecmd, sizeof eval);
1126 *miimon = !!eval.data;
63331829
EJ
1127 } else {
1128 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1129 }
1130 }
1131
1132 return error;
1133}
1134
1670c579
EJ
1135static int
1136netdev_linux_set_miimon_interval(struct netdev *netdev_,
1137 long long int interval)
1138{
1139 struct netdev_dev_linux *netdev_dev;
1140
1141 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1142
1143 interval = interval > 0 ? MAX(interval, 100) : 0;
1144 if (netdev_dev->miimon_interval != interval) {
1145 netdev_dev->miimon_interval = interval;
1146 timer_set_expired(&netdev_dev->miimon_timer);
1147 }
1148
1149 return 0;
1150}
1151
1152static void
1153netdev_linux_miimon_run(void)
1154{
1155 struct shash device_shash;
1156 struct shash_node *node;
1157
1158 shash_init(&device_shash);
1159 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1160 SHASH_FOR_EACH (node, &device_shash) {
1161 struct netdev_dev_linux *dev = node->data;
1162 bool miimon;
1163
1164 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1165 continue;
1166 }
1167
1168 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1169 if (miimon != dev->miimon) {
1670c579 1170 dev->miimon = miimon;
ac4d3bcb 1171 netdev_dev_linux_changed(dev);
1670c579
EJ
1172 }
1173
1174 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1175 }
1176
1177 shash_destroy(&device_shash);
1178}
1179
1180static void
1181netdev_linux_miimon_wait(void)
1182{
1183 struct shash device_shash;
1184 struct shash_node *node;
1185
1186 shash_init(&device_shash);
1187 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1188 SHASH_FOR_EACH (node, &device_shash) {
1189 struct netdev_dev_linux *dev = node->data;
1190
1191 if (dev->miimon_interval > 0) {
1192 timer_wait(&dev->miimon_timer);
1193 }
1194 }
1195 shash_destroy(&device_shash);
1196}
1197
8b61709d
BP
1198/* Check whether we can we use RTM_GETLINK to get network device statistics.
1199 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1200 * enabled. */
1201static bool
1202check_for_working_netlink_stats(void)
1203{
1204 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1205 * preferable, so if that works, we'll use it. */
1206 int ifindex = do_get_ifindex("lo");
1207 if (ifindex < 0) {
1208 VLOG_WARN("failed to get ifindex for lo, "
1209 "obtaining netdev stats from proc");
1210 return false;
1211 } else {
1212 struct netdev_stats stats;
1213 int error = get_stats_via_netlink(ifindex, &stats);
1214 if (!error) {
1215 VLOG_DBG("obtaining netdev stats via rtnetlink");
1216 return true;
1217 } else {
1218 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1219 "via proc (you are probably running a pre-2.6.19 "
1220 "kernel)", strerror(error));
1221 return false;
1222 }
1223 }
1224}
1225
8722022c
BP
1226/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1227static void
1228netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1229{
1230 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1231 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1232 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1233
8722022c 1234 netdev_dev->is_tap = !strcmp(type, "tap");
9fe3b9a2
BP
1235 netdev_dev->is_internal = (!netdev_dev->is_tap
1236 && dpif_linux_is_internal_device(name));
8722022c
BP
1237 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1238 }
1239}
1240
92df599c
JG
1241static void
1242swap_uint64(uint64_t *a, uint64_t *b)
1243{
1de0e8ae
BP
1244 uint64_t tmp = *a;
1245 *a = *b;
1246 *b = tmp;
92df599c
JG
1247}
1248
7fbef77a 1249/* Retrieves current device stats for 'netdev'. */
8b61709d 1250static int
149f577a
JG
1251netdev_linux_get_stats(const struct netdev *netdev_,
1252 struct netdev_stats *stats)
8b61709d 1253{
149f577a
JG
1254 struct netdev_dev_linux *netdev_dev =
1255 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1256 static int use_netlink_stats = -1;
1257 int error;
1258
7fbef77a
JG
1259 if (netdev_dev->have_vport_stats ||
1260 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1261
1262 error = netdev_vport_get_stats(netdev_, stats);
1263 netdev_dev->have_vport_stats = !error;
1264 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1265 }
8b61709d 1266
7fbef77a
JG
1267 if (!netdev_dev->have_vport_stats) {
1268 if (use_netlink_stats < 0) {
1269 use_netlink_stats = check_for_working_netlink_stats();
1270 }
1271 if (use_netlink_stats) {
1272 int ifindex;
1273
1274 error = get_ifindex(netdev_, &ifindex);
1275 if (!error) {
1276 error = get_stats_via_netlink(ifindex, stats);
1277 }
1278 } else {
1279 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1280 }
8b61709d 1281 }
fe6b0e03
JG
1282
1283 /* If this port is an internal port then the transmit and receive stats
1284 * will appear to be swapped relative to the other ports since we are the
1285 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1286 * them back here. This does not apply if we are getting stats from the
1287 * vport layer because it always tracks stats from the perspective of the
1288 * switch. */
92df599c 1289 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1290 if (!error && !netdev_dev->have_vport_stats &&
1291 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1292 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1293 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1294 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1295 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1296 stats->rx_length_errors = 0;
1297 stats->rx_over_errors = 0;
1298 stats->rx_crc_errors = 0;
1299 stats->rx_frame_errors = 0;
1300 stats->rx_fifo_errors = 0;
1301 stats->rx_missed_errors = 0;
1302 stats->tx_aborted_errors = 0;
1303 stats->tx_carrier_errors = 0;
1304 stats->tx_fifo_errors = 0;
1305 stats->tx_heartbeat_errors = 0;
1306 stats->tx_window_errors = 0;
1307 }
1308
8b61709d
BP
1309 return error;
1310}
1311
1312/* Stores the features supported by 'netdev' into each of '*current',
1313 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1314 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1315 * successful, otherwise a positive errno value. */
8b61709d 1316static int
6f2f5cce 1317netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1318 uint32_t *current, uint32_t *advertised,
1319 uint32_t *supported, uint32_t *peer)
1320{
1321 struct ethtool_cmd ecmd;
1322 int error;
1323
1324 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1325 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1326 ETHTOOL_GSET, "ETHTOOL_GSET");
1327 if (error) {
1328 return error;
1329 }
1330
1331 /* Supported features. */
1332 *supported = 0;
1333 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1334 *supported |= OFPPF_10MB_HD;
1335 }
1336 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1337 *supported |= OFPPF_10MB_FD;
1338 }
1339 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1340 *supported |= OFPPF_100MB_HD;
1341 }
1342 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1343 *supported |= OFPPF_100MB_FD;
1344 }
1345 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1346 *supported |= OFPPF_1GB_HD;
1347 }
1348 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1349 *supported |= OFPPF_1GB_FD;
1350 }
1351 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1352 *supported |= OFPPF_10GB_FD;
1353 }
1354 if (ecmd.supported & SUPPORTED_TP) {
1355 *supported |= OFPPF_COPPER;
1356 }
1357 if (ecmd.supported & SUPPORTED_FIBRE) {
1358 *supported |= OFPPF_FIBER;
1359 }
1360 if (ecmd.supported & SUPPORTED_Autoneg) {
1361 *supported |= OFPPF_AUTONEG;
1362 }
1363 if (ecmd.supported & SUPPORTED_Pause) {
1364 *supported |= OFPPF_PAUSE;
1365 }
1366 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1367 *supported |= OFPPF_PAUSE_ASYM;
1368 }
1369
1370 /* Advertised features. */
1371 *advertised = 0;
1372 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1373 *advertised |= OFPPF_10MB_HD;
1374 }
1375 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1376 *advertised |= OFPPF_10MB_FD;
1377 }
1378 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1379 *advertised |= OFPPF_100MB_HD;
1380 }
1381 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1382 *advertised |= OFPPF_100MB_FD;
1383 }
1384 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1385 *advertised |= OFPPF_1GB_HD;
1386 }
1387 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1388 *advertised |= OFPPF_1GB_FD;
1389 }
1390 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1391 *advertised |= OFPPF_10GB_FD;
1392 }
1393 if (ecmd.advertising & ADVERTISED_TP) {
1394 *advertised |= OFPPF_COPPER;
1395 }
1396 if (ecmd.advertising & ADVERTISED_FIBRE) {
1397 *advertised |= OFPPF_FIBER;
1398 }
1399 if (ecmd.advertising & ADVERTISED_Autoneg) {
1400 *advertised |= OFPPF_AUTONEG;
1401 }
1402 if (ecmd.advertising & ADVERTISED_Pause) {
1403 *advertised |= OFPPF_PAUSE;
1404 }
1405 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1406 *advertised |= OFPPF_PAUSE_ASYM;
1407 }
1408
1409 /* Current settings. */
1410 if (ecmd.speed == SPEED_10) {
1411 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1412 } else if (ecmd.speed == SPEED_100) {
1413 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1414 } else if (ecmd.speed == SPEED_1000) {
1415 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1416 } else if (ecmd.speed == SPEED_10000) {
1417 *current = OFPPF_10GB_FD;
1418 } else {
1419 *current = 0;
1420 }
1421
1422 if (ecmd.port == PORT_TP) {
1423 *current |= OFPPF_COPPER;
1424 } else if (ecmd.port == PORT_FIBRE) {
1425 *current |= OFPPF_FIBER;
1426 }
1427
1428 if (ecmd.autoneg) {
1429 *current |= OFPPF_AUTONEG;
1430 }
1431
1432 /* Peer advertisements. */
1433 *peer = 0; /* XXX */
1434
1435 return 0;
1436}
1437
1438/* Set the features advertised by 'netdev' to 'advertise'. */
1439static int
1440netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1441{
1442 struct ethtool_cmd ecmd;
1443 int error;
1444
1445 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1446 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1447 ETHTOOL_GSET, "ETHTOOL_GSET");
1448 if (error) {
1449 return error;
1450 }
1451
1452 ecmd.advertising = 0;
1453 if (advertise & OFPPF_10MB_HD) {
1454 ecmd.advertising |= ADVERTISED_10baseT_Half;
1455 }
1456 if (advertise & OFPPF_10MB_FD) {
1457 ecmd.advertising |= ADVERTISED_10baseT_Full;
1458 }
1459 if (advertise & OFPPF_100MB_HD) {
1460 ecmd.advertising |= ADVERTISED_100baseT_Half;
1461 }
1462 if (advertise & OFPPF_100MB_FD) {
1463 ecmd.advertising |= ADVERTISED_100baseT_Full;
1464 }
1465 if (advertise & OFPPF_1GB_HD) {
1466 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1467 }
1468 if (advertise & OFPPF_1GB_FD) {
1469 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1470 }
1471 if (advertise & OFPPF_10GB_FD) {
1472 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1473 }
1474 if (advertise & OFPPF_COPPER) {
1475 ecmd.advertising |= ADVERTISED_TP;
1476 }
1477 if (advertise & OFPPF_FIBER) {
1478 ecmd.advertising |= ADVERTISED_FIBRE;
1479 }
1480 if (advertise & OFPPF_AUTONEG) {
1481 ecmd.advertising |= ADVERTISED_Autoneg;
1482 }
1483 if (advertise & OFPPF_PAUSE) {
1484 ecmd.advertising |= ADVERTISED_Pause;
1485 }
1486 if (advertise & OFPPF_PAUSE_ASYM) {
1487 ecmd.advertising |= ADVERTISED_Asym_Pause;
1488 }
0b0544d7 1489 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1490 ETHTOOL_SSET, "ETHTOOL_SSET");
1491}
1492
1493/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1494 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1495 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1496 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1497 * sets '*vlan_vid' to -1. */
1498static int
1499netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1500{
1501 const char *netdev_name = netdev_get_name(netdev);
1502 struct ds line = DS_EMPTY_INITIALIZER;
1503 FILE *stream = NULL;
1504 int error;
1505 char *fn;
1506
1507 COVERAGE_INC(netdev_get_vlan_vid);
1508 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1509 stream = fopen(fn, "r");
1510 if (!stream) {
1511 error = errno;
1512 goto done;
1513 }
1514
1515 if (ds_get_line(&line, stream)) {
1516 if (ferror(stream)) {
1517 error = errno;
1518 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1519 } else {
1520 error = EPROTO;
1521 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1522 }
1523 goto done;
1524 }
1525
1526 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1527 error = EPROTO;
1528 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1529 fn, ds_cstr(&line));
1530 goto done;
1531 }
1532
1533 error = 0;
1534
1535done:
1536 free(fn);
1537 if (stream) {
1538 fclose(stream);
1539 }
1540 ds_destroy(&line);
1541 if (error) {
1542 *vlan_vid = -1;
1543 }
1544 return error;
1545}
1546
1547#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1548#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1549
8e460221 1550/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1551 * positive errno value.
1552 *
1553 * This function is equivalent to running
1554 * /sbin/tc qdisc del dev %s handle ffff: ingress
1555 * but it is much, much faster.
1556 */
8e460221
BP
1557static int
1558netdev_linux_remove_policing(struct netdev *netdev)
1559{
80a86fbe
BP
1560 struct netdev_dev_linux *netdev_dev =
1561 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1562 const char *netdev_name = netdev_get_name(netdev);
8e460221 1563
6f42c8ea 1564 struct ofpbuf request;
6f42c8ea 1565 struct tcmsg *tcmsg;
6f42c8ea
BP
1566 int error;
1567
c1c9c9c4 1568 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1569 if (!tcmsg) {
1570 return ENODEV;
1571 }
c1c9c9c4 1572 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1573 tcmsg->tcm_parent = TC_H_INGRESS;
1574 nl_msg_put_string(&request, TCA_KIND, "ingress");
1575 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1576
1577 error = tc_transact(&request, NULL);
4d10512c 1578 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1579 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1580 netdev_name, strerror(error));
1581 return error;
1582 }
1583
80a86fbe
BP
1584 netdev_dev->kbits_rate = 0;
1585 netdev_dev->kbits_burst = 0;
1586 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1587 return 0;
1588}
1589
8b61709d
BP
1590/* Attempts to set input rate limiting (policing) policy. */
1591static int
1592netdev_linux_set_policing(struct netdev *netdev,
1593 uint32_t kbits_rate, uint32_t kbits_burst)
1594{
80a86fbe
BP
1595 struct netdev_dev_linux *netdev_dev =
1596 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1597 const char *netdev_name = netdev_get_name(netdev);
1598 char command[1024];
1599
1600 COVERAGE_INC(netdev_set_policing);
8e460221 1601
80a86fbe
BP
1602 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1603 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1604 : kbits_burst); /* Stick with user-specified value. */
1605
1606 if (netdev_dev->cache_valid & VALID_POLICING
1607 && netdev_dev->kbits_rate == kbits_rate
1608 && netdev_dev->kbits_burst == kbits_burst) {
1609 /* Assume that settings haven't changed since we last set them. */
1610 return 0;
1611 }
1612
8e460221 1613 netdev_linux_remove_policing(netdev);
8b61709d 1614 if (kbits_rate) {
8b61709d
BP
1615 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1616 if (system(command) != 0) {
1617 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1618 return -1;
1619 }
1620
1621 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1622 kbits_rate, kbits_burst);
1623 if (system(command) != 0) {
1624 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1625 netdev_name);
1626 return -1;
1627 }
80a86fbe
BP
1628
1629 netdev_dev->kbits_rate = kbits_rate;
1630 netdev_dev->kbits_burst = kbits_burst;
1631 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1632 }
1633
1634 return 0;
1635}
1636
c1c9c9c4
BP
1637static int
1638netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1639 struct sset *types)
c1c9c9c4
BP
1640{
1641 const struct tc_ops **opsp;
1642
1643 for (opsp = tcs; *opsp != NULL; opsp++) {
1644 const struct tc_ops *ops = *opsp;
1645 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1646 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1647 }
1648 }
1649 return 0;
1650}
1651
1652static const struct tc_ops *
1653tc_lookup_ovs_name(const char *name)
1654{
1655 const struct tc_ops **opsp;
1656
1657 for (opsp = tcs; *opsp != NULL; opsp++) {
1658 const struct tc_ops *ops = *opsp;
1659 if (!strcmp(name, ops->ovs_name)) {
1660 return ops;
1661 }
1662 }
1663 return NULL;
1664}
1665
1666static const struct tc_ops *
1667tc_lookup_linux_name(const char *name)
1668{
1669 const struct tc_ops **opsp;
1670
1671 for (opsp = tcs; *opsp != NULL; opsp++) {
1672 const struct tc_ops *ops = *opsp;
1673 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1674 return ops;
1675 }
1676 }
1677 return NULL;
1678}
1679
93b13be8
BP
1680static struct tc_queue *
1681tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1682 size_t hash)
1683{
1684 struct netdev_dev_linux *netdev_dev =
1685 netdev_dev_linux_cast(netdev_get_dev(netdev));
1686 struct tc_queue *queue;
1687
4e8e4213 1688 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1689 if (queue->queue_id == queue_id) {
1690 return queue;
1691 }
1692 }
1693 return NULL;
1694}
1695
1696static struct tc_queue *
1697tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1698{
1699 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1700}
1701
c1c9c9c4
BP
1702static int
1703netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1704 const char *type,
1705 struct netdev_qos_capabilities *caps)
1706{
1707 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1708 if (!ops) {
1709 return EOPNOTSUPP;
1710 }
1711 caps->n_queues = ops->n_queues;
1712 return 0;
1713}
1714
1715static int
1716netdev_linux_get_qos(const struct netdev *netdev,
1717 const char **typep, struct shash *details)
1718{
1719 struct netdev_dev_linux *netdev_dev =
1720 netdev_dev_linux_cast(netdev_get_dev(netdev));
1721 int error;
1722
1723 error = tc_query_qdisc(netdev);
1724 if (error) {
1725 return error;
1726 }
1727
1728 *typep = netdev_dev->tc->ops->ovs_name;
1729 return (netdev_dev->tc->ops->qdisc_get
1730 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1731 : 0);
1732}
1733
1734static int
1735netdev_linux_set_qos(struct netdev *netdev,
1736 const char *type, const struct shash *details)
1737{
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1740 const struct tc_ops *new_ops;
1741 int error;
1742
1743 new_ops = tc_lookup_ovs_name(type);
1744 if (!new_ops || !new_ops->tc_install) {
1745 return EOPNOTSUPP;
1746 }
1747
1748 error = tc_query_qdisc(netdev);
1749 if (error) {
1750 return error;
1751 }
1752
1753 if (new_ops == netdev_dev->tc->ops) {
1754 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1755 } else {
1756 /* Delete existing qdisc. */
1757 error = tc_del_qdisc(netdev);
1758 if (error) {
1759 return error;
1760 }
1761 assert(netdev_dev->tc == NULL);
1762
1763 /* Install new qdisc. */
1764 error = new_ops->tc_install(netdev, details);
1765 assert((error == 0) == (netdev_dev->tc != NULL));
1766
1767 return error;
1768 }
1769}
1770
1771static int
1772netdev_linux_get_queue(const struct netdev *netdev,
1773 unsigned int queue_id, struct shash *details)
1774{
1775 struct netdev_dev_linux *netdev_dev =
1776 netdev_dev_linux_cast(netdev_get_dev(netdev));
1777 int error;
1778
1779 error = tc_query_qdisc(netdev);
1780 if (error) {
1781 return error;
93b13be8
BP
1782 } else {
1783 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1784 return (queue
1785 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1786 : ENOENT);
c1c9c9c4 1787 }
c1c9c9c4
BP
1788}
1789
1790static int
1791netdev_linux_set_queue(struct netdev *netdev,
1792 unsigned int queue_id, const struct shash *details)
1793{
1794 struct netdev_dev_linux *netdev_dev =
1795 netdev_dev_linux_cast(netdev_get_dev(netdev));
1796 int error;
1797
1798 error = tc_query_qdisc(netdev);
1799 if (error) {
1800 return error;
1801 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1802 || !netdev_dev->tc->ops->class_set) {
1803 return EINVAL;
1804 }
1805
1806 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1807}
1808
1809static int
1810netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1811{
1812 struct netdev_dev_linux *netdev_dev =
1813 netdev_dev_linux_cast(netdev_get_dev(netdev));
1814 int error;
1815
1816 error = tc_query_qdisc(netdev);
1817 if (error) {
1818 return error;
1819 } else if (!netdev_dev->tc->ops->class_delete) {
1820 return EINVAL;
93b13be8
BP
1821 } else {
1822 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1823 return (queue
1824 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1825 : ENOENT);
c1c9c9c4 1826 }
c1c9c9c4
BP
1827}
1828
1829static int
1830netdev_linux_get_queue_stats(const struct netdev *netdev,
1831 unsigned int queue_id,
1832 struct netdev_queue_stats *stats)
1833{
1834 struct netdev_dev_linux *netdev_dev =
1835 netdev_dev_linux_cast(netdev_get_dev(netdev));
1836 int error;
1837
1838 error = tc_query_qdisc(netdev);
1839 if (error) {
1840 return error;
c1c9c9c4
BP
1841 } else if (!netdev_dev->tc->ops->class_get_stats) {
1842 return EOPNOTSUPP;
93b13be8
BP
1843 } else {
1844 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1845 return (queue
1846 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1847 : ENOENT);
c1c9c9c4 1848 }
c1c9c9c4
BP
1849}
1850
23a98ffe 1851static bool
c1c9c9c4
BP
1852start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1853{
1854 struct ofpbuf request;
1855 struct tcmsg *tcmsg;
1856
1857 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1858 if (!tcmsg) {
1859 return false;
1860 }
3c4de644 1861 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1862 nl_dump_start(dump, rtnl_sock, &request);
1863 ofpbuf_uninit(&request);
23a98ffe 1864 return true;
c1c9c9c4
BP
1865}
1866
1867static int
1868netdev_linux_dump_queues(const struct netdev *netdev,
1869 netdev_dump_queues_cb *cb, void *aux)
1870{
1871 struct netdev_dev_linux *netdev_dev =
1872 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1873 struct tc_queue *queue;
c1c9c9c4
BP
1874 struct shash details;
1875 int last_error;
c1c9c9c4
BP
1876 int error;
1877
1878 error = tc_query_qdisc(netdev);
1879 if (error) {
1880 return error;
1881 } else if (!netdev_dev->tc->ops->class_get) {
1882 return EOPNOTSUPP;
1883 }
1884
1885 last_error = 0;
1886 shash_init(&details);
4e8e4213 1887 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1888 shash_clear(&details);
1889
93b13be8 1890 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1891 if (!error) {
93b13be8 1892 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1893 } else {
1894 last_error = error;
1895 }
1896 }
1897 shash_destroy(&details);
1898
1899 return last_error;
1900}
1901
1902static int
1903netdev_linux_dump_queue_stats(const struct netdev *netdev,
1904 netdev_dump_queue_stats_cb *cb, void *aux)
1905{
1906 struct netdev_dev_linux *netdev_dev =
1907 netdev_dev_linux_cast(netdev_get_dev(netdev));
1908 struct nl_dump dump;
1909 struct ofpbuf msg;
1910 int last_error;
1911 int error;
1912
1913 error = tc_query_qdisc(netdev);
1914 if (error) {
1915 return error;
1916 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1917 return EOPNOTSUPP;
1918 }
1919
1920 last_error = 0;
23a98ffe
BP
1921 if (!start_queue_dump(netdev, &dump)) {
1922 return ENODEV;
1923 }
c1c9c9c4
BP
1924 while (nl_dump_next(&dump, &msg)) {
1925 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1926 if (error) {
1927 last_error = error;
1928 }
1929 }
1930
1931 error = nl_dump_done(&dump);
1932 return error ? error : last_error;
1933}
1934
8b61709d 1935static int
f1acd62b
BP
1936netdev_linux_get_in4(const struct netdev *netdev_,
1937 struct in_addr *address, struct in_addr *netmask)
8b61709d 1938{
149f577a
JG
1939 struct netdev_dev_linux *netdev_dev =
1940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1941
1942 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1943 int error;
1944
149f577a 1945 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1946 SIOCGIFADDR, "SIOCGIFADDR");
1947 if (error) {
1948 return error;
1949 }
1950
149f577a 1951 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1952 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1953 if (error) {
1954 return error;
1955 }
1956
149f577a 1957 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1958 }
149f577a
JG
1959 *address = netdev_dev->address;
1960 *netmask = netdev_dev->netmask;
f1acd62b 1961 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1962}
1963
8b61709d 1964static int
f1acd62b
BP
1965netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1966 struct in_addr netmask)
8b61709d 1967{
149f577a
JG
1968 struct netdev_dev_linux *netdev_dev =
1969 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1970 int error;
1971
f1acd62b 1972 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1973 if (!error) {
149f577a
JG
1974 netdev_dev->cache_valid |= VALID_IN4;
1975 netdev_dev->address = address;
1976 netdev_dev->netmask = netmask;
f1acd62b 1977 if (address.s_addr != INADDR_ANY) {
8b61709d 1978 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1979 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1980 }
1981 }
1982 return error;
1983}
1984
1985static bool
1986parse_if_inet6_line(const char *line,
1987 struct in6_addr *in6, char ifname[16 + 1])
1988{
1989 uint8_t *s6 = in6->s6_addr;
1990#define X8 "%2"SCNx8
1991 return sscanf(line,
1992 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1993 "%*x %*x %*x %*x %16s\n",
1994 &s6[0], &s6[1], &s6[2], &s6[3],
1995 &s6[4], &s6[5], &s6[6], &s6[7],
1996 &s6[8], &s6[9], &s6[10], &s6[11],
1997 &s6[12], &s6[13], &s6[14], &s6[15],
1998 ifname) == 17;
1999}
2000
2001/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2002 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2003static int
2004netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2005{
149f577a
JG
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2008 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2009 FILE *file;
2010 char line[128];
2011
149f577a 2012 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2013
2014 file = fopen("/proc/net/if_inet6", "r");
2015 if (file != NULL) {
2016 const char *name = netdev_get_name(netdev_);
2017 while (fgets(line, sizeof line, file)) {
2a022368 2018 struct in6_addr in6_tmp;
8b61709d 2019 char ifname[16 + 1];
2a022368 2020 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2021 && !strcmp(name, ifname))
2022 {
2a022368 2023 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2024 break;
2025 }
2026 }
2027 fclose(file);
2028 }
149f577a 2029 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2030 }
149f577a 2031 *in6 = netdev_dev->in6;
8b61709d
BP
2032 return 0;
2033}
2034
2035static void
2036make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2037{
2038 struct sockaddr_in sin;
2039 memset(&sin, 0, sizeof sin);
2040 sin.sin_family = AF_INET;
2041 sin.sin_addr = addr;
2042 sin.sin_port = 0;
2043
2044 memset(sa, 0, sizeof *sa);
2045 memcpy(sa, &sin, sizeof sin);
2046}
2047
2048static int
2049do_set_addr(struct netdev *netdev,
2050 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2051{
2052 struct ifreq ifr;
71d7c22f 2053 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2054 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2055
2056 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2057 ioctl_name);
8b61709d
BP
2058}
2059
2060/* Adds 'router' as a default IP gateway. */
2061static int
67a4917b 2062netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2063{
2064 struct in_addr any = { INADDR_ANY };
2065 struct rtentry rt;
2066 int error;
2067
2068 memset(&rt, 0, sizeof rt);
2069 make_in4_sockaddr(&rt.rt_dst, any);
2070 make_in4_sockaddr(&rt.rt_gateway, router);
2071 make_in4_sockaddr(&rt.rt_genmask, any);
2072 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2073 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2074 if (error) {
2075 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2076 }
2077 return error;
2078}
2079
f1acd62b
BP
2080static int
2081netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2082 char **netdev_name)
2083{
2084 static const char fn[] = "/proc/net/route";
2085 FILE *stream;
2086 char line[256];
2087 int ln;
2088
2089 *netdev_name = NULL;
2090 stream = fopen(fn, "r");
2091 if (stream == NULL) {
2092 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2093 return errno;
2094 }
2095
2096 ln = 0;
2097 while (fgets(line, sizeof line, stream)) {
2098 if (++ln >= 2) {
2099 char iface[17];
dbba996b 2100 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2101 int refcnt, metric, mtu;
2102 unsigned int flags, use, window, irtt;
2103
2104 if (sscanf(line,
2105 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2106 " %d %u %u\n",
2107 iface, &dest, &gateway, &flags, &refcnt,
2108 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2109
d295e8e9 2110 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2111 fn, ln, line);
2112 continue;
2113 }
2114 if (!(flags & RTF_UP)) {
2115 /* Skip routes that aren't up. */
2116 continue;
2117 }
2118
2119 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2120 * network byte order, so we don't need need any endian
f1acd62b
BP
2121 * conversions here. */
2122 if ((dest & mask) == (host->s_addr & mask)) {
2123 if (!gateway) {
2124 /* The host is directly reachable. */
2125 next_hop->s_addr = 0;
2126 } else {
2127 /* To reach the host, we must go through a gateway. */
2128 next_hop->s_addr = gateway;
2129 }
2130 *netdev_name = xstrdup(iface);
2131 fclose(stream);
2132 return 0;
2133 }
2134 }
2135 }
2136
2137 fclose(stream);
2138 return ENXIO;
2139}
2140
e210037e
AE
2141static int
2142netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2143{
2144 struct ethtool_drvinfo drvinfo;
2145 int error;
2146
2147 memset(&drvinfo, 0, sizeof drvinfo);
2148 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2149 (struct ethtool_cmd *)&drvinfo,
2150 ETHTOOL_GDRVINFO,
2151 "ETHTOOL_GDRVINFO");
2152 if (!error) {
2153 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2154 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2155 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2156 }
2157
2158 return error;
2159}
2160
8b61709d
BP
2161/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2162 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2163 * returns 0. Otherwise, it returns a positive errno value; in particular,
2164 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2165static int
2166netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2167 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2168{
2169 struct arpreq r;
c100e025 2170 struct sockaddr_in sin;
8b61709d
BP
2171 int retval;
2172
2173 memset(&r, 0, sizeof r);
f2cc621b 2174 memset(&sin, 0, sizeof sin);
c100e025
BP
2175 sin.sin_family = AF_INET;
2176 sin.sin_addr.s_addr = ip;
2177 sin.sin_port = 0;
2178 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2179 r.arp_ha.sa_family = ARPHRD_ETHER;
2180 r.arp_flags = 0;
71d7c22f 2181 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2182 COVERAGE_INC(netdev_arp_lookup);
2183 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2184 if (!retval) {
2185 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2186 } else if (retval != ENXIO) {
2187 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2188 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2189 }
2190 return retval;
2191}
2192
2193static int
2194nd_to_iff_flags(enum netdev_flags nd)
2195{
2196 int iff = 0;
2197 if (nd & NETDEV_UP) {
2198 iff |= IFF_UP;
2199 }
2200 if (nd & NETDEV_PROMISC) {
2201 iff |= IFF_PROMISC;
2202 }
2203 return iff;
2204}
2205
2206static int
2207iff_to_nd_flags(int iff)
2208{
2209 enum netdev_flags nd = 0;
2210 if (iff & IFF_UP) {
2211 nd |= NETDEV_UP;
2212 }
2213 if (iff & IFF_PROMISC) {
2214 nd |= NETDEV_PROMISC;
2215 }
2216 return nd;
2217}
2218
2219static int
2220netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2221 enum netdev_flags on, enum netdev_flags *old_flagsp)
2222{
2223 int old_flags, new_flags;
2224 int error;
2225
2226 error = get_flags(netdev, &old_flags);
2227 if (!error) {
2228 *old_flagsp = iff_to_nd_flags(old_flags);
2229 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2230 if (new_flags != old_flags) {
2231 error = set_flags(netdev, new_flags);
2232 }
2233 }
2234 return error;
2235}
2236
ac4d3bcb
EJ
2237static unsigned int
2238netdev_linux_change_seq(const struct netdev *netdev)
2239{
2240 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2241}
2242
c3827f61
BP
2243#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2244{ \
2245 NAME, \
2246 \
2247 netdev_linux_init, \
2248 netdev_linux_run, \
2249 netdev_linux_wait, \
2250 \
2251 CREATE, \
2252 netdev_linux_destroy, \
de5cdb90 2253 NULL, /* get_config */ \
6d9e6eb4 2254 NULL, /* set_config */ \
c3827f61
BP
2255 \
2256 netdev_linux_open, \
2257 netdev_linux_close, \
2258 \
2259 ENUMERATE, \
2260 \
7b6b0ef4 2261 netdev_linux_listen, \
c3827f61
BP
2262 netdev_linux_recv, \
2263 netdev_linux_recv_wait, \
2264 netdev_linux_drain, \
2265 \
2266 netdev_linux_send, \
2267 netdev_linux_send_wait, \
2268 \
2269 netdev_linux_set_etheraddr, \
2270 netdev_linux_get_etheraddr, \
2271 netdev_linux_get_mtu, \
2272 netdev_linux_get_ifindex, \
2273 netdev_linux_get_carrier, \
1670c579 2274 netdev_linux_set_miimon_interval, \
c3827f61
BP
2275 netdev_linux_get_stats, \
2276 SET_STATS, \
2277 \
2278 netdev_linux_get_features, \
2279 netdev_linux_set_advertisements, \
2280 netdev_linux_get_vlan_vid, \
2281 \
2282 netdev_linux_set_policing, \
2283 netdev_linux_get_qos_types, \
2284 netdev_linux_get_qos_capabilities, \
2285 netdev_linux_get_qos, \
2286 netdev_linux_set_qos, \
2287 netdev_linux_get_queue, \
2288 netdev_linux_set_queue, \
2289 netdev_linux_delete_queue, \
2290 netdev_linux_get_queue_stats, \
2291 netdev_linux_dump_queues, \
2292 netdev_linux_dump_queue_stats, \
2293 \
2294 netdev_linux_get_in4, \
2295 netdev_linux_set_in4, \
2296 netdev_linux_get_in6, \
2297 netdev_linux_add_router, \
2298 netdev_linux_get_next_hop, \
e210037e 2299 netdev_linux_get_status, \
c3827f61
BP
2300 netdev_linux_arp_lookup, \
2301 \
2302 netdev_linux_update_flags, \
2303 \
ac4d3bcb 2304 netdev_linux_change_seq \
c3827f61
BP
2305}
2306
2307const struct netdev_class netdev_linux_class =
2308 NETDEV_LINUX_CLASS(
2309 "system",
2310 netdev_linux_create,
2311 netdev_linux_enumerate,
98563392 2312 NULL); /* set_stats */
c3827f61
BP
2313
2314const struct netdev_class netdev_tap_class =
2315 NETDEV_LINUX_CLASS(
2316 "tap",
2317 netdev_linux_create_tap,
2318 NULL, /* enumerate */
2319 NULL); /* set_stats */
2320
2321const struct netdev_class netdev_internal_class =
2322 NETDEV_LINUX_CLASS(
2323 "internal",
2324 netdev_linux_create,
2325 NULL, /* enumerate */
2326 netdev_vport_set_stats);
8b61709d 2327\f
c1c9c9c4 2328/* HTB traffic control class. */
559843ed 2329
c1c9c9c4 2330#define HTB_N_QUEUES 0xf000
8b61709d 2331
c1c9c9c4
BP
2332struct htb {
2333 struct tc tc;
2334 unsigned int max_rate; /* In bytes/s. */
2335};
8b61709d 2336
c1c9c9c4 2337struct htb_class {
93b13be8 2338 struct tc_queue tc_queue;
c1c9c9c4
BP
2339 unsigned int min_rate; /* In bytes/s. */
2340 unsigned int max_rate; /* In bytes/s. */
2341 unsigned int burst; /* In bytes. */
2342 unsigned int priority; /* Lower values are higher priorities. */
2343};
8b61709d 2344
c1c9c9c4
BP
2345static struct htb *
2346htb_get__(const struct netdev *netdev)
2347{
2348 struct netdev_dev_linux *netdev_dev =
2349 netdev_dev_linux_cast(netdev_get_dev(netdev));
2350 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2351}
2352
24045e35 2353static void
c1c9c9c4
BP
2354htb_install__(struct netdev *netdev, uint64_t max_rate)
2355{
2356 struct netdev_dev_linux *netdev_dev =
2357 netdev_dev_linux_cast(netdev_get_dev(netdev));
2358 struct htb *htb;
2359
2360 htb = xmalloc(sizeof *htb);
2361 tc_init(&htb->tc, &tc_ops_htb);
2362 htb->max_rate = max_rate;
2363
2364 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2365}
2366
2367/* Create an HTB qdisc.
2368 *
a339aa81 2369 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2370static int
2371htb_setup_qdisc__(struct netdev *netdev)
2372{
2373 size_t opt_offset;
2374 struct tc_htb_glob opt;
2375 struct ofpbuf request;
2376 struct tcmsg *tcmsg;
2377
2378 tc_del_qdisc(netdev);
2379
2380 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2381 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2382 if (!tcmsg) {
2383 return ENODEV;
2384 }
c1c9c9c4
BP
2385 tcmsg->tcm_handle = tc_make_handle(1, 0);
2386 tcmsg->tcm_parent = TC_H_ROOT;
2387
2388 nl_msg_put_string(&request, TCA_KIND, "htb");
2389
2390 memset(&opt, 0, sizeof opt);
2391 opt.rate2quantum = 10;
2392 opt.version = 3;
4ecf12d5 2393 opt.defcls = 1;
c1c9c9c4
BP
2394
2395 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2396 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2397 nl_msg_end_nested(&request, opt_offset);
2398
2399 return tc_transact(&request, NULL);
2400}
2401
2402/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2403 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2404static int
2405htb_setup_class__(struct netdev *netdev, unsigned int handle,
2406 unsigned int parent, struct htb_class *class)
2407{
2408 size_t opt_offset;
2409 struct tc_htb_opt opt;
2410 struct ofpbuf request;
2411 struct tcmsg *tcmsg;
2412 int error;
2413 int mtu;
2414
2415 netdev_get_mtu(netdev, &mtu);
f915f1a8
BP
2416 if (mtu == INT_MAX) {
2417 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2418 netdev_get_name(netdev));
2419 return EINVAL;
2420 }
c1c9c9c4
BP
2421
2422 memset(&opt, 0, sizeof opt);
2423 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2424 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2425 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2426 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2427 opt.prio = class->priority;
2428
2429 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2430 if (!tcmsg) {
2431 return ENODEV;
2432 }
c1c9c9c4
BP
2433 tcmsg->tcm_handle = handle;
2434 tcmsg->tcm_parent = parent;
2435
2436 nl_msg_put_string(&request, TCA_KIND, "htb");
2437 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2438 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2439 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2440 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2441 nl_msg_end_nested(&request, opt_offset);
2442
2443 error = tc_transact(&request, NULL);
2444 if (error) {
2445 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2446 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2447 netdev_get_name(netdev),
2448 tc_get_major(handle), tc_get_minor(handle),
2449 tc_get_major(parent), tc_get_minor(parent),
2450 class->min_rate, class->max_rate,
2451 class->burst, class->priority, strerror(error));
2452 }
2453 return error;
2454}
2455
2456/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2457 * description of them into 'details'. The description complies with the
2458 * specification given in the vswitch database documentation for linux-htb
2459 * queue details. */
2460static int
2461htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2462{
2463 static const struct nl_policy tca_htb_policy[] = {
2464 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2465 .min_len = sizeof(struct tc_htb_opt) },
2466 };
2467
2468 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2469 const struct tc_htb_opt *htb;
2470
2471 if (!nl_parse_nested(nl_options, tca_htb_policy,
2472 attrs, ARRAY_SIZE(tca_htb_policy))) {
2473 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2474 return EPROTO;
2475 }
2476
2477 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2478 class->min_rate = htb->rate.rate;
2479 class->max_rate = htb->ceil.rate;
2480 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2481 class->priority = htb->prio;
2482 return 0;
2483}
2484
2485static int
2486htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2487 struct htb_class *options,
2488 struct netdev_queue_stats *stats)
2489{
2490 struct nlattr *nl_options;
2491 unsigned int handle;
2492 int error;
2493
2494 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2495 if (!error && queue_id) {
17ee3c1f
BP
2496 unsigned int major = tc_get_major(handle);
2497 unsigned int minor = tc_get_minor(handle);
2498 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2499 *queue_id = minor - 1;
c1c9c9c4
BP
2500 } else {
2501 error = EPROTO;
2502 }
2503 }
2504 if (!error && options) {
2505 error = htb_parse_tca_options__(nl_options, options);
2506 }
2507 return error;
2508}
2509
2510static void
2511htb_parse_qdisc_details__(struct netdev *netdev,
2512 const struct shash *details, struct htb_class *hc)
2513{
2514 const char *max_rate_s;
2515
2516 max_rate_s = shash_find_data(details, "max-rate");
2517 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2518 if (!hc->max_rate) {
2519 uint32_t current;
2520
2521 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2522 hc->max_rate = netdev_features_to_bps(current) / 8;
2523 }
2524 hc->min_rate = hc->max_rate;
2525 hc->burst = 0;
2526 hc->priority = 0;
2527}
2528
2529static int
2530htb_parse_class_details__(struct netdev *netdev,
2531 const struct shash *details, struct htb_class *hc)
2532{
2533 const struct htb *htb = htb_get__(netdev);
2534 const char *min_rate_s = shash_find_data(details, "min-rate");
2535 const char *max_rate_s = shash_find_data(details, "max-rate");
2536 const char *burst_s = shash_find_data(details, "burst");
2537 const char *priority_s = shash_find_data(details, "priority");
2538 int mtu;
2539
f915f1a8
BP
2540 netdev_get_mtu(netdev, &mtu);
2541 if (mtu == INT_MAX) {
2542 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2543 netdev_get_name(netdev));
2544 return EINVAL;
2545 }
2546
4f104611
EJ
2547 /* HTB requires at least an mtu sized min-rate to send any traffic even
2548 * on uncongested links. */
c45ab5e9 2549 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2550 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2551 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2552
2553 /* max-rate */
2554 hc->max_rate = (max_rate_s
2555 ? strtoull(max_rate_s, NULL, 10) / 8
2556 : htb->max_rate);
2557 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2558 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2559
2560 /* burst
2561 *
2562 * According to hints in the documentation that I've read, it is important
2563 * that 'burst' be at least as big as the largest frame that might be
2564 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2565 * but having it a bit too small is a problem. Since netdev_get_mtu()
2566 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2567 * the MTU. We actually add 64, instead of 14, as a guard against
2568 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2569 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2570 hc->burst = MAX(hc->burst, mtu + 64);
2571
2572 /* priority */
2573 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2574
2575 return 0;
2576}
2577
2578static int
2579htb_query_class__(const struct netdev *netdev, unsigned int handle,
2580 unsigned int parent, struct htb_class *options,
2581 struct netdev_queue_stats *stats)
2582{
2583 struct ofpbuf *reply;
2584 int error;
2585
2586 error = tc_query_class(netdev, handle, parent, &reply);
2587 if (!error) {
2588 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2589 ofpbuf_delete(reply);
2590 }
2591 return error;
2592}
2593
2594static int
2595htb_tc_install(struct netdev *netdev, const struct shash *details)
2596{
2597 int error;
2598
2599 error = htb_setup_qdisc__(netdev);
2600 if (!error) {
2601 struct htb_class hc;
2602
2603 htb_parse_qdisc_details__(netdev, details, &hc);
2604 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2605 tc_make_handle(1, 0), &hc);
2606 if (!error) {
2607 htb_install__(netdev, hc.max_rate);
2608 }
2609 }
2610 return error;
2611}
2612
93b13be8
BP
2613static struct htb_class *
2614htb_class_cast__(const struct tc_queue *queue)
2615{
2616 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2617}
2618
c1c9c9c4
BP
2619static void
2620htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2621 const struct htb_class *hc)
2622{
2623 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2624 size_t hash = hash_int(queue_id, 0);
2625 struct tc_queue *queue;
c1c9c9c4
BP
2626 struct htb_class *hcp;
2627
93b13be8
BP
2628 queue = tc_find_queue__(netdev, queue_id, hash);
2629 if (queue) {
2630 hcp = htb_class_cast__(queue);
2631 } else {
c1c9c9c4 2632 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2633 queue = &hcp->tc_queue;
2634 queue->queue_id = queue_id;
2635 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2636 }
93b13be8
BP
2637
2638 hcp->min_rate = hc->min_rate;
2639 hcp->max_rate = hc->max_rate;
2640 hcp->burst = hc->burst;
2641 hcp->priority = hc->priority;
c1c9c9c4
BP
2642}
2643
2644static int
2645htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2646{
c1c9c9c4
BP
2647 struct ofpbuf msg;
2648 struct nl_dump dump;
2649 struct htb_class hc;
c1c9c9c4
BP
2650
2651 /* Get qdisc options. */
2652 hc.max_rate = 0;
2653 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2654 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2655
2656 /* Get queues. */
23a98ffe
BP
2657 if (!start_queue_dump(netdev, &dump)) {
2658 return ENODEV;
2659 }
c1c9c9c4
BP
2660 while (nl_dump_next(&dump, &msg)) {
2661 unsigned int queue_id;
2662
2663 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2664 htb_update_queue__(netdev, queue_id, &hc);
2665 }
2666 }
2667 nl_dump_done(&dump);
2668
2669 return 0;
2670}
2671
2672static void
2673htb_tc_destroy(struct tc *tc)
2674{
2675 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2676 struct htb_class *hc, *next;
c1c9c9c4 2677
4e8e4213 2678 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2679 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2680 free(hc);
2681 }
2682 tc_destroy(tc);
2683 free(htb);
2684}
2685
2686static int
2687htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2688{
2689 const struct htb *htb = htb_get__(netdev);
2690 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2691 return 0;
2692}
2693
2694static int
2695htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2696{
2697 struct htb_class hc;
2698 int error;
2699
2700 htb_parse_qdisc_details__(netdev, details, &hc);
2701 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2702 tc_make_handle(1, 0), &hc);
2703 if (!error) {
2704 htb_get__(netdev)->max_rate = hc.max_rate;
2705 }
2706 return error;
2707}
2708
2709static int
93b13be8
BP
2710htb_class_get(const struct netdev *netdev OVS_UNUSED,
2711 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2712{
93b13be8 2713 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2714
2715 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2716 if (hc->min_rate != hc->max_rate) {
2717 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2718 }
2719 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2720 if (hc->priority) {
2721 shash_add(details, "priority", xasprintf("%u", hc->priority));
2722 }
2723 return 0;
2724}
2725
2726static int
2727htb_class_set(struct netdev *netdev, unsigned int queue_id,
2728 const struct shash *details)
2729{
2730 struct htb_class hc;
2731 int error;
2732
2733 error = htb_parse_class_details__(netdev, details, &hc);
2734 if (error) {
2735 return error;
2736 }
2737
17ee3c1f 2738 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2739 tc_make_handle(1, 0xfffe), &hc);
2740 if (error) {
2741 return error;
2742 }
2743
2744 htb_update_queue__(netdev, queue_id, &hc);
2745 return 0;
2746}
2747
2748static int
93b13be8 2749htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2750{
93b13be8 2751 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2752 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2753 int error;
2754
93b13be8 2755 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2756 if (!error) {
93b13be8 2757 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2758 free(hc);
c1c9c9c4
BP
2759 }
2760 return error;
2761}
2762
2763static int
93b13be8 2764htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2765 struct netdev_queue_stats *stats)
2766{
93b13be8 2767 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2768 tc_make_handle(1, 0xfffe), NULL, stats);
2769}
2770
2771static int
2772htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2773 const struct ofpbuf *nlmsg,
2774 netdev_dump_queue_stats_cb *cb, void *aux)
2775{
2776 struct netdev_queue_stats stats;
17ee3c1f 2777 unsigned int handle, major, minor;
c1c9c9c4
BP
2778 int error;
2779
2780 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2781 if (error) {
2782 return error;
2783 }
2784
17ee3c1f
BP
2785 major = tc_get_major(handle);
2786 minor = tc_get_minor(handle);
2787 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2788 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2789 }
2790 return 0;
2791}
2792
2793static const struct tc_ops tc_ops_htb = {
2794 "htb", /* linux_name */
2795 "linux-htb", /* ovs_name */
2796 HTB_N_QUEUES, /* n_queues */
2797 htb_tc_install,
2798 htb_tc_load,
2799 htb_tc_destroy,
2800 htb_qdisc_get,
2801 htb_qdisc_set,
2802 htb_class_get,
2803 htb_class_set,
2804 htb_class_delete,
2805 htb_class_get_stats,
2806 htb_class_dump_stats
2807};
2808\f
a339aa81
EJ
2809/* "linux-hfsc" traffic control class. */
2810
2811#define HFSC_N_QUEUES 0xf000
2812
2813struct hfsc {
2814 struct tc tc;
2815 uint32_t max_rate;
2816};
2817
2818struct hfsc_class {
2819 struct tc_queue tc_queue;
2820 uint32_t min_rate;
2821 uint32_t max_rate;
2822};
2823
2824static struct hfsc *
2825hfsc_get__(const struct netdev *netdev)
2826{
2827 struct netdev_dev_linux *netdev_dev;
2828 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2829 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2830}
2831
2832static struct hfsc_class *
2833hfsc_class_cast__(const struct tc_queue *queue)
2834{
2835 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2836}
2837
24045e35 2838static void
a339aa81
EJ
2839hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2840{
2841 struct netdev_dev_linux * netdev_dev;
2842 struct hfsc *hfsc;
2843
2844 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2845 hfsc = xmalloc(sizeof *hfsc);
2846 tc_init(&hfsc->tc, &tc_ops_hfsc);
2847 hfsc->max_rate = max_rate;
2848 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2849}
2850
2851static void
2852hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2853 const struct hfsc_class *hc)
2854{
2855 size_t hash;
2856 struct hfsc *hfsc;
2857 struct hfsc_class *hcp;
2858 struct tc_queue *queue;
2859
2860 hfsc = hfsc_get__(netdev);
2861 hash = hash_int(queue_id, 0);
2862
2863 queue = tc_find_queue__(netdev, queue_id, hash);
2864 if (queue) {
2865 hcp = hfsc_class_cast__(queue);
2866 } else {
2867 hcp = xmalloc(sizeof *hcp);
2868 queue = &hcp->tc_queue;
2869 queue->queue_id = queue_id;
2870 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2871 }
2872
2873 hcp->min_rate = hc->min_rate;
2874 hcp->max_rate = hc->max_rate;
2875}
2876
2877static int
2878hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2879{
2880 const struct tc_service_curve *rsc, *fsc, *usc;
2881 static const struct nl_policy tca_hfsc_policy[] = {
2882 [TCA_HFSC_RSC] = {
2883 .type = NL_A_UNSPEC,
2884 .optional = false,
2885 .min_len = sizeof(struct tc_service_curve),
2886 },
2887 [TCA_HFSC_FSC] = {
2888 .type = NL_A_UNSPEC,
2889 .optional = false,
2890 .min_len = sizeof(struct tc_service_curve),
2891 },
2892 [TCA_HFSC_USC] = {
2893 .type = NL_A_UNSPEC,
2894 .optional = false,
2895 .min_len = sizeof(struct tc_service_curve),
2896 },
2897 };
2898 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2899
2900 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2901 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2902 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2903 return EPROTO;
2904 }
2905
2906 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2907 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2908 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2909
2910 if (rsc->m1 != 0 || rsc->d != 0 ||
2911 fsc->m1 != 0 || fsc->d != 0 ||
2912 usc->m1 != 0 || usc->d != 0) {
2913 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2914 "Non-linear service curves are not supported.");
2915 return EPROTO;
2916 }
2917
2918 if (rsc->m2 != fsc->m2) {
2919 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2920 "Real-time service curves are not supported ");
2921 return EPROTO;
2922 }
2923
2924 if (rsc->m2 > usc->m2) {
2925 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2926 "Min-rate service curve is greater than "
2927 "the max-rate service curve.");
2928 return EPROTO;
2929 }
2930
2931 class->min_rate = fsc->m2;
2932 class->max_rate = usc->m2;
2933 return 0;
2934}
2935
2936static int
2937hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2938 struct hfsc_class *options,
2939 struct netdev_queue_stats *stats)
2940{
2941 int error;
2942 unsigned int handle;
2943 struct nlattr *nl_options;
2944
2945 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2946 if (error) {
2947 return error;
2948 }
2949
2950 if (queue_id) {
2951 unsigned int major, minor;
2952
2953 major = tc_get_major(handle);
2954 minor = tc_get_minor(handle);
2955 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2956 *queue_id = minor - 1;
2957 } else {
2958 return EPROTO;
2959 }
2960 }
2961
2962 if (options) {
2963 error = hfsc_parse_tca_options__(nl_options, options);
2964 }
2965
2966 return error;
2967}
2968
2969static int
2970hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2971 unsigned int parent, struct hfsc_class *options,
2972 struct netdev_queue_stats *stats)
2973{
2974 int error;
2975 struct ofpbuf *reply;
2976
2977 error = tc_query_class(netdev, handle, parent, &reply);
2978 if (error) {
2979 return error;
2980 }
2981
2982 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2983 ofpbuf_delete(reply);
2984 return error;
2985}
2986
2987static void
2988hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2989 struct hfsc_class *class)
2990{
2991 uint32_t max_rate;
2992 const char *max_rate_s;
2993
2994 max_rate_s = shash_find_data(details, "max-rate");
2995 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2996
2997 if (!max_rate) {
2998 uint32_t current;
2999
3000 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3001 max_rate = netdev_features_to_bps(current) / 8;
3002 }
3003
3004 class->min_rate = max_rate;
3005 class->max_rate = max_rate;
3006}
3007
3008static int
3009hfsc_parse_class_details__(struct netdev *netdev,
3010 const struct shash *details,
3011 struct hfsc_class * class)
3012{
3013 const struct hfsc *hfsc;
3014 uint32_t min_rate, max_rate;
3015 const char *min_rate_s, *max_rate_s;
3016
3017 hfsc = hfsc_get__(netdev);
3018 min_rate_s = shash_find_data(details, "min-rate");
3019 max_rate_s = shash_find_data(details, "max-rate");
3020
c45ab5e9 3021 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3022 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3023 min_rate = MIN(min_rate, hfsc->max_rate);
3024
3025 max_rate = (max_rate_s
3026 ? strtoull(max_rate_s, NULL, 10) / 8
3027 : hfsc->max_rate);
3028 max_rate = MAX(max_rate, min_rate);
3029 max_rate = MIN(max_rate, hfsc->max_rate);
3030
3031 class->min_rate = min_rate;
3032 class->max_rate = max_rate;
3033
3034 return 0;
3035}
3036
3037/* Create an HFSC qdisc.
3038 *
3039 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3040static int
3041hfsc_setup_qdisc__(struct netdev * netdev)
3042{
3043 struct tcmsg *tcmsg;
3044 struct ofpbuf request;
3045 struct tc_hfsc_qopt opt;
3046
3047 tc_del_qdisc(netdev);
3048
3049 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3050 NLM_F_EXCL | NLM_F_CREATE, &request);
3051
3052 if (!tcmsg) {
3053 return ENODEV;
3054 }
3055
3056 tcmsg->tcm_handle = tc_make_handle(1, 0);
3057 tcmsg->tcm_parent = TC_H_ROOT;
3058
3059 memset(&opt, 0, sizeof opt);
3060 opt.defcls = 1;
3061
3062 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3063 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3064
3065 return tc_transact(&request, NULL);
3066}
3067
3068/* Create an HFSC class.
3069 *
3070 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3071 * sc rate <min_rate> ul rate <max_rate>" */
3072static int
3073hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3074 unsigned int parent, struct hfsc_class *class)
3075{
3076 int error;
3077 size_t opt_offset;
3078 struct tcmsg *tcmsg;
3079 struct ofpbuf request;
3080 struct tc_service_curve min, max;
3081
3082 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3083
3084 if (!tcmsg) {
3085 return ENODEV;
3086 }
3087
3088 tcmsg->tcm_handle = handle;
3089 tcmsg->tcm_parent = parent;
3090
3091 min.m1 = 0;
3092 min.d = 0;
3093 min.m2 = class->min_rate;
3094
3095 max.m1 = 0;
3096 max.d = 0;
3097 max.m2 = class->max_rate;
3098
3099 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3100 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3101 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3102 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3103 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3104 nl_msg_end_nested(&request, opt_offset);
3105
3106 error = tc_transact(&request, NULL);
3107 if (error) {
3108 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3109 "min-rate %ubps, max-rate %ubps (%s)",
3110 netdev_get_name(netdev),
3111 tc_get_major(handle), tc_get_minor(handle),
3112 tc_get_major(parent), tc_get_minor(parent),
3113 class->min_rate, class->max_rate, strerror(error));
3114 }
3115
3116 return error;
3117}
3118
3119static int
3120hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3121{
3122 int error;
3123 struct hfsc_class class;
3124
3125 error = hfsc_setup_qdisc__(netdev);
3126
3127 if (error) {
3128 return error;
3129 }
3130
3131 hfsc_parse_qdisc_details__(netdev, details, &class);
3132 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3133 tc_make_handle(1, 0), &class);
3134
3135 if (error) {
3136 return error;
3137 }
3138
3139 hfsc_install__(netdev, class.max_rate);
3140 return 0;
3141}
3142
3143static int
3144hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3145{
3146 struct ofpbuf msg;
a339aa81
EJ
3147 struct nl_dump dump;
3148 struct hfsc_class hc;
3149
3150 hc.max_rate = 0;
3151 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3152 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3153
3154 if (!start_queue_dump(netdev, &dump)) {
3155 return ENODEV;
3156 }
3157
3158 while (nl_dump_next(&dump, &msg)) {
3159 unsigned int queue_id;
3160
3161 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3162 hfsc_update_queue__(netdev, queue_id, &hc);
3163 }
3164 }
3165
3166 nl_dump_done(&dump);
3167 return 0;
3168}
3169
3170static void
3171hfsc_tc_destroy(struct tc *tc)
3172{
3173 struct hfsc *hfsc;
3174 struct hfsc_class *hc, *next;
3175
3176 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3177
3178 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3179 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3180 free(hc);
3181 }
3182
3183 tc_destroy(tc);
3184 free(hfsc);
3185}
3186
3187static int
3188hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3189{
3190 const struct hfsc *hfsc;
3191 hfsc = hfsc_get__(netdev);
3192 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3193 return 0;
3194}
3195
3196static int
3197hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3198{
3199 int error;
3200 struct hfsc_class class;
3201
3202 hfsc_parse_qdisc_details__(netdev, details, &class);
3203 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3204 tc_make_handle(1, 0), &class);
3205
3206 if (!error) {
3207 hfsc_get__(netdev)->max_rate = class.max_rate;
3208 }
3209
3210 return error;
3211}
3212
3213static int
3214hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3215 const struct tc_queue *queue, struct shash *details)
3216{
3217 const struct hfsc_class *hc;
3218
3219 hc = hfsc_class_cast__(queue);
3220 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3221 if (hc->min_rate != hc->max_rate) {
3222 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3223 }
3224 return 0;
3225}
3226
3227static int
3228hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3229 const struct shash *details)
3230{
3231 int error;
3232 struct hfsc_class class;
3233
3234 error = hfsc_parse_class_details__(netdev, details, &class);
3235 if (error) {
3236 return error;
3237 }
3238
3239 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3240 tc_make_handle(1, 0xfffe), &class);
3241 if (error) {
3242 return error;
3243 }
3244
3245 hfsc_update_queue__(netdev, queue_id, &class);
3246 return 0;
3247}
3248
3249static int
3250hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3251{
3252 int error;
3253 struct hfsc *hfsc;
3254 struct hfsc_class *hc;
3255
3256 hc = hfsc_class_cast__(queue);
3257 hfsc = hfsc_get__(netdev);
3258
3259 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3260 if (!error) {
3261 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3262 free(hc);
3263 }
3264 return error;
3265}
3266
3267static int
3268hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3269 struct netdev_queue_stats *stats)
3270{
3271 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3272 tc_make_handle(1, 0xfffe), NULL, stats);
3273}
3274
3275static int
3276hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3277 const struct ofpbuf *nlmsg,
3278 netdev_dump_queue_stats_cb *cb, void *aux)
3279{
3280 struct netdev_queue_stats stats;
3281 unsigned int handle, major, minor;
3282 int error;
3283
3284 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3285 if (error) {
3286 return error;
3287 }
3288
3289 major = tc_get_major(handle);
3290 minor = tc_get_minor(handle);
3291 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3292 (*cb)(minor - 1, &stats, aux);
3293 }
3294 return 0;
3295}
3296
3297static const struct tc_ops tc_ops_hfsc = {
3298 "hfsc", /* linux_name */
3299 "linux-hfsc", /* ovs_name */
3300 HFSC_N_QUEUES, /* n_queues */
3301 hfsc_tc_install, /* tc_install */
3302 hfsc_tc_load, /* tc_load */
3303 hfsc_tc_destroy, /* tc_destroy */
3304 hfsc_qdisc_get, /* qdisc_get */
3305 hfsc_qdisc_set, /* qdisc_set */
3306 hfsc_class_get, /* class_get */
3307 hfsc_class_set, /* class_set */
3308 hfsc_class_delete, /* class_delete */
3309 hfsc_class_get_stats, /* class_get_stats */
3310 hfsc_class_dump_stats /* class_dump_stats */
3311};
3312\f
c1c9c9c4
BP
3313/* "linux-default" traffic control class.
3314 *
3315 * This class represents the default, unnamed Linux qdisc. It corresponds to
3316 * the "" (empty string) QoS type in the OVS database. */
3317
3318static void
3319default_install__(struct netdev *netdev)
3320{
3321 struct netdev_dev_linux *netdev_dev =
3322 netdev_dev_linux_cast(netdev_get_dev(netdev));
3323 static struct tc *tc;
3324
3325 if (!tc) {
3326 tc = xmalloc(sizeof *tc);
3327 tc_init(tc, &tc_ops_default);
3328 }
3329 netdev_dev->tc = tc;
3330}
3331
3332static int
3333default_tc_install(struct netdev *netdev,
3334 const struct shash *details OVS_UNUSED)
3335{
3336 default_install__(netdev);
3337 return 0;
3338}
3339
3340static int
3341default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3342{
3343 default_install__(netdev);
3344 return 0;
3345}
3346
3347static const struct tc_ops tc_ops_default = {
3348 NULL, /* linux_name */
3349 "", /* ovs_name */
3350 0, /* n_queues */
3351 default_tc_install,
3352 default_tc_load,
3353 NULL, /* tc_destroy */
3354 NULL, /* qdisc_get */
3355 NULL, /* qdisc_set */
3356 NULL, /* class_get */
3357 NULL, /* class_set */
3358 NULL, /* class_delete */
3359 NULL, /* class_get_stats */
3360 NULL /* class_dump_stats */
3361};
3362\f
3363/* "linux-other" traffic control class.
3364 *
3365 * */
3366
3367static int
3368other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3369{
3370 struct netdev_dev_linux *netdev_dev =
3371 netdev_dev_linux_cast(netdev_get_dev(netdev));
3372 static struct tc *tc;
3373
3374 if (!tc) {
3375 tc = xmalloc(sizeof *tc);
3376 tc_init(tc, &tc_ops_other);
3377 }
3378 netdev_dev->tc = tc;
3379 return 0;
3380}
3381
3382static const struct tc_ops tc_ops_other = {
3383 NULL, /* linux_name */
3384 "linux-other", /* ovs_name */
3385 0, /* n_queues */
3386 NULL, /* tc_install */
3387 other_tc_load,
3388 NULL, /* tc_destroy */
3389 NULL, /* qdisc_get */
3390 NULL, /* qdisc_set */
3391 NULL, /* class_get */
3392 NULL, /* class_set */
3393 NULL, /* class_delete */
3394 NULL, /* class_get_stats */
3395 NULL /* class_dump_stats */
3396};
3397\f
3398/* Traffic control. */
3399
3400/* Number of kernel "tc" ticks per second. */
3401static double ticks_per_s;
3402
3403/* Number of kernel "jiffies" per second. This is used for the purpose of
3404 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3405 * one jiffy's worth of data.
3406 *
3407 * There are two possibilities here:
3408 *
3409 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3410 * approximate range of 100 to 1024. That means that we really need to
3411 * make sure that the qdisc can buffer that much data.
3412 *
3413 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3414 * has finely granular timers and there's no need to fudge additional room
3415 * for buffers. (There's no extra effort needed to implement that: the
3416 * large 'buffer_hz' is used as a divisor, so practically any number will
3417 * come out as 0 in the division. Small integer results in the case of
3418 * really high dividends won't have any real effect anyhow.)
3419 */
3420static unsigned int buffer_hz;
3421
3422/* Returns tc handle 'major':'minor'. */
3423static unsigned int
3424tc_make_handle(unsigned int major, unsigned int minor)
3425{
3426 return TC_H_MAKE(major << 16, minor);
3427}
3428
3429/* Returns the major number from 'handle'. */
3430static unsigned int
3431tc_get_major(unsigned int handle)
3432{
3433 return TC_H_MAJ(handle) >> 16;
3434}
3435
3436/* Returns the minor number from 'handle'. */
3437static unsigned int
3438tc_get_minor(unsigned int handle)
3439{
3440 return TC_H_MIN(handle);
3441}
3442
3443static struct tcmsg *
3444tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3445 struct ofpbuf *request)
3446{
3447 struct tcmsg *tcmsg;
3448 int ifindex;
3449 int error;
3450
3451 error = get_ifindex(netdev, &ifindex);
3452 if (error) {
3453 return NULL;
3454 }
3455
3456 ofpbuf_init(request, 512);
3457 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3458 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3459 tcmsg->tcm_family = AF_UNSPEC;
3460 tcmsg->tcm_ifindex = ifindex;
3461 /* Caller should fill in tcmsg->tcm_handle. */
3462 /* Caller should fill in tcmsg->tcm_parent. */
3463
3464 return tcmsg;
3465}
3466
3467static int
3468tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3469{
3470 int error = nl_sock_transact(rtnl_sock, request, replyp);
3471 ofpbuf_uninit(request);
3472 return error;
3473}
3474
3475static void
3476read_psched(void)
3477{
3478 /* The values in psched are not individually very meaningful, but they are
3479 * important. The tables below show some values seen in the wild.
3480 *
3481 * Some notes:
3482 *
3483 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3484 * (Before that, there are hints that it was 1000000000.)
3485 *
3486 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3487 * above.
3488 *
3489 * /proc/net/psched
3490 * -----------------------------------
3491 * [1] 000c8000 000f4240 000f4240 00000064
3492 * [2] 000003e8 00000400 000f4240 3b9aca00
3493 * [3] 000003e8 00000400 000f4240 3b9aca00
3494 * [4] 000003e8 00000400 000f4240 00000064
3495 * [5] 000003e8 00000040 000f4240 3b9aca00
3496 * [6] 000003e8 00000040 000f4240 000000f9
3497 *
3498 * a b c d ticks_per_s buffer_hz
3499 * ------- --------- ---------- ------------- ----------- -------------
3500 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3501 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3502 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3503 * [4] 1,000 1,024 1,000,000 100 976,562 100
3504 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3505 * [6] 1,000 64 1,000,000 249 15,625,000 249
3506 *
3507 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3508 * [2] 2.6.26-1-686-bigmem from Debian lenny
3509 * [3] 2.6.26-2-sparc64 from Debian lenny
3510 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3511 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3512 * [6] 2.6.34 from kernel.org on KVM
3513 */
3514 static const char fn[] = "/proc/net/psched";
3515 unsigned int a, b, c, d;
3516 FILE *stream;
3517
3518 ticks_per_s = 1.0;
3519 buffer_hz = 100;
3520
3521 stream = fopen(fn, "r");
3522 if (!stream) {
3523 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3524 return;
3525 }
3526
3527 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3528 VLOG_WARN("%s: read failed", fn);
3529 fclose(stream);
3530 return;
3531 }
3532 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3533 fclose(stream);
3534
3535 if (!a || !c) {
3536 VLOG_WARN("%s: invalid scheduler parameters", fn);
3537 return;
3538 }
3539
3540 ticks_per_s = (double) a * c / b;
3541 if (c == 1000000) {
3542 buffer_hz = d;
3543 } else {
3544 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3545 fn, a, b, c, d);
3546 }
3547 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3548}
3549
3550/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3551 * rate of 'rate' bytes per second. */
3552static unsigned int
3553tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3554{
3555 if (!buffer_hz) {
3556 read_psched();
3557 }
3558 return (rate * ticks) / ticks_per_s;
3559}
3560
3561/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3562 * rate of 'rate' bytes per second. */
3563static unsigned int
3564tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3565{
3566 if (!buffer_hz) {
3567 read_psched();
3568 }
015c93a4 3569 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3570}
3571
3572/* Returns the number of bytes that need to be reserved for qdisc buffering at
3573 * a transmission rate of 'rate' bytes per second. */
3574static unsigned int
3575tc_buffer_per_jiffy(unsigned int rate)
3576{
3577 if (!buffer_hz) {
3578 read_psched();
3579 }
3580 return rate / buffer_hz;
3581}
3582
3583/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3584 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3585 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3586 * stores NULL into it if it is absent.
3587 *
3588 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3589 * 'msg'.
3590 *
3591 * Returns 0 if successful, otherwise a positive errno value. */
3592static int
3593tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3594 struct nlattr **options)
3595{
3596 static const struct nl_policy tca_policy[] = {
3597 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3598 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3599 };
3600 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3601
3602 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3603 tca_policy, ta, ARRAY_SIZE(ta))) {
3604 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3605 goto error;
3606 }
3607
3608 if (kind) {
3609 *kind = nl_attr_get_string(ta[TCA_KIND]);
3610 }
3611
3612 if (options) {
3613 *options = ta[TCA_OPTIONS];
3614 }
3615
3616 return 0;
3617
3618error:
3619 if (kind) {
3620 *kind = NULL;
3621 }
3622 if (options) {
3623 *options = NULL;
3624 }
3625 return EPROTO;
3626}
3627
3628/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3629 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3630 * into '*options', and its queue statistics into '*stats'. Any of the output
3631 * arguments may be null.
3632 *
3633 * Returns 0 if successful, otherwise a positive errno value. */
3634static int
3635tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3636 struct nlattr **options, struct netdev_queue_stats *stats)
3637{
3638 static const struct nl_policy tca_policy[] = {
3639 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3640 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3641 };
3642 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3643
3644 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3645 tca_policy, ta, ARRAY_SIZE(ta))) {
3646 VLOG_WARN_RL(&rl, "failed to parse class message");
3647 goto error;
3648 }
3649
3650 if (handlep) {
3651 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3652 *handlep = tc->tcm_handle;
3653 }
3654
3655 if (options) {
3656 *options = ta[TCA_OPTIONS];
3657 }
3658
3659 if (stats) {
3660 const struct gnet_stats_queue *gsq;
3661 struct gnet_stats_basic gsb;
3662
3663 static const struct nl_policy stats_policy[] = {
3664 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3665 .min_len = sizeof gsb },
3666 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3667 .min_len = sizeof *gsq },
3668 };
3669 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3670
3671 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3672 sa, ARRAY_SIZE(sa))) {
3673 VLOG_WARN_RL(&rl, "failed to parse class stats");
3674 goto error;
3675 }
3676
3677 /* Alignment issues screw up the length of struct gnet_stats_basic on
3678 * some arch/bitsize combinations. Newer versions of Linux have a
3679 * struct gnet_stats_basic_packed, but we can't depend on that. The
3680 * easiest thing to do is just to make a copy. */
3681 memset(&gsb, 0, sizeof gsb);
3682 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3683 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3684 stats->tx_bytes = gsb.bytes;
3685 stats->tx_packets = gsb.packets;
3686
3687 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3688 stats->tx_errors = gsq->drops;
3689 }
3690
3691 return 0;
3692
3693error:
3694 if (options) {
3695 *options = NULL;
3696 }
3697 if (stats) {
3698 memset(stats, 0, sizeof *stats);
3699 }
3700 return EPROTO;
3701}
3702
3703/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3704 * on 'netdev'. */
3705static int
3706tc_query_class(const struct netdev *netdev,
3707 unsigned int handle, unsigned int parent,
3708 struct ofpbuf **replyp)
3709{
3710 struct ofpbuf request;
3711 struct tcmsg *tcmsg;
3712 int error;
3713
3714 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3715 if (!tcmsg) {
3716 return ENODEV;
3717 }
c1c9c9c4
BP
3718 tcmsg->tcm_handle = handle;
3719 tcmsg->tcm_parent = parent;
3720
3721 error = tc_transact(&request, replyp);
3722 if (error) {
3723 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3724 netdev_get_name(netdev),
3725 tc_get_major(handle), tc_get_minor(handle),
3726 tc_get_major(parent), tc_get_minor(parent),
3727 strerror(error));
3728 }
3729 return error;
3730}
3731
3732/* Equivalent to "tc class del dev <name> handle <handle>". */
3733static int
3734tc_delete_class(const struct netdev *netdev, unsigned int handle)
3735{
3736 struct ofpbuf request;
3737 struct tcmsg *tcmsg;
3738 int error;
3739
3740 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3741 if (!tcmsg) {
3742 return ENODEV;
3743 }
c1c9c9c4
BP
3744 tcmsg->tcm_handle = handle;
3745 tcmsg->tcm_parent = 0;
3746
3747 error = tc_transact(&request, NULL);
3748 if (error) {
3749 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3750 netdev_get_name(netdev),
3751 tc_get_major(handle), tc_get_minor(handle),
3752 strerror(error));
3753 }
3754 return error;
3755}
3756
3757/* Equivalent to "tc qdisc del dev <name> root". */
3758static int
3759tc_del_qdisc(struct netdev *netdev)
3760{
3761 struct netdev_dev_linux *netdev_dev =
3762 netdev_dev_linux_cast(netdev_get_dev(netdev));
3763 struct ofpbuf request;
3764 struct tcmsg *tcmsg;
3765 int error;
3766
3767 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3768 if (!tcmsg) {
3769 return ENODEV;
3770 }
c1c9c9c4
BP
3771 tcmsg->tcm_handle = tc_make_handle(1, 0);
3772 tcmsg->tcm_parent = TC_H_ROOT;
3773
3774 error = tc_transact(&request, NULL);
3775 if (error == EINVAL) {
3776 /* EINVAL probably means that the default qdisc was in use, in which
3777 * case we've accomplished our purpose. */
3778 error = 0;
3779 }
3780 if (!error && netdev_dev->tc) {
3781 if (netdev_dev->tc->ops->tc_destroy) {
3782 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3783 }
3784 netdev_dev->tc = NULL;
3785 }
3786 return error;
3787}
3788
3789/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3790 * kernel to determine what they are. Returns 0 if successful, otherwise a
3791 * positive errno value. */
3792static int
3793tc_query_qdisc(const struct netdev *netdev)
3794{
3795 struct netdev_dev_linux *netdev_dev =
3796 netdev_dev_linux_cast(netdev_get_dev(netdev));
3797 struct ofpbuf request, *qdisc;
3798 const struct tc_ops *ops;
3799 struct tcmsg *tcmsg;
3800 int load_error;
3801 int error;
3802
3803 if (netdev_dev->tc) {
3804 return 0;
3805 }
3806
3807 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3808 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3809 * 2.6.35 without that fix backported to it.
3810 *
3811 * To avoid the OOPS, we must not make a request that would attempt to dump
3812 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3813 * few others. There are a few ways that I can see to do this, but most of
3814 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3815 * technique chosen here is to assume that any non-default qdisc that we
3816 * create will have a class with handle 1:0. The built-in qdiscs only have
3817 * a class with handle 0:0.
3818 *
3819 * We could check for Linux 2.6.35+ and use a more straightforward method
3820 * there. */
3821 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3822 if (!tcmsg) {
3823 return ENODEV;
3824 }
c1c9c9c4
BP
3825 tcmsg->tcm_handle = tc_make_handle(1, 0);
3826 tcmsg->tcm_parent = 0;
3827
3828 /* Figure out what tc class to instantiate. */
3829 error = tc_transact(&request, &qdisc);
3830 if (!error) {
3831 const char *kind;
3832
3833 error = tc_parse_qdisc(qdisc, &kind, NULL);
3834 if (error) {
3835 ops = &tc_ops_other;
3836 } else {
3837 ops = tc_lookup_linux_name(kind);
3838 if (!ops) {
3839 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3840 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3841
3842 ops = &tc_ops_other;
3843 }
3844 }
3845 } else if (error == ENOENT) {
3846 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3847 * other entity that doesn't have a handle 1:0. We will assume
3848 * that it's the system default qdisc. */
3849 ops = &tc_ops_default;
3850 error = 0;
3851 } else {
3852 /* Who knows? Maybe the device got deleted. */
3853 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3854 netdev_get_name(netdev), strerror(error));
3855 ops = &tc_ops_other;
3856 }
3857
3858 /* Instantiate it. */
3859 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3860 assert((load_error == 0) == (netdev_dev->tc != NULL));
3861 ofpbuf_delete(qdisc);
3862
3863 return error ? error : load_error;
3864}
3865
3866/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3867 approximate the time to transmit packets of various lengths. For an MTU of
3868 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3869 represents two possible packet lengths; for a MTU of 513 through 1024, four
3870 possible lengths; and so on.
3871
3872 Returns, for the specified 'mtu', the number of bits that packet lengths
3873 need to be shifted right to fit within such a 256-entry table. */
3874static int
3875tc_calc_cell_log(unsigned int mtu)
3876{
3877 int cell_log;
3878
3879 if (!mtu) {
3880 mtu = ETH_PAYLOAD_MAX;
3881 }
3882 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3883
3884 for (cell_log = 0; mtu >= 256; cell_log++) {
3885 mtu >>= 1;
3886 }
3887
3888 return cell_log;
3889}
3890
3891/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3892 * of 'mtu'. */
3893static void
3894tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3895{
3896 memset(rate, 0, sizeof *rate);
3897 rate->cell_log = tc_calc_cell_log(mtu);
3898 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3899 /* rate->cell_align = 0; */ /* distro headers. */
3900 rate->mpu = ETH_TOTAL_MIN;
3901 rate->rate = Bps;
3902}
3903
3904/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3905 * attribute of the specified "type".
3906 *
3907 * See tc_calc_cell_log() above for a description of "rtab"s. */
3908static void
3909tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3910{
3911 uint32_t *rtab;
3912 unsigned int i;
3913
3914 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3915 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3916 unsigned packet_size = (i + 1) << rate->cell_log;
3917 if (packet_size < rate->mpu) {
3918 packet_size = rate->mpu;
3919 }
3920 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3921 }
3922}
3923
3924/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3925 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3926 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3927 * 0 is fine.) */
c1c9c9c4
BP
3928static int
3929tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3930{
3931 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3932 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3933}
d3980822
BP
3934\f
3935/* Public utility functions. */
3936
3937#define COPY_NETDEV_STATS \
3938 dst->rx_packets = src->rx_packets; \
3939 dst->tx_packets = src->tx_packets; \
3940 dst->rx_bytes = src->rx_bytes; \
3941 dst->tx_bytes = src->tx_bytes; \
3942 dst->rx_errors = src->rx_errors; \
3943 dst->tx_errors = src->tx_errors; \
3944 dst->rx_dropped = src->rx_dropped; \
3945 dst->tx_dropped = src->tx_dropped; \
3946 dst->multicast = src->multicast; \
3947 dst->collisions = src->collisions; \
3948 dst->rx_length_errors = src->rx_length_errors; \
3949 dst->rx_over_errors = src->rx_over_errors; \
3950 dst->rx_crc_errors = src->rx_crc_errors; \
3951 dst->rx_frame_errors = src->rx_frame_errors; \
3952 dst->rx_fifo_errors = src->rx_fifo_errors; \
3953 dst->rx_missed_errors = src->rx_missed_errors; \
3954 dst->tx_aborted_errors = src->tx_aborted_errors; \
3955 dst->tx_carrier_errors = src->tx_carrier_errors; \
3956 dst->tx_fifo_errors = src->tx_fifo_errors; \
3957 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3958 dst->tx_window_errors = src->tx_window_errors
3959
3960/* Copies 'src' into 'dst', performing format conversion in the process. */
3961void
3962netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3963 const struct rtnl_link_stats *src)
3964{
3965 COPY_NETDEV_STATS;
3966}
3967
3968/* Copies 'src' into 'dst', performing format conversion in the process. */
3969void
3970netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3971 const struct rtnl_link_stats64 *src)
3972{
3973 COPY_NETDEV_STATS;
3974}
3975
3976/* Copies 'src' into 'dst', performing format conversion in the process. */
3977void
3978netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3979 const struct netdev_stats *src)
3980{
3981 COPY_NETDEV_STATS;
7afa4f1d
BP
3982 dst->rx_compressed = 0;
3983 dst->tx_compressed = 0;
d3980822 3984}
c1c9c9c4
BP
3985\f
3986/* Utility functions. */
3987
3988static int
3989get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3990{
3991 /* Policy for RTNLGRP_LINK messages.
3992 *
3993 * There are *many* more fields in these messages, but currently we only
3994 * care about these fields. */
3995 static const struct nl_policy rtnlgrp_link_policy[] = {
3996 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3997 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3998 .min_len = sizeof(struct rtnl_link_stats) },
3999 };
4000
4001 struct ofpbuf request;
4002 struct ofpbuf *reply;
4003 struct ifinfomsg *ifi;
c1c9c9c4
BP
4004 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4005 int error;
4006
4007 ofpbuf_init(&request, 0);
4008 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4009 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4010 ifi->ifi_family = PF_UNSPEC;
4011 ifi->ifi_index = ifindex;
4012 error = nl_sock_transact(rtnl_sock, &request, &reply);
4013 ofpbuf_uninit(&request);
4014 if (error) {
4015 return error;
4016 }
4017
4018 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4019 rtnlgrp_link_policy,
4020 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4021 ofpbuf_delete(reply);
4022 return EPROTO;
4023 }
4024
4025 if (!attrs[IFLA_STATS]) {
4026 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4027 ofpbuf_delete(reply);
4028 return EPROTO;
4029 }
8b61709d 4030
d3980822 4031 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4032
576e26d7
BP
4033 ofpbuf_delete(reply);
4034
8b61709d
BP
4035 return 0;
4036}
4037
4038static int
4039get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4040{
4041 static const char fn[] = "/proc/net/dev";
4042 char line[1024];
4043 FILE *stream;
4044 int ln;
4045
4046 stream = fopen(fn, "r");
4047 if (!stream) {
4048 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4049 return errno;
4050 }
4051
4052 ln = 0;
4053 while (fgets(line, sizeof line, stream)) {
4054 if (++ln >= 3) {
4055 char devname[16];
4056#define X64 "%"SCNu64
4057 if (sscanf(line,
4058 " %15[^:]:"
4059 X64 X64 X64 X64 X64 X64 X64 "%*u"
4060 X64 X64 X64 X64 X64 X64 X64 "%*u",
4061 devname,
4062 &stats->rx_bytes,
4063 &stats->rx_packets,
4064 &stats->rx_errors,
4065 &stats->rx_dropped,
4066 &stats->rx_fifo_errors,
4067 &stats->rx_frame_errors,
4068 &stats->multicast,
4069 &stats->tx_bytes,
4070 &stats->tx_packets,
4071 &stats->tx_errors,
4072 &stats->tx_dropped,
4073 &stats->tx_fifo_errors,
4074 &stats->collisions,
4075 &stats->tx_carrier_errors) != 15) {
4076 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4077 } else if (!strcmp(devname, netdev_name)) {
4078 stats->rx_length_errors = UINT64_MAX;
4079 stats->rx_over_errors = UINT64_MAX;
4080 stats->rx_crc_errors = UINT64_MAX;
4081 stats->rx_missed_errors = UINT64_MAX;
4082 stats->tx_aborted_errors = UINT64_MAX;
4083 stats->tx_heartbeat_errors = UINT64_MAX;
4084 stats->tx_window_errors = UINT64_MAX;
4085 fclose(stream);
4086 return 0;
4087 }
4088 }
4089 }
4090 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4091 fclose(stream);
4092 return ENODEV;
4093}
c1c9c9c4 4094
8b61709d
BP
4095static int
4096get_flags(const struct netdev *netdev, int *flags)
4097{
4098 struct ifreq ifr;
4099 int error;
4100
149f577a
JG
4101 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4102 "SIOCGIFFLAGS");
8b61709d
BP
4103 *flags = ifr.ifr_flags;
4104 return error;
4105}
4106
4107static int
4108set_flags(struct netdev *netdev, int flags)
4109{
4110 struct ifreq ifr;
4111
4112 ifr.ifr_flags = flags;
149f577a
JG
4113 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4114 "SIOCSIFFLAGS");
8b61709d
BP
4115}
4116
4117static int
4118do_get_ifindex(const char *netdev_name)
4119{
4120 struct ifreq ifr;
4121
71d7c22f 4122 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4123 COVERAGE_INC(netdev_get_ifindex);
4124 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4125 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4126 netdev_name, strerror(errno));
4127 return -errno;
4128 }
4129 return ifr.ifr_ifindex;
4130}
4131
4132static int
4133get_ifindex(const struct netdev *netdev_, int *ifindexp)
4134{
149f577a
JG
4135 struct netdev_dev_linux *netdev_dev =
4136 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4137 *ifindexp = 0;
149f577a 4138 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4139 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4140 if (ifindex < 0) {
4141 return -ifindex;
4142 }
149f577a
JG
4143 netdev_dev->cache_valid |= VALID_IFINDEX;
4144 netdev_dev->ifindex = ifindex;
8b61709d 4145 }
149f577a 4146 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4147 return 0;
4148}
4149
4150static int
4151get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4152{
4153 struct ifreq ifr;
4154 int hwaddr_family;
4155
4156 memset(&ifr, 0, sizeof ifr);
71d7c22f 4157 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4158 COVERAGE_INC(netdev_get_hwaddr);
4159 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4160 /* ENODEV probably means that a vif disappeared asynchronously and
4161 * hasn't been removed from the database yet, so reduce the log level
4162 * to INFO for that case. */
4163 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4164 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4165 netdev_name, strerror(errno));
8b61709d
BP
4166 return errno;
4167 }
4168 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4169 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4170 VLOG_WARN("%s device has unknown hardware address family %d",
4171 netdev_name, hwaddr_family);
4172 }
4173 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4174 return 0;
4175}
4176
4177static int
4178set_etheraddr(const char *netdev_name, int hwaddr_family,
4179 const uint8_t mac[ETH_ADDR_LEN])
4180{
4181 struct ifreq ifr;
4182
4183 memset(&ifr, 0, sizeof ifr);
71d7c22f 4184 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4185 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4186 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4187 COVERAGE_INC(netdev_set_hwaddr);
4188 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4189 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4190 netdev_name, strerror(errno));
4191 return errno;
4192 }
4193 return 0;
4194}
4195
4196static int
0b0544d7 4197netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4198 int cmd, const char *cmd_name)
4199{
4200 struct ifreq ifr;
4201
4202 memset(&ifr, 0, sizeof ifr);
71d7c22f 4203 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4204 ifr.ifr_data = (caddr_t) ecmd;
4205
4206 ecmd->cmd = cmd;
4207 COVERAGE_INC(netdev_ethtool);
4208 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4209 return 0;
4210 } else {
4211 if (errno != EOPNOTSUPP) {
4212 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4213 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4214 } else {
4215 /* The device doesn't support this operation. That's pretty
4216 * common, so there's no point in logging anything. */
4217 }
4218 return errno;
4219 }
4220}
4221
e47bd51a
JP
4222/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4223 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4224int
4225netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4226 const char *flag_name, bool enable)
4227{
4228 const char *netdev_name = netdev_get_name(netdev);
4229 struct ethtool_value evalue;
4230 uint32_t new_flags;
4231 int error;
4232
4233 memset(&evalue, 0, sizeof evalue);
4234 error = netdev_linux_do_ethtool(netdev_name,
4235 (struct ethtool_cmd *)&evalue,
4236 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4237 if (error) {
4238 return error;
4239 }
4240
4241 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4242 error = netdev_linux_do_ethtool(netdev_name,
4243 (struct ethtool_cmd *)&evalue,
4244 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4245 if (error) {
4246 return error;
4247 }
4248
4249 memset(&evalue, 0, sizeof evalue);
4250 error = netdev_linux_do_ethtool(netdev_name,
4251 (struct ethtool_cmd *)&evalue,
4252 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4253 if (error) {
4254 return error;
4255 }
4256
4257 if (new_flags != evalue.data) {
4258 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4259 "device %s failed", enable ? "enable" : "disable",
4260 flag_name, netdev_name);
4261 return EOPNOTSUPP;
4262 }
4263
4264 return 0;
4265}
4266
8b61709d 4267static int
149f577a
JG
4268netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4269 const char *cmd_name)
8b61709d 4270{
71d7c22f 4271 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4272 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4273 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4274 strerror(errno));
8b61709d
BP
4275 return errno;
4276 }
4277 return 0;
4278}
f1acd62b
BP
4279
4280static int
4281netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4282 int cmd, const char *cmd_name)
4283{
4284 struct ifreq ifr;
4285 int error;
4286
4287 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4288 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4289 if (!error) {
4290 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4291 *ip = sin->sin_addr;
4292 }
4293 return error;
4294}
488d734d
BP
4295
4296/* Returns an AF_PACKET raw socket or a negative errno value. */
4297static int
4298af_packet_sock(void)
4299{
4300 static int sock = INT_MIN;
4301
4302 if (sock == INT_MIN) {
4303 sock = socket(AF_PACKET, SOCK_RAW, 0);
4304 if (sock >= 0) {
4305 set_nonblocking(sock);
4306 } else {
4307 sock = -errno;
4308 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4309 }
4310 }
4311
4312 return sock;
4313}