]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
notifiers: Rename run and wait functions.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
8b61709d 27#include <linux/if_tun.h>
a740f0de 28#include <linux/ip.h>
8b61709d
BP
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
6f42c8ea 32#include <linux/pkt_sched.h>
e9e28be3 33#include <linux/rtnetlink.h>
8b61709d
BP
34#include <linux/sockios.h>
35#include <linux/version.h>
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
39#include <netpacket/packet.h>
40#include <net/ethernet.h>
41#include <net/if.h>
a740f0de 42#include <linux/if_tunnel.h>
8b61709d
BP
43#include <net/if_arp.h>
44#include <net/if_packet.h>
45#include <net/route.h>
46#include <netinet/in.h>
e9e28be3 47#include <poll.h>
8b61709d
BP
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
e9e28be3
BP
51
52#include "coverage.h"
9fe3b9a2 53#include "dpif-linux.h"
8b61709d
BP
54#include "dynamic-string.h"
55#include "fatal-signal.h"
93b13be8
BP
56#include "hash.h"
57#include "hmap.h"
8b61709d 58#include "netdev-provider.h"
7fbef77a 59#include "netdev-vport.h"
e9e28be3 60#include "netlink.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
e9e28be3 63#include "ofpbuf.h"
8b61709d
BP
64#include "openflow/openflow.h"
65#include "packets.h"
66#include "poll-loop.h"
21d6e22e 67#include "rtnetlink-link.h"
8b61709d
BP
68#include "socket-util.h"
69#include "shash.h"
19993ef3 70#include "sset.h"
1670c579 71#include "timer.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
75
76COVERAGE_DEFINE(netdev_get_vlan_vid);
77COVERAGE_DEFINE(netdev_set_policing);
78COVERAGE_DEFINE(netdev_arp_lookup);
79COVERAGE_DEFINE(netdev_get_ifindex);
80COVERAGE_DEFINE(netdev_get_hwaddr);
81COVERAGE_DEFINE(netdev_set_hwaddr);
82COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
83\f
84/* These were introduced in Linux 2.6.14, so they might be missing if we have
85 * old headers. */
86#ifndef ADVERTISED_Pause
87#define ADVERTISED_Pause (1 << 13)
88#endif
89#ifndef ADVERTISED_Asym_Pause
90#define ADVERTISED_Asym_Pause (1 << 14)
91#endif
92
e47bd51a
JP
93/* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95#ifndef ETHTOOL_GFLAGS
96#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97#endif
98#ifndef ETHTOOL_SFLAGS
99#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100#endif
101
c1c9c9c4
BP
102/* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 * headers. */
104#ifndef TC_RTAB_SIZE
105#define TC_RTAB_SIZE 1024
106#endif
107
0a811051 108static struct nln_notifier netdev_linux_cache_notifier;
46415c90 109static int cache_notifier_refcount;
8b61709d
BP
110
111enum {
7fbef77a
JG
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
114 VALID_IN4 = 1 << 2,
115 VALID_IN6 = 1 << 3,
116 VALID_MTU = 1 << 4,
117 VALID_CARRIER = 1 << 5,
f613a0d7
PS
118 VALID_POLICING = 1 << 6,
119 VALID_HAVE_VPORT_STATS = 1 << 7
8b61709d
BP
120};
121
149f577a
JG
122struct tap_state {
123 int fd;
61b999dd 124 bool opened;
149f577a 125};
c1c9c9c4
BP
126\f
127/* Traffic control. */
128
129/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
130 * network device.
131 *
132 * Each TC implementation subclasses this with whatever additional data it
133 * needs. */
c1c9c9c4
BP
134struct tc {
135 const struct tc_ops *ops;
93b13be8
BP
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
139};
c1c9c9c4 140
93b13be8
BP
141/* One traffic control queue.
142 *
143 * Each TC implementation subclasses this with whatever additional data it
144 * needs. */
145struct tc_queue {
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
148};
149
150/* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
152 *
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
156struct tc_ops {
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
161
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
164
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
168
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
174 *
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
178 *
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
181 *
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
185
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
189 *
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
195 * 'netdev'.
196 *
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
200
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * tc_destroy(tc).
204 *
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
208 *
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
211
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
213 *
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
217 *
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
221 *
222 * This function may be null if 'tc' is not configurable.
223 */
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
225
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
228 *
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
232 *
233 * This function may be null if 'tc' is not configurable.
234 */
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
236
93b13be8
BP
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
239 *
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
243 *
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
247 *
248 * This function may be null if 'tc' does not have queues ('n_queues' is
249 * 0). */
93b13be8 250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
251 struct shash *details);
252
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * 'n_queues'.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
266
93b13be8
BP
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
269 *
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
93b13be8 272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 273
93b13be8
BP
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
276 *
277 * On success, initializes '*stats'.
278 *
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
93b13be8
BP
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
c1c9c9c4
BP
283 struct netdev_queue_stats *stats);
284
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
287 *
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
293};
294
295static void
296tc_init(struct tc *tc, const struct tc_ops *ops)
297{
298 tc->ops = ops;
93b13be8 299 hmap_init(&tc->queues);
c1c9c9c4
BP
300}
301
302static void
303tc_destroy(struct tc *tc)
304{
93b13be8 305 hmap_destroy(&tc->queues);
c1c9c9c4
BP
306}
307
308static const struct tc_ops tc_ops_htb;
a339aa81 309static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
310static const struct tc_ops tc_ops_default;
311static const struct tc_ops tc_ops_other;
312
313static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
318 NULL
319};
149f577a 320
c1c9c9c4
BP
321static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322static unsigned int tc_get_major(unsigned int handle);
323static unsigned int tc_get_minor(unsigned int handle);
324
325static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327static unsigned int tc_buffer_per_jiffy(unsigned int rate);
328
329static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332
333static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341static int tc_delete_class(const struct netdev *, unsigned int handle);
342
343static int tc_del_qdisc(struct netdev *netdev);
344static int tc_query_qdisc(const struct netdev *netdev);
345
346static int tc_calc_cell_log(unsigned int mtu);
347static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
351\f
149f577a
JG
352struct netdev_dev_linux {
353 struct netdev_dev netdev_dev;
354
8b61709d 355 struct shash_node *shash_node;
149f577a 356 unsigned int cache_valid;
ac4d3bcb 357 unsigned int change_seq;
8b61709d 358
1670c579
EJ
359 bool miimon; /* Link status of last poll. */
360 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
361 struct timer miimon_timer;
362
8722022c
BP
363 /* The following are figured out "on demand" only. They are only valid
364 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
365 int ifindex;
366 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 367 struct in_addr address, netmask;
8b61709d
BP
368 struct in6_addr in6;
369 int mtu;
370 int carrier;
80a86fbe
BP
371 uint32_t kbits_rate; /* Policing data. */
372 uint32_t kbits_burst;
7fbef77a 373 bool have_vport_stats;
c1c9c9c4 374 struct tc *tc;
149f577a
JG
375
376 union {
377 struct tap_state tap;
378 } state;
8b61709d
BP
379};
380
149f577a
JG
381struct netdev_linux {
382 struct netdev netdev;
5b7448ed 383 int fd;
149f577a 384};
8b61709d 385
76c308b5
BP
386/* Sockets used for ioctl operations. */
387static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 388
ff4ed3c9
BP
389/* A Netlink routing socket that is not subscribed to any multicast groups. */
390static struct nl_sock *rtnl_sock;
391
8b61709d
BP
392/* This is set pretty low because we probably won't learn anything from the
393 * additional log messages. */
394static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
395
15b3596a 396static int netdev_linux_init(void);
6f643e49 397
0b0544d7 398static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 399 int cmd, const char *cmd_name);
149f577a
JG
400static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
401 const char *cmd_name);
f1acd62b
BP
402static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
403 int cmd, const char *cmd_name);
8b61709d
BP
404static int get_flags(const struct netdev *, int *flagsp);
405static int set_flags(struct netdev *, int flags);
406static int do_get_ifindex(const char *netdev_name);
407static int get_ifindex(const struct netdev *, int *ifindexp);
408static int do_set_addr(struct netdev *netdev,
409 int ioctl_nr, const char *ioctl_name,
410 struct in_addr addr);
411static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
412static int set_etheraddr(const char *netdev_name, int hwaddr_family,
413 const uint8_t[ETH_ADDR_LEN]);
414static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
415static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 416static int af_packet_sock(void);
1670c579
EJ
417static void netdev_linux_miimon_run(void);
418static void netdev_linux_miimon_wait(void);
8b61709d 419
15b3596a
JG
420static bool
421is_netdev_linux_class(const struct netdev_class *netdev_class)
422{
423 return netdev_class->init == netdev_linux_init;
424}
425
149f577a
JG
426static struct netdev_dev_linux *
427netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 428{
15b3596a
JG
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
431
149f577a 432 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
433}
434
8b61709d
BP
435static struct netdev_linux *
436netdev_linux_cast(const struct netdev *netdev)
437{
15b3596a
JG
438 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
439 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
440 assert(is_netdev_linux_class(netdev_class));
441
8b61709d
BP
442 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
443}
ff4ed3c9 444\f
8b61709d
BP
445static int
446netdev_linux_init(void)
447{
448 static int status = -1;
449 if (status < 0) {
ff4ed3c9 450 /* Create AF_INET socket. */
8b61709d
BP
451 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
452 status = af_inet_sock >= 0 ? 0 : errno;
453 if (status) {
454 VLOG_ERR("failed to create inet socket: %s", strerror(status));
455 }
ff4ed3c9
BP
456
457 /* Create rtnetlink socket. */
458 if (!status) {
cceb11f5 459 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
460 if (status) {
461 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
462 strerror(status));
463 }
464 }
8b61709d
BP
465 }
466 return status;
467}
468
469static void
470netdev_linux_run(void)
471{
18a23781 472 rtnetlink_link_run();
1670c579 473 netdev_linux_miimon_run();
8b61709d
BP
474}
475
476static void
477netdev_linux_wait(void)
478{
18a23781 479 rtnetlink_link_wait();
1670c579 480 netdev_linux_miimon_wait();
8b61709d
BP
481}
482
ac4d3bcb
EJ
483static void
484netdev_dev_linux_changed(struct netdev_dev_linux *dev)
485{
486 dev->change_seq++;
487 if (!dev->change_seq) {
488 dev->change_seq++;
489 }
490 dev->cache_valid = 0;
491}
492
8b61709d 493static void
21d6e22e 494netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 495 void *aux OVS_UNUSED)
8b61709d 496{
149f577a 497 struct netdev_dev_linux *dev;
8b61709d 498 if (change) {
46415c90
JG
499 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 if (base_dev) {
15b3596a
JG
501 const struct netdev_class *netdev_class =
502 netdev_dev_get_class(base_dev);
503
504 if (is_netdev_linux_class(netdev_class)) {
505 dev = netdev_dev_linux_cast(base_dev);
ac4d3bcb 506 netdev_dev_linux_changed(dev);
15b3596a 507 }
8b61709d
BP
508 }
509 } else {
46415c90 510 struct shash device_shash;
8b61709d 511 struct shash_node *node;
46415c90
JG
512
513 shash_init(&device_shash);
514 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
515 SHASH_FOR_EACH (node, &device_shash) {
149f577a 516 dev = node->data;
ac4d3bcb 517 netdev_dev_linux_changed(dev);
8b61709d 518 }
46415c90 519 shash_destroy(&device_shash);
8b61709d
BP
520 }
521}
522
c3827f61 523/* Creates system and internal devices. */
8b61709d 524static int
de5cdb90
BP
525netdev_linux_create(const struct netdev_class *class, const char *name,
526 struct netdev_dev **netdev_devp)
6c88d577 527{
149f577a
JG
528 struct netdev_dev_linux *netdev_dev;
529 int error;
6c88d577 530
46415c90 531 if (!cache_notifier_refcount) {
21d6e22e
EJ
532 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
533 netdev_linux_cache_cb, NULL);
149f577a
JG
534 if (error) {
535 return error;
536 }
537 }
46415c90 538 cache_notifier_refcount++;
6c88d577 539
149f577a 540 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 541 netdev_dev->change_seq = 1;
de5cdb90 542 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
46415c90 543
149f577a 544 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
545 return 0;
546}
547
5b7448ed
JG
548/* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
a740f0de 554static int
b8dcf5e9 555netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 556 const char *name, struct netdev_dev **netdev_devp)
a740f0de 557{
149f577a 558 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
559 struct tap_state *state;
560 static const char tap_dev[] = "/dev/net/tun";
561 struct ifreq ifr;
562 int error;
563
149f577a
JG
564 netdev_dev = xzalloc(sizeof *netdev_dev);
565 state = &netdev_dev->state.tap;
a740f0de 566
6c88d577 567 /* Open tap device. */
149f577a
JG
568 state->fd = open(tap_dev, O_RDWR);
569 if (state->fd < 0) {
6c88d577
JP
570 error = errno;
571 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
572 goto error;
573 }
574
575 /* Create tap device. */
576 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 577 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 578 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
579 VLOG_WARN("%s: creating tap device failed: %s", name,
580 strerror(errno));
581 error = errno;
582 goto error;
583 }
584
585 /* Make non-blocking. */
149f577a 586 error = set_nonblocking(state->fd);
a740f0de
JG
587 if (error) {
588 goto error;
589 }
590
de5cdb90 591 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 592 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
593 return 0;
594
595error:
149f577a 596 free(netdev_dev);
a740f0de
JG
597 return error;
598}
599
a740f0de 600static void
149f577a 601destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 602{
149f577a
JG
603 struct tap_state *state = &netdev_dev->state.tap;
604
605 if (state->fd >= 0) {
606 close(state->fd);
a740f0de
JG
607 }
608}
609
149f577a 610/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 611static void
149f577a 612netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 613{
149f577a 614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 615 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 616
c1c9c9c4
BP
617 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
618 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
619 }
620
d2bb2799 621 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 622 cache_notifier_refcount--;
149f577a 623
46415c90 624 if (!cache_notifier_refcount) {
21d6e22e 625 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 626 }
d2bb2799 627 } else if (class == &netdev_tap_class) {
149f577a 628 destroy_tap(netdev_dev);
d2bb2799
BP
629 } else {
630 NOT_REACHED();
6c88d577 631 }
149f577a 632
658797c8 633 free(netdev_dev);
6c88d577
JP
634}
635
8b61709d 636static int
7b6b0ef4 637netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 638{
5b7448ed 639 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
640 struct netdev_linux *netdev;
641 enum netdev_flags flags;
642 int error;
643
644 /* Allocate network device. */
ec6fde61 645 netdev = xzalloc(sizeof *netdev);
49a6a163 646 netdev->fd = -1;
5b7448ed 647 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 648
c3827f61
BP
649 /* Verify that the device really exists, by attempting to read its flags.
650 * (The flags might be cached, in which case this won't actually do an
651 * ioctl.)
652 *
653 * Don't do this for "internal" netdevs, though, because those have to be
654 * created as netdev objects before they exist in the kernel, because
655 * creating them in the kernel happens by passing a netdev object to
656 * dpif_port_add(). */
657 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
658 error = netdev_get_flags(&netdev->netdev, &flags);
659 if (error == ENODEV) {
660 goto error;
661 }
8b61709d
BP
662 }
663
61b999dd
JG
664 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
665 !netdev_dev->state.tap.opened) {
666
667 /* We assume that the first user of the tap device is the primary user
668 * and give them the tap FD. Subsequent users probably just expect
669 * this to be a system device so open it normally to avoid send/receive
670 * directions appearing to be reversed. */
5b7448ed 671 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 672 netdev_dev->state.tap.opened = true;
8b61709d
BP
673 }
674
675 *netdevp = &netdev->netdev;
676 return 0;
677
678error:
149f577a 679 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
680 return error;
681}
682
683/* Closes and destroys 'netdev'. */
684static void
685netdev_linux_close(struct netdev *netdev_)
686{
687 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
688
49a6a163 689 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 690 close(netdev->fd);
8b61709d
BP
691 }
692 free(netdev);
693}
e9e28be3 694
19993ef3 695/* Initializes 'sset' with a list of the names of all known network devices. */
8b61709d 696static int
19993ef3 697netdev_linux_enumerate(struct sset *sset)
8b61709d
BP
698{
699 struct if_nameindex *names;
700
701 names = if_nameindex();
702 if (names) {
703 size_t i;
704
705 for (i = 0; names[i].if_name != NULL; i++) {
19993ef3 706 sset_add(sset, names[i].if_name);
8b61709d
BP
707 }
708 if_freenameindex(names);
709 return 0;
710 } else {
711 VLOG_WARN("could not obtain list of network device names: %s",
712 strerror(errno));
713 return errno;
714 }
715}
716
7b6b0ef4
BP
717static int
718netdev_linux_listen(struct netdev *netdev_)
719{
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 struct sockaddr_ll sll;
722 int ifindex;
723 int error;
724 int fd;
725
726 if (netdev->fd >= 0) {
727 return 0;
728 }
729
730 /* Create file descriptor. */
731 fd = socket(PF_PACKET, SOCK_RAW, 0);
732 if (fd < 0) {
733 error = errno;
734 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
735 goto error;
736 }
737
738 /* Set non-blocking mode. */
739 error = set_nonblocking(fd);
740 if (error) {
741 goto error;
742 }
743
744 /* Get ethernet device index. */
745 error = get_ifindex(&netdev->netdev, &ifindex);
746 if (error) {
747 goto error;
748 }
749
750 /* Bind to specific ethernet device. */
751 memset(&sll, 0, sizeof sll);
752 sll.sll_family = AF_PACKET;
753 sll.sll_ifindex = ifindex;
754 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
755 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
756 error = errno;
757 VLOG_ERR("%s: failed to bind raw socket (%s)",
758 netdev_get_name(netdev_), strerror(error));
759 goto error;
760 }
761
762 netdev->fd = fd;
763 return 0;
764
765error:
766 if (fd >= 0) {
767 close(fd);
768 }
769 return error;
770}
771
8b61709d
BP
772static int
773netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
774{
775 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776
5b7448ed 777 if (netdev->fd < 0) {
7b6b0ef4 778 /* Device is not listening. */
c0e5f6ca 779 return -EAGAIN;
8b61709d
BP
780 }
781
782 for (;;) {
5b7448ed 783 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
784 if (retval >= 0) {
785 return retval;
786 } else if (errno != EINTR) {
787 if (errno != EAGAIN) {
788 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
789 strerror(errno), netdev_get_name(netdev_));
790 }
c0e5f6ca 791 return -errno;
8b61709d
BP
792 }
793 }
794}
795
796/* Registers with the poll loop to wake up from the next call to poll_block()
797 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
798static void
799netdev_linux_recv_wait(struct netdev *netdev_)
800{
801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
802 if (netdev->fd >= 0) {
803 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
804 }
805}
806
807/* Discards all packets waiting to be received from 'netdev'. */
808static int
809netdev_linux_drain(struct netdev *netdev_)
810{
811 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 812 if (netdev->fd < 0) {
8b61709d 813 return 0;
5b7448ed 814 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 815 struct ifreq ifr;
149f577a 816 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
817 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
818 if (error) {
819 return error;
820 }
5b7448ed 821 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
822 return 0;
823 } else {
5b7448ed 824 return drain_rcvbuf(netdev->fd);
8b61709d
BP
825 }
826}
827
828/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
829 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
830 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
831 * the packet is too big or too small to transmit on the device.
832 *
833 * The caller retains ownership of 'buffer' in all cases.
834 *
835 * The kernel maintains a packet transmission queue, so the caller is not
836 * expected to do additional queuing of packets. */
837static int
838netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
839{
f23347ea
BP
840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 for (;;) {
842 ssize_t retval;
8b61709d 843
f23347ea
BP
844 if (netdev->fd < 0) {
845 /* Use our AF_PACKET socket to send to this device. */
846 struct sockaddr_ll sll;
847 struct msghdr msg;
848 struct iovec iov;
849 int ifindex;
850 int error;
488d734d
BP
851 int sock;
852
853 sock = af_packet_sock();
854 if (sock < 0) {
855 return sock;
856 }
f23347ea
BP
857
858 error = get_ifindex(netdev_, &ifindex);
859 if (error) {
860 return error;
861 }
8b61709d 862
f23347ea
BP
863 /* We don't bother setting most fields in sockaddr_ll because the
864 * kernel ignores them for SOCK_RAW. */
865 memset(&sll, 0, sizeof sll);
866 sll.sll_family = AF_PACKET;
867 sll.sll_ifindex = ifindex;
76c308b5 868
f23347ea
BP
869 iov.iov_base = (void *) data;
870 iov.iov_len = size;
76c308b5 871
f23347ea
BP
872 msg.msg_name = &sll;
873 msg.msg_namelen = sizeof sll;
874 msg.msg_iov = &iov;
875 msg.msg_iovlen = 1;
876 msg.msg_control = NULL;
877 msg.msg_controllen = 0;
878 msg.msg_flags = 0;
879
488d734d 880 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
881 } else {
882 /* Use the netdev's own fd to send to this device. This is
883 * essential for tap devices, because packets sent to a tap device
884 * with an AF_PACKET socket will loop back to be *received* again
885 * on the tap device. */
886 retval = write(netdev->fd, data, size);
887 }
76c308b5 888
8b61709d
BP
889 if (retval < 0) {
890 /* The Linux AF_PACKET implementation never blocks waiting for room
891 * for packets, instead returning ENOBUFS. Translate this into
892 * EAGAIN for the caller. */
893 if (errno == ENOBUFS) {
894 return EAGAIN;
895 } else if (errno == EINTR) {
896 continue;
897 } else if (errno != EAGAIN) {
898 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
899 netdev_get_name(netdev_), strerror(errno));
900 }
901 return errno;
902 } else if (retval != size) {
903 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
904 "%zu) on %s", retval, size, netdev_get_name(netdev_));
905 return EMSGSIZE;
906 } else {
907 return 0;
908 }
909 }
910}
911
912/* Registers with the poll loop to wake up from the next call to poll_block()
913 * when the packet transmission queue has sufficient room to transmit a packet
914 * with netdev_send().
915 *
916 * The kernel maintains a packet transmission queue, so the client is not
917 * expected to do additional queuing of packets. Thus, this function is
918 * unlikely to ever be used. It is included for completeness. */
919static void
920netdev_linux_send_wait(struct netdev *netdev_)
921{
922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 923 if (netdev->fd < 0) {
8b61709d 924 /* Nothing to do. */
5b7448ed
JG
925 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
926 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
927 } else {
928 /* TAP device always accepts packets.*/
929 poll_immediate_wake();
930 }
931}
932
933/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
934 * otherwise a positive errno value. */
935static int
936netdev_linux_set_etheraddr(struct netdev *netdev_,
937 const uint8_t mac[ETH_ADDR_LEN])
938{
149f577a
JG
939 struct netdev_dev_linux *netdev_dev =
940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
941 int error;
942
149f577a
JG
943 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
944 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
945 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
946 if (!error) {
149f577a
JG
947 netdev_dev->cache_valid |= VALID_ETHERADDR;
948 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
949 }
950 } else {
951 error = 0;
8b61709d
BP
952 }
953 return error;
954}
955
956/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
957 * free the returned buffer. */
958static int
959netdev_linux_get_etheraddr(const struct netdev *netdev_,
960 uint8_t mac[ETH_ADDR_LEN])
961{
149f577a
JG
962 struct netdev_dev_linux *netdev_dev =
963 netdev_dev_linux_cast(netdev_get_dev(netdev_));
964 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 965 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 966 netdev_dev->etheraddr);
8b61709d
BP
967 if (error) {
968 return error;
969 }
149f577a 970 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 971 }
149f577a 972 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
973 return 0;
974}
975
976/* Returns the maximum size of transmitted (and received) packets on 'netdev',
977 * in bytes, not including the hardware header; thus, this is typically 1500
978 * bytes for Ethernet devices. */
979static int
980netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
981{
149f577a
JG
982 struct netdev_dev_linux *netdev_dev =
983 netdev_dev_linux_cast(netdev_get_dev(netdev_));
984 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
985 struct ifreq ifr;
986 int error;
987
149f577a
JG
988 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
989 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
990 if (error) {
991 return error;
992 }
149f577a
JG
993 netdev_dev->mtu = ifr.ifr_mtu;
994 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 995 }
149f577a 996 *mtup = netdev_dev->mtu;
8b61709d
BP
997 return 0;
998}
999
9b020780
PS
1000/* Sets the maximum size of transmitted (MTU) for given device using linux
1001 * networking ioctl interface.
1002 */
1003static int
1004netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1005{
1006 struct netdev_dev_linux *netdev_dev =
1007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1008 struct ifreq ifr;
1009 int error;
1010
1011 ifr.ifr_mtu = mtu;
1012 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1013 SIOCSIFMTU, "SIOCSIFMTU");
1014 if (error) {
1015 return error;
1016 }
1017
1018 netdev_dev->mtu = ifr.ifr_mtu;
1019 netdev_dev->cache_valid |= VALID_MTU;
1020 return 0;
1021}
1022
9ab3d9a3
BP
1023/* Returns the ifindex of 'netdev', if successful, as a positive number.
1024 * On failure, returns a negative errno value. */
1025static int
1026netdev_linux_get_ifindex(const struct netdev *netdev)
1027{
1028 int ifindex, error;
1029
1030 error = get_ifindex(netdev, &ifindex);
1031 return error ? -error : ifindex;
1032}
1033
8b61709d
BP
1034static int
1035netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1036{
149f577a
JG
1037 struct netdev_dev_linux *netdev_dev =
1038 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1039 int error = 0;
1040 char *fn = NULL;
1041 int fd = -1;
1042
1670c579
EJ
1043 if (netdev_dev->miimon_interval > 0) {
1044 *carrier = netdev_dev->miimon;
1045 return 0;
1046 }
1047
149f577a 1048 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
1049 char line[8];
1050 int retval;
1051
149f577a
JG
1052 fn = xasprintf("/sys/class/net/%s/carrier",
1053 netdev_get_name(netdev_));
8b61709d
BP
1054 fd = open(fn, O_RDONLY);
1055 if (fd < 0) {
1056 error = errno;
1057 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1058 goto exit;
1059 }
1060
1061 retval = read(fd, line, sizeof line);
1062 if (retval < 0) {
1063 error = errno;
1064 if (error == EINVAL) {
1065 /* This is the normal return value when we try to check carrier
1066 * if the network device is not up. */
1067 } else {
1068 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1069 }
1070 goto exit;
1071 } else if (retval == 0) {
1072 error = EPROTO;
1073 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1074 goto exit;
1075 }
1076
1077 if (line[0] != '0' && line[0] != '1') {
1078 error = EPROTO;
1079 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1080 fn, line[0]);
1081 goto exit;
1082 }
149f577a
JG
1083 netdev_dev->carrier = line[0] != '0';
1084 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 1085 }
149f577a 1086 *carrier = netdev_dev->carrier;
8b61709d
BP
1087 error = 0;
1088
1089exit:
1090 if (fd >= 0) {
1091 close(fd);
1092 }
1093 free(fn);
1094 return error;
1095}
1096
63331829 1097static int
1670c579
EJ
1098netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1099 struct mii_ioctl_data *data)
63331829 1100{
63331829 1101 struct ifreq ifr;
782e6111 1102 int error;
63331829 1103
63331829 1104 memset(&ifr, 0, sizeof ifr);
782e6111 1105 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1106 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1107 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1108
782e6111
EJ
1109 return error;
1110}
1111
1112static int
1670c579 1113netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1114{
782e6111
EJ
1115 struct mii_ioctl_data data;
1116 int error;
63331829 1117
782e6111
EJ
1118 *miimon = false;
1119
1120 memset(&data, 0, sizeof data);
1670c579 1121 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1122 if (!error) {
1123 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1124 data.reg_num = MII_BMSR;
1670c579 1125 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1126 &data);
63331829
EJ
1127
1128 if (!error) {
782e6111 1129 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1130 } else {
1131 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1132 }
1133 } else {
1134 struct ethtool_cmd ecmd;
63331829
EJ
1135
1136 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1137 name);
1138
1139 memset(&ecmd, 0, sizeof ecmd);
1140 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1141 "ETHTOOL_GLINK");
1142 if (!error) {
782e6111
EJ
1143 struct ethtool_value eval;
1144
1145 memcpy(&eval, &ecmd, sizeof eval);
1146 *miimon = !!eval.data;
63331829
EJ
1147 } else {
1148 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1149 }
1150 }
1151
1152 return error;
1153}
1154
1670c579
EJ
1155static int
1156netdev_linux_set_miimon_interval(struct netdev *netdev_,
1157 long long int interval)
1158{
1159 struct netdev_dev_linux *netdev_dev;
1160
1161 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1162
1163 interval = interval > 0 ? MAX(interval, 100) : 0;
1164 if (netdev_dev->miimon_interval != interval) {
1165 netdev_dev->miimon_interval = interval;
1166 timer_set_expired(&netdev_dev->miimon_timer);
1167 }
1168
1169 return 0;
1170}
1171
1172static void
1173netdev_linux_miimon_run(void)
1174{
1175 struct shash device_shash;
1176 struct shash_node *node;
1177
1178 shash_init(&device_shash);
1179 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1180 SHASH_FOR_EACH (node, &device_shash) {
1181 struct netdev_dev_linux *dev = node->data;
1182 bool miimon;
1183
1184 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1185 continue;
1186 }
1187
1188 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1189 if (miimon != dev->miimon) {
1670c579 1190 dev->miimon = miimon;
ac4d3bcb 1191 netdev_dev_linux_changed(dev);
1670c579
EJ
1192 }
1193
1194 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1195 }
1196
1197 shash_destroy(&device_shash);
1198}
1199
1200static void
1201netdev_linux_miimon_wait(void)
1202{
1203 struct shash device_shash;
1204 struct shash_node *node;
1205
1206 shash_init(&device_shash);
1207 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1208 SHASH_FOR_EACH (node, &device_shash) {
1209 struct netdev_dev_linux *dev = node->data;
1210
1211 if (dev->miimon_interval > 0) {
1212 timer_wait(&dev->miimon_timer);
1213 }
1214 }
1215 shash_destroy(&device_shash);
1216}
1217
8b61709d
BP
1218/* Check whether we can we use RTM_GETLINK to get network device statistics.
1219 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1220 * enabled. */
1221static bool
1222check_for_working_netlink_stats(void)
1223{
1224 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1225 * preferable, so if that works, we'll use it. */
1226 int ifindex = do_get_ifindex("lo");
1227 if (ifindex < 0) {
1228 VLOG_WARN("failed to get ifindex for lo, "
1229 "obtaining netdev stats from proc");
1230 return false;
1231 } else {
1232 struct netdev_stats stats;
1233 int error = get_stats_via_netlink(ifindex, &stats);
1234 if (!error) {
1235 VLOG_DBG("obtaining netdev stats via rtnetlink");
1236 return true;
1237 } else {
1238 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1239 "via proc (you are probably running a pre-2.6.19 "
1240 "kernel)", strerror(error));
1241 return false;
1242 }
1243 }
1244}
1245
92df599c
JG
1246static void
1247swap_uint64(uint64_t *a, uint64_t *b)
1248{
1de0e8ae
BP
1249 uint64_t tmp = *a;
1250 *a = *b;
1251 *b = tmp;
92df599c
JG
1252}
1253
f613a0d7
PS
1254static void
1255get_stats_via_vport(const struct netdev *netdev_,
1256 struct netdev_stats *stats)
8b61709d 1257{
149f577a
JG
1258 struct netdev_dev_linux *netdev_dev =
1259 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1260
7fbef77a
JG
1261 if (netdev_dev->have_vport_stats ||
1262 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
f613a0d7 1263 int error;
7fbef77a
JG
1264
1265 error = netdev_vport_get_stats(netdev_, stats);
f613a0d7
PS
1266 if (error) {
1267 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1268 netdev_get_name(netdev_), error);
1269 }
7fbef77a
JG
1270 netdev_dev->have_vport_stats = !error;
1271 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1272 }
f613a0d7 1273}
8b61709d 1274
f613a0d7
PS
1275static int
1276netdev_linux_sys_get_stats(const struct netdev *netdev_,
1277 struct netdev_stats *stats)
1278{
1279 static int use_netlink_stats = -1;
1280 int error;
1281
1282 if (use_netlink_stats < 0) {
1283 use_netlink_stats = check_for_working_netlink_stats();
1284 }
1285
1286 if (use_netlink_stats) {
1287 int ifindex;
1288
1289 error = get_ifindex(netdev_, &ifindex);
1290 if (!error) {
1291 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1292 }
f613a0d7
PS
1293 } else {
1294 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1295 }
7fbef77a 1296
f613a0d7
PS
1297 if (error) {
1298 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1299 netdev_get_name(netdev_), error);
1300 }
1301 return error;
1302
1303}
1304
1305/* Retrieves current device stats for 'netdev-linux'. */
1306static int
1307netdev_linux_get_stats(const struct netdev *netdev_,
1308 struct netdev_stats *stats)
1309{
1310 struct netdev_dev_linux *netdev_dev =
1311 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1312 struct netdev_stats dev_stats;
1313 int error;
1314
1315 get_stats_via_vport(netdev_, stats);
1316
1317 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1318
1319 if (error) {
1320 if (!netdev_dev->have_vport_stats) {
1321 return error;
7fbef77a 1322 } else {
f613a0d7
PS
1323 return 0;
1324 }
1325 }
1326
1327 if (!netdev_dev->have_vport_stats) {
1328 /* stats not available from OVS then use ioctl stats. */
1329 *stats = dev_stats;
1330 } else {
1331 stats->rx_errors += dev_stats.rx_errors;
1332 stats->tx_errors += dev_stats.tx_errors;
1333 stats->rx_dropped += dev_stats.rx_dropped;
1334 stats->tx_dropped += dev_stats.tx_dropped;
1335 stats->multicast += dev_stats.multicast;
1336 stats->collisions += dev_stats.collisions;
1337 stats->rx_length_errors += dev_stats.rx_length_errors;
1338 stats->rx_over_errors += dev_stats.rx_over_errors;
1339 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1340 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1341 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1342 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1343 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1344 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1345 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1346 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1347 stats->tx_window_errors += dev_stats.tx_window_errors;
1348 }
1349 return 0;
1350}
1351
1352/* Retrieves current device stats for 'netdev-tap' netdev or
1353 * netdev-internal. */
1354static int
1355netdev_pseudo_get_stats(const struct netdev *netdev_,
1356 struct netdev_stats *stats)
1357{
1358 struct netdev_dev_linux *netdev_dev =
1359 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1360 struct netdev_stats dev_stats;
1361 int error;
1362
1363 get_stats_via_vport(netdev_, stats);
1364
1365 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1366 if (error) {
1367 if (!netdev_dev->have_vport_stats) {
1368 return error;
1369 } else {
1370 return 0;
8b61709d 1371 }
8b61709d 1372 }
fe6b0e03
JG
1373
1374 /* If this port is an internal port then the transmit and receive stats
1375 * will appear to be swapped relative to the other ports since we are the
1376 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1377 * them back here. This does not apply if we are getting stats from the
1378 * vport layer because it always tracks stats from the perspective of the
1379 * switch. */
f613a0d7
PS
1380 if (!netdev_dev->have_vport_stats) {
1381 *stats = dev_stats;
92df599c
JG
1382 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1383 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1384 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1385 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1386 stats->rx_length_errors = 0;
1387 stats->rx_over_errors = 0;
1388 stats->rx_crc_errors = 0;
1389 stats->rx_frame_errors = 0;
1390 stats->rx_fifo_errors = 0;
1391 stats->rx_missed_errors = 0;
1392 stats->tx_aborted_errors = 0;
1393 stats->tx_carrier_errors = 0;
1394 stats->tx_fifo_errors = 0;
1395 stats->tx_heartbeat_errors = 0;
1396 stats->tx_window_errors = 0;
f613a0d7
PS
1397 } else {
1398 stats->rx_dropped += dev_stats.tx_dropped;
1399 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1400
f613a0d7
PS
1401 stats->rx_errors += dev_stats.tx_errors;
1402 stats->tx_errors += dev_stats.rx_errors;
1403
1404 stats->multicast += dev_stats.multicast;
1405 stats->collisions += dev_stats.collisions;
1406 }
1407 return 0;
8b61709d
BP
1408}
1409
1410/* Stores the features supported by 'netdev' into each of '*current',
1411 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1412 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1413 * successful, otherwise a positive errno value. */
8b61709d 1414static int
6f2f5cce 1415netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1416 uint32_t *current, uint32_t *advertised,
1417 uint32_t *supported, uint32_t *peer)
1418{
1419 struct ethtool_cmd ecmd;
1420 int error;
1421
1422 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1423 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1424 ETHTOOL_GSET, "ETHTOOL_GSET");
1425 if (error) {
1426 return error;
1427 }
1428
1429 /* Supported features. */
1430 *supported = 0;
1431 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1432 *supported |= OFPPF_10MB_HD;
1433 }
1434 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1435 *supported |= OFPPF_10MB_FD;
1436 }
1437 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1438 *supported |= OFPPF_100MB_HD;
1439 }
1440 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1441 *supported |= OFPPF_100MB_FD;
1442 }
1443 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1444 *supported |= OFPPF_1GB_HD;
1445 }
1446 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1447 *supported |= OFPPF_1GB_FD;
1448 }
1449 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1450 *supported |= OFPPF_10GB_FD;
1451 }
1452 if (ecmd.supported & SUPPORTED_TP) {
1453 *supported |= OFPPF_COPPER;
1454 }
1455 if (ecmd.supported & SUPPORTED_FIBRE) {
1456 *supported |= OFPPF_FIBER;
1457 }
1458 if (ecmd.supported & SUPPORTED_Autoneg) {
1459 *supported |= OFPPF_AUTONEG;
1460 }
1461 if (ecmd.supported & SUPPORTED_Pause) {
1462 *supported |= OFPPF_PAUSE;
1463 }
1464 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1465 *supported |= OFPPF_PAUSE_ASYM;
1466 }
1467
1468 /* Advertised features. */
1469 *advertised = 0;
1470 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1471 *advertised |= OFPPF_10MB_HD;
1472 }
1473 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1474 *advertised |= OFPPF_10MB_FD;
1475 }
1476 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1477 *advertised |= OFPPF_100MB_HD;
1478 }
1479 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1480 *advertised |= OFPPF_100MB_FD;
1481 }
1482 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1483 *advertised |= OFPPF_1GB_HD;
1484 }
1485 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1486 *advertised |= OFPPF_1GB_FD;
1487 }
1488 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1489 *advertised |= OFPPF_10GB_FD;
1490 }
1491 if (ecmd.advertising & ADVERTISED_TP) {
1492 *advertised |= OFPPF_COPPER;
1493 }
1494 if (ecmd.advertising & ADVERTISED_FIBRE) {
1495 *advertised |= OFPPF_FIBER;
1496 }
1497 if (ecmd.advertising & ADVERTISED_Autoneg) {
1498 *advertised |= OFPPF_AUTONEG;
1499 }
1500 if (ecmd.advertising & ADVERTISED_Pause) {
1501 *advertised |= OFPPF_PAUSE;
1502 }
1503 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1504 *advertised |= OFPPF_PAUSE_ASYM;
1505 }
1506
1507 /* Current settings. */
1508 if (ecmd.speed == SPEED_10) {
1509 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1510 } else if (ecmd.speed == SPEED_100) {
1511 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1512 } else if (ecmd.speed == SPEED_1000) {
1513 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1514 } else if (ecmd.speed == SPEED_10000) {
1515 *current = OFPPF_10GB_FD;
1516 } else {
1517 *current = 0;
1518 }
1519
1520 if (ecmd.port == PORT_TP) {
1521 *current |= OFPPF_COPPER;
1522 } else if (ecmd.port == PORT_FIBRE) {
1523 *current |= OFPPF_FIBER;
1524 }
1525
1526 if (ecmd.autoneg) {
1527 *current |= OFPPF_AUTONEG;
1528 }
1529
1530 /* Peer advertisements. */
1531 *peer = 0; /* XXX */
1532
1533 return 0;
1534}
1535
1536/* Set the features advertised by 'netdev' to 'advertise'. */
1537static int
1538netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1539{
1540 struct ethtool_cmd ecmd;
1541 int error;
1542
1543 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1544 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1545 ETHTOOL_GSET, "ETHTOOL_GSET");
1546 if (error) {
1547 return error;
1548 }
1549
1550 ecmd.advertising = 0;
1551 if (advertise & OFPPF_10MB_HD) {
1552 ecmd.advertising |= ADVERTISED_10baseT_Half;
1553 }
1554 if (advertise & OFPPF_10MB_FD) {
1555 ecmd.advertising |= ADVERTISED_10baseT_Full;
1556 }
1557 if (advertise & OFPPF_100MB_HD) {
1558 ecmd.advertising |= ADVERTISED_100baseT_Half;
1559 }
1560 if (advertise & OFPPF_100MB_FD) {
1561 ecmd.advertising |= ADVERTISED_100baseT_Full;
1562 }
1563 if (advertise & OFPPF_1GB_HD) {
1564 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1565 }
1566 if (advertise & OFPPF_1GB_FD) {
1567 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1568 }
1569 if (advertise & OFPPF_10GB_FD) {
1570 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1571 }
1572 if (advertise & OFPPF_COPPER) {
1573 ecmd.advertising |= ADVERTISED_TP;
1574 }
1575 if (advertise & OFPPF_FIBER) {
1576 ecmd.advertising |= ADVERTISED_FIBRE;
1577 }
1578 if (advertise & OFPPF_AUTONEG) {
1579 ecmd.advertising |= ADVERTISED_Autoneg;
1580 }
1581 if (advertise & OFPPF_PAUSE) {
1582 ecmd.advertising |= ADVERTISED_Pause;
1583 }
1584 if (advertise & OFPPF_PAUSE_ASYM) {
1585 ecmd.advertising |= ADVERTISED_Asym_Pause;
1586 }
0b0544d7 1587 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1588 ETHTOOL_SSET, "ETHTOOL_SSET");
1589}
1590
1591/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1592 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1593 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1594 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1595 * sets '*vlan_vid' to -1. */
1596static int
1597netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1598{
1599 const char *netdev_name = netdev_get_name(netdev);
1600 struct ds line = DS_EMPTY_INITIALIZER;
1601 FILE *stream = NULL;
1602 int error;
1603 char *fn;
1604
1605 COVERAGE_INC(netdev_get_vlan_vid);
1606 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1607 stream = fopen(fn, "r");
1608 if (!stream) {
1609 error = errno;
1610 goto done;
1611 }
1612
1613 if (ds_get_line(&line, stream)) {
1614 if (ferror(stream)) {
1615 error = errno;
1616 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1617 } else {
1618 error = EPROTO;
1619 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1620 }
1621 goto done;
1622 }
1623
1624 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1625 error = EPROTO;
1626 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1627 fn, ds_cstr(&line));
1628 goto done;
1629 }
1630
1631 error = 0;
1632
1633done:
1634 free(fn);
1635 if (stream) {
1636 fclose(stream);
1637 }
1638 ds_destroy(&line);
1639 if (error) {
1640 *vlan_vid = -1;
1641 }
1642 return error;
1643}
1644
1645#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1646#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1647
8e460221 1648/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1649 * positive errno value.
1650 *
1651 * This function is equivalent to running
1652 * /sbin/tc qdisc del dev %s handle ffff: ingress
1653 * but it is much, much faster.
1654 */
8e460221
BP
1655static int
1656netdev_linux_remove_policing(struct netdev *netdev)
1657{
80a86fbe
BP
1658 struct netdev_dev_linux *netdev_dev =
1659 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1660 const char *netdev_name = netdev_get_name(netdev);
8e460221 1661
6f42c8ea 1662 struct ofpbuf request;
6f42c8ea 1663 struct tcmsg *tcmsg;
6f42c8ea
BP
1664 int error;
1665
c1c9c9c4 1666 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1667 if (!tcmsg) {
1668 return ENODEV;
1669 }
c1c9c9c4 1670 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1671 tcmsg->tcm_parent = TC_H_INGRESS;
1672 nl_msg_put_string(&request, TCA_KIND, "ingress");
1673 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1674
1675 error = tc_transact(&request, NULL);
4d10512c 1676 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1677 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1678 netdev_name, strerror(error));
1679 return error;
1680 }
1681
80a86fbe
BP
1682 netdev_dev->kbits_rate = 0;
1683 netdev_dev->kbits_burst = 0;
1684 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1685 return 0;
1686}
1687
8b61709d
BP
1688/* Attempts to set input rate limiting (policing) policy. */
1689static int
1690netdev_linux_set_policing(struct netdev *netdev,
1691 uint32_t kbits_rate, uint32_t kbits_burst)
1692{
80a86fbe
BP
1693 struct netdev_dev_linux *netdev_dev =
1694 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1695 const char *netdev_name = netdev_get_name(netdev);
1696 char command[1024];
1697
1698 COVERAGE_INC(netdev_set_policing);
8e460221 1699
80a86fbe
BP
1700 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1701 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1702 : kbits_burst); /* Stick with user-specified value. */
1703
1704 if (netdev_dev->cache_valid & VALID_POLICING
1705 && netdev_dev->kbits_rate == kbits_rate
1706 && netdev_dev->kbits_burst == kbits_burst) {
1707 /* Assume that settings haven't changed since we last set them. */
1708 return 0;
1709 }
1710
8e460221 1711 netdev_linux_remove_policing(netdev);
8b61709d 1712 if (kbits_rate) {
8b61709d
BP
1713 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1714 if (system(command) != 0) {
1715 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1716 return -1;
1717 }
1718
1719 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1720 kbits_rate, kbits_burst);
1721 if (system(command) != 0) {
1722 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1723 netdev_name);
1724 return -1;
1725 }
80a86fbe
BP
1726
1727 netdev_dev->kbits_rate = kbits_rate;
1728 netdev_dev->kbits_burst = kbits_burst;
1729 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1730 }
1731
1732 return 0;
1733}
1734
c1c9c9c4
BP
1735static int
1736netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1737 struct sset *types)
c1c9c9c4
BP
1738{
1739 const struct tc_ops **opsp;
1740
1741 for (opsp = tcs; *opsp != NULL; opsp++) {
1742 const struct tc_ops *ops = *opsp;
1743 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1744 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1745 }
1746 }
1747 return 0;
1748}
1749
1750static const struct tc_ops *
1751tc_lookup_ovs_name(const char *name)
1752{
1753 const struct tc_ops **opsp;
1754
1755 for (opsp = tcs; *opsp != NULL; opsp++) {
1756 const struct tc_ops *ops = *opsp;
1757 if (!strcmp(name, ops->ovs_name)) {
1758 return ops;
1759 }
1760 }
1761 return NULL;
1762}
1763
1764static const struct tc_ops *
1765tc_lookup_linux_name(const char *name)
1766{
1767 const struct tc_ops **opsp;
1768
1769 for (opsp = tcs; *opsp != NULL; opsp++) {
1770 const struct tc_ops *ops = *opsp;
1771 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1772 return ops;
1773 }
1774 }
1775 return NULL;
1776}
1777
93b13be8
BP
1778static struct tc_queue *
1779tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1780 size_t hash)
1781{
1782 struct netdev_dev_linux *netdev_dev =
1783 netdev_dev_linux_cast(netdev_get_dev(netdev));
1784 struct tc_queue *queue;
1785
4e8e4213 1786 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1787 if (queue->queue_id == queue_id) {
1788 return queue;
1789 }
1790 }
1791 return NULL;
1792}
1793
1794static struct tc_queue *
1795tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1796{
1797 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1798}
1799
c1c9c9c4
BP
1800static int
1801netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1802 const char *type,
1803 struct netdev_qos_capabilities *caps)
1804{
1805 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1806 if (!ops) {
1807 return EOPNOTSUPP;
1808 }
1809 caps->n_queues = ops->n_queues;
1810 return 0;
1811}
1812
1813static int
1814netdev_linux_get_qos(const struct netdev *netdev,
1815 const char **typep, struct shash *details)
1816{
1817 struct netdev_dev_linux *netdev_dev =
1818 netdev_dev_linux_cast(netdev_get_dev(netdev));
1819 int error;
1820
1821 error = tc_query_qdisc(netdev);
1822 if (error) {
1823 return error;
1824 }
1825
1826 *typep = netdev_dev->tc->ops->ovs_name;
1827 return (netdev_dev->tc->ops->qdisc_get
1828 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1829 : 0);
1830}
1831
1832static int
1833netdev_linux_set_qos(struct netdev *netdev,
1834 const char *type, const struct shash *details)
1835{
1836 struct netdev_dev_linux *netdev_dev =
1837 netdev_dev_linux_cast(netdev_get_dev(netdev));
1838 const struct tc_ops *new_ops;
1839 int error;
1840
1841 new_ops = tc_lookup_ovs_name(type);
1842 if (!new_ops || !new_ops->tc_install) {
1843 return EOPNOTSUPP;
1844 }
1845
1846 error = tc_query_qdisc(netdev);
1847 if (error) {
1848 return error;
1849 }
1850
1851 if (new_ops == netdev_dev->tc->ops) {
1852 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1853 } else {
1854 /* Delete existing qdisc. */
1855 error = tc_del_qdisc(netdev);
1856 if (error) {
1857 return error;
1858 }
1859 assert(netdev_dev->tc == NULL);
1860
1861 /* Install new qdisc. */
1862 error = new_ops->tc_install(netdev, details);
1863 assert((error == 0) == (netdev_dev->tc != NULL));
1864
1865 return error;
1866 }
1867}
1868
1869static int
1870netdev_linux_get_queue(const struct netdev *netdev,
1871 unsigned int queue_id, struct shash *details)
1872{
1873 struct netdev_dev_linux *netdev_dev =
1874 netdev_dev_linux_cast(netdev_get_dev(netdev));
1875 int error;
1876
1877 error = tc_query_qdisc(netdev);
1878 if (error) {
1879 return error;
93b13be8
BP
1880 } else {
1881 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1882 return (queue
1883 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1884 : ENOENT);
c1c9c9c4 1885 }
c1c9c9c4
BP
1886}
1887
1888static int
1889netdev_linux_set_queue(struct netdev *netdev,
1890 unsigned int queue_id, const struct shash *details)
1891{
1892 struct netdev_dev_linux *netdev_dev =
1893 netdev_dev_linux_cast(netdev_get_dev(netdev));
1894 int error;
1895
1896 error = tc_query_qdisc(netdev);
1897 if (error) {
1898 return error;
1899 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1900 || !netdev_dev->tc->ops->class_set) {
1901 return EINVAL;
1902 }
1903
1904 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1905}
1906
1907static int
1908netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1909{
1910 struct netdev_dev_linux *netdev_dev =
1911 netdev_dev_linux_cast(netdev_get_dev(netdev));
1912 int error;
1913
1914 error = tc_query_qdisc(netdev);
1915 if (error) {
1916 return error;
1917 } else if (!netdev_dev->tc->ops->class_delete) {
1918 return EINVAL;
93b13be8
BP
1919 } else {
1920 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1921 return (queue
1922 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1923 : ENOENT);
c1c9c9c4 1924 }
c1c9c9c4
BP
1925}
1926
1927static int
1928netdev_linux_get_queue_stats(const struct netdev *netdev,
1929 unsigned int queue_id,
1930 struct netdev_queue_stats *stats)
1931{
1932 struct netdev_dev_linux *netdev_dev =
1933 netdev_dev_linux_cast(netdev_get_dev(netdev));
1934 int error;
1935
1936 error = tc_query_qdisc(netdev);
1937 if (error) {
1938 return error;
c1c9c9c4
BP
1939 } else if (!netdev_dev->tc->ops->class_get_stats) {
1940 return EOPNOTSUPP;
93b13be8
BP
1941 } else {
1942 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1943 return (queue
1944 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1945 : ENOENT);
c1c9c9c4 1946 }
c1c9c9c4
BP
1947}
1948
23a98ffe 1949static bool
c1c9c9c4
BP
1950start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1951{
1952 struct ofpbuf request;
1953 struct tcmsg *tcmsg;
1954
1955 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1956 if (!tcmsg) {
1957 return false;
1958 }
3c4de644 1959 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1960 nl_dump_start(dump, rtnl_sock, &request);
1961 ofpbuf_uninit(&request);
23a98ffe 1962 return true;
c1c9c9c4
BP
1963}
1964
1965static int
1966netdev_linux_dump_queues(const struct netdev *netdev,
1967 netdev_dump_queues_cb *cb, void *aux)
1968{
1969 struct netdev_dev_linux *netdev_dev =
1970 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1971 struct tc_queue *queue;
c1c9c9c4
BP
1972 struct shash details;
1973 int last_error;
c1c9c9c4
BP
1974 int error;
1975
1976 error = tc_query_qdisc(netdev);
1977 if (error) {
1978 return error;
1979 } else if (!netdev_dev->tc->ops->class_get) {
1980 return EOPNOTSUPP;
1981 }
1982
1983 last_error = 0;
1984 shash_init(&details);
4e8e4213 1985 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1986 shash_clear(&details);
1987
93b13be8 1988 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1989 if (!error) {
93b13be8 1990 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1991 } else {
1992 last_error = error;
1993 }
1994 }
1995 shash_destroy(&details);
1996
1997 return last_error;
1998}
1999
2000static int
2001netdev_linux_dump_queue_stats(const struct netdev *netdev,
2002 netdev_dump_queue_stats_cb *cb, void *aux)
2003{
2004 struct netdev_dev_linux *netdev_dev =
2005 netdev_dev_linux_cast(netdev_get_dev(netdev));
2006 struct nl_dump dump;
2007 struct ofpbuf msg;
2008 int last_error;
2009 int error;
2010
2011 error = tc_query_qdisc(netdev);
2012 if (error) {
2013 return error;
2014 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2015 return EOPNOTSUPP;
2016 }
2017
2018 last_error = 0;
23a98ffe
BP
2019 if (!start_queue_dump(netdev, &dump)) {
2020 return ENODEV;
2021 }
c1c9c9c4
BP
2022 while (nl_dump_next(&dump, &msg)) {
2023 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2024 if (error) {
2025 last_error = error;
2026 }
2027 }
2028
2029 error = nl_dump_done(&dump);
2030 return error ? error : last_error;
2031}
2032
8b61709d 2033static int
f1acd62b
BP
2034netdev_linux_get_in4(const struct netdev *netdev_,
2035 struct in_addr *address, struct in_addr *netmask)
8b61709d 2036{
149f577a
JG
2037 struct netdev_dev_linux *netdev_dev =
2038 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2039
2040 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
2041 int error;
2042
149f577a 2043 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
2044 SIOCGIFADDR, "SIOCGIFADDR");
2045 if (error) {
2046 return error;
2047 }
2048
149f577a 2049 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
2050 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2051 if (error) {
2052 return error;
2053 }
2054
149f577a 2055 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 2056 }
149f577a
JG
2057 *address = netdev_dev->address;
2058 *netmask = netdev_dev->netmask;
f1acd62b 2059 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2060}
2061
8b61709d 2062static int
f1acd62b
BP
2063netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2064 struct in_addr netmask)
8b61709d 2065{
149f577a
JG
2066 struct netdev_dev_linux *netdev_dev =
2067 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
2068 int error;
2069
f1acd62b 2070 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2071 if (!error) {
149f577a
JG
2072 netdev_dev->cache_valid |= VALID_IN4;
2073 netdev_dev->address = address;
2074 netdev_dev->netmask = netmask;
f1acd62b 2075 if (address.s_addr != INADDR_ANY) {
8b61709d 2076 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2077 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2078 }
2079 }
2080 return error;
2081}
2082
2083static bool
2084parse_if_inet6_line(const char *line,
2085 struct in6_addr *in6, char ifname[16 + 1])
2086{
2087 uint8_t *s6 = in6->s6_addr;
2088#define X8 "%2"SCNx8
2089 return sscanf(line,
2090 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2091 "%*x %*x %*x %*x %16s\n",
2092 &s6[0], &s6[1], &s6[2], &s6[3],
2093 &s6[4], &s6[5], &s6[6], &s6[7],
2094 &s6[8], &s6[9], &s6[10], &s6[11],
2095 &s6[12], &s6[13], &s6[14], &s6[15],
2096 ifname) == 17;
2097}
2098
2099/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2100 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2101static int
2102netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2103{
149f577a
JG
2104 struct netdev_dev_linux *netdev_dev =
2105 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2106 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2107 FILE *file;
2108 char line[128];
2109
149f577a 2110 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2111
2112 file = fopen("/proc/net/if_inet6", "r");
2113 if (file != NULL) {
2114 const char *name = netdev_get_name(netdev_);
2115 while (fgets(line, sizeof line, file)) {
2a022368 2116 struct in6_addr in6_tmp;
8b61709d 2117 char ifname[16 + 1];
2a022368 2118 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2119 && !strcmp(name, ifname))
2120 {
2a022368 2121 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2122 break;
2123 }
2124 }
2125 fclose(file);
2126 }
149f577a 2127 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2128 }
149f577a 2129 *in6 = netdev_dev->in6;
8b61709d
BP
2130 return 0;
2131}
2132
2133static void
2134make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2135{
2136 struct sockaddr_in sin;
2137 memset(&sin, 0, sizeof sin);
2138 sin.sin_family = AF_INET;
2139 sin.sin_addr = addr;
2140 sin.sin_port = 0;
2141
2142 memset(sa, 0, sizeof *sa);
2143 memcpy(sa, &sin, sizeof sin);
2144}
2145
2146static int
2147do_set_addr(struct netdev *netdev,
2148 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2149{
2150 struct ifreq ifr;
71d7c22f 2151 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2152 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2153
2154 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2155 ioctl_name);
8b61709d
BP
2156}
2157
2158/* Adds 'router' as a default IP gateway. */
2159static int
67a4917b 2160netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2161{
2162 struct in_addr any = { INADDR_ANY };
2163 struct rtentry rt;
2164 int error;
2165
2166 memset(&rt, 0, sizeof rt);
2167 make_in4_sockaddr(&rt.rt_dst, any);
2168 make_in4_sockaddr(&rt.rt_gateway, router);
2169 make_in4_sockaddr(&rt.rt_genmask, any);
2170 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2171 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2172 if (error) {
2173 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2174 }
2175 return error;
2176}
2177
f1acd62b
BP
2178static int
2179netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2180 char **netdev_name)
2181{
2182 static const char fn[] = "/proc/net/route";
2183 FILE *stream;
2184 char line[256];
2185 int ln;
2186
2187 *netdev_name = NULL;
2188 stream = fopen(fn, "r");
2189 if (stream == NULL) {
2190 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2191 return errno;
2192 }
2193
2194 ln = 0;
2195 while (fgets(line, sizeof line, stream)) {
2196 if (++ln >= 2) {
2197 char iface[17];
dbba996b 2198 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2199 int refcnt, metric, mtu;
2200 unsigned int flags, use, window, irtt;
2201
2202 if (sscanf(line,
2203 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2204 " %d %u %u\n",
2205 iface, &dest, &gateway, &flags, &refcnt,
2206 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2207
d295e8e9 2208 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2209 fn, ln, line);
2210 continue;
2211 }
2212 if (!(flags & RTF_UP)) {
2213 /* Skip routes that aren't up. */
2214 continue;
2215 }
2216
2217 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2218 * network byte order, so we don't need need any endian
f1acd62b
BP
2219 * conversions here. */
2220 if ((dest & mask) == (host->s_addr & mask)) {
2221 if (!gateway) {
2222 /* The host is directly reachable. */
2223 next_hop->s_addr = 0;
2224 } else {
2225 /* To reach the host, we must go through a gateway. */
2226 next_hop->s_addr = gateway;
2227 }
2228 *netdev_name = xstrdup(iface);
2229 fclose(stream);
2230 return 0;
2231 }
2232 }
2233 }
2234
2235 fclose(stream);
2236 return ENXIO;
2237}
2238
e210037e
AE
2239static int
2240netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2241{
2242 struct ethtool_drvinfo drvinfo;
2243 int error;
2244
2245 memset(&drvinfo, 0, sizeof drvinfo);
2246 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2247 (struct ethtool_cmd *)&drvinfo,
2248 ETHTOOL_GDRVINFO,
2249 "ETHTOOL_GDRVINFO");
2250 if (!error) {
2251 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2252 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2253 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2254 }
2255
2256 return error;
2257}
2258
8b61709d
BP
2259/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2260 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2261 * returns 0. Otherwise, it returns a positive errno value; in particular,
2262 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2263static int
2264netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2265 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2266{
2267 struct arpreq r;
c100e025 2268 struct sockaddr_in sin;
8b61709d
BP
2269 int retval;
2270
2271 memset(&r, 0, sizeof r);
f2cc621b 2272 memset(&sin, 0, sizeof sin);
c100e025
BP
2273 sin.sin_family = AF_INET;
2274 sin.sin_addr.s_addr = ip;
2275 sin.sin_port = 0;
2276 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2277 r.arp_ha.sa_family = ARPHRD_ETHER;
2278 r.arp_flags = 0;
71d7c22f 2279 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2280 COVERAGE_INC(netdev_arp_lookup);
2281 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2282 if (!retval) {
2283 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2284 } else if (retval != ENXIO) {
2285 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2286 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2287 }
2288 return retval;
2289}
2290
2291static int
2292nd_to_iff_flags(enum netdev_flags nd)
2293{
2294 int iff = 0;
2295 if (nd & NETDEV_UP) {
2296 iff |= IFF_UP;
2297 }
2298 if (nd & NETDEV_PROMISC) {
2299 iff |= IFF_PROMISC;
2300 }
2301 return iff;
2302}
2303
2304static int
2305iff_to_nd_flags(int iff)
2306{
2307 enum netdev_flags nd = 0;
2308 if (iff & IFF_UP) {
2309 nd |= NETDEV_UP;
2310 }
2311 if (iff & IFF_PROMISC) {
2312 nd |= NETDEV_PROMISC;
2313 }
2314 return nd;
2315}
2316
2317static int
2318netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2319 enum netdev_flags on, enum netdev_flags *old_flagsp)
2320{
2321 int old_flags, new_flags;
2322 int error;
2323
2324 error = get_flags(netdev, &old_flags);
2325 if (!error) {
2326 *old_flagsp = iff_to_nd_flags(old_flags);
2327 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2328 if (new_flags != old_flags) {
2329 error = set_flags(netdev, new_flags);
2330 }
2331 }
2332 return error;
2333}
2334
ac4d3bcb
EJ
2335static unsigned int
2336netdev_linux_change_seq(const struct netdev *netdev)
2337{
2338 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2339}
2340
f613a0d7 2341#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, GET_STATS, SET_STATS) \
c3827f61
BP
2342{ \
2343 NAME, \
2344 \
2345 netdev_linux_init, \
2346 netdev_linux_run, \
2347 netdev_linux_wait, \
2348 \
2349 CREATE, \
2350 netdev_linux_destroy, \
de5cdb90 2351 NULL, /* get_config */ \
6d9e6eb4 2352 NULL, /* set_config */ \
c3827f61
BP
2353 \
2354 netdev_linux_open, \
2355 netdev_linux_close, \
2356 \
2357 ENUMERATE, \
2358 \
7b6b0ef4 2359 netdev_linux_listen, \
c3827f61
BP
2360 netdev_linux_recv, \
2361 netdev_linux_recv_wait, \
2362 netdev_linux_drain, \
2363 \
2364 netdev_linux_send, \
2365 netdev_linux_send_wait, \
2366 \
2367 netdev_linux_set_etheraddr, \
2368 netdev_linux_get_etheraddr, \
2369 netdev_linux_get_mtu, \
9b020780 2370 netdev_linux_set_mtu, \
c3827f61
BP
2371 netdev_linux_get_ifindex, \
2372 netdev_linux_get_carrier, \
1670c579 2373 netdev_linux_set_miimon_interval, \
f613a0d7 2374 GET_STATS, \
c3827f61
BP
2375 SET_STATS, \
2376 \
2377 netdev_linux_get_features, \
2378 netdev_linux_set_advertisements, \
2379 netdev_linux_get_vlan_vid, \
2380 \
2381 netdev_linux_set_policing, \
2382 netdev_linux_get_qos_types, \
2383 netdev_linux_get_qos_capabilities, \
2384 netdev_linux_get_qos, \
2385 netdev_linux_set_qos, \
2386 netdev_linux_get_queue, \
2387 netdev_linux_set_queue, \
2388 netdev_linux_delete_queue, \
2389 netdev_linux_get_queue_stats, \
2390 netdev_linux_dump_queues, \
2391 netdev_linux_dump_queue_stats, \
2392 \
2393 netdev_linux_get_in4, \
2394 netdev_linux_set_in4, \
2395 netdev_linux_get_in6, \
2396 netdev_linux_add_router, \
2397 netdev_linux_get_next_hop, \
e210037e 2398 netdev_linux_get_status, \
c3827f61
BP
2399 netdev_linux_arp_lookup, \
2400 \
2401 netdev_linux_update_flags, \
2402 \
ac4d3bcb 2403 netdev_linux_change_seq \
c3827f61
BP
2404}
2405
2406const struct netdev_class netdev_linux_class =
2407 NETDEV_LINUX_CLASS(
2408 "system",
2409 netdev_linux_create,
2410 netdev_linux_enumerate,
f613a0d7 2411 netdev_linux_get_stats,
98563392 2412 NULL); /* set_stats */
c3827f61
BP
2413
2414const struct netdev_class netdev_tap_class =
2415 NETDEV_LINUX_CLASS(
2416 "tap",
2417 netdev_linux_create_tap,
2418 NULL, /* enumerate */
f613a0d7 2419 netdev_pseudo_get_stats,
c3827f61
BP
2420 NULL); /* set_stats */
2421
2422const struct netdev_class netdev_internal_class =
2423 NETDEV_LINUX_CLASS(
2424 "internal",
2425 netdev_linux_create,
2426 NULL, /* enumerate */
f613a0d7 2427 netdev_pseudo_get_stats,
c3827f61 2428 netdev_vport_set_stats);
8b61709d 2429\f
c1c9c9c4 2430/* HTB traffic control class. */
559843ed 2431
c1c9c9c4 2432#define HTB_N_QUEUES 0xf000
8b61709d 2433
c1c9c9c4
BP
2434struct htb {
2435 struct tc tc;
2436 unsigned int max_rate; /* In bytes/s. */
2437};
8b61709d 2438
c1c9c9c4 2439struct htb_class {
93b13be8 2440 struct tc_queue tc_queue;
c1c9c9c4
BP
2441 unsigned int min_rate; /* In bytes/s. */
2442 unsigned int max_rate; /* In bytes/s. */
2443 unsigned int burst; /* In bytes. */
2444 unsigned int priority; /* Lower values are higher priorities. */
2445};
8b61709d 2446
c1c9c9c4
BP
2447static struct htb *
2448htb_get__(const struct netdev *netdev)
2449{
2450 struct netdev_dev_linux *netdev_dev =
2451 netdev_dev_linux_cast(netdev_get_dev(netdev));
2452 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2453}
2454
24045e35 2455static void
c1c9c9c4
BP
2456htb_install__(struct netdev *netdev, uint64_t max_rate)
2457{
2458 struct netdev_dev_linux *netdev_dev =
2459 netdev_dev_linux_cast(netdev_get_dev(netdev));
2460 struct htb *htb;
2461
2462 htb = xmalloc(sizeof *htb);
2463 tc_init(&htb->tc, &tc_ops_htb);
2464 htb->max_rate = max_rate;
2465
2466 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2467}
2468
2469/* Create an HTB qdisc.
2470 *
a339aa81 2471 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2472static int
2473htb_setup_qdisc__(struct netdev *netdev)
2474{
2475 size_t opt_offset;
2476 struct tc_htb_glob opt;
2477 struct ofpbuf request;
2478 struct tcmsg *tcmsg;
2479
2480 tc_del_qdisc(netdev);
2481
2482 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2483 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2484 if (!tcmsg) {
2485 return ENODEV;
2486 }
c1c9c9c4
BP
2487 tcmsg->tcm_handle = tc_make_handle(1, 0);
2488 tcmsg->tcm_parent = TC_H_ROOT;
2489
2490 nl_msg_put_string(&request, TCA_KIND, "htb");
2491
2492 memset(&opt, 0, sizeof opt);
2493 opt.rate2quantum = 10;
2494 opt.version = 3;
4ecf12d5 2495 opt.defcls = 1;
c1c9c9c4
BP
2496
2497 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2498 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2499 nl_msg_end_nested(&request, opt_offset);
2500
2501 return tc_transact(&request, NULL);
2502}
2503
2504/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2505 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2506static int
2507htb_setup_class__(struct netdev *netdev, unsigned int handle,
2508 unsigned int parent, struct htb_class *class)
2509{
2510 size_t opt_offset;
2511 struct tc_htb_opt opt;
2512 struct ofpbuf request;
2513 struct tcmsg *tcmsg;
2514 int error;
2515 int mtu;
2516
9b020780
PS
2517 error = netdev_get_mtu(netdev, &mtu);
2518 if (error) {
f915f1a8
BP
2519 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2520 netdev_get_name(netdev));
9b020780 2521 return error;
f915f1a8 2522 }
c1c9c9c4
BP
2523
2524 memset(&opt, 0, sizeof opt);
2525 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2526 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2527 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2528 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2529 opt.prio = class->priority;
2530
2531 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2532 if (!tcmsg) {
2533 return ENODEV;
2534 }
c1c9c9c4
BP
2535 tcmsg->tcm_handle = handle;
2536 tcmsg->tcm_parent = parent;
2537
2538 nl_msg_put_string(&request, TCA_KIND, "htb");
2539 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2540 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2541 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2542 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2543 nl_msg_end_nested(&request, opt_offset);
2544
2545 error = tc_transact(&request, NULL);
2546 if (error) {
2547 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2548 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2549 netdev_get_name(netdev),
2550 tc_get_major(handle), tc_get_minor(handle),
2551 tc_get_major(parent), tc_get_minor(parent),
2552 class->min_rate, class->max_rate,
2553 class->burst, class->priority, strerror(error));
2554 }
2555 return error;
2556}
2557
2558/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2559 * description of them into 'details'. The description complies with the
2560 * specification given in the vswitch database documentation for linux-htb
2561 * queue details. */
2562static int
2563htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2564{
2565 static const struct nl_policy tca_htb_policy[] = {
2566 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2567 .min_len = sizeof(struct tc_htb_opt) },
2568 };
2569
2570 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2571 const struct tc_htb_opt *htb;
2572
2573 if (!nl_parse_nested(nl_options, tca_htb_policy,
2574 attrs, ARRAY_SIZE(tca_htb_policy))) {
2575 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2576 return EPROTO;
2577 }
2578
2579 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2580 class->min_rate = htb->rate.rate;
2581 class->max_rate = htb->ceil.rate;
2582 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2583 class->priority = htb->prio;
2584 return 0;
2585}
2586
2587static int
2588htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2589 struct htb_class *options,
2590 struct netdev_queue_stats *stats)
2591{
2592 struct nlattr *nl_options;
2593 unsigned int handle;
2594 int error;
2595
2596 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2597 if (!error && queue_id) {
17ee3c1f
BP
2598 unsigned int major = tc_get_major(handle);
2599 unsigned int minor = tc_get_minor(handle);
2600 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2601 *queue_id = minor - 1;
c1c9c9c4
BP
2602 } else {
2603 error = EPROTO;
2604 }
2605 }
2606 if (!error && options) {
2607 error = htb_parse_tca_options__(nl_options, options);
2608 }
2609 return error;
2610}
2611
2612static void
2613htb_parse_qdisc_details__(struct netdev *netdev,
2614 const struct shash *details, struct htb_class *hc)
2615{
2616 const char *max_rate_s;
2617
2618 max_rate_s = shash_find_data(details, "max-rate");
2619 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2620 if (!hc->max_rate) {
2621 uint32_t current;
2622
2623 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2624 hc->max_rate = netdev_features_to_bps(current) / 8;
2625 }
2626 hc->min_rate = hc->max_rate;
2627 hc->burst = 0;
2628 hc->priority = 0;
2629}
2630
2631static int
2632htb_parse_class_details__(struct netdev *netdev,
2633 const struct shash *details, struct htb_class *hc)
2634{
2635 const struct htb *htb = htb_get__(netdev);
2636 const char *min_rate_s = shash_find_data(details, "min-rate");
2637 const char *max_rate_s = shash_find_data(details, "max-rate");
2638 const char *burst_s = shash_find_data(details, "burst");
2639 const char *priority_s = shash_find_data(details, "priority");
9b020780 2640 int mtu, error;
c1c9c9c4 2641
9b020780
PS
2642 error = netdev_get_mtu(netdev, &mtu);
2643 if (error) {
f915f1a8
BP
2644 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2645 netdev_get_name(netdev));
9b020780 2646 return error;
f915f1a8
BP
2647 }
2648
4f104611
EJ
2649 /* HTB requires at least an mtu sized min-rate to send any traffic even
2650 * on uncongested links. */
c45ab5e9 2651 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2652 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2653 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2654
2655 /* max-rate */
2656 hc->max_rate = (max_rate_s
2657 ? strtoull(max_rate_s, NULL, 10) / 8
2658 : htb->max_rate);
2659 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2660 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2661
2662 /* burst
2663 *
2664 * According to hints in the documentation that I've read, it is important
2665 * that 'burst' be at least as big as the largest frame that might be
2666 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2667 * but having it a bit too small is a problem. Since netdev_get_mtu()
2668 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2669 * the MTU. We actually add 64, instead of 14, as a guard against
2670 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2671 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2672 hc->burst = MAX(hc->burst, mtu + 64);
2673
2674 /* priority */
2675 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2676
2677 return 0;
2678}
2679
2680static int
2681htb_query_class__(const struct netdev *netdev, unsigned int handle,
2682 unsigned int parent, struct htb_class *options,
2683 struct netdev_queue_stats *stats)
2684{
2685 struct ofpbuf *reply;
2686 int error;
2687
2688 error = tc_query_class(netdev, handle, parent, &reply);
2689 if (!error) {
2690 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2691 ofpbuf_delete(reply);
2692 }
2693 return error;
2694}
2695
2696static int
2697htb_tc_install(struct netdev *netdev, const struct shash *details)
2698{
2699 int error;
2700
2701 error = htb_setup_qdisc__(netdev);
2702 if (!error) {
2703 struct htb_class hc;
2704
2705 htb_parse_qdisc_details__(netdev, details, &hc);
2706 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2707 tc_make_handle(1, 0), &hc);
2708 if (!error) {
2709 htb_install__(netdev, hc.max_rate);
2710 }
2711 }
2712 return error;
2713}
2714
93b13be8
BP
2715static struct htb_class *
2716htb_class_cast__(const struct tc_queue *queue)
2717{
2718 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2719}
2720
c1c9c9c4
BP
2721static void
2722htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2723 const struct htb_class *hc)
2724{
2725 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2726 size_t hash = hash_int(queue_id, 0);
2727 struct tc_queue *queue;
c1c9c9c4
BP
2728 struct htb_class *hcp;
2729
93b13be8
BP
2730 queue = tc_find_queue__(netdev, queue_id, hash);
2731 if (queue) {
2732 hcp = htb_class_cast__(queue);
2733 } else {
c1c9c9c4 2734 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2735 queue = &hcp->tc_queue;
2736 queue->queue_id = queue_id;
2737 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2738 }
93b13be8
BP
2739
2740 hcp->min_rate = hc->min_rate;
2741 hcp->max_rate = hc->max_rate;
2742 hcp->burst = hc->burst;
2743 hcp->priority = hc->priority;
c1c9c9c4
BP
2744}
2745
2746static int
2747htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2748{
c1c9c9c4
BP
2749 struct ofpbuf msg;
2750 struct nl_dump dump;
2751 struct htb_class hc;
c1c9c9c4
BP
2752
2753 /* Get qdisc options. */
2754 hc.max_rate = 0;
2755 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2756 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2757
2758 /* Get queues. */
23a98ffe
BP
2759 if (!start_queue_dump(netdev, &dump)) {
2760 return ENODEV;
2761 }
c1c9c9c4
BP
2762 while (nl_dump_next(&dump, &msg)) {
2763 unsigned int queue_id;
2764
2765 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2766 htb_update_queue__(netdev, queue_id, &hc);
2767 }
2768 }
2769 nl_dump_done(&dump);
2770
2771 return 0;
2772}
2773
2774static void
2775htb_tc_destroy(struct tc *tc)
2776{
2777 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2778 struct htb_class *hc, *next;
c1c9c9c4 2779
4e8e4213 2780 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2781 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2782 free(hc);
2783 }
2784 tc_destroy(tc);
2785 free(htb);
2786}
2787
2788static int
2789htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2790{
2791 const struct htb *htb = htb_get__(netdev);
2792 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2793 return 0;
2794}
2795
2796static int
2797htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2798{
2799 struct htb_class hc;
2800 int error;
2801
2802 htb_parse_qdisc_details__(netdev, details, &hc);
2803 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2804 tc_make_handle(1, 0), &hc);
2805 if (!error) {
2806 htb_get__(netdev)->max_rate = hc.max_rate;
2807 }
2808 return error;
2809}
2810
2811static int
93b13be8
BP
2812htb_class_get(const struct netdev *netdev OVS_UNUSED,
2813 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2814{
93b13be8 2815 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2816
2817 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2818 if (hc->min_rate != hc->max_rate) {
2819 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2820 }
2821 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2822 if (hc->priority) {
2823 shash_add(details, "priority", xasprintf("%u", hc->priority));
2824 }
2825 return 0;
2826}
2827
2828static int
2829htb_class_set(struct netdev *netdev, unsigned int queue_id,
2830 const struct shash *details)
2831{
2832 struct htb_class hc;
2833 int error;
2834
2835 error = htb_parse_class_details__(netdev, details, &hc);
2836 if (error) {
2837 return error;
2838 }
2839
17ee3c1f 2840 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2841 tc_make_handle(1, 0xfffe), &hc);
2842 if (error) {
2843 return error;
2844 }
2845
2846 htb_update_queue__(netdev, queue_id, &hc);
2847 return 0;
2848}
2849
2850static int
93b13be8 2851htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2852{
93b13be8 2853 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2854 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2855 int error;
2856
93b13be8 2857 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2858 if (!error) {
93b13be8 2859 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2860 free(hc);
c1c9c9c4
BP
2861 }
2862 return error;
2863}
2864
2865static int
93b13be8 2866htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2867 struct netdev_queue_stats *stats)
2868{
93b13be8 2869 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2870 tc_make_handle(1, 0xfffe), NULL, stats);
2871}
2872
2873static int
2874htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2875 const struct ofpbuf *nlmsg,
2876 netdev_dump_queue_stats_cb *cb, void *aux)
2877{
2878 struct netdev_queue_stats stats;
17ee3c1f 2879 unsigned int handle, major, minor;
c1c9c9c4
BP
2880 int error;
2881
2882 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2883 if (error) {
2884 return error;
2885 }
2886
17ee3c1f
BP
2887 major = tc_get_major(handle);
2888 minor = tc_get_minor(handle);
2889 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2890 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2891 }
2892 return 0;
2893}
2894
2895static const struct tc_ops tc_ops_htb = {
2896 "htb", /* linux_name */
2897 "linux-htb", /* ovs_name */
2898 HTB_N_QUEUES, /* n_queues */
2899 htb_tc_install,
2900 htb_tc_load,
2901 htb_tc_destroy,
2902 htb_qdisc_get,
2903 htb_qdisc_set,
2904 htb_class_get,
2905 htb_class_set,
2906 htb_class_delete,
2907 htb_class_get_stats,
2908 htb_class_dump_stats
2909};
2910\f
a339aa81
EJ
2911/* "linux-hfsc" traffic control class. */
2912
2913#define HFSC_N_QUEUES 0xf000
2914
2915struct hfsc {
2916 struct tc tc;
2917 uint32_t max_rate;
2918};
2919
2920struct hfsc_class {
2921 struct tc_queue tc_queue;
2922 uint32_t min_rate;
2923 uint32_t max_rate;
2924};
2925
2926static struct hfsc *
2927hfsc_get__(const struct netdev *netdev)
2928{
2929 struct netdev_dev_linux *netdev_dev;
2930 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2931 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2932}
2933
2934static struct hfsc_class *
2935hfsc_class_cast__(const struct tc_queue *queue)
2936{
2937 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2938}
2939
24045e35 2940static void
a339aa81
EJ
2941hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2942{
2943 struct netdev_dev_linux * netdev_dev;
2944 struct hfsc *hfsc;
2945
2946 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2947 hfsc = xmalloc(sizeof *hfsc);
2948 tc_init(&hfsc->tc, &tc_ops_hfsc);
2949 hfsc->max_rate = max_rate;
2950 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2951}
2952
2953static void
2954hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2955 const struct hfsc_class *hc)
2956{
2957 size_t hash;
2958 struct hfsc *hfsc;
2959 struct hfsc_class *hcp;
2960 struct tc_queue *queue;
2961
2962 hfsc = hfsc_get__(netdev);
2963 hash = hash_int(queue_id, 0);
2964
2965 queue = tc_find_queue__(netdev, queue_id, hash);
2966 if (queue) {
2967 hcp = hfsc_class_cast__(queue);
2968 } else {
2969 hcp = xmalloc(sizeof *hcp);
2970 queue = &hcp->tc_queue;
2971 queue->queue_id = queue_id;
2972 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2973 }
2974
2975 hcp->min_rate = hc->min_rate;
2976 hcp->max_rate = hc->max_rate;
2977}
2978
2979static int
2980hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2981{
2982 const struct tc_service_curve *rsc, *fsc, *usc;
2983 static const struct nl_policy tca_hfsc_policy[] = {
2984 [TCA_HFSC_RSC] = {
2985 .type = NL_A_UNSPEC,
2986 .optional = false,
2987 .min_len = sizeof(struct tc_service_curve),
2988 },
2989 [TCA_HFSC_FSC] = {
2990 .type = NL_A_UNSPEC,
2991 .optional = false,
2992 .min_len = sizeof(struct tc_service_curve),
2993 },
2994 [TCA_HFSC_USC] = {
2995 .type = NL_A_UNSPEC,
2996 .optional = false,
2997 .min_len = sizeof(struct tc_service_curve),
2998 },
2999 };
3000 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3001
3002 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3003 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3004 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3005 return EPROTO;
3006 }
3007
3008 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3009 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3010 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3011
3012 if (rsc->m1 != 0 || rsc->d != 0 ||
3013 fsc->m1 != 0 || fsc->d != 0 ||
3014 usc->m1 != 0 || usc->d != 0) {
3015 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3016 "Non-linear service curves are not supported.");
3017 return EPROTO;
3018 }
3019
3020 if (rsc->m2 != fsc->m2) {
3021 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3022 "Real-time service curves are not supported ");
3023 return EPROTO;
3024 }
3025
3026 if (rsc->m2 > usc->m2) {
3027 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3028 "Min-rate service curve is greater than "
3029 "the max-rate service curve.");
3030 return EPROTO;
3031 }
3032
3033 class->min_rate = fsc->m2;
3034 class->max_rate = usc->m2;
3035 return 0;
3036}
3037
3038static int
3039hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3040 struct hfsc_class *options,
3041 struct netdev_queue_stats *stats)
3042{
3043 int error;
3044 unsigned int handle;
3045 struct nlattr *nl_options;
3046
3047 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3048 if (error) {
3049 return error;
3050 }
3051
3052 if (queue_id) {
3053 unsigned int major, minor;
3054
3055 major = tc_get_major(handle);
3056 minor = tc_get_minor(handle);
3057 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3058 *queue_id = minor - 1;
3059 } else {
3060 return EPROTO;
3061 }
3062 }
3063
3064 if (options) {
3065 error = hfsc_parse_tca_options__(nl_options, options);
3066 }
3067
3068 return error;
3069}
3070
3071static int
3072hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3073 unsigned int parent, struct hfsc_class *options,
3074 struct netdev_queue_stats *stats)
3075{
3076 int error;
3077 struct ofpbuf *reply;
3078
3079 error = tc_query_class(netdev, handle, parent, &reply);
3080 if (error) {
3081 return error;
3082 }
3083
3084 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3085 ofpbuf_delete(reply);
3086 return error;
3087}
3088
3089static void
3090hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3091 struct hfsc_class *class)
3092{
3093 uint32_t max_rate;
3094 const char *max_rate_s;
3095
3096 max_rate_s = shash_find_data(details, "max-rate");
3097 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3098
3099 if (!max_rate) {
3100 uint32_t current;
3101
3102 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3103 max_rate = netdev_features_to_bps(current) / 8;
3104 }
3105
3106 class->min_rate = max_rate;
3107 class->max_rate = max_rate;
3108}
3109
3110static int
3111hfsc_parse_class_details__(struct netdev *netdev,
3112 const struct shash *details,
3113 struct hfsc_class * class)
3114{
3115 const struct hfsc *hfsc;
3116 uint32_t min_rate, max_rate;
3117 const char *min_rate_s, *max_rate_s;
3118
3119 hfsc = hfsc_get__(netdev);
3120 min_rate_s = shash_find_data(details, "min-rate");
3121 max_rate_s = shash_find_data(details, "max-rate");
3122
c45ab5e9 3123 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3124 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3125 min_rate = MIN(min_rate, hfsc->max_rate);
3126
3127 max_rate = (max_rate_s
3128 ? strtoull(max_rate_s, NULL, 10) / 8
3129 : hfsc->max_rate);
3130 max_rate = MAX(max_rate, min_rate);
3131 max_rate = MIN(max_rate, hfsc->max_rate);
3132
3133 class->min_rate = min_rate;
3134 class->max_rate = max_rate;
3135
3136 return 0;
3137}
3138
3139/* Create an HFSC qdisc.
3140 *
3141 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3142static int
3143hfsc_setup_qdisc__(struct netdev * netdev)
3144{
3145 struct tcmsg *tcmsg;
3146 struct ofpbuf request;
3147 struct tc_hfsc_qopt opt;
3148
3149 tc_del_qdisc(netdev);
3150
3151 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3152 NLM_F_EXCL | NLM_F_CREATE, &request);
3153
3154 if (!tcmsg) {
3155 return ENODEV;
3156 }
3157
3158 tcmsg->tcm_handle = tc_make_handle(1, 0);
3159 tcmsg->tcm_parent = TC_H_ROOT;
3160
3161 memset(&opt, 0, sizeof opt);
3162 opt.defcls = 1;
3163
3164 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3165 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3166
3167 return tc_transact(&request, NULL);
3168}
3169
3170/* Create an HFSC class.
3171 *
3172 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3173 * sc rate <min_rate> ul rate <max_rate>" */
3174static int
3175hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3176 unsigned int parent, struct hfsc_class *class)
3177{
3178 int error;
3179 size_t opt_offset;
3180 struct tcmsg *tcmsg;
3181 struct ofpbuf request;
3182 struct tc_service_curve min, max;
3183
3184 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3185
3186 if (!tcmsg) {
3187 return ENODEV;
3188 }
3189
3190 tcmsg->tcm_handle = handle;
3191 tcmsg->tcm_parent = parent;
3192
3193 min.m1 = 0;
3194 min.d = 0;
3195 min.m2 = class->min_rate;
3196
3197 max.m1 = 0;
3198 max.d = 0;
3199 max.m2 = class->max_rate;
3200
3201 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3202 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3203 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3204 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3205 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3206 nl_msg_end_nested(&request, opt_offset);
3207
3208 error = tc_transact(&request, NULL);
3209 if (error) {
3210 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3211 "min-rate %ubps, max-rate %ubps (%s)",
3212 netdev_get_name(netdev),
3213 tc_get_major(handle), tc_get_minor(handle),
3214 tc_get_major(parent), tc_get_minor(parent),
3215 class->min_rate, class->max_rate, strerror(error));
3216 }
3217
3218 return error;
3219}
3220
3221static int
3222hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3223{
3224 int error;
3225 struct hfsc_class class;
3226
3227 error = hfsc_setup_qdisc__(netdev);
3228
3229 if (error) {
3230 return error;
3231 }
3232
3233 hfsc_parse_qdisc_details__(netdev, details, &class);
3234 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3235 tc_make_handle(1, 0), &class);
3236
3237 if (error) {
3238 return error;
3239 }
3240
3241 hfsc_install__(netdev, class.max_rate);
3242 return 0;
3243}
3244
3245static int
3246hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3247{
3248 struct ofpbuf msg;
a339aa81
EJ
3249 struct nl_dump dump;
3250 struct hfsc_class hc;
3251
3252 hc.max_rate = 0;
3253 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3254 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3255
3256 if (!start_queue_dump(netdev, &dump)) {
3257 return ENODEV;
3258 }
3259
3260 while (nl_dump_next(&dump, &msg)) {
3261 unsigned int queue_id;
3262
3263 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3264 hfsc_update_queue__(netdev, queue_id, &hc);
3265 }
3266 }
3267
3268 nl_dump_done(&dump);
3269 return 0;
3270}
3271
3272static void
3273hfsc_tc_destroy(struct tc *tc)
3274{
3275 struct hfsc *hfsc;
3276 struct hfsc_class *hc, *next;
3277
3278 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3279
3280 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3281 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3282 free(hc);
3283 }
3284
3285 tc_destroy(tc);
3286 free(hfsc);
3287}
3288
3289static int
3290hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3291{
3292 const struct hfsc *hfsc;
3293 hfsc = hfsc_get__(netdev);
3294 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3295 return 0;
3296}
3297
3298static int
3299hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3300{
3301 int error;
3302 struct hfsc_class class;
3303
3304 hfsc_parse_qdisc_details__(netdev, details, &class);
3305 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3306 tc_make_handle(1, 0), &class);
3307
3308 if (!error) {
3309 hfsc_get__(netdev)->max_rate = class.max_rate;
3310 }
3311
3312 return error;
3313}
3314
3315static int
3316hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3317 const struct tc_queue *queue, struct shash *details)
3318{
3319 const struct hfsc_class *hc;
3320
3321 hc = hfsc_class_cast__(queue);
3322 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3323 if (hc->min_rate != hc->max_rate) {
3324 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3325 }
3326 return 0;
3327}
3328
3329static int
3330hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3331 const struct shash *details)
3332{
3333 int error;
3334 struct hfsc_class class;
3335
3336 error = hfsc_parse_class_details__(netdev, details, &class);
3337 if (error) {
3338 return error;
3339 }
3340
3341 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3342 tc_make_handle(1, 0xfffe), &class);
3343 if (error) {
3344 return error;
3345 }
3346
3347 hfsc_update_queue__(netdev, queue_id, &class);
3348 return 0;
3349}
3350
3351static int
3352hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3353{
3354 int error;
3355 struct hfsc *hfsc;
3356 struct hfsc_class *hc;
3357
3358 hc = hfsc_class_cast__(queue);
3359 hfsc = hfsc_get__(netdev);
3360
3361 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3362 if (!error) {
3363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3364 free(hc);
3365 }
3366 return error;
3367}
3368
3369static int
3370hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3371 struct netdev_queue_stats *stats)
3372{
3373 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3374 tc_make_handle(1, 0xfffe), NULL, stats);
3375}
3376
3377static int
3378hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3379 const struct ofpbuf *nlmsg,
3380 netdev_dump_queue_stats_cb *cb, void *aux)
3381{
3382 struct netdev_queue_stats stats;
3383 unsigned int handle, major, minor;
3384 int error;
3385
3386 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3387 if (error) {
3388 return error;
3389 }
3390
3391 major = tc_get_major(handle);
3392 minor = tc_get_minor(handle);
3393 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3394 (*cb)(minor - 1, &stats, aux);
3395 }
3396 return 0;
3397}
3398
3399static const struct tc_ops tc_ops_hfsc = {
3400 "hfsc", /* linux_name */
3401 "linux-hfsc", /* ovs_name */
3402 HFSC_N_QUEUES, /* n_queues */
3403 hfsc_tc_install, /* tc_install */
3404 hfsc_tc_load, /* tc_load */
3405 hfsc_tc_destroy, /* tc_destroy */
3406 hfsc_qdisc_get, /* qdisc_get */
3407 hfsc_qdisc_set, /* qdisc_set */
3408 hfsc_class_get, /* class_get */
3409 hfsc_class_set, /* class_set */
3410 hfsc_class_delete, /* class_delete */
3411 hfsc_class_get_stats, /* class_get_stats */
3412 hfsc_class_dump_stats /* class_dump_stats */
3413};
3414\f
c1c9c9c4
BP
3415/* "linux-default" traffic control class.
3416 *
3417 * This class represents the default, unnamed Linux qdisc. It corresponds to
3418 * the "" (empty string) QoS type in the OVS database. */
3419
3420static void
3421default_install__(struct netdev *netdev)
3422{
3423 struct netdev_dev_linux *netdev_dev =
3424 netdev_dev_linux_cast(netdev_get_dev(netdev));
3425 static struct tc *tc;
3426
3427 if (!tc) {
3428 tc = xmalloc(sizeof *tc);
3429 tc_init(tc, &tc_ops_default);
3430 }
3431 netdev_dev->tc = tc;
3432}
3433
3434static int
3435default_tc_install(struct netdev *netdev,
3436 const struct shash *details OVS_UNUSED)
3437{
3438 default_install__(netdev);
3439 return 0;
3440}
3441
3442static int
3443default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3444{
3445 default_install__(netdev);
3446 return 0;
3447}
3448
3449static const struct tc_ops tc_ops_default = {
3450 NULL, /* linux_name */
3451 "", /* ovs_name */
3452 0, /* n_queues */
3453 default_tc_install,
3454 default_tc_load,
3455 NULL, /* tc_destroy */
3456 NULL, /* qdisc_get */
3457 NULL, /* qdisc_set */
3458 NULL, /* class_get */
3459 NULL, /* class_set */
3460 NULL, /* class_delete */
3461 NULL, /* class_get_stats */
3462 NULL /* class_dump_stats */
3463};
3464\f
3465/* "linux-other" traffic control class.
3466 *
3467 * */
3468
3469static int
3470other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3471{
3472 struct netdev_dev_linux *netdev_dev =
3473 netdev_dev_linux_cast(netdev_get_dev(netdev));
3474 static struct tc *tc;
3475
3476 if (!tc) {
3477 tc = xmalloc(sizeof *tc);
3478 tc_init(tc, &tc_ops_other);
3479 }
3480 netdev_dev->tc = tc;
3481 return 0;
3482}
3483
3484static const struct tc_ops tc_ops_other = {
3485 NULL, /* linux_name */
3486 "linux-other", /* ovs_name */
3487 0, /* n_queues */
3488 NULL, /* tc_install */
3489 other_tc_load,
3490 NULL, /* tc_destroy */
3491 NULL, /* qdisc_get */
3492 NULL, /* qdisc_set */
3493 NULL, /* class_get */
3494 NULL, /* class_set */
3495 NULL, /* class_delete */
3496 NULL, /* class_get_stats */
3497 NULL /* class_dump_stats */
3498};
3499\f
3500/* Traffic control. */
3501
3502/* Number of kernel "tc" ticks per second. */
3503static double ticks_per_s;
3504
3505/* Number of kernel "jiffies" per second. This is used for the purpose of
3506 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3507 * one jiffy's worth of data.
3508 *
3509 * There are two possibilities here:
3510 *
3511 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3512 * approximate range of 100 to 1024. That means that we really need to
3513 * make sure that the qdisc can buffer that much data.
3514 *
3515 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3516 * has finely granular timers and there's no need to fudge additional room
3517 * for buffers. (There's no extra effort needed to implement that: the
3518 * large 'buffer_hz' is used as a divisor, so practically any number will
3519 * come out as 0 in the division. Small integer results in the case of
3520 * really high dividends won't have any real effect anyhow.)
3521 */
3522static unsigned int buffer_hz;
3523
3524/* Returns tc handle 'major':'minor'. */
3525static unsigned int
3526tc_make_handle(unsigned int major, unsigned int minor)
3527{
3528 return TC_H_MAKE(major << 16, minor);
3529}
3530
3531/* Returns the major number from 'handle'. */
3532static unsigned int
3533tc_get_major(unsigned int handle)
3534{
3535 return TC_H_MAJ(handle) >> 16;
3536}
3537
3538/* Returns the minor number from 'handle'. */
3539static unsigned int
3540tc_get_minor(unsigned int handle)
3541{
3542 return TC_H_MIN(handle);
3543}
3544
3545static struct tcmsg *
3546tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3547 struct ofpbuf *request)
3548{
3549 struct tcmsg *tcmsg;
3550 int ifindex;
3551 int error;
3552
3553 error = get_ifindex(netdev, &ifindex);
3554 if (error) {
3555 return NULL;
3556 }
3557
3558 ofpbuf_init(request, 512);
3559 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3560 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3561 tcmsg->tcm_family = AF_UNSPEC;
3562 tcmsg->tcm_ifindex = ifindex;
3563 /* Caller should fill in tcmsg->tcm_handle. */
3564 /* Caller should fill in tcmsg->tcm_parent. */
3565
3566 return tcmsg;
3567}
3568
3569static int
3570tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3571{
3572 int error = nl_sock_transact(rtnl_sock, request, replyp);
3573 ofpbuf_uninit(request);
3574 return error;
3575}
3576
3577static void
3578read_psched(void)
3579{
3580 /* The values in psched are not individually very meaningful, but they are
3581 * important. The tables below show some values seen in the wild.
3582 *
3583 * Some notes:
3584 *
3585 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3586 * (Before that, there are hints that it was 1000000000.)
3587 *
3588 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3589 * above.
3590 *
3591 * /proc/net/psched
3592 * -----------------------------------
3593 * [1] 000c8000 000f4240 000f4240 00000064
3594 * [2] 000003e8 00000400 000f4240 3b9aca00
3595 * [3] 000003e8 00000400 000f4240 3b9aca00
3596 * [4] 000003e8 00000400 000f4240 00000064
3597 * [5] 000003e8 00000040 000f4240 3b9aca00
3598 * [6] 000003e8 00000040 000f4240 000000f9
3599 *
3600 * a b c d ticks_per_s buffer_hz
3601 * ------- --------- ---------- ------------- ----------- -------------
3602 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3603 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3604 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3605 * [4] 1,000 1,024 1,000,000 100 976,562 100
3606 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3607 * [6] 1,000 64 1,000,000 249 15,625,000 249
3608 *
3609 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3610 * [2] 2.6.26-1-686-bigmem from Debian lenny
3611 * [3] 2.6.26-2-sparc64 from Debian lenny
3612 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3613 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3614 * [6] 2.6.34 from kernel.org on KVM
3615 */
3616 static const char fn[] = "/proc/net/psched";
3617 unsigned int a, b, c, d;
3618 FILE *stream;
3619
3620 ticks_per_s = 1.0;
3621 buffer_hz = 100;
3622
3623 stream = fopen(fn, "r");
3624 if (!stream) {
3625 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3626 return;
3627 }
3628
3629 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3630 VLOG_WARN("%s: read failed", fn);
3631 fclose(stream);
3632 return;
3633 }
3634 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3635 fclose(stream);
3636
3637 if (!a || !c) {
3638 VLOG_WARN("%s: invalid scheduler parameters", fn);
3639 return;
3640 }
3641
3642 ticks_per_s = (double) a * c / b;
3643 if (c == 1000000) {
3644 buffer_hz = d;
3645 } else {
3646 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3647 fn, a, b, c, d);
3648 }
3649 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3650}
3651
3652/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3653 * rate of 'rate' bytes per second. */
3654static unsigned int
3655tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3656{
3657 if (!buffer_hz) {
3658 read_psched();
3659 }
3660 return (rate * ticks) / ticks_per_s;
3661}
3662
3663/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3664 * rate of 'rate' bytes per second. */
3665static unsigned int
3666tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3667{
3668 if (!buffer_hz) {
3669 read_psched();
3670 }
015c93a4 3671 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3672}
3673
3674/* Returns the number of bytes that need to be reserved for qdisc buffering at
3675 * a transmission rate of 'rate' bytes per second. */
3676static unsigned int
3677tc_buffer_per_jiffy(unsigned int rate)
3678{
3679 if (!buffer_hz) {
3680 read_psched();
3681 }
3682 return rate / buffer_hz;
3683}
3684
3685/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3686 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3687 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3688 * stores NULL into it if it is absent.
3689 *
3690 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3691 * 'msg'.
3692 *
3693 * Returns 0 if successful, otherwise a positive errno value. */
3694static int
3695tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3696 struct nlattr **options)
3697{
3698 static const struct nl_policy tca_policy[] = {
3699 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3700 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3701 };
3702 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3703
3704 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3705 tca_policy, ta, ARRAY_SIZE(ta))) {
3706 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3707 goto error;
3708 }
3709
3710 if (kind) {
3711 *kind = nl_attr_get_string(ta[TCA_KIND]);
3712 }
3713
3714 if (options) {
3715 *options = ta[TCA_OPTIONS];
3716 }
3717
3718 return 0;
3719
3720error:
3721 if (kind) {
3722 *kind = NULL;
3723 }
3724 if (options) {
3725 *options = NULL;
3726 }
3727 return EPROTO;
3728}
3729
3730/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3731 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3732 * into '*options', and its queue statistics into '*stats'. Any of the output
3733 * arguments may be null.
3734 *
3735 * Returns 0 if successful, otherwise a positive errno value. */
3736static int
3737tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3738 struct nlattr **options, struct netdev_queue_stats *stats)
3739{
3740 static const struct nl_policy tca_policy[] = {
3741 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3742 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3743 };
3744 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3745
3746 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3747 tca_policy, ta, ARRAY_SIZE(ta))) {
3748 VLOG_WARN_RL(&rl, "failed to parse class message");
3749 goto error;
3750 }
3751
3752 if (handlep) {
3753 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3754 *handlep = tc->tcm_handle;
3755 }
3756
3757 if (options) {
3758 *options = ta[TCA_OPTIONS];
3759 }
3760
3761 if (stats) {
3762 const struct gnet_stats_queue *gsq;
3763 struct gnet_stats_basic gsb;
3764
3765 static const struct nl_policy stats_policy[] = {
3766 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3767 .min_len = sizeof gsb },
3768 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3769 .min_len = sizeof *gsq },
3770 };
3771 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3772
3773 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3774 sa, ARRAY_SIZE(sa))) {
3775 VLOG_WARN_RL(&rl, "failed to parse class stats");
3776 goto error;
3777 }
3778
3779 /* Alignment issues screw up the length of struct gnet_stats_basic on
3780 * some arch/bitsize combinations. Newer versions of Linux have a
3781 * struct gnet_stats_basic_packed, but we can't depend on that. The
3782 * easiest thing to do is just to make a copy. */
3783 memset(&gsb, 0, sizeof gsb);
3784 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3785 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3786 stats->tx_bytes = gsb.bytes;
3787 stats->tx_packets = gsb.packets;
3788
3789 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3790 stats->tx_errors = gsq->drops;
3791 }
3792
3793 return 0;
3794
3795error:
3796 if (options) {
3797 *options = NULL;
3798 }
3799 if (stats) {
3800 memset(stats, 0, sizeof *stats);
3801 }
3802 return EPROTO;
3803}
3804
3805/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3806 * on 'netdev'. */
3807static int
3808tc_query_class(const struct netdev *netdev,
3809 unsigned int handle, unsigned int parent,
3810 struct ofpbuf **replyp)
3811{
3812 struct ofpbuf request;
3813 struct tcmsg *tcmsg;
3814 int error;
3815
3816 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3817 if (!tcmsg) {
3818 return ENODEV;
3819 }
c1c9c9c4
BP
3820 tcmsg->tcm_handle = handle;
3821 tcmsg->tcm_parent = parent;
3822
3823 error = tc_transact(&request, replyp);
3824 if (error) {
3825 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3826 netdev_get_name(netdev),
3827 tc_get_major(handle), tc_get_minor(handle),
3828 tc_get_major(parent), tc_get_minor(parent),
3829 strerror(error));
3830 }
3831 return error;
3832}
3833
3834/* Equivalent to "tc class del dev <name> handle <handle>". */
3835static int
3836tc_delete_class(const struct netdev *netdev, unsigned int handle)
3837{
3838 struct ofpbuf request;
3839 struct tcmsg *tcmsg;
3840 int error;
3841
3842 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3843 if (!tcmsg) {
3844 return ENODEV;
3845 }
c1c9c9c4
BP
3846 tcmsg->tcm_handle = handle;
3847 tcmsg->tcm_parent = 0;
3848
3849 error = tc_transact(&request, NULL);
3850 if (error) {
3851 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3852 netdev_get_name(netdev),
3853 tc_get_major(handle), tc_get_minor(handle),
3854 strerror(error));
3855 }
3856 return error;
3857}
3858
3859/* Equivalent to "tc qdisc del dev <name> root". */
3860static int
3861tc_del_qdisc(struct netdev *netdev)
3862{
3863 struct netdev_dev_linux *netdev_dev =
3864 netdev_dev_linux_cast(netdev_get_dev(netdev));
3865 struct ofpbuf request;
3866 struct tcmsg *tcmsg;
3867 int error;
3868
3869 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3870 if (!tcmsg) {
3871 return ENODEV;
3872 }
c1c9c9c4
BP
3873 tcmsg->tcm_handle = tc_make_handle(1, 0);
3874 tcmsg->tcm_parent = TC_H_ROOT;
3875
3876 error = tc_transact(&request, NULL);
3877 if (error == EINVAL) {
3878 /* EINVAL probably means that the default qdisc was in use, in which
3879 * case we've accomplished our purpose. */
3880 error = 0;
3881 }
3882 if (!error && netdev_dev->tc) {
3883 if (netdev_dev->tc->ops->tc_destroy) {
3884 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3885 }
3886 netdev_dev->tc = NULL;
3887 }
3888 return error;
3889}
3890
3891/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3892 * kernel to determine what they are. Returns 0 if successful, otherwise a
3893 * positive errno value. */
3894static int
3895tc_query_qdisc(const struct netdev *netdev)
3896{
3897 struct netdev_dev_linux *netdev_dev =
3898 netdev_dev_linux_cast(netdev_get_dev(netdev));
3899 struct ofpbuf request, *qdisc;
3900 const struct tc_ops *ops;
3901 struct tcmsg *tcmsg;
3902 int load_error;
3903 int error;
3904
3905 if (netdev_dev->tc) {
3906 return 0;
3907 }
3908
3909 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3910 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3911 * 2.6.35 without that fix backported to it.
3912 *
3913 * To avoid the OOPS, we must not make a request that would attempt to dump
3914 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3915 * few others. There are a few ways that I can see to do this, but most of
3916 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3917 * technique chosen here is to assume that any non-default qdisc that we
3918 * create will have a class with handle 1:0. The built-in qdiscs only have
3919 * a class with handle 0:0.
3920 *
3921 * We could check for Linux 2.6.35+ and use a more straightforward method
3922 * there. */
3923 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3924 if (!tcmsg) {
3925 return ENODEV;
3926 }
c1c9c9c4
BP
3927 tcmsg->tcm_handle = tc_make_handle(1, 0);
3928 tcmsg->tcm_parent = 0;
3929
3930 /* Figure out what tc class to instantiate. */
3931 error = tc_transact(&request, &qdisc);
3932 if (!error) {
3933 const char *kind;
3934
3935 error = tc_parse_qdisc(qdisc, &kind, NULL);
3936 if (error) {
3937 ops = &tc_ops_other;
3938 } else {
3939 ops = tc_lookup_linux_name(kind);
3940 if (!ops) {
3941 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3942 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3943
3944 ops = &tc_ops_other;
3945 }
3946 }
3947 } else if (error == ENOENT) {
3948 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3949 * other entity that doesn't have a handle 1:0. We will assume
3950 * that it's the system default qdisc. */
3951 ops = &tc_ops_default;
3952 error = 0;
3953 } else {
3954 /* Who knows? Maybe the device got deleted. */
3955 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3956 netdev_get_name(netdev), strerror(error));
3957 ops = &tc_ops_other;
3958 }
3959
3960 /* Instantiate it. */
3961 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3962 assert((load_error == 0) == (netdev_dev->tc != NULL));
3963 ofpbuf_delete(qdisc);
3964
3965 return error ? error : load_error;
3966}
3967
3968/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3969 approximate the time to transmit packets of various lengths. For an MTU of
3970 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3971 represents two possible packet lengths; for a MTU of 513 through 1024, four
3972 possible lengths; and so on.
3973
3974 Returns, for the specified 'mtu', the number of bits that packet lengths
3975 need to be shifted right to fit within such a 256-entry table. */
3976static int
3977tc_calc_cell_log(unsigned int mtu)
3978{
3979 int cell_log;
3980
3981 if (!mtu) {
3982 mtu = ETH_PAYLOAD_MAX;
3983 }
3984 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3985
3986 for (cell_log = 0; mtu >= 256; cell_log++) {
3987 mtu >>= 1;
3988 }
3989
3990 return cell_log;
3991}
3992
3993/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3994 * of 'mtu'. */
3995static void
3996tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3997{
3998 memset(rate, 0, sizeof *rate);
3999 rate->cell_log = tc_calc_cell_log(mtu);
4000 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4001 /* rate->cell_align = 0; */ /* distro headers. */
4002 rate->mpu = ETH_TOTAL_MIN;
4003 rate->rate = Bps;
4004}
4005
4006/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4007 * attribute of the specified "type".
4008 *
4009 * See tc_calc_cell_log() above for a description of "rtab"s. */
4010static void
4011tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4012{
4013 uint32_t *rtab;
4014 unsigned int i;
4015
4016 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4017 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4018 unsigned packet_size = (i + 1) << rate->cell_log;
4019 if (packet_size < rate->mpu) {
4020 packet_size = rate->mpu;
4021 }
4022 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4023 }
4024}
4025
4026/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4027 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4028 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4029 * 0 is fine.) */
c1c9c9c4
BP
4030static int
4031tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4032{
4033 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4034 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4035}
d3980822 4036\f
d3980822 4037/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4038static void
d3980822
BP
4039netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4040 const struct rtnl_link_stats *src)
4041{
f613a0d7
PS
4042 dst->rx_packets = src->rx_packets;
4043 dst->tx_packets = src->tx_packets;
4044 dst->rx_bytes = src->rx_bytes;
4045 dst->tx_bytes = src->tx_bytes;
4046 dst->rx_errors = src->rx_errors;
4047 dst->tx_errors = src->tx_errors;
4048 dst->rx_dropped = src->rx_dropped;
4049 dst->tx_dropped = src->tx_dropped;
4050 dst->multicast = src->multicast;
4051 dst->collisions = src->collisions;
4052 dst->rx_length_errors = src->rx_length_errors;
4053 dst->rx_over_errors = src->rx_over_errors;
4054 dst->rx_crc_errors = src->rx_crc_errors;
4055 dst->rx_frame_errors = src->rx_frame_errors;
4056 dst->rx_fifo_errors = src->rx_fifo_errors;
4057 dst->rx_missed_errors = src->rx_missed_errors;
4058 dst->tx_aborted_errors = src->tx_aborted_errors;
4059 dst->tx_carrier_errors = src->tx_carrier_errors;
4060 dst->tx_fifo_errors = src->tx_fifo_errors;
4061 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4062 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4063}
4064
c1c9c9c4
BP
4065\f
4066/* Utility functions. */
4067
4068static int
4069get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4070{
4071 /* Policy for RTNLGRP_LINK messages.
4072 *
4073 * There are *many* more fields in these messages, but currently we only
4074 * care about these fields. */
4075 static const struct nl_policy rtnlgrp_link_policy[] = {
4076 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4077 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4078 .min_len = sizeof(struct rtnl_link_stats) },
4079 };
4080
4081 struct ofpbuf request;
4082 struct ofpbuf *reply;
4083 struct ifinfomsg *ifi;
c1c9c9c4
BP
4084 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4085 int error;
4086
4087 ofpbuf_init(&request, 0);
4088 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4089 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4090 ifi->ifi_family = PF_UNSPEC;
4091 ifi->ifi_index = ifindex;
4092 error = nl_sock_transact(rtnl_sock, &request, &reply);
4093 ofpbuf_uninit(&request);
4094 if (error) {
4095 return error;
4096 }
4097
4098 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4099 rtnlgrp_link_policy,
4100 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4101 ofpbuf_delete(reply);
4102 return EPROTO;
4103 }
4104
4105 if (!attrs[IFLA_STATS]) {
4106 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4107 ofpbuf_delete(reply);
4108 return EPROTO;
4109 }
8b61709d 4110
d3980822 4111 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4112
576e26d7
BP
4113 ofpbuf_delete(reply);
4114
8b61709d
BP
4115 return 0;
4116}
4117
4118static int
4119get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4120{
4121 static const char fn[] = "/proc/net/dev";
4122 char line[1024];
4123 FILE *stream;
4124 int ln;
4125
4126 stream = fopen(fn, "r");
4127 if (!stream) {
4128 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4129 return errno;
4130 }
4131
4132 ln = 0;
4133 while (fgets(line, sizeof line, stream)) {
4134 if (++ln >= 3) {
4135 char devname[16];
4136#define X64 "%"SCNu64
4137 if (sscanf(line,
4138 " %15[^:]:"
4139 X64 X64 X64 X64 X64 X64 X64 "%*u"
4140 X64 X64 X64 X64 X64 X64 X64 "%*u",
4141 devname,
4142 &stats->rx_bytes,
4143 &stats->rx_packets,
4144 &stats->rx_errors,
4145 &stats->rx_dropped,
4146 &stats->rx_fifo_errors,
4147 &stats->rx_frame_errors,
4148 &stats->multicast,
4149 &stats->tx_bytes,
4150 &stats->tx_packets,
4151 &stats->tx_errors,
4152 &stats->tx_dropped,
4153 &stats->tx_fifo_errors,
4154 &stats->collisions,
4155 &stats->tx_carrier_errors) != 15) {
4156 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4157 } else if (!strcmp(devname, netdev_name)) {
4158 stats->rx_length_errors = UINT64_MAX;
4159 stats->rx_over_errors = UINT64_MAX;
4160 stats->rx_crc_errors = UINT64_MAX;
4161 stats->rx_missed_errors = UINT64_MAX;
4162 stats->tx_aborted_errors = UINT64_MAX;
4163 stats->tx_heartbeat_errors = UINT64_MAX;
4164 stats->tx_window_errors = UINT64_MAX;
4165 fclose(stream);
4166 return 0;
4167 }
4168 }
4169 }
4170 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4171 fclose(stream);
4172 return ENODEV;
4173}
c1c9c9c4 4174
8b61709d
BP
4175static int
4176get_flags(const struct netdev *netdev, int *flags)
4177{
4178 struct ifreq ifr;
4179 int error;
4180
149f577a
JG
4181 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4182 "SIOCGIFFLAGS");
8b61709d
BP
4183 *flags = ifr.ifr_flags;
4184 return error;
4185}
4186
4187static int
4188set_flags(struct netdev *netdev, int flags)
4189{
4190 struct ifreq ifr;
4191
4192 ifr.ifr_flags = flags;
149f577a
JG
4193 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4194 "SIOCSIFFLAGS");
8b61709d
BP
4195}
4196
4197static int
4198do_get_ifindex(const char *netdev_name)
4199{
4200 struct ifreq ifr;
4201
71d7c22f 4202 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4203 COVERAGE_INC(netdev_get_ifindex);
4204 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4205 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4206 netdev_name, strerror(errno));
4207 return -errno;
4208 }
4209 return ifr.ifr_ifindex;
4210}
4211
4212static int
4213get_ifindex(const struct netdev *netdev_, int *ifindexp)
4214{
149f577a
JG
4215 struct netdev_dev_linux *netdev_dev =
4216 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4217 *ifindexp = 0;
149f577a 4218 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4219 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4220 if (ifindex < 0) {
4221 return -ifindex;
4222 }
149f577a
JG
4223 netdev_dev->cache_valid |= VALID_IFINDEX;
4224 netdev_dev->ifindex = ifindex;
8b61709d 4225 }
149f577a 4226 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4227 return 0;
4228}
4229
4230static int
4231get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4232{
4233 struct ifreq ifr;
4234 int hwaddr_family;
4235
4236 memset(&ifr, 0, sizeof ifr);
71d7c22f 4237 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4238 COVERAGE_INC(netdev_get_hwaddr);
4239 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4240 /* ENODEV probably means that a vif disappeared asynchronously and
4241 * hasn't been removed from the database yet, so reduce the log level
4242 * to INFO for that case. */
4243 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4244 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4245 netdev_name, strerror(errno));
8b61709d
BP
4246 return errno;
4247 }
4248 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4249 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4250 VLOG_WARN("%s device has unknown hardware address family %d",
4251 netdev_name, hwaddr_family);
4252 }
4253 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4254 return 0;
4255}
4256
4257static int
4258set_etheraddr(const char *netdev_name, int hwaddr_family,
4259 const uint8_t mac[ETH_ADDR_LEN])
4260{
4261 struct ifreq ifr;
4262
4263 memset(&ifr, 0, sizeof ifr);
71d7c22f 4264 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4265 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4266 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4267 COVERAGE_INC(netdev_set_hwaddr);
4268 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4269 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4270 netdev_name, strerror(errno));
4271 return errno;
4272 }
4273 return 0;
4274}
4275
4276static int
0b0544d7 4277netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4278 int cmd, const char *cmd_name)
4279{
4280 struct ifreq ifr;
4281
4282 memset(&ifr, 0, sizeof ifr);
71d7c22f 4283 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4284 ifr.ifr_data = (caddr_t) ecmd;
4285
4286 ecmd->cmd = cmd;
4287 COVERAGE_INC(netdev_ethtool);
4288 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4289 return 0;
4290 } else {
4291 if (errno != EOPNOTSUPP) {
4292 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4293 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4294 } else {
4295 /* The device doesn't support this operation. That's pretty
4296 * common, so there's no point in logging anything. */
4297 }
4298 return errno;
4299 }
4300}
4301
e47bd51a
JP
4302/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4303 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4304int
4305netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4306 const char *flag_name, bool enable)
4307{
4308 const char *netdev_name = netdev_get_name(netdev);
4309 struct ethtool_value evalue;
4310 uint32_t new_flags;
4311 int error;
4312
4313 memset(&evalue, 0, sizeof evalue);
4314 error = netdev_linux_do_ethtool(netdev_name,
4315 (struct ethtool_cmd *)&evalue,
4316 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4317 if (error) {
4318 return error;
4319 }
4320
4321 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4322 error = netdev_linux_do_ethtool(netdev_name,
4323 (struct ethtool_cmd *)&evalue,
4324 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4325 if (error) {
4326 return error;
4327 }
4328
4329 memset(&evalue, 0, sizeof evalue);
4330 error = netdev_linux_do_ethtool(netdev_name,
4331 (struct ethtool_cmd *)&evalue,
4332 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4333 if (error) {
4334 return error;
4335 }
4336
4337 if (new_flags != evalue.data) {
4338 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4339 "device %s failed", enable ? "enable" : "disable",
4340 flag_name, netdev_name);
4341 return EOPNOTSUPP;
4342 }
4343
4344 return 0;
4345}
4346
8b61709d 4347static int
149f577a
JG
4348netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4349 const char *cmd_name)
8b61709d 4350{
71d7c22f 4351 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4352 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4353 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4354 strerror(errno));
8b61709d
BP
4355 return errno;
4356 }
4357 return 0;
4358}
f1acd62b
BP
4359
4360static int
4361netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4362 int cmd, const char *cmd_name)
4363{
4364 struct ifreq ifr;
4365 int error;
4366
4367 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4368 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4369 if (!error) {
4370 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4371 *ip = sin->sin_addr;
4372 }
4373 return error;
4374}
488d734d
BP
4375
4376/* Returns an AF_PACKET raw socket or a negative errno value. */
4377static int
4378af_packet_sock(void)
4379{
4380 static int sock = INT_MIN;
4381
4382 if (sock == INT_MIN) {
4383 sock = socket(AF_PACKET, SOCK_RAW, 0);
4384 if (sock >= 0) {
4385 set_nonblocking(sock);
4386 } else {
4387 sock = -errno;
4388 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4389 }
4390 }
4391
4392 return sock;
4393}