]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netdev: Don't divide by zero when "linux-htb" zero min-rate is used
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
149f577a 2 * Copyright (c) 2009, 2010 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
6f42c8ea 28#include <linux/pkt_sched.h>
e9e28be3 29#include <linux/rtnetlink.h>
8b61709d
BP
30#include <linux/sockios.h>
31#include <linux/version.h>
32#include <sys/types.h>
33#include <sys/ioctl.h>
34#include <sys/socket.h>
35#include <netpacket/packet.h>
36#include <net/ethernet.h>
37#include <net/if.h>
a740f0de 38#include <linux/if_tunnel.h>
8b61709d
BP
39#include <net/if_arp.h>
40#include <net/if_packet.h>
41#include <net/route.h>
42#include <netinet/in.h>
e9e28be3 43#include <poll.h>
8b61709d
BP
44#include <stdlib.h>
45#include <string.h>
46#include <unistd.h>
e9e28be3
BP
47
48#include "coverage.h"
8b61709d
BP
49#include "dynamic-string.h"
50#include "fatal-signal.h"
93b13be8
BP
51#include "hash.h"
52#include "hmap.h"
8b61709d 53#include "netdev-provider.h"
7fbef77a 54#include "netdev-vport.h"
e9e28be3
BP
55#include "netlink.h"
56#include "ofpbuf.h"
8b61709d
BP
57#include "openflow/openflow.h"
58#include "packets.h"
59#include "poll-loop.h"
559843ed 60#include "rtnetlink.h"
8b61709d
BP
61#include "socket-util.h"
62#include "shash.h"
63#include "svec.h"
e9e28be3 64#include "vlog.h"
5136ce49
BP
65
66VLOG_DEFINE_THIS_MODULE(netdev_linux)
8b61709d
BP
67\f
68/* These were introduced in Linux 2.6.14, so they might be missing if we have
69 * old headers. */
70#ifndef ADVERTISED_Pause
71#define ADVERTISED_Pause (1 << 13)
72#endif
73#ifndef ADVERTISED_Asym_Pause
74#define ADVERTISED_Asym_Pause (1 << 14)
75#endif
76
c1c9c9c4
BP
77/* This was introduced in Linux 2.6.25, so it might be missing if we have old
78 * headers. */
79#ifndef TC_RTAB_SIZE
80#define TC_RTAB_SIZE 1024
81#endif
82
149f577a 83static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 84static int cache_notifier_refcount;
8b61709d
BP
85
86enum {
7fbef77a
JG
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
89 VALID_IN4 = 1 << 2,
90 VALID_IN6 = 1 << 3,
91 VALID_MTU = 1 << 4,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
96};
97
149f577a
JG
98struct tap_state {
99 int fd;
61b999dd 100 bool opened;
149f577a 101};
c1c9c9c4
BP
102\f
103/* Traffic control. */
104
105/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
106 * network device.
107 *
108 * Each TC implementation subclasses this with whatever additional data it
109 * needs. */
c1c9c9c4
BP
110struct tc {
111 const struct tc_ops *ops;
93b13be8
BP
112 struct hmap queues; /* Contains "struct tc_queue"s.
113 * Read by generic TC layer.
114 * Written only by TC implementation. */
115};
c1c9c9c4 116
93b13be8
BP
117/* One traffic control queue.
118 *
119 * Each TC implementation subclasses this with whatever additional data it
120 * needs. */
121struct tc_queue {
122 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
123 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
124};
125
126/* A particular kind of traffic control. Each implementation generally maps to
127 * one particular Linux qdisc class.
128 *
129 * The functions below return 0 if successful or a positive errno value on
130 * failure, except where otherwise noted. All of them must be provided, except
131 * where otherwise noted. */
132struct tc_ops {
133 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
134 * This is null for tc_ops_default and tc_ops_other, for which there are no
135 * appropriate values. */
136 const char *linux_name;
137
138 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
139 const char *ovs_name;
140
141 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
142 * queues. The queues are numbered 0 through n_queues - 1. */
143 unsigned int n_queues;
144
145 /* Called to install this TC class on 'netdev'. The implementation should
146 * make the Netlink calls required to set up 'netdev' with the right qdisc
147 * and configure it according to 'details'. The implementation may assume
148 * that the current qdisc is the default; that is, there is no need for it
149 * to delete the current qdisc before installing itself.
150 *
151 * The contents of 'details' should be documented as valid for 'ovs_name'
152 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
153 * (which is built as ovs-vswitchd.conf.db(8)).
154 *
155 * This function must return 0 if and only if it sets 'netdev->tc' to an
156 * initialized 'struct tc'.
157 *
158 * (This function is null for tc_ops_other, which cannot be installed. For
159 * other TC classes it should always be nonnull.) */
160 int (*tc_install)(struct netdev *netdev, const struct shash *details);
161
162 /* Called when the netdev code determines (through a Netlink query) that
163 * this TC class's qdisc is installed on 'netdev', but we didn't install
164 * it ourselves and so don't know any of the details.
165 *
166 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
167 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
168 * implementation should parse the other attributes of 'nlmsg' as
169 * necessary to determine its configuration. If necessary it should also
170 * use Netlink queries to determine the configuration of queues on
171 * 'netdev'.
172 *
173 * This function must return 0 if and only if it sets 'netdev->tc' to an
174 * initialized 'struct tc'. */
175 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
176
177 /* Destroys the data structures allocated by the implementation as part of
178 * 'tc'. (This includes destroying 'tc->queues' by calling
179 * tc_destroy(tc).
180 *
181 * The implementation should not need to perform any Netlink calls. If
182 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
183 * (But it may not be desirable.)
184 *
185 * This function may be null if 'tc' is trivial. */
186 void (*tc_destroy)(struct tc *tc);
187
188 /* Retrieves details of 'netdev->tc' configuration into 'details'.
189 *
190 * The implementation should not need to perform any Netlink calls, because
191 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
192 * cached the configuration.
193 *
194 * The contents of 'details' should be documented as valid for 'ovs_name'
195 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
196 * (which is built as ovs-vswitchd.conf.db(8)).
197 *
198 * This function may be null if 'tc' is not configurable.
199 */
200 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
201
202 /* Reconfigures 'netdev->tc' according to 'details', performing any
203 * required Netlink calls to complete the reconfiguration.
204 *
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
208 *
209 * This function may be null if 'tc' is not configurable.
210 */
211 int (*qdisc_set)(struct netdev *, const struct shash *details);
212
93b13be8
BP
213 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
214 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
215 *
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "Queue" table in
218 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
219 *
220 * The implementation should not need to perform any Netlink calls, because
221 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
222 * cached the queue configuration.
223 *
224 * This function may be null if 'tc' does not have queues ('n_queues' is
225 * 0). */
93b13be8 226 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
227 struct shash *details);
228
229 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
230 * 'details', perfoming any required Netlink calls to complete the
231 * reconfiguration. The caller ensures that 'queue_id' is less than
232 * 'n_queues'.
233 *
234 * The contents of 'details' should be documented as valid for 'ovs_name'
235 * in the "other_config" column in the "Queue" table in
236 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
237 *
238 * This function may be null if 'tc' does not have queues or its queues are
239 * not configurable. */
240 int (*class_set)(struct netdev *, unsigned int queue_id,
241 const struct shash *details);
242
93b13be8
BP
243 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
244 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
245 *
246 * This function may be null if 'tc' does not have queues or its queues
247 * cannot be deleted. */
93b13be8 248 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 249
93b13be8
BP
250 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
251 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
252 *
253 * On success, initializes '*stats'.
254 *
255 * This function may be null if 'tc' does not have queues or if it cannot
256 * report queue statistics. */
93b13be8
BP
257 int (*class_get_stats)(const struct netdev *netdev,
258 const struct tc_queue *queue,
c1c9c9c4
BP
259 struct netdev_queue_stats *stats);
260
261 /* Extracts queue stats from 'nlmsg', which is a response to a
262 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
263 *
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_dump_stats)(const struct netdev *netdev,
267 const struct ofpbuf *nlmsg,
268 netdev_dump_queue_stats_cb *cb, void *aux);
269};
270
271static void
272tc_init(struct tc *tc, const struct tc_ops *ops)
273{
274 tc->ops = ops;
93b13be8 275 hmap_init(&tc->queues);
c1c9c9c4
BP
276}
277
278static void
279tc_destroy(struct tc *tc)
280{
93b13be8 281 hmap_destroy(&tc->queues);
c1c9c9c4
BP
282}
283
284static const struct tc_ops tc_ops_htb;
285static const struct tc_ops tc_ops_default;
286static const struct tc_ops tc_ops_other;
287
288static const struct tc_ops *tcs[] = {
289 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
290 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
291 &tc_ops_other, /* Some other qdisc. */
292 NULL
293};
149f577a 294
c1c9c9c4
BP
295static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
296static unsigned int tc_get_major(unsigned int handle);
297static unsigned int tc_get_minor(unsigned int handle);
298
299static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
300static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
301static unsigned int tc_buffer_per_jiffy(unsigned int rate);
302
303static struct tcmsg *tc_make_request(const struct netdev *, int type,
304 unsigned int flags, struct ofpbuf *);
305static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
306
307static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
308 struct nlattr **options);
309static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
310 struct nlattr **options,
311 struct netdev_queue_stats *);
312static int tc_query_class(const struct netdev *,
313 unsigned int handle, unsigned int parent,
314 struct ofpbuf **replyp);
315static int tc_delete_class(const struct netdev *, unsigned int handle);
316
317static int tc_del_qdisc(struct netdev *netdev);
318static int tc_query_qdisc(const struct netdev *netdev);
319
320static int tc_calc_cell_log(unsigned int mtu);
321static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
322static void tc_put_rtab(struct ofpbuf *, uint16_t type,
323 const struct tc_ratespec *rate);
324static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
325\f
149f577a
JG
326struct netdev_dev_linux {
327 struct netdev_dev netdev_dev;
328
8b61709d 329 struct shash_node *shash_node;
149f577a 330 unsigned int cache_valid;
8b61709d 331
8722022c
BP
332 /* The following are figured out "on demand" only. They are only valid
333 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
334 int ifindex;
335 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 336 struct in_addr address, netmask;
8b61709d
BP
337 struct in6_addr in6;
338 int mtu;
339 int carrier;
8722022c
BP
340 bool is_internal; /* Is this an openvswitch internal device? */
341 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
342 uint32_t kbits_rate; /* Policing data. */
343 uint32_t kbits_burst;
7fbef77a 344 bool have_vport_stats;
c1c9c9c4 345 struct tc *tc;
149f577a
JG
346
347 union {
348 struct tap_state tap;
349 } state;
8b61709d
BP
350};
351
149f577a
JG
352struct netdev_linux {
353 struct netdev netdev;
5b7448ed 354 int fd;
149f577a 355};
8b61709d 356
8b61709d
BP
357/* An AF_INET socket (used for ioctl operations). */
358static int af_inet_sock = -1;
359
ff4ed3c9
BP
360/* A Netlink routing socket that is not subscribed to any multicast groups. */
361static struct nl_sock *rtnl_sock;
362
8b61709d
BP
363struct netdev_linux_notifier {
364 struct netdev_notifier notifier;
365 struct list node;
366};
367
368static struct shash netdev_linux_notifiers =
369 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 370static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
371
372/* This is set pretty low because we probably won't learn anything from the
373 * additional log messages. */
374static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
375
15b3596a 376static int netdev_linux_init(void);
6f643e49 377
0b0544d7 378static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 379 int cmd, const char *cmd_name);
149f577a
JG
380static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
381 const char *cmd_name);
f1acd62b
BP
382static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
383 int cmd, const char *cmd_name);
8b61709d
BP
384static int get_flags(const struct netdev *, int *flagsp);
385static int set_flags(struct netdev *, int flags);
386static int do_get_ifindex(const char *netdev_name);
387static int get_ifindex(const struct netdev *, int *ifindexp);
388static int do_set_addr(struct netdev *netdev,
389 int ioctl_nr, const char *ioctl_name,
390 struct in_addr addr);
391static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
392static int set_etheraddr(const char *netdev_name, int hwaddr_family,
393 const uint8_t[ETH_ADDR_LEN]);
394static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
395static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
396
15b3596a
JG
397static bool
398is_netdev_linux_class(const struct netdev_class *netdev_class)
399{
400 return netdev_class->init == netdev_linux_init;
401}
402
149f577a
JG
403static struct netdev_dev_linux *
404netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 405{
15b3596a
JG
406 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
407 assert(is_netdev_linux_class(netdev_class));
408
149f577a 409 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
410}
411
8b61709d
BP
412static struct netdev_linux *
413netdev_linux_cast(const struct netdev *netdev)
414{
15b3596a
JG
415 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
418
8b61709d
BP
419 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
420}
ff4ed3c9 421\f
8b61709d
BP
422static int
423netdev_linux_init(void)
424{
425 static int status = -1;
426 if (status < 0) {
ff4ed3c9 427 /* Create AF_INET socket. */
8b61709d
BP
428 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
429 status = af_inet_sock >= 0 ? 0 : errno;
430 if (status) {
431 VLOG_ERR("failed to create inet socket: %s", strerror(status));
432 }
ff4ed3c9
BP
433
434 /* Create rtnetlink socket. */
435 if (!status) {
436 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
437 if (status) {
438 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
439 strerror(status));
440 }
441 }
8b61709d
BP
442 }
443 return status;
444}
445
446static void
447netdev_linux_run(void)
448{
46097491 449 rtnetlink_notifier_run();
8b61709d
BP
450}
451
452static void
453netdev_linux_wait(void)
454{
46097491 455 rtnetlink_notifier_wait();
8b61709d
BP
456}
457
458static void
46097491 459netdev_linux_cache_cb(const struct rtnetlink_change *change,
67a4917b 460 void *aux OVS_UNUSED)
8b61709d 461{
149f577a 462 struct netdev_dev_linux *dev;
8b61709d 463 if (change) {
46415c90
JG
464 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
465 if (base_dev) {
15b3596a
JG
466 const struct netdev_class *netdev_class =
467 netdev_dev_get_class(base_dev);
468
469 if (is_netdev_linux_class(netdev_class)) {
470 dev = netdev_dev_linux_cast(base_dev);
471 dev->cache_valid = 0;
472 }
8b61709d
BP
473 }
474 } else {
46415c90 475 struct shash device_shash;
8b61709d 476 struct shash_node *node;
46415c90
JG
477
478 shash_init(&device_shash);
479 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
480 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
481 dev = node->data;
482 dev->cache_valid = 0;
8b61709d 483 }
46415c90 484 shash_destroy(&device_shash);
8b61709d
BP
485 }
486}
487
149f577a 488/* Creates the netdev device of 'type' with 'name'. */
8b61709d 489static int
b8dcf5e9
BP
490netdev_linux_create_system(const struct netdev_class *class OVS_UNUSED,
491 const char *name, const struct shash *args,
492 struct netdev_dev **netdev_devp)
6c88d577 493{
149f577a
JG
494 struct netdev_dev_linux *netdev_dev;
495 int error;
6c88d577
JP
496
497 if (!shash_is_empty(args)) {
149f577a 498 VLOG_WARN("%s: arguments for system devices should be empty", name);
6c88d577
JP
499 }
500
46415c90 501 if (!cache_notifier_refcount) {
149f577a
JG
502 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
503 netdev_linux_cache_cb, NULL);
504 if (error) {
505 return error;
506 }
507 }
46415c90 508 cache_notifier_refcount++;
6c88d577 509
149f577a 510 netdev_dev = xzalloc(sizeof *netdev_dev);
149f577a 511 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
46415c90 512
149f577a 513 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
514 return 0;
515}
516
5b7448ed
JG
517/* For most types of netdevs we open the device for each call of
518 * netdev_open(). However, this is not the case with tap devices,
519 * since it is only possible to open the device once. In this
520 * situation we share a single file descriptor, and consequently
521 * buffers, across all readers. Therefore once data is read it will
522 * be unavailable to other reads for tap devices. */
a740f0de 523static int
b8dcf5e9
BP
524netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
525 const char *name, const struct shash *args,
526 struct netdev_dev **netdev_devp)
a740f0de 527{
149f577a 528 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
529 struct tap_state *state;
530 static const char tap_dev[] = "/dev/net/tun";
531 struct ifreq ifr;
532 int error;
533
534 if (!shash_is_empty(args)) {
149f577a 535 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
536 }
537
149f577a
JG
538 netdev_dev = xzalloc(sizeof *netdev_dev);
539 state = &netdev_dev->state.tap;
a740f0de 540
6c88d577 541 /* Open tap device. */
149f577a
JG
542 state->fd = open(tap_dev, O_RDWR);
543 if (state->fd < 0) {
6c88d577
JP
544 error = errno;
545 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
546 goto error;
547 }
548
549 /* Create tap device. */
550 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
551 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 552 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
553 VLOG_WARN("%s: creating tap device failed: %s", name,
554 strerror(errno));
555 error = errno;
556 goto error;
557 }
558
559 /* Make non-blocking. */
149f577a 560 error = set_nonblocking(state->fd);
a740f0de
JG
561 if (error) {
562 goto error;
563 }
564
149f577a
JG
565 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
566 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
567 return 0;
568
569error:
149f577a 570 free(netdev_dev);
a740f0de
JG
571 return error;
572}
573
a740f0de 574static void
149f577a 575destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 576{
149f577a
JG
577 struct tap_state *state = &netdev_dev->state.tap;
578
579 if (state->fd >= 0) {
580 close(state->fd);
a740f0de
JG
581 }
582}
583
149f577a 584/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 585static void
149f577a 586netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 587{
149f577a
JG
588 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
589 const char *type = netdev_dev_get_type(netdev_dev_);
6c88d577 590
c1c9c9c4
BP
591 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
592 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
593 }
594
149f577a 595 if (!strcmp(type, "system")) {
46415c90 596 cache_notifier_refcount--;
149f577a 597
46415c90 598 if (!cache_notifier_refcount) {
149f577a
JG
599 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
600 }
601 } else if (!strcmp(type, "tap")) {
602 destroy_tap(netdev_dev);
6c88d577 603 }
149f577a 604
658797c8 605 free(netdev_dev);
6c88d577
JP
606}
607
8b61709d 608static int
5b7448ed 609netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 610 struct netdev **netdevp)
8b61709d 611{
5b7448ed 612 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
613 struct netdev_linux *netdev;
614 enum netdev_flags flags;
615 int error;
616
617 /* Allocate network device. */
ec6fde61 618 netdev = xzalloc(sizeof *netdev);
49a6a163 619 netdev->fd = -1;
5b7448ed 620 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d
BP
621
622 error = netdev_get_flags(&netdev->netdev, &flags);
623 if (error == ENODEV) {
624 goto error;
625 }
626
61b999dd
JG
627 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
628 !netdev_dev->state.tap.opened) {
629
630 /* We assume that the first user of the tap device is the primary user
631 * and give them the tap FD. Subsequent users probably just expect
632 * this to be a system device so open it normally to avoid send/receive
633 * directions appearing to be reversed. */
5b7448ed 634 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 635 netdev_dev->state.tap.opened = true;
5b7448ed 636 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
637 struct sockaddr_ll sll;
638 int protocol;
639 int ifindex;
640
641 /* Create file descriptor. */
642 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
643 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
644 : ethertype);
5b7448ed
JG
645 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
646 if (netdev->fd < 0) {
8b61709d
BP
647 error = errno;
648 goto error;
649 }
8b61709d
BP
650
651 /* Set non-blocking mode. */
5b7448ed 652 error = set_nonblocking(netdev->fd);
8b61709d
BP
653 if (error) {
654 goto error;
655 }
656
657 /* Get ethernet device index. */
658 error = get_ifindex(&netdev->netdev, &ifindex);
659 if (error) {
660 goto error;
661 }
662
663 /* Bind to specific ethernet device. */
664 memset(&sll, 0, sizeof sll);
665 sll.sll_family = AF_PACKET;
666 sll.sll_ifindex = ifindex;
5b7448ed 667 if (bind(netdev->fd,
8b61709d
BP
668 (struct sockaddr *) &sll, sizeof sll) < 0) {
669 error = errno;
5b7448ed 670 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 671 strerror(error));
8b61709d
BP
672 goto error;
673 }
674
675 /* Between the socket() and bind() calls above, the socket receives all
676 * packets of the requested type on all system interfaces. We do not
677 * want to receive that data, but there is no way to avoid it. So we
678 * must now drain out the receive queue. */
5b7448ed 679 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
680 if (error) {
681 goto error;
682 }
683 }
684
685 *netdevp = &netdev->netdev;
686 return 0;
687
688error:
149f577a 689 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
690 return error;
691}
692
693/* Closes and destroys 'netdev'. */
694static void
695netdev_linux_close(struct netdev *netdev_)
696{
697 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
698
49a6a163 699 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 700 close(netdev->fd);
8b61709d
BP
701 }
702 free(netdev);
703}
e9e28be3 704
8b61709d
BP
705/* Initializes 'svec' with a list of the names of all known network devices. */
706static int
707netdev_linux_enumerate(struct svec *svec)
708{
709 struct if_nameindex *names;
710
711 names = if_nameindex();
712 if (names) {
713 size_t i;
714
715 for (i = 0; names[i].if_name != NULL; i++) {
716 svec_add(svec, names[i].if_name);
717 }
718 if_freenameindex(names);
719 return 0;
720 } else {
721 VLOG_WARN("could not obtain list of network device names: %s",
722 strerror(errno));
723 return errno;
724 }
725}
726
727static int
728netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
729{
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
731
5b7448ed 732 if (netdev->fd < 0) {
8b61709d 733 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 734 return -EAGAIN;
8b61709d
BP
735 }
736
737 for (;;) {
5b7448ed 738 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
739 if (retval >= 0) {
740 return retval;
741 } else if (errno != EINTR) {
742 if (errno != EAGAIN) {
743 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
744 strerror(errno), netdev_get_name(netdev_));
745 }
c0e5f6ca 746 return -errno;
8b61709d
BP
747 }
748 }
749}
750
751/* Registers with the poll loop to wake up from the next call to poll_block()
752 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
753static void
754netdev_linux_recv_wait(struct netdev *netdev_)
755{
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
757 if (netdev->fd >= 0) {
758 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
759 }
760}
761
762/* Discards all packets waiting to be received from 'netdev'. */
763static int
764netdev_linux_drain(struct netdev *netdev_)
765{
766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 767 if (netdev->fd < 0) {
8b61709d 768 return 0;
5b7448ed 769 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 770 struct ifreq ifr;
149f577a 771 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
772 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
773 if (error) {
774 return error;
775 }
5b7448ed 776 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
777 return 0;
778 } else {
5b7448ed 779 return drain_rcvbuf(netdev->fd);
8b61709d
BP
780 }
781}
782
783/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
784 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
785 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
786 * the packet is too big or too small to transmit on the device.
787 *
788 * The caller retains ownership of 'buffer' in all cases.
789 *
790 * The kernel maintains a packet transmission queue, so the caller is not
791 * expected to do additional queuing of packets. */
792static int
793netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
794{
795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796
797 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
798 */
5b7448ed 799 if (netdev->fd < 0) {
8b61709d
BP
800 return EPIPE;
801 }
802
803 for (;;) {
5b7448ed 804 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
805 if (retval < 0) {
806 /* The Linux AF_PACKET implementation never blocks waiting for room
807 * for packets, instead returning ENOBUFS. Translate this into
808 * EAGAIN for the caller. */
809 if (errno == ENOBUFS) {
810 return EAGAIN;
811 } else if (errno == EINTR) {
812 continue;
813 } else if (errno != EAGAIN) {
814 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
815 netdev_get_name(netdev_), strerror(errno));
816 }
817 return errno;
818 } else if (retval != size) {
819 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
820 "%zu) on %s", retval, size, netdev_get_name(netdev_));
821 return EMSGSIZE;
822 } else {
823 return 0;
824 }
825 }
826}
827
828/* Registers with the poll loop to wake up from the next call to poll_block()
829 * when the packet transmission queue has sufficient room to transmit a packet
830 * with netdev_send().
831 *
832 * The kernel maintains a packet transmission queue, so the client is not
833 * expected to do additional queuing of packets. Thus, this function is
834 * unlikely to ever be used. It is included for completeness. */
835static void
836netdev_linux_send_wait(struct netdev *netdev_)
837{
838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 839 if (netdev->fd < 0) {
8b61709d 840 /* Nothing to do. */
5b7448ed
JG
841 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
842 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
843 } else {
844 /* TAP device always accepts packets.*/
845 poll_immediate_wake();
846 }
847}
848
849/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
850 * otherwise a positive errno value. */
851static int
852netdev_linux_set_etheraddr(struct netdev *netdev_,
853 const uint8_t mac[ETH_ADDR_LEN])
854{
149f577a
JG
855 struct netdev_dev_linux *netdev_dev =
856 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
857 int error;
858
149f577a
JG
859 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
860 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
861 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
862 if (!error) {
149f577a
JG
863 netdev_dev->cache_valid |= VALID_ETHERADDR;
864 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
865 }
866 } else {
867 error = 0;
8b61709d
BP
868 }
869 return error;
870}
871
872/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
873 * free the returned buffer. */
874static int
875netdev_linux_get_etheraddr(const struct netdev *netdev_,
876 uint8_t mac[ETH_ADDR_LEN])
877{
149f577a
JG
878 struct netdev_dev_linux *netdev_dev =
879 netdev_dev_linux_cast(netdev_get_dev(netdev_));
880 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 881 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 882 netdev_dev->etheraddr);
8b61709d
BP
883 if (error) {
884 return error;
885 }
149f577a 886 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 887 }
149f577a 888 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
889 return 0;
890}
891
892/* Returns the maximum size of transmitted (and received) packets on 'netdev',
893 * in bytes, not including the hardware header; thus, this is typically 1500
894 * bytes for Ethernet devices. */
895static int
896netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
897{
149f577a
JG
898 struct netdev_dev_linux *netdev_dev =
899 netdev_dev_linux_cast(netdev_get_dev(netdev_));
900 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
901 struct ifreq ifr;
902 int error;
903
149f577a
JG
904 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
905 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
906 if (error) {
907 return error;
908 }
149f577a
JG
909 netdev_dev->mtu = ifr.ifr_mtu;
910 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 911 }
149f577a 912 *mtup = netdev_dev->mtu;
8b61709d
BP
913 return 0;
914}
915
9ab3d9a3
BP
916/* Returns the ifindex of 'netdev', if successful, as a positive number.
917 * On failure, returns a negative errno value. */
918static int
919netdev_linux_get_ifindex(const struct netdev *netdev)
920{
921 int ifindex, error;
922
923 error = get_ifindex(netdev, &ifindex);
924 return error ? -error : ifindex;
925}
926
8b61709d
BP
927static int
928netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
929{
149f577a
JG
930 struct netdev_dev_linux *netdev_dev =
931 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
932 int error = 0;
933 char *fn = NULL;
934 int fd = -1;
935
149f577a 936 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
937 char line[8];
938 int retval;
939
149f577a
JG
940 fn = xasprintf("/sys/class/net/%s/carrier",
941 netdev_get_name(netdev_));
8b61709d
BP
942 fd = open(fn, O_RDONLY);
943 if (fd < 0) {
944 error = errno;
945 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
946 goto exit;
947 }
948
949 retval = read(fd, line, sizeof line);
950 if (retval < 0) {
951 error = errno;
952 if (error == EINVAL) {
953 /* This is the normal return value when we try to check carrier
954 * if the network device is not up. */
955 } else {
956 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
957 }
958 goto exit;
959 } else if (retval == 0) {
960 error = EPROTO;
961 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
962 goto exit;
963 }
964
965 if (line[0] != '0' && line[0] != '1') {
966 error = EPROTO;
967 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
968 fn, line[0]);
969 goto exit;
970 }
149f577a
JG
971 netdev_dev->carrier = line[0] != '0';
972 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 973 }
149f577a 974 *carrier = netdev_dev->carrier;
8b61709d
BP
975 error = 0;
976
977exit:
978 if (fd >= 0) {
979 close(fd);
980 }
981 free(fn);
982 return error;
983}
984
985/* Check whether we can we use RTM_GETLINK to get network device statistics.
986 * In pre-2.6.19 kernels, this was only available if wireless extensions were
987 * enabled. */
988static bool
989check_for_working_netlink_stats(void)
990{
991 /* Decide on the netdev_get_stats() implementation to use. Netlink is
992 * preferable, so if that works, we'll use it. */
993 int ifindex = do_get_ifindex("lo");
994 if (ifindex < 0) {
995 VLOG_WARN("failed to get ifindex for lo, "
996 "obtaining netdev stats from proc");
997 return false;
998 } else {
999 struct netdev_stats stats;
1000 int error = get_stats_via_netlink(ifindex, &stats);
1001 if (!error) {
1002 VLOG_DBG("obtaining netdev stats via rtnetlink");
1003 return true;
1004 } else {
1005 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1006 "via proc (you are probably running a pre-2.6.19 "
1007 "kernel)", strerror(error));
1008 return false;
1009 }
1010 }
1011}
1012
8722022c
BP
1013/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1014static void
1015netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1016{
1017 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1018 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1019 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1020
8722022c
BP
1021 netdev_dev->is_tap = !strcmp(type, "tap");
1022 netdev_dev->is_internal = false;
1023 if (!netdev_dev->is_tap) {
1024 struct ethtool_drvinfo drvinfo;
1025 int error;
1026
1027 memset(&drvinfo, 0, sizeof drvinfo);
1028 error = netdev_linux_do_ethtool(name,
1029 (struct ethtool_cmd *)&drvinfo,
1030 ETHTOOL_GDRVINFO,
1031 "ETHTOOL_GDRVINFO");
1032
1033 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1034 netdev_dev->is_internal = true;
1035 }
1036 }
1037
1038 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1039 }
1040}
1041
92df599c
JG
1042static void
1043swap_uint64(uint64_t *a, uint64_t *b)
1044{
1045 *a ^= *b;
1046 *b ^= *a;
1047 *a ^= *b;
1048}
1049
7fbef77a 1050/* Retrieves current device stats for 'netdev'. */
8b61709d 1051static int
149f577a
JG
1052netdev_linux_get_stats(const struct netdev *netdev_,
1053 struct netdev_stats *stats)
8b61709d 1054{
149f577a
JG
1055 struct netdev_dev_linux *netdev_dev =
1056 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1057 static int use_netlink_stats = -1;
1058 int error;
1059
1060 COVERAGE_INC(netdev_get_stats);
fe6b0e03 1061
7fbef77a
JG
1062 if (netdev_dev->have_vport_stats ||
1063 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1064
1065 error = netdev_vport_get_stats(netdev_, stats);
1066 netdev_dev->have_vport_stats = !error;
1067 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1068 }
8b61709d 1069
7fbef77a
JG
1070 if (!netdev_dev->have_vport_stats) {
1071 if (use_netlink_stats < 0) {
1072 use_netlink_stats = check_for_working_netlink_stats();
1073 }
1074 if (use_netlink_stats) {
1075 int ifindex;
1076
1077 error = get_ifindex(netdev_, &ifindex);
1078 if (!error) {
1079 error = get_stats_via_netlink(ifindex, stats);
1080 }
1081 } else {
1082 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1083 }
8b61709d 1084 }
fe6b0e03
JG
1085
1086 /* If this port is an internal port then the transmit and receive stats
1087 * will appear to be swapped relative to the other ports since we are the
1088 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1089 * them back here. This does not apply if we are getting stats from the
1090 * vport layer because it always tracks stats from the perspective of the
1091 * switch. */
92df599c 1092 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1093 if (!error && !netdev_dev->have_vport_stats &&
1094 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1095 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1096 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1097 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1098 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1099 stats->rx_length_errors = 0;
1100 stats->rx_over_errors = 0;
1101 stats->rx_crc_errors = 0;
1102 stats->rx_frame_errors = 0;
1103 stats->rx_fifo_errors = 0;
1104 stats->rx_missed_errors = 0;
1105 stats->tx_aborted_errors = 0;
1106 stats->tx_carrier_errors = 0;
1107 stats->tx_fifo_errors = 0;
1108 stats->tx_heartbeat_errors = 0;
1109 stats->tx_window_errors = 0;
1110 }
1111
8b61709d
BP
1112 return error;
1113}
1114
1115/* Stores the features supported by 'netdev' into each of '*current',
1116 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1117 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1118 * successful, otherwise a positive errno value. */
8b61709d
BP
1119static int
1120netdev_linux_get_features(struct netdev *netdev,
1121 uint32_t *current, uint32_t *advertised,
1122 uint32_t *supported, uint32_t *peer)
1123{
1124 struct ethtool_cmd ecmd;
1125 int error;
1126
1127 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1128 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1129 ETHTOOL_GSET, "ETHTOOL_GSET");
1130 if (error) {
1131 return error;
1132 }
1133
1134 /* Supported features. */
1135 *supported = 0;
1136 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1137 *supported |= OFPPF_10MB_HD;
1138 }
1139 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1140 *supported |= OFPPF_10MB_FD;
1141 }
1142 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1143 *supported |= OFPPF_100MB_HD;
1144 }
1145 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1146 *supported |= OFPPF_100MB_FD;
1147 }
1148 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1149 *supported |= OFPPF_1GB_HD;
1150 }
1151 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1152 *supported |= OFPPF_1GB_FD;
1153 }
1154 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1155 *supported |= OFPPF_10GB_FD;
1156 }
1157 if (ecmd.supported & SUPPORTED_TP) {
1158 *supported |= OFPPF_COPPER;
1159 }
1160 if (ecmd.supported & SUPPORTED_FIBRE) {
1161 *supported |= OFPPF_FIBER;
1162 }
1163 if (ecmd.supported & SUPPORTED_Autoneg) {
1164 *supported |= OFPPF_AUTONEG;
1165 }
1166 if (ecmd.supported & SUPPORTED_Pause) {
1167 *supported |= OFPPF_PAUSE;
1168 }
1169 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1170 *supported |= OFPPF_PAUSE_ASYM;
1171 }
1172
1173 /* Advertised features. */
1174 *advertised = 0;
1175 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1176 *advertised |= OFPPF_10MB_HD;
1177 }
1178 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1179 *advertised |= OFPPF_10MB_FD;
1180 }
1181 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1182 *advertised |= OFPPF_100MB_HD;
1183 }
1184 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1185 *advertised |= OFPPF_100MB_FD;
1186 }
1187 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1188 *advertised |= OFPPF_1GB_HD;
1189 }
1190 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1191 *advertised |= OFPPF_1GB_FD;
1192 }
1193 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1194 *advertised |= OFPPF_10GB_FD;
1195 }
1196 if (ecmd.advertising & ADVERTISED_TP) {
1197 *advertised |= OFPPF_COPPER;
1198 }
1199 if (ecmd.advertising & ADVERTISED_FIBRE) {
1200 *advertised |= OFPPF_FIBER;
1201 }
1202 if (ecmd.advertising & ADVERTISED_Autoneg) {
1203 *advertised |= OFPPF_AUTONEG;
1204 }
1205 if (ecmd.advertising & ADVERTISED_Pause) {
1206 *advertised |= OFPPF_PAUSE;
1207 }
1208 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1209 *advertised |= OFPPF_PAUSE_ASYM;
1210 }
1211
1212 /* Current settings. */
1213 if (ecmd.speed == SPEED_10) {
1214 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1215 } else if (ecmd.speed == SPEED_100) {
1216 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1217 } else if (ecmd.speed == SPEED_1000) {
1218 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1219 } else if (ecmd.speed == SPEED_10000) {
1220 *current = OFPPF_10GB_FD;
1221 } else {
1222 *current = 0;
1223 }
1224
1225 if (ecmd.port == PORT_TP) {
1226 *current |= OFPPF_COPPER;
1227 } else if (ecmd.port == PORT_FIBRE) {
1228 *current |= OFPPF_FIBER;
1229 }
1230
1231 if (ecmd.autoneg) {
1232 *current |= OFPPF_AUTONEG;
1233 }
1234
1235 /* Peer advertisements. */
1236 *peer = 0; /* XXX */
1237
1238 return 0;
1239}
1240
1241/* Set the features advertised by 'netdev' to 'advertise'. */
1242static int
1243netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1244{
1245 struct ethtool_cmd ecmd;
1246 int error;
1247
1248 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1249 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1250 ETHTOOL_GSET, "ETHTOOL_GSET");
1251 if (error) {
1252 return error;
1253 }
1254
1255 ecmd.advertising = 0;
1256 if (advertise & OFPPF_10MB_HD) {
1257 ecmd.advertising |= ADVERTISED_10baseT_Half;
1258 }
1259 if (advertise & OFPPF_10MB_FD) {
1260 ecmd.advertising |= ADVERTISED_10baseT_Full;
1261 }
1262 if (advertise & OFPPF_100MB_HD) {
1263 ecmd.advertising |= ADVERTISED_100baseT_Half;
1264 }
1265 if (advertise & OFPPF_100MB_FD) {
1266 ecmd.advertising |= ADVERTISED_100baseT_Full;
1267 }
1268 if (advertise & OFPPF_1GB_HD) {
1269 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1270 }
1271 if (advertise & OFPPF_1GB_FD) {
1272 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1273 }
1274 if (advertise & OFPPF_10GB_FD) {
1275 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1276 }
1277 if (advertise & OFPPF_COPPER) {
1278 ecmd.advertising |= ADVERTISED_TP;
1279 }
1280 if (advertise & OFPPF_FIBER) {
1281 ecmd.advertising |= ADVERTISED_FIBRE;
1282 }
1283 if (advertise & OFPPF_AUTONEG) {
1284 ecmd.advertising |= ADVERTISED_Autoneg;
1285 }
1286 if (advertise & OFPPF_PAUSE) {
1287 ecmd.advertising |= ADVERTISED_Pause;
1288 }
1289 if (advertise & OFPPF_PAUSE_ASYM) {
1290 ecmd.advertising |= ADVERTISED_Asym_Pause;
1291 }
0b0544d7 1292 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1293 ETHTOOL_SSET, "ETHTOOL_SSET");
1294}
1295
1296/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1297 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1298 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1299 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1300 * sets '*vlan_vid' to -1. */
1301static int
1302netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1303{
1304 const char *netdev_name = netdev_get_name(netdev);
1305 struct ds line = DS_EMPTY_INITIALIZER;
1306 FILE *stream = NULL;
1307 int error;
1308 char *fn;
1309
1310 COVERAGE_INC(netdev_get_vlan_vid);
1311 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1312 stream = fopen(fn, "r");
1313 if (!stream) {
1314 error = errno;
1315 goto done;
1316 }
1317
1318 if (ds_get_line(&line, stream)) {
1319 if (ferror(stream)) {
1320 error = errno;
1321 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1322 } else {
1323 error = EPROTO;
1324 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1325 }
1326 goto done;
1327 }
1328
1329 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1330 error = EPROTO;
1331 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1332 fn, ds_cstr(&line));
1333 goto done;
1334 }
1335
1336 error = 0;
1337
1338done:
1339 free(fn);
1340 if (stream) {
1341 fclose(stream);
1342 }
1343 ds_destroy(&line);
1344 if (error) {
1345 *vlan_vid = -1;
1346 }
1347 return error;
1348}
1349
1350#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1351#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1352
8e460221 1353/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1354 * positive errno value.
1355 *
1356 * This function is equivalent to running
1357 * /sbin/tc qdisc del dev %s handle ffff: ingress
1358 * but it is much, much faster.
1359 */
8e460221
BP
1360static int
1361netdev_linux_remove_policing(struct netdev *netdev)
1362{
80a86fbe
BP
1363 struct netdev_dev_linux *netdev_dev =
1364 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1365 const char *netdev_name = netdev_get_name(netdev);
8e460221 1366
6f42c8ea 1367 struct ofpbuf request;
6f42c8ea 1368 struct tcmsg *tcmsg;
6f42c8ea
BP
1369 int error;
1370
c1c9c9c4
BP
1371 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1372 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1373 tcmsg->tcm_parent = TC_H_INGRESS;
1374 nl_msg_put_string(&request, TCA_KIND, "ingress");
1375 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1376
1377 error = tc_transact(&request, NULL);
4d10512c 1378 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1379 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1380 netdev_name, strerror(error));
1381 return error;
1382 }
1383
80a86fbe
BP
1384 netdev_dev->kbits_rate = 0;
1385 netdev_dev->kbits_burst = 0;
1386 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1387 return 0;
1388}
1389
8b61709d
BP
1390/* Attempts to set input rate limiting (policing) policy. */
1391static int
1392netdev_linux_set_policing(struct netdev *netdev,
1393 uint32_t kbits_rate, uint32_t kbits_burst)
1394{
80a86fbe
BP
1395 struct netdev_dev_linux *netdev_dev =
1396 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1397 const char *netdev_name = netdev_get_name(netdev);
1398 char command[1024];
1399
1400 COVERAGE_INC(netdev_set_policing);
8e460221 1401
80a86fbe
BP
1402 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1403 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1404 : kbits_burst); /* Stick with user-specified value. */
1405
1406 if (netdev_dev->cache_valid & VALID_POLICING
1407 && netdev_dev->kbits_rate == kbits_rate
1408 && netdev_dev->kbits_burst == kbits_burst) {
1409 /* Assume that settings haven't changed since we last set them. */
1410 return 0;
1411 }
1412
8e460221 1413 netdev_linux_remove_policing(netdev);
8b61709d 1414 if (kbits_rate) {
8b61709d
BP
1415 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1416 if (system(command) != 0) {
1417 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1418 return -1;
1419 }
1420
1421 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1422 kbits_rate, kbits_burst);
1423 if (system(command) != 0) {
1424 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1425 netdev_name);
1426 return -1;
1427 }
80a86fbe
BP
1428
1429 netdev_dev->kbits_rate = kbits_rate;
1430 netdev_dev->kbits_burst = kbits_burst;
1431 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1432 }
1433
1434 return 0;
1435}
1436
c1c9c9c4
BP
1437static int
1438netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1439 struct svec *types)
1440{
1441 const struct tc_ops **opsp;
1442
1443 for (opsp = tcs; *opsp != NULL; opsp++) {
1444 const struct tc_ops *ops = *opsp;
1445 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1446 svec_add(types, ops->ovs_name);
1447 }
1448 }
1449 return 0;
1450}
1451
1452static const struct tc_ops *
1453tc_lookup_ovs_name(const char *name)
1454{
1455 const struct tc_ops **opsp;
1456
1457 for (opsp = tcs; *opsp != NULL; opsp++) {
1458 const struct tc_ops *ops = *opsp;
1459 if (!strcmp(name, ops->ovs_name)) {
1460 return ops;
1461 }
1462 }
1463 return NULL;
1464}
1465
1466static const struct tc_ops *
1467tc_lookup_linux_name(const char *name)
1468{
1469 const struct tc_ops **opsp;
1470
1471 for (opsp = tcs; *opsp != NULL; opsp++) {
1472 const struct tc_ops *ops = *opsp;
1473 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1474 return ops;
1475 }
1476 }
1477 return NULL;
1478}
1479
93b13be8
BP
1480static struct tc_queue *
1481tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1482 size_t hash)
1483{
1484 struct netdev_dev_linux *netdev_dev =
1485 netdev_dev_linux_cast(netdev_get_dev(netdev));
1486 struct tc_queue *queue;
1487
4e8e4213 1488 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1489 if (queue->queue_id == queue_id) {
1490 return queue;
1491 }
1492 }
1493 return NULL;
1494}
1495
1496static struct tc_queue *
1497tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1498{
1499 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1500}
1501
c1c9c9c4
BP
1502static int
1503netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1504 const char *type,
1505 struct netdev_qos_capabilities *caps)
1506{
1507 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1508 if (!ops) {
1509 return EOPNOTSUPP;
1510 }
1511 caps->n_queues = ops->n_queues;
1512 return 0;
1513}
1514
1515static int
1516netdev_linux_get_qos(const struct netdev *netdev,
1517 const char **typep, struct shash *details)
1518{
1519 struct netdev_dev_linux *netdev_dev =
1520 netdev_dev_linux_cast(netdev_get_dev(netdev));
1521 int error;
1522
1523 error = tc_query_qdisc(netdev);
1524 if (error) {
1525 return error;
1526 }
1527
1528 *typep = netdev_dev->tc->ops->ovs_name;
1529 return (netdev_dev->tc->ops->qdisc_get
1530 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1531 : 0);
1532}
1533
1534static int
1535netdev_linux_set_qos(struct netdev *netdev,
1536 const char *type, const struct shash *details)
1537{
1538 struct netdev_dev_linux *netdev_dev =
1539 netdev_dev_linux_cast(netdev_get_dev(netdev));
1540 const struct tc_ops *new_ops;
1541 int error;
1542
1543 new_ops = tc_lookup_ovs_name(type);
1544 if (!new_ops || !new_ops->tc_install) {
1545 return EOPNOTSUPP;
1546 }
1547
1548 error = tc_query_qdisc(netdev);
1549 if (error) {
1550 return error;
1551 }
1552
1553 if (new_ops == netdev_dev->tc->ops) {
1554 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1555 } else {
1556 /* Delete existing qdisc. */
1557 error = tc_del_qdisc(netdev);
1558 if (error) {
1559 return error;
1560 }
1561 assert(netdev_dev->tc == NULL);
1562
1563 /* Install new qdisc. */
1564 error = new_ops->tc_install(netdev, details);
1565 assert((error == 0) == (netdev_dev->tc != NULL));
1566
1567 return error;
1568 }
1569}
1570
1571static int
1572netdev_linux_get_queue(const struct netdev *netdev,
1573 unsigned int queue_id, struct shash *details)
1574{
1575 struct netdev_dev_linux *netdev_dev =
1576 netdev_dev_linux_cast(netdev_get_dev(netdev));
1577 int error;
1578
1579 error = tc_query_qdisc(netdev);
1580 if (error) {
1581 return error;
93b13be8
BP
1582 } else {
1583 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1584 return (queue
1585 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1586 : ENOENT);
c1c9c9c4 1587 }
c1c9c9c4
BP
1588}
1589
1590static int
1591netdev_linux_set_queue(struct netdev *netdev,
1592 unsigned int queue_id, const struct shash *details)
1593{
1594 struct netdev_dev_linux *netdev_dev =
1595 netdev_dev_linux_cast(netdev_get_dev(netdev));
1596 int error;
1597
1598 error = tc_query_qdisc(netdev);
1599 if (error) {
1600 return error;
1601 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1602 || !netdev_dev->tc->ops->class_set) {
1603 return EINVAL;
1604 }
1605
1606 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1607}
1608
1609static int
1610netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1611{
1612 struct netdev_dev_linux *netdev_dev =
1613 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 int error;
1615
1616 error = tc_query_qdisc(netdev);
1617 if (error) {
1618 return error;
1619 } else if (!netdev_dev->tc->ops->class_delete) {
1620 return EINVAL;
93b13be8
BP
1621 } else {
1622 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1623 return (queue
1624 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1625 : ENOENT);
c1c9c9c4 1626 }
c1c9c9c4
BP
1627}
1628
1629static int
1630netdev_linux_get_queue_stats(const struct netdev *netdev,
1631 unsigned int queue_id,
1632 struct netdev_queue_stats *stats)
1633{
1634 struct netdev_dev_linux *netdev_dev =
1635 netdev_dev_linux_cast(netdev_get_dev(netdev));
1636 int error;
1637
1638 error = tc_query_qdisc(netdev);
1639 if (error) {
1640 return error;
c1c9c9c4
BP
1641 } else if (!netdev_dev->tc->ops->class_get_stats) {
1642 return EOPNOTSUPP;
93b13be8
BP
1643 } else {
1644 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1645 return (queue
1646 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1647 : ENOENT);
c1c9c9c4 1648 }
c1c9c9c4
BP
1649}
1650
1651static void
1652start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1653{
1654 struct ofpbuf request;
1655 struct tcmsg *tcmsg;
1656
1657 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3c4de644 1658 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1659 nl_dump_start(dump, rtnl_sock, &request);
1660 ofpbuf_uninit(&request);
1661}
1662
1663static int
1664netdev_linux_dump_queues(const struct netdev *netdev,
1665 netdev_dump_queues_cb *cb, void *aux)
1666{
1667 struct netdev_dev_linux *netdev_dev =
1668 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1669 struct tc_queue *queue;
c1c9c9c4
BP
1670 struct shash details;
1671 int last_error;
c1c9c9c4
BP
1672 int error;
1673
1674 error = tc_query_qdisc(netdev);
1675 if (error) {
1676 return error;
1677 } else if (!netdev_dev->tc->ops->class_get) {
1678 return EOPNOTSUPP;
1679 }
1680
1681 last_error = 0;
1682 shash_init(&details);
4e8e4213 1683 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1684 shash_clear(&details);
1685
93b13be8 1686 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1687 if (!error) {
93b13be8 1688 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1689 } else {
1690 last_error = error;
1691 }
1692 }
1693 shash_destroy(&details);
1694
1695 return last_error;
1696}
1697
1698static int
1699netdev_linux_dump_queue_stats(const struct netdev *netdev,
1700 netdev_dump_queue_stats_cb *cb, void *aux)
1701{
1702 struct netdev_dev_linux *netdev_dev =
1703 netdev_dev_linux_cast(netdev_get_dev(netdev));
1704 struct nl_dump dump;
1705 struct ofpbuf msg;
1706 int last_error;
1707 int error;
1708
1709 error = tc_query_qdisc(netdev);
1710 if (error) {
1711 return error;
1712 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1713 return EOPNOTSUPP;
1714 }
1715
1716 last_error = 0;
1717 start_queue_dump(netdev, &dump);
1718 while (nl_dump_next(&dump, &msg)) {
1719 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1720 if (error) {
1721 last_error = error;
1722 }
1723 }
1724
1725 error = nl_dump_done(&dump);
1726 return error ? error : last_error;
1727}
1728
8b61709d 1729static int
f1acd62b
BP
1730netdev_linux_get_in4(const struct netdev *netdev_,
1731 struct in_addr *address, struct in_addr *netmask)
8b61709d 1732{
149f577a
JG
1733 struct netdev_dev_linux *netdev_dev =
1734 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1735
1736 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1737 int error;
1738
149f577a 1739 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1740 SIOCGIFADDR, "SIOCGIFADDR");
1741 if (error) {
1742 return error;
1743 }
1744
149f577a 1745 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1746 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1747 if (error) {
1748 return error;
1749 }
1750
149f577a 1751 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1752 }
149f577a
JG
1753 *address = netdev_dev->address;
1754 *netmask = netdev_dev->netmask;
f1acd62b 1755 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1756}
1757
8b61709d 1758static int
f1acd62b
BP
1759netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1760 struct in_addr netmask)
8b61709d 1761{
149f577a
JG
1762 struct netdev_dev_linux *netdev_dev =
1763 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1764 int error;
1765
f1acd62b 1766 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1767 if (!error) {
149f577a
JG
1768 netdev_dev->cache_valid |= VALID_IN4;
1769 netdev_dev->address = address;
1770 netdev_dev->netmask = netmask;
f1acd62b 1771 if (address.s_addr != INADDR_ANY) {
8b61709d 1772 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1773 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1774 }
1775 }
1776 return error;
1777}
1778
1779static bool
1780parse_if_inet6_line(const char *line,
1781 struct in6_addr *in6, char ifname[16 + 1])
1782{
1783 uint8_t *s6 = in6->s6_addr;
1784#define X8 "%2"SCNx8
1785 return sscanf(line,
1786 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1787 "%*x %*x %*x %*x %16s\n",
1788 &s6[0], &s6[1], &s6[2], &s6[3],
1789 &s6[4], &s6[5], &s6[6], &s6[7],
1790 &s6[8], &s6[9], &s6[10], &s6[11],
1791 &s6[12], &s6[13], &s6[14], &s6[15],
1792 ifname) == 17;
1793}
1794
1795/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1796 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1797static int
1798netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1799{
149f577a
JG
1800 struct netdev_dev_linux *netdev_dev =
1801 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1802 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1803 FILE *file;
1804 char line[128];
1805
149f577a 1806 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1807
1808 file = fopen("/proc/net/if_inet6", "r");
1809 if (file != NULL) {
1810 const char *name = netdev_get_name(netdev_);
1811 while (fgets(line, sizeof line, file)) {
2a022368 1812 struct in6_addr in6_tmp;
8b61709d 1813 char ifname[16 + 1];
2a022368 1814 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1815 && !strcmp(name, ifname))
1816 {
2a022368 1817 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1818 break;
1819 }
1820 }
1821 fclose(file);
1822 }
149f577a 1823 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1824 }
149f577a 1825 *in6 = netdev_dev->in6;
8b61709d
BP
1826 return 0;
1827}
1828
1829static void
1830make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1831{
1832 struct sockaddr_in sin;
1833 memset(&sin, 0, sizeof sin);
1834 sin.sin_family = AF_INET;
1835 sin.sin_addr = addr;
1836 sin.sin_port = 0;
1837
1838 memset(sa, 0, sizeof *sa);
1839 memcpy(sa, &sin, sizeof sin);
1840}
1841
1842static int
1843do_set_addr(struct netdev *netdev,
1844 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1845{
1846 struct ifreq ifr;
149f577a 1847 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1848 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1849
1850 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1851 ioctl_name);
8b61709d
BP
1852}
1853
1854/* Adds 'router' as a default IP gateway. */
1855static int
67a4917b 1856netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1857{
1858 struct in_addr any = { INADDR_ANY };
1859 struct rtentry rt;
1860 int error;
1861
1862 memset(&rt, 0, sizeof rt);
1863 make_in4_sockaddr(&rt.rt_dst, any);
1864 make_in4_sockaddr(&rt.rt_gateway, router);
1865 make_in4_sockaddr(&rt.rt_genmask, any);
1866 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1867 COVERAGE_INC(netdev_add_router);
1868 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1869 if (error) {
1870 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1871 }
1872 return error;
1873}
1874
f1acd62b
BP
1875static int
1876netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1877 char **netdev_name)
1878{
1879 static const char fn[] = "/proc/net/route";
1880 FILE *stream;
1881 char line[256];
1882 int ln;
1883
1884 *netdev_name = NULL;
1885 stream = fopen(fn, "r");
1886 if (stream == NULL) {
1887 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1888 return errno;
1889 }
1890
1891 ln = 0;
1892 while (fgets(line, sizeof line, stream)) {
1893 if (++ln >= 2) {
1894 char iface[17];
1895 uint32_t dest, gateway, mask;
1896 int refcnt, metric, mtu;
1897 unsigned int flags, use, window, irtt;
1898
1899 if (sscanf(line,
1900 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1901 " %d %u %u\n",
1902 iface, &dest, &gateway, &flags, &refcnt,
1903 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1904
d295e8e9 1905 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1906 fn, ln, line);
1907 continue;
1908 }
1909 if (!(flags & RTF_UP)) {
1910 /* Skip routes that aren't up. */
1911 continue;
1912 }
1913
1914 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1915 * network byte order, so we don't need need any endian
f1acd62b
BP
1916 * conversions here. */
1917 if ((dest & mask) == (host->s_addr & mask)) {
1918 if (!gateway) {
1919 /* The host is directly reachable. */
1920 next_hop->s_addr = 0;
1921 } else {
1922 /* To reach the host, we must go through a gateway. */
1923 next_hop->s_addr = gateway;
1924 }
1925 *netdev_name = xstrdup(iface);
1926 fclose(stream);
1927 return 0;
1928 }
1929 }
1930 }
1931
1932 fclose(stream);
1933 return ENXIO;
1934}
1935
8b61709d
BP
1936/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1937 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1938 * returns 0. Otherwise, it returns a positive errno value; in particular,
1939 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1940static int
1941netdev_linux_arp_lookup(const struct netdev *netdev,
1942 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1943{
1944 struct arpreq r;
c100e025 1945 struct sockaddr_in sin;
8b61709d
BP
1946 int retval;
1947
1948 memset(&r, 0, sizeof r);
c100e025
BP
1949 sin.sin_family = AF_INET;
1950 sin.sin_addr.s_addr = ip;
1951 sin.sin_port = 0;
1952 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
1953 r.arp_ha.sa_family = ARPHRD_ETHER;
1954 r.arp_flags = 0;
149f577a 1955 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
1956 COVERAGE_INC(netdev_arp_lookup);
1957 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1958 if (!retval) {
1959 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1960 } else if (retval != ENXIO) {
1961 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 1962 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
1963 }
1964 return retval;
1965}
1966
1967static int
1968nd_to_iff_flags(enum netdev_flags nd)
1969{
1970 int iff = 0;
1971 if (nd & NETDEV_UP) {
1972 iff |= IFF_UP;
1973 }
1974 if (nd & NETDEV_PROMISC) {
1975 iff |= IFF_PROMISC;
1976 }
1977 return iff;
1978}
1979
1980static int
1981iff_to_nd_flags(int iff)
1982{
1983 enum netdev_flags nd = 0;
1984 if (iff & IFF_UP) {
1985 nd |= NETDEV_UP;
1986 }
1987 if (iff & IFF_PROMISC) {
1988 nd |= NETDEV_PROMISC;
1989 }
1990 return nd;
1991}
1992
1993static int
1994netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1995 enum netdev_flags on, enum netdev_flags *old_flagsp)
1996{
1997 int old_flags, new_flags;
1998 int error;
1999
2000 error = get_flags(netdev, &old_flags);
2001 if (!error) {
2002 *old_flagsp = iff_to_nd_flags(old_flags);
2003 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2004 if (new_flags != old_flags) {
2005 error = set_flags(netdev, new_flags);
2006 }
2007 }
2008 return error;
2009}
2010
2011static void
2012poll_notify(struct list *list)
2013{
2014 struct netdev_linux_notifier *notifier;
4e8e4213 2015 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2016 struct netdev_notifier *n = &notifier->notifier;
2017 n->cb(n);
2018 }
2019}
2020
2021static void
46097491 2022netdev_linux_poll_cb(const struct rtnetlink_change *change,
67a4917b 2023 void *aux OVS_UNUSED)
8b61709d
BP
2024{
2025 if (change) {
2026 struct list *list = shash_find_data(&netdev_linux_notifiers,
2027 change->ifname);
2028 if (list) {
2029 poll_notify(list);
2030 }
2031 } else {
2032 struct shash_node *node;
2033 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2034 poll_notify(node->data);
2035 }
2036 }
2037}
2038
2039static int
2040netdev_linux_poll_add(struct netdev *netdev,
2041 void (*cb)(struct netdev_notifier *), void *aux,
2042 struct netdev_notifier **notifierp)
2043{
2044 const char *netdev_name = netdev_get_name(netdev);
2045 struct netdev_linux_notifier *notifier;
2046 struct list *list;
2047
2048 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2049 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
8b61709d
BP
2050 netdev_linux_poll_cb, NULL);
2051 if (error) {
2052 return error;
2053 }
2054 }
2055
2056 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2057 if (!list) {
2058 list = xmalloc(sizeof *list);
2059 list_init(list);
2060 shash_add(&netdev_linux_notifiers, netdev_name, list);
2061 }
2062
2063 notifier = xmalloc(sizeof *notifier);
2064 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2065 list_push_back(list, &notifier->node);
2066 *notifierp = &notifier->notifier;
2067 return 0;
2068}
2069
2070static void
2071netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2072{
2073 struct netdev_linux_notifier *notifier =
2074 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2075 struct list *list;
2076
2077 /* Remove 'notifier' from its list. */
2078 list = list_remove(&notifier->node);
2079 if (list_is_empty(list)) {
2080 /* The list is now empty. Remove it from the hash and free it. */
2081 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2082 shash_delete(&netdev_linux_notifiers,
2083 shash_find(&netdev_linux_notifiers, netdev_name));
2084 free(list);
2085 }
2086 free(notifier);
2087
2088 /* If that was the last notifier, unregister. */
2089 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2090 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2091 }
2092}
2093
2094const struct netdev_class netdev_linux_class = {
149f577a 2095 "system",
8b61709d
BP
2096
2097 netdev_linux_init,
2098 netdev_linux_run,
2099 netdev_linux_wait,
2100
a740f0de 2101 netdev_linux_create_system,
6c88d577
JP
2102 netdev_linux_destroy,
2103 NULL, /* reconfigure */
2104
8b61709d
BP
2105 netdev_linux_open,
2106 netdev_linux_close,
2107
2108 netdev_linux_enumerate,
2109
2110 netdev_linux_recv,
2111 netdev_linux_recv_wait,
2112 netdev_linux_drain,
2113
2114 netdev_linux_send,
2115 netdev_linux_send_wait,
2116
2117 netdev_linux_set_etheraddr,
2118 netdev_linux_get_etheraddr,
2119 netdev_linux_get_mtu,
9ab3d9a3 2120 netdev_linux_get_ifindex,
8b61709d
BP
2121 netdev_linux_get_carrier,
2122 netdev_linux_get_stats,
f4b6076a 2123 netdev_vport_set_stats,
8b61709d
BP
2124
2125 netdev_linux_get_features,
2126 netdev_linux_set_advertisements,
2127 netdev_linux_get_vlan_vid,
c1c9c9c4 2128
8b61709d 2129 netdev_linux_set_policing,
c1c9c9c4
BP
2130 netdev_linux_get_qos_types,
2131 netdev_linux_get_qos_capabilities,
2132 netdev_linux_get_qos,
2133 netdev_linux_set_qos,
2134 netdev_linux_get_queue,
2135 netdev_linux_set_queue,
2136 netdev_linux_delete_queue,
2137 netdev_linux_get_queue_stats,
2138 netdev_linux_dump_queues,
2139 netdev_linux_dump_queue_stats,
8b61709d
BP
2140
2141 netdev_linux_get_in4,
2142 netdev_linux_set_in4,
2143 netdev_linux_get_in6,
2144 netdev_linux_add_router,
f1acd62b 2145 netdev_linux_get_next_hop,
8b61709d
BP
2146 netdev_linux_arp_lookup,
2147
2148 netdev_linux_update_flags,
2149
2150 netdev_linux_poll_add,
2151 netdev_linux_poll_remove,
2152};
2153
2154const struct netdev_class netdev_tap_class = {
149f577a 2155 "tap",
8b61709d
BP
2156
2157 netdev_linux_init,
149f577a
JG
2158 netdev_linux_run,
2159 netdev_linux_wait,
8b61709d 2160
a740f0de 2161 netdev_linux_create_tap,
6c88d577
JP
2162 netdev_linux_destroy,
2163 NULL, /* reconfigure */
2164
8b61709d
BP
2165 netdev_linux_open,
2166 netdev_linux_close,
2167
149f577a 2168 NULL, /* enumerate */
8b61709d
BP
2169
2170 netdev_linux_recv,
2171 netdev_linux_recv_wait,
2172 netdev_linux_drain,
2173
2174 netdev_linux_send,
2175 netdev_linux_send_wait,
2176
2177 netdev_linux_set_etheraddr,
2178 netdev_linux_get_etheraddr,
2179 netdev_linux_get_mtu,
9ab3d9a3 2180 netdev_linux_get_ifindex,
8b61709d
BP
2181 netdev_linux_get_carrier,
2182 netdev_linux_get_stats,
8722022c 2183 NULL, /* set_stats */
8b61709d
BP
2184
2185 netdev_linux_get_features,
2186 netdev_linux_set_advertisements,
a740f0de 2187 netdev_linux_get_vlan_vid,
c1c9c9c4 2188
a740f0de 2189 netdev_linux_set_policing,
c1c9c9c4
BP
2190 netdev_linux_get_qos_types,
2191 netdev_linux_get_qos_capabilities,
2192 netdev_linux_get_qos,
2193 netdev_linux_set_qos,
2194 netdev_linux_get_queue,
2195 netdev_linux_set_queue,
2196 netdev_linux_delete_queue,
2197 netdev_linux_get_queue_stats,
2198 netdev_linux_dump_queues,
2199 netdev_linux_dump_queue_stats,
a740f0de
JG
2200
2201 netdev_linux_get_in4,
2202 netdev_linux_set_in4,
2203 netdev_linux_get_in6,
2204 netdev_linux_add_router,
2205 netdev_linux_get_next_hop,
2206 netdev_linux_arp_lookup,
2207
2208 netdev_linux_update_flags,
2209
2210 netdev_linux_poll_add,
2211 netdev_linux_poll_remove,
2212};
8b61709d 2213\f
c1c9c9c4 2214/* HTB traffic control class. */
559843ed 2215
c1c9c9c4 2216#define HTB_N_QUEUES 0xf000
8b61709d 2217
c1c9c9c4
BP
2218struct htb {
2219 struct tc tc;
2220 unsigned int max_rate; /* In bytes/s. */
2221};
8b61709d 2222
c1c9c9c4 2223struct htb_class {
93b13be8 2224 struct tc_queue tc_queue;
c1c9c9c4
BP
2225 unsigned int min_rate; /* In bytes/s. */
2226 unsigned int max_rate; /* In bytes/s. */
2227 unsigned int burst; /* In bytes. */
2228 unsigned int priority; /* Lower values are higher priorities. */
2229};
8b61709d 2230
c1c9c9c4
BP
2231static struct htb *
2232htb_get__(const struct netdev *netdev)
2233{
2234 struct netdev_dev_linux *netdev_dev =
2235 netdev_dev_linux_cast(netdev_get_dev(netdev));
2236 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2237}
2238
2239static struct htb *
2240htb_install__(struct netdev *netdev, uint64_t max_rate)
2241{
2242 struct netdev_dev_linux *netdev_dev =
2243 netdev_dev_linux_cast(netdev_get_dev(netdev));
2244 struct htb *htb;
2245
2246 htb = xmalloc(sizeof *htb);
2247 tc_init(&htb->tc, &tc_ops_htb);
2248 htb->max_rate = max_rate;
2249
2250 netdev_dev->tc = &htb->tc;
2251
2252 return htb;
2253}
2254
2255/* Create an HTB qdisc.
2256 *
2257 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2258 * 0". */
2259static int
2260htb_setup_qdisc__(struct netdev *netdev)
2261{
2262 size_t opt_offset;
2263 struct tc_htb_glob opt;
2264 struct ofpbuf request;
2265 struct tcmsg *tcmsg;
2266
2267 tc_del_qdisc(netdev);
2268
2269 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2270 NLM_F_EXCL | NLM_F_CREATE, &request);
2271 tcmsg->tcm_handle = tc_make_handle(1, 0);
2272 tcmsg->tcm_parent = TC_H_ROOT;
2273
2274 nl_msg_put_string(&request, TCA_KIND, "htb");
2275
2276 memset(&opt, 0, sizeof opt);
2277 opt.rate2quantum = 10;
2278 opt.version = 3;
2279 opt.defcls = 0;
2280
2281 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2282 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2283 nl_msg_end_nested(&request, opt_offset);
2284
2285 return tc_transact(&request, NULL);
2286}
2287
2288/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2289 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2290static int
2291htb_setup_class__(struct netdev *netdev, unsigned int handle,
2292 unsigned int parent, struct htb_class *class)
2293{
2294 size_t opt_offset;
2295 struct tc_htb_opt opt;
2296 struct ofpbuf request;
2297 struct tcmsg *tcmsg;
2298 int error;
2299 int mtu;
2300
2301 netdev_get_mtu(netdev, &mtu);
2302
2303 memset(&opt, 0, sizeof opt);
2304 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2305 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2306 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2307 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2308 opt.prio = class->priority;
2309
2310 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2311 tcmsg->tcm_handle = handle;
2312 tcmsg->tcm_parent = parent;
2313
2314 nl_msg_put_string(&request, TCA_KIND, "htb");
2315 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2316 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2317 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2318 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2319 nl_msg_end_nested(&request, opt_offset);
2320
2321 error = tc_transact(&request, NULL);
2322 if (error) {
2323 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2324 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2325 netdev_get_name(netdev),
2326 tc_get_major(handle), tc_get_minor(handle),
2327 tc_get_major(parent), tc_get_minor(parent),
2328 class->min_rate, class->max_rate,
2329 class->burst, class->priority, strerror(error));
2330 }
2331 return error;
2332}
2333
2334/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2335 * description of them into 'details'. The description complies with the
2336 * specification given in the vswitch database documentation for linux-htb
2337 * queue details. */
2338static int
2339htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2340{
2341 static const struct nl_policy tca_htb_policy[] = {
2342 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2343 .min_len = sizeof(struct tc_htb_opt) },
2344 };
2345
2346 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2347 const struct tc_htb_opt *htb;
2348
2349 if (!nl_parse_nested(nl_options, tca_htb_policy,
2350 attrs, ARRAY_SIZE(tca_htb_policy))) {
2351 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2352 return EPROTO;
2353 }
2354
2355 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2356 class->min_rate = htb->rate.rate;
2357 class->max_rate = htb->ceil.rate;
2358 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2359 class->priority = htb->prio;
2360 return 0;
2361}
2362
2363static int
2364htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2365 struct htb_class *options,
2366 struct netdev_queue_stats *stats)
2367{
2368 struct nlattr *nl_options;
2369 unsigned int handle;
2370 int error;
2371
2372 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2373 if (!error && queue_id) {
17ee3c1f
BP
2374 unsigned int major = tc_get_major(handle);
2375 unsigned int minor = tc_get_minor(handle);
2376 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2377 *queue_id = minor - 1;
c1c9c9c4
BP
2378 } else {
2379 error = EPROTO;
2380 }
2381 }
2382 if (!error && options) {
2383 error = htb_parse_tca_options__(nl_options, options);
2384 }
2385 return error;
2386}
2387
2388static void
2389htb_parse_qdisc_details__(struct netdev *netdev,
2390 const struct shash *details, struct htb_class *hc)
2391{
2392 const char *max_rate_s;
2393
2394 max_rate_s = shash_find_data(details, "max-rate");
2395 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2396 if (!hc->max_rate) {
2397 uint32_t current;
2398
2399 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2400 hc->max_rate = netdev_features_to_bps(current) / 8;
2401 }
2402 hc->min_rate = hc->max_rate;
2403 hc->burst = 0;
2404 hc->priority = 0;
2405}
2406
2407static int
2408htb_parse_class_details__(struct netdev *netdev,
2409 const struct shash *details, struct htb_class *hc)
2410{
2411 const struct htb *htb = htb_get__(netdev);
2412 const char *min_rate_s = shash_find_data(details, "min-rate");
2413 const char *max_rate_s = shash_find_data(details, "max-rate");
2414 const char *burst_s = shash_find_data(details, "burst");
2415 const char *priority_s = shash_find_data(details, "priority");
2416 int mtu;
2417
2418 /* min-rate */
2419 if (!min_rate_s) {
2420 /* min-rate is required. */
2421 return EINVAL;
2422 }
2423 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2424 hc->min_rate = MAX(hc->min_rate, 0);
2425 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2426
2427 /* max-rate */
2428 hc->max_rate = (max_rate_s
2429 ? strtoull(max_rate_s, NULL, 10) / 8
2430 : htb->max_rate);
2431 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2432 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2433
2434 /* burst
2435 *
2436 * According to hints in the documentation that I've read, it is important
2437 * that 'burst' be at least as big as the largest frame that might be
2438 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2439 * but having it a bit too small is a problem. Since netdev_get_mtu()
2440 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2441 * the MTU. We actually add 64, instead of 14, as a guard against
2442 * additional headers get tacked on somewhere that we're not aware of. */
2443 netdev_get_mtu(netdev, &mtu);
2444 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2445 hc->burst = MAX(hc->burst, mtu + 64);
2446
2447 /* priority */
2448 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2449
2450 return 0;
2451}
2452
2453static int
2454htb_query_class__(const struct netdev *netdev, unsigned int handle,
2455 unsigned int parent, struct htb_class *options,
2456 struct netdev_queue_stats *stats)
2457{
2458 struct ofpbuf *reply;
2459 int error;
2460
2461 error = tc_query_class(netdev, handle, parent, &reply);
2462 if (!error) {
2463 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2464 ofpbuf_delete(reply);
2465 }
2466 return error;
2467}
2468
2469static int
2470htb_tc_install(struct netdev *netdev, const struct shash *details)
2471{
2472 int error;
2473
2474 error = htb_setup_qdisc__(netdev);
2475 if (!error) {
2476 struct htb_class hc;
2477
2478 htb_parse_qdisc_details__(netdev, details, &hc);
2479 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2480 tc_make_handle(1, 0), &hc);
2481 if (!error) {
2482 htb_install__(netdev, hc.max_rate);
2483 }
2484 }
2485 return error;
2486}
2487
93b13be8
BP
2488static struct htb_class *
2489htb_class_cast__(const struct tc_queue *queue)
2490{
2491 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2492}
2493
c1c9c9c4
BP
2494static void
2495htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2496 const struct htb_class *hc)
2497{
2498 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2499 size_t hash = hash_int(queue_id, 0);
2500 struct tc_queue *queue;
c1c9c9c4
BP
2501 struct htb_class *hcp;
2502
93b13be8
BP
2503 queue = tc_find_queue__(netdev, queue_id, hash);
2504 if (queue) {
2505 hcp = htb_class_cast__(queue);
2506 } else {
c1c9c9c4 2507 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2508 queue = &hcp->tc_queue;
2509 queue->queue_id = queue_id;
2510 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2511 }
93b13be8
BP
2512
2513 hcp->min_rate = hc->min_rate;
2514 hcp->max_rate = hc->max_rate;
2515 hcp->burst = hc->burst;
2516 hcp->priority = hc->priority;
c1c9c9c4
BP
2517}
2518
2519static int
2520htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2521{
2522 struct shash details = SHASH_INITIALIZER(&details);
2523 struct ofpbuf msg;
2524 struct nl_dump dump;
2525 struct htb_class hc;
2526 struct htb *htb;
2527
2528 /* Get qdisc options. */
2529 hc.max_rate = 0;
2530 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2531 htb = htb_install__(netdev, hc.max_rate);
2532
2533 /* Get queues. */
2534 start_queue_dump(netdev, &dump);
2535 shash_init(&details);
2536 while (nl_dump_next(&dump, &msg)) {
2537 unsigned int queue_id;
2538
2539 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2540 htb_update_queue__(netdev, queue_id, &hc);
2541 }
2542 }
2543 nl_dump_done(&dump);
2544
2545 return 0;
2546}
2547
2548static void
2549htb_tc_destroy(struct tc *tc)
2550{
2551 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2552 struct htb_class *hc, *next;
c1c9c9c4 2553
4e8e4213 2554 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2555 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2556 free(hc);
2557 }
2558 tc_destroy(tc);
2559 free(htb);
2560}
2561
2562static int
2563htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2564{
2565 const struct htb *htb = htb_get__(netdev);
2566 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2567 return 0;
2568}
2569
2570static int
2571htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2572{
2573 struct htb_class hc;
2574 int error;
2575
2576 htb_parse_qdisc_details__(netdev, details, &hc);
2577 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2578 tc_make_handle(1, 0), &hc);
2579 if (!error) {
2580 htb_get__(netdev)->max_rate = hc.max_rate;
2581 }
2582 return error;
2583}
2584
2585static int
93b13be8
BP
2586htb_class_get(const struct netdev *netdev OVS_UNUSED,
2587 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2588{
93b13be8 2589 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2590
2591 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2592 if (hc->min_rate != hc->max_rate) {
2593 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2594 }
2595 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2596 if (hc->priority) {
2597 shash_add(details, "priority", xasprintf("%u", hc->priority));
2598 }
2599 return 0;
2600}
2601
2602static int
2603htb_class_set(struct netdev *netdev, unsigned int queue_id,
2604 const struct shash *details)
2605{
2606 struct htb_class hc;
2607 int error;
2608
2609 error = htb_parse_class_details__(netdev, details, &hc);
2610 if (error) {
2611 return error;
2612 }
2613
17ee3c1f 2614 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2615 tc_make_handle(1, 0xfffe), &hc);
2616 if (error) {
2617 return error;
2618 }
2619
2620 htb_update_queue__(netdev, queue_id, &hc);
2621 return 0;
2622}
2623
2624static int
93b13be8 2625htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2626{
93b13be8 2627 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2628 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2629 int error;
2630
93b13be8 2631 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2632 if (!error) {
93b13be8 2633 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2634 free(hc);
c1c9c9c4
BP
2635 }
2636 return error;
2637}
2638
2639static int
93b13be8 2640htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2641 struct netdev_queue_stats *stats)
2642{
93b13be8 2643 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2644 tc_make_handle(1, 0xfffe), NULL, stats);
2645}
2646
2647static int
2648htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2649 const struct ofpbuf *nlmsg,
2650 netdev_dump_queue_stats_cb *cb, void *aux)
2651{
2652 struct netdev_queue_stats stats;
17ee3c1f 2653 unsigned int handle, major, minor;
c1c9c9c4
BP
2654 int error;
2655
2656 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2657 if (error) {
2658 return error;
2659 }
2660
17ee3c1f
BP
2661 major = tc_get_major(handle);
2662 minor = tc_get_minor(handle);
2663 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2664 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2665 }
2666 return 0;
2667}
2668
2669static const struct tc_ops tc_ops_htb = {
2670 "htb", /* linux_name */
2671 "linux-htb", /* ovs_name */
2672 HTB_N_QUEUES, /* n_queues */
2673 htb_tc_install,
2674 htb_tc_load,
2675 htb_tc_destroy,
2676 htb_qdisc_get,
2677 htb_qdisc_set,
2678 htb_class_get,
2679 htb_class_set,
2680 htb_class_delete,
2681 htb_class_get_stats,
2682 htb_class_dump_stats
2683};
2684\f
2685/* "linux-default" traffic control class.
2686 *
2687 * This class represents the default, unnamed Linux qdisc. It corresponds to
2688 * the "" (empty string) QoS type in the OVS database. */
2689
2690static void
2691default_install__(struct netdev *netdev)
2692{
2693 struct netdev_dev_linux *netdev_dev =
2694 netdev_dev_linux_cast(netdev_get_dev(netdev));
2695 static struct tc *tc;
2696
2697 if (!tc) {
2698 tc = xmalloc(sizeof *tc);
2699 tc_init(tc, &tc_ops_default);
2700 }
2701 netdev_dev->tc = tc;
2702}
2703
2704static int
2705default_tc_install(struct netdev *netdev,
2706 const struct shash *details OVS_UNUSED)
2707{
2708 default_install__(netdev);
2709 return 0;
2710}
2711
2712static int
2713default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2714{
2715 default_install__(netdev);
2716 return 0;
2717}
2718
2719static const struct tc_ops tc_ops_default = {
2720 NULL, /* linux_name */
2721 "", /* ovs_name */
2722 0, /* n_queues */
2723 default_tc_install,
2724 default_tc_load,
2725 NULL, /* tc_destroy */
2726 NULL, /* qdisc_get */
2727 NULL, /* qdisc_set */
2728 NULL, /* class_get */
2729 NULL, /* class_set */
2730 NULL, /* class_delete */
2731 NULL, /* class_get_stats */
2732 NULL /* class_dump_stats */
2733};
2734\f
2735/* "linux-other" traffic control class.
2736 *
2737 * */
2738
2739static int
2740other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2741{
2742 struct netdev_dev_linux *netdev_dev =
2743 netdev_dev_linux_cast(netdev_get_dev(netdev));
2744 static struct tc *tc;
2745
2746 if (!tc) {
2747 tc = xmalloc(sizeof *tc);
2748 tc_init(tc, &tc_ops_other);
2749 }
2750 netdev_dev->tc = tc;
2751 return 0;
2752}
2753
2754static const struct tc_ops tc_ops_other = {
2755 NULL, /* linux_name */
2756 "linux-other", /* ovs_name */
2757 0, /* n_queues */
2758 NULL, /* tc_install */
2759 other_tc_load,
2760 NULL, /* tc_destroy */
2761 NULL, /* qdisc_get */
2762 NULL, /* qdisc_set */
2763 NULL, /* class_get */
2764 NULL, /* class_set */
2765 NULL, /* class_delete */
2766 NULL, /* class_get_stats */
2767 NULL /* class_dump_stats */
2768};
2769\f
2770/* Traffic control. */
2771
2772/* Number of kernel "tc" ticks per second. */
2773static double ticks_per_s;
2774
2775/* Number of kernel "jiffies" per second. This is used for the purpose of
2776 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2777 * one jiffy's worth of data.
2778 *
2779 * There are two possibilities here:
2780 *
2781 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2782 * approximate range of 100 to 1024. That means that we really need to
2783 * make sure that the qdisc can buffer that much data.
2784 *
2785 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2786 * has finely granular timers and there's no need to fudge additional room
2787 * for buffers. (There's no extra effort needed to implement that: the
2788 * large 'buffer_hz' is used as a divisor, so practically any number will
2789 * come out as 0 in the division. Small integer results in the case of
2790 * really high dividends won't have any real effect anyhow.)
2791 */
2792static unsigned int buffer_hz;
2793
2794/* Returns tc handle 'major':'minor'. */
2795static unsigned int
2796tc_make_handle(unsigned int major, unsigned int minor)
2797{
2798 return TC_H_MAKE(major << 16, minor);
2799}
2800
2801/* Returns the major number from 'handle'. */
2802static unsigned int
2803tc_get_major(unsigned int handle)
2804{
2805 return TC_H_MAJ(handle) >> 16;
2806}
2807
2808/* Returns the minor number from 'handle'. */
2809static unsigned int
2810tc_get_minor(unsigned int handle)
2811{
2812 return TC_H_MIN(handle);
2813}
2814
2815static struct tcmsg *
2816tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2817 struct ofpbuf *request)
2818{
2819 struct tcmsg *tcmsg;
2820 int ifindex;
2821 int error;
2822
2823 error = get_ifindex(netdev, &ifindex);
2824 if (error) {
2825 return NULL;
2826 }
2827
2828 ofpbuf_init(request, 512);
2829 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2830 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2831 tcmsg->tcm_family = AF_UNSPEC;
2832 tcmsg->tcm_ifindex = ifindex;
2833 /* Caller should fill in tcmsg->tcm_handle. */
2834 /* Caller should fill in tcmsg->tcm_parent. */
2835
2836 return tcmsg;
2837}
2838
2839static int
2840tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2841{
2842 int error = nl_sock_transact(rtnl_sock, request, replyp);
2843 ofpbuf_uninit(request);
2844 return error;
2845}
2846
2847static void
2848read_psched(void)
2849{
2850 /* The values in psched are not individually very meaningful, but they are
2851 * important. The tables below show some values seen in the wild.
2852 *
2853 * Some notes:
2854 *
2855 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2856 * (Before that, there are hints that it was 1000000000.)
2857 *
2858 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2859 * above.
2860 *
2861 * /proc/net/psched
2862 * -----------------------------------
2863 * [1] 000c8000 000f4240 000f4240 00000064
2864 * [2] 000003e8 00000400 000f4240 3b9aca00
2865 * [3] 000003e8 00000400 000f4240 3b9aca00
2866 * [4] 000003e8 00000400 000f4240 00000064
2867 * [5] 000003e8 00000040 000f4240 3b9aca00
2868 * [6] 000003e8 00000040 000f4240 000000f9
2869 *
2870 * a b c d ticks_per_s buffer_hz
2871 * ------- --------- ---------- ------------- ----------- -------------
2872 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2873 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2874 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2875 * [4] 1,000 1,024 1,000,000 100 976,562 100
2876 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2877 * [6] 1,000 64 1,000,000 249 15,625,000 249
2878 *
2879 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2880 * [2] 2.6.26-1-686-bigmem from Debian lenny
2881 * [3] 2.6.26-2-sparc64 from Debian lenny
2882 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2883 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2884 * [6] 2.6.34 from kernel.org on KVM
2885 */
2886 static const char fn[] = "/proc/net/psched";
2887 unsigned int a, b, c, d;
2888 FILE *stream;
2889
2890 ticks_per_s = 1.0;
2891 buffer_hz = 100;
2892
2893 stream = fopen(fn, "r");
2894 if (!stream) {
2895 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2896 return;
2897 }
2898
2899 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2900 VLOG_WARN("%s: read failed", fn);
2901 fclose(stream);
2902 return;
2903 }
2904 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2905 fclose(stream);
2906
2907 if (!a || !c) {
2908 VLOG_WARN("%s: invalid scheduler parameters", fn);
2909 return;
2910 }
2911
2912 ticks_per_s = (double) a * c / b;
2913 if (c == 1000000) {
2914 buffer_hz = d;
2915 } else {
2916 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2917 fn, a, b, c, d);
2918 }
2919 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2920}
2921
2922/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2923 * rate of 'rate' bytes per second. */
2924static unsigned int
2925tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2926{
2927 if (!buffer_hz) {
2928 read_psched();
2929 }
2930 return (rate * ticks) / ticks_per_s;
2931}
2932
2933/* Returns the number of ticks that it would take to transmit 'size' bytes at a
2934 * rate of 'rate' bytes per second. */
2935static unsigned int
2936tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2937{
2938 if (!buffer_hz) {
2939 read_psched();
2940 }
015c93a4 2941 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
2942}
2943
2944/* Returns the number of bytes that need to be reserved for qdisc buffering at
2945 * a transmission rate of 'rate' bytes per second. */
2946static unsigned int
2947tc_buffer_per_jiffy(unsigned int rate)
2948{
2949 if (!buffer_hz) {
2950 read_psched();
2951 }
2952 return rate / buffer_hz;
2953}
2954
2955/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2956 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2957 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2958 * stores NULL into it if it is absent.
2959 *
2960 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2961 * 'msg'.
2962 *
2963 * Returns 0 if successful, otherwise a positive errno value. */
2964static int
2965tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2966 struct nlattr **options)
2967{
2968 static const struct nl_policy tca_policy[] = {
2969 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2970 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2971 };
2972 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2973
2974 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2975 tca_policy, ta, ARRAY_SIZE(ta))) {
2976 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2977 goto error;
2978 }
2979
2980 if (kind) {
2981 *kind = nl_attr_get_string(ta[TCA_KIND]);
2982 }
2983
2984 if (options) {
2985 *options = ta[TCA_OPTIONS];
2986 }
2987
2988 return 0;
2989
2990error:
2991 if (kind) {
2992 *kind = NULL;
2993 }
2994 if (options) {
2995 *options = NULL;
2996 }
2997 return EPROTO;
2998}
2999
3000/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3001 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3002 * into '*options', and its queue statistics into '*stats'. Any of the output
3003 * arguments may be null.
3004 *
3005 * Returns 0 if successful, otherwise a positive errno value. */
3006static int
3007tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3008 struct nlattr **options, struct netdev_queue_stats *stats)
3009{
3010 static const struct nl_policy tca_policy[] = {
3011 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3012 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3013 };
3014 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3015
3016 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3017 tca_policy, ta, ARRAY_SIZE(ta))) {
3018 VLOG_WARN_RL(&rl, "failed to parse class message");
3019 goto error;
3020 }
3021
3022 if (handlep) {
3023 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3024 *handlep = tc->tcm_handle;
3025 }
3026
3027 if (options) {
3028 *options = ta[TCA_OPTIONS];
3029 }
3030
3031 if (stats) {
3032 const struct gnet_stats_queue *gsq;
3033 struct gnet_stats_basic gsb;
3034
3035 static const struct nl_policy stats_policy[] = {
3036 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3037 .min_len = sizeof gsb },
3038 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3039 .min_len = sizeof *gsq },
3040 };
3041 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3042
3043 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3044 sa, ARRAY_SIZE(sa))) {
3045 VLOG_WARN_RL(&rl, "failed to parse class stats");
3046 goto error;
3047 }
3048
3049 /* Alignment issues screw up the length of struct gnet_stats_basic on
3050 * some arch/bitsize combinations. Newer versions of Linux have a
3051 * struct gnet_stats_basic_packed, but we can't depend on that. The
3052 * easiest thing to do is just to make a copy. */
3053 memset(&gsb, 0, sizeof gsb);
3054 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3055 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3056 stats->tx_bytes = gsb.bytes;
3057 stats->tx_packets = gsb.packets;
3058
3059 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3060 stats->tx_errors = gsq->drops;
3061 }
3062
3063 return 0;
3064
3065error:
3066 if (options) {
3067 *options = NULL;
3068 }
3069 if (stats) {
3070 memset(stats, 0, sizeof *stats);
3071 }
3072 return EPROTO;
3073}
3074
3075/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3076 * on 'netdev'. */
3077static int
3078tc_query_class(const struct netdev *netdev,
3079 unsigned int handle, unsigned int parent,
3080 struct ofpbuf **replyp)
3081{
3082 struct ofpbuf request;
3083 struct tcmsg *tcmsg;
3084 int error;
3085
3086 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3087 tcmsg->tcm_handle = handle;
3088 tcmsg->tcm_parent = parent;
3089
3090 error = tc_transact(&request, replyp);
3091 if (error) {
3092 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3093 netdev_get_name(netdev),
3094 tc_get_major(handle), tc_get_minor(handle),
3095 tc_get_major(parent), tc_get_minor(parent),
3096 strerror(error));
3097 }
3098 return error;
3099}
3100
3101/* Equivalent to "tc class del dev <name> handle <handle>". */
3102static int
3103tc_delete_class(const struct netdev *netdev, unsigned int handle)
3104{
3105 struct ofpbuf request;
3106 struct tcmsg *tcmsg;
3107 int error;
3108
3109 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3110 tcmsg->tcm_handle = handle;
3111 tcmsg->tcm_parent = 0;
3112
3113 error = tc_transact(&request, NULL);
3114 if (error) {
3115 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3116 netdev_get_name(netdev),
3117 tc_get_major(handle), tc_get_minor(handle),
3118 strerror(error));
3119 }
3120 return error;
3121}
3122
3123/* Equivalent to "tc qdisc del dev <name> root". */
3124static int
3125tc_del_qdisc(struct netdev *netdev)
3126{
3127 struct netdev_dev_linux *netdev_dev =
3128 netdev_dev_linux_cast(netdev_get_dev(netdev));
3129 struct ofpbuf request;
3130 struct tcmsg *tcmsg;
3131 int error;
3132
3133 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3134 tcmsg->tcm_handle = tc_make_handle(1, 0);
3135 tcmsg->tcm_parent = TC_H_ROOT;
3136
3137 error = tc_transact(&request, NULL);
3138 if (error == EINVAL) {
3139 /* EINVAL probably means that the default qdisc was in use, in which
3140 * case we've accomplished our purpose. */
3141 error = 0;
3142 }
3143 if (!error && netdev_dev->tc) {
3144 if (netdev_dev->tc->ops->tc_destroy) {
3145 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3146 }
3147 netdev_dev->tc = NULL;
3148 }
3149 return error;
3150}
3151
3152/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3153 * kernel to determine what they are. Returns 0 if successful, otherwise a
3154 * positive errno value. */
3155static int
3156tc_query_qdisc(const struct netdev *netdev)
3157{
3158 struct netdev_dev_linux *netdev_dev =
3159 netdev_dev_linux_cast(netdev_get_dev(netdev));
3160 struct ofpbuf request, *qdisc;
3161 const struct tc_ops *ops;
3162 struct tcmsg *tcmsg;
3163 int load_error;
3164 int error;
3165
3166 if (netdev_dev->tc) {
3167 return 0;
3168 }
3169
3170 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3171 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3172 * 2.6.35 without that fix backported to it.
3173 *
3174 * To avoid the OOPS, we must not make a request that would attempt to dump
3175 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3176 * few others. There are a few ways that I can see to do this, but most of
3177 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3178 * technique chosen here is to assume that any non-default qdisc that we
3179 * create will have a class with handle 1:0. The built-in qdiscs only have
3180 * a class with handle 0:0.
3181 *
3182 * We could check for Linux 2.6.35+ and use a more straightforward method
3183 * there. */
3184 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3185 tcmsg->tcm_handle = tc_make_handle(1, 0);
3186 tcmsg->tcm_parent = 0;
3187
3188 /* Figure out what tc class to instantiate. */
3189 error = tc_transact(&request, &qdisc);
3190 if (!error) {
3191 const char *kind;
3192
3193 error = tc_parse_qdisc(qdisc, &kind, NULL);
3194 if (error) {
3195 ops = &tc_ops_other;
3196 } else {
3197 ops = tc_lookup_linux_name(kind);
3198 if (!ops) {
3199 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3200 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3201
3202 ops = &tc_ops_other;
3203 }
3204 }
3205 } else if (error == ENOENT) {
3206 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3207 * other entity that doesn't have a handle 1:0. We will assume
3208 * that it's the system default qdisc. */
3209 ops = &tc_ops_default;
3210 error = 0;
3211 } else {
3212 /* Who knows? Maybe the device got deleted. */
3213 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3214 netdev_get_name(netdev), strerror(error));
3215 ops = &tc_ops_other;
3216 }
3217
3218 /* Instantiate it. */
3219 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3220 assert((load_error == 0) == (netdev_dev->tc != NULL));
3221 ofpbuf_delete(qdisc);
3222
3223 return error ? error : load_error;
3224}
3225
3226/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3227 approximate the time to transmit packets of various lengths. For an MTU of
3228 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3229 represents two possible packet lengths; for a MTU of 513 through 1024, four
3230 possible lengths; and so on.
3231
3232 Returns, for the specified 'mtu', the number of bits that packet lengths
3233 need to be shifted right to fit within such a 256-entry table. */
3234static int
3235tc_calc_cell_log(unsigned int mtu)
3236{
3237 int cell_log;
3238
3239 if (!mtu) {
3240 mtu = ETH_PAYLOAD_MAX;
3241 }
3242 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3243
3244 for (cell_log = 0; mtu >= 256; cell_log++) {
3245 mtu >>= 1;
3246 }
3247
3248 return cell_log;
3249}
3250
3251/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3252 * of 'mtu'. */
3253static void
3254tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3255{
3256 memset(rate, 0, sizeof *rate);
3257 rate->cell_log = tc_calc_cell_log(mtu);
3258 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3259 /* rate->cell_align = 0; */ /* distro headers. */
3260 rate->mpu = ETH_TOTAL_MIN;
3261 rate->rate = Bps;
3262}
3263
3264/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3265 * attribute of the specified "type".
3266 *
3267 * See tc_calc_cell_log() above for a description of "rtab"s. */
3268static void
3269tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3270{
3271 uint32_t *rtab;
3272 unsigned int i;
3273
3274 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3275 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3276 unsigned packet_size = (i + 1) << rate->cell_log;
3277 if (packet_size < rate->mpu) {
3278 packet_size = rate->mpu;
3279 }
3280 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3281 }
3282}
3283
3284/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3285 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3286 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3287 * 0 is fine.) */
c1c9c9c4
BP
3288static int
3289tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3290{
3291 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3292 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3293}
3294
3295\f
3296/* Utility functions. */
3297
3298static int
3299get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3300{
3301 /* Policy for RTNLGRP_LINK messages.
3302 *
3303 * There are *many* more fields in these messages, but currently we only
3304 * care about these fields. */
3305 static const struct nl_policy rtnlgrp_link_policy[] = {
3306 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3307 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3308 .min_len = sizeof(struct rtnl_link_stats) },
3309 };
3310
3311 struct ofpbuf request;
3312 struct ofpbuf *reply;
3313 struct ifinfomsg *ifi;
3314 const struct rtnl_link_stats *rtnl_stats;
3315 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3316 int error;
3317
3318 ofpbuf_init(&request, 0);
3319 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3320 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3321 ifi->ifi_family = PF_UNSPEC;
3322 ifi->ifi_index = ifindex;
3323 error = nl_sock_transact(rtnl_sock, &request, &reply);
3324 ofpbuf_uninit(&request);
3325 if (error) {
3326 return error;
3327 }
3328
3329 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3330 rtnlgrp_link_policy,
3331 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3332 ofpbuf_delete(reply);
3333 return EPROTO;
3334 }
3335
3336 if (!attrs[IFLA_STATS]) {
3337 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3338 ofpbuf_delete(reply);
3339 return EPROTO;
3340 }
8b61709d
BP
3341
3342 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3343 stats->rx_packets = rtnl_stats->rx_packets;
3344 stats->tx_packets = rtnl_stats->tx_packets;
3345 stats->rx_bytes = rtnl_stats->rx_bytes;
3346 stats->tx_bytes = rtnl_stats->tx_bytes;
3347 stats->rx_errors = rtnl_stats->rx_errors;
3348 stats->tx_errors = rtnl_stats->tx_errors;
3349 stats->rx_dropped = rtnl_stats->rx_dropped;
3350 stats->tx_dropped = rtnl_stats->tx_dropped;
3351 stats->multicast = rtnl_stats->multicast;
3352 stats->collisions = rtnl_stats->collisions;
3353 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3354 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3355 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3356 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3357 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3358 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3359 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3360 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3361 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3362 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3363 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3364
576e26d7
BP
3365 ofpbuf_delete(reply);
3366
8b61709d
BP
3367 return 0;
3368}
3369
3370static int
3371get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3372{
3373 static const char fn[] = "/proc/net/dev";
3374 char line[1024];
3375 FILE *stream;
3376 int ln;
3377
3378 stream = fopen(fn, "r");
3379 if (!stream) {
3380 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3381 return errno;
3382 }
3383
3384 ln = 0;
3385 while (fgets(line, sizeof line, stream)) {
3386 if (++ln >= 3) {
3387 char devname[16];
3388#define X64 "%"SCNu64
3389 if (sscanf(line,
3390 " %15[^:]:"
3391 X64 X64 X64 X64 X64 X64 X64 "%*u"
3392 X64 X64 X64 X64 X64 X64 X64 "%*u",
3393 devname,
3394 &stats->rx_bytes,
3395 &stats->rx_packets,
3396 &stats->rx_errors,
3397 &stats->rx_dropped,
3398 &stats->rx_fifo_errors,
3399 &stats->rx_frame_errors,
3400 &stats->multicast,
3401 &stats->tx_bytes,
3402 &stats->tx_packets,
3403 &stats->tx_errors,
3404 &stats->tx_dropped,
3405 &stats->tx_fifo_errors,
3406 &stats->collisions,
3407 &stats->tx_carrier_errors) != 15) {
3408 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3409 } else if (!strcmp(devname, netdev_name)) {
3410 stats->rx_length_errors = UINT64_MAX;
3411 stats->rx_over_errors = UINT64_MAX;
3412 stats->rx_crc_errors = UINT64_MAX;
3413 stats->rx_missed_errors = UINT64_MAX;
3414 stats->tx_aborted_errors = UINT64_MAX;
3415 stats->tx_heartbeat_errors = UINT64_MAX;
3416 stats->tx_window_errors = UINT64_MAX;
3417 fclose(stream);
3418 return 0;
3419 }
3420 }
3421 }
3422 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3423 fclose(stream);
3424 return ENODEV;
3425}
c1c9c9c4 3426
8b61709d
BP
3427static int
3428get_flags(const struct netdev *netdev, int *flags)
3429{
3430 struct ifreq ifr;
3431 int error;
3432
149f577a
JG
3433 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3434 "SIOCGIFFLAGS");
8b61709d
BP
3435 *flags = ifr.ifr_flags;
3436 return error;
3437}
3438
3439static int
3440set_flags(struct netdev *netdev, int flags)
3441{
3442 struct ifreq ifr;
3443
3444 ifr.ifr_flags = flags;
149f577a
JG
3445 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3446 "SIOCSIFFLAGS");
8b61709d
BP
3447}
3448
3449static int
3450do_get_ifindex(const char *netdev_name)
3451{
3452 struct ifreq ifr;
3453
3454 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3455 COVERAGE_INC(netdev_get_ifindex);
3456 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3457 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3458 netdev_name, strerror(errno));
3459 return -errno;
3460 }
3461 return ifr.ifr_ifindex;
3462}
3463
3464static int
3465get_ifindex(const struct netdev *netdev_, int *ifindexp)
3466{
149f577a
JG
3467 struct netdev_dev_linux *netdev_dev =
3468 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 3469 *ifindexp = 0;
149f577a 3470 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
3471 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3472 if (ifindex < 0) {
3473 return -ifindex;
3474 }
149f577a
JG
3475 netdev_dev->cache_valid |= VALID_IFINDEX;
3476 netdev_dev->ifindex = ifindex;
8b61709d 3477 }
149f577a 3478 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
3479 return 0;
3480}
3481
3482static int
3483get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3484{
3485 struct ifreq ifr;
3486 int hwaddr_family;
3487
3488 memset(&ifr, 0, sizeof ifr);
3489 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3490 COVERAGE_INC(netdev_get_hwaddr);
3491 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3492 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3493 netdev_name, strerror(errno));
3494 return errno;
3495 }
3496 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3497 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3498 VLOG_WARN("%s device has unknown hardware address family %d",
3499 netdev_name, hwaddr_family);
3500 }
3501 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3502 return 0;
3503}
3504
3505static int
3506set_etheraddr(const char *netdev_name, int hwaddr_family,
3507 const uint8_t mac[ETH_ADDR_LEN])
3508{
3509 struct ifreq ifr;
3510
3511 memset(&ifr, 0, sizeof ifr);
3512 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3513 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3514 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3515 COVERAGE_INC(netdev_set_hwaddr);
3516 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3517 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3518 netdev_name, strerror(errno));
3519 return errno;
3520 }
3521 return 0;
3522}
3523
3524static int
0b0544d7 3525netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
3526 int cmd, const char *cmd_name)
3527{
3528 struct ifreq ifr;
3529
3530 memset(&ifr, 0, sizeof ifr);
0b0544d7 3531 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
3532 ifr.ifr_data = (caddr_t) ecmd;
3533
3534 ecmd->cmd = cmd;
3535 COVERAGE_INC(netdev_ethtool);
3536 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3537 return 0;
3538 } else {
3539 if (errno != EOPNOTSUPP) {
3540 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 3541 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
3542 } else {
3543 /* The device doesn't support this operation. That's pretty
3544 * common, so there's no point in logging anything. */
3545 }
3546 return errno;
3547 }
3548}
3549
3550static int
149f577a
JG
3551netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3552 const char *cmd_name)
8b61709d 3553{
149f577a 3554 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 3555 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
3556 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3557 strerror(errno));
8b61709d
BP
3558 return errno;
3559 }
3560 return 0;
3561}
f1acd62b
BP
3562
3563static int
3564netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3565 int cmd, const char *cmd_name)
3566{
3567 struct ifreq ifr;
3568 int error;
3569
3570 ifr.ifr_addr.sa_family = AF_INET;
149f577a 3571 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
3572 if (!error) {
3573 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3574 *ip = sin->sin_addr;
3575 }
3576 return error;
3577}