]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
route-table: Handle route updates more robustly.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
149f577a 2 * Copyright (c) 2009, 2010 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
63331829 28#include <linux/mii.h>
6f42c8ea 29#include <linux/pkt_sched.h>
e9e28be3 30#include <linux/rtnetlink.h>
8b61709d
BP
31#include <linux/sockios.h>
32#include <linux/version.h>
33#include <sys/types.h>
34#include <sys/ioctl.h>
35#include <sys/socket.h>
36#include <netpacket/packet.h>
37#include <net/ethernet.h>
38#include <net/if.h>
a740f0de 39#include <linux/if_tunnel.h>
8b61709d
BP
40#include <net/if_arp.h>
41#include <net/if_packet.h>
42#include <net/route.h>
43#include <netinet/in.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
8b61709d
BP
50#include "dynamic-string.h"
51#include "fatal-signal.h"
93b13be8
BP
52#include "hash.h"
53#include "hmap.h"
8b61709d 54#include "netdev-provider.h"
7fbef77a 55#include "netdev-vport.h"
e9e28be3 56#include "netlink.h"
2fe27d5a 57#include "netlink-socket.h"
e9e28be3 58#include "ofpbuf.h"
8b61709d
BP
59#include "openflow/openflow.h"
60#include "packets.h"
61#include "poll-loop.h"
559843ed 62#include "rtnetlink.h"
21d6e22e 63#include "rtnetlink-link.h"
8b61709d
BP
64#include "socket-util.h"
65#include "shash.h"
66#include "svec.h"
e9e28be3 67#include "vlog.h"
5136ce49 68
d98e6007 69VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
70
71COVERAGE_DEFINE(netdev_get_vlan_vid);
72COVERAGE_DEFINE(netdev_set_policing);
73COVERAGE_DEFINE(netdev_arp_lookup);
74COVERAGE_DEFINE(netdev_get_ifindex);
75COVERAGE_DEFINE(netdev_get_hwaddr);
76COVERAGE_DEFINE(netdev_set_hwaddr);
77COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
78\f
79/* These were introduced in Linux 2.6.14, so they might be missing if we have
80 * old headers. */
81#ifndef ADVERTISED_Pause
82#define ADVERTISED_Pause (1 << 13)
83#endif
84#ifndef ADVERTISED_Asym_Pause
85#define ADVERTISED_Asym_Pause (1 << 14)
86#endif
87
c1c9c9c4
BP
88/* This was introduced in Linux 2.6.25, so it might be missing if we have old
89 * headers. */
90#ifndef TC_RTAB_SIZE
91#define TC_RTAB_SIZE 1024
92#endif
93
149f577a 94static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 95static int cache_notifier_refcount;
8b61709d
BP
96
97enum {
7fbef77a
JG
98 VALID_IFINDEX = 1 << 0,
99 VALID_ETHERADDR = 1 << 1,
100 VALID_IN4 = 1 << 2,
101 VALID_IN6 = 1 << 3,
102 VALID_MTU = 1 << 4,
103 VALID_CARRIER = 1 << 5,
104 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
105 VALID_POLICING = 1 << 7,
106 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
107};
108
149f577a
JG
109struct tap_state {
110 int fd;
61b999dd 111 bool opened;
149f577a 112};
c1c9c9c4
BP
113\f
114/* Traffic control. */
115
116/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
117 * network device.
118 *
119 * Each TC implementation subclasses this with whatever additional data it
120 * needs. */
c1c9c9c4
BP
121struct tc {
122 const struct tc_ops *ops;
93b13be8
BP
123 struct hmap queues; /* Contains "struct tc_queue"s.
124 * Read by generic TC layer.
125 * Written only by TC implementation. */
126};
c1c9c9c4 127
93b13be8
BP
128/* One traffic control queue.
129 *
130 * Each TC implementation subclasses this with whatever additional data it
131 * needs. */
132struct tc_queue {
133 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
134 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
135};
136
137/* A particular kind of traffic control. Each implementation generally maps to
138 * one particular Linux qdisc class.
139 *
140 * The functions below return 0 if successful or a positive errno value on
141 * failure, except where otherwise noted. All of them must be provided, except
142 * where otherwise noted. */
143struct tc_ops {
144 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
145 * This is null for tc_ops_default and tc_ops_other, for which there are no
146 * appropriate values. */
147 const char *linux_name;
148
149 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
150 const char *ovs_name;
151
152 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
153 * queues. The queues are numbered 0 through n_queues - 1. */
154 unsigned int n_queues;
155
156 /* Called to install this TC class on 'netdev'. The implementation should
157 * make the Netlink calls required to set up 'netdev' with the right qdisc
158 * and configure it according to 'details'. The implementation may assume
159 * that the current qdisc is the default; that is, there is no need for it
160 * to delete the current qdisc before installing itself.
161 *
162 * The contents of 'details' should be documented as valid for 'ovs_name'
163 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
164 * (which is built as ovs-vswitchd.conf.db(8)).
165 *
166 * This function must return 0 if and only if it sets 'netdev->tc' to an
167 * initialized 'struct tc'.
168 *
169 * (This function is null for tc_ops_other, which cannot be installed. For
170 * other TC classes it should always be nonnull.) */
171 int (*tc_install)(struct netdev *netdev, const struct shash *details);
172
173 /* Called when the netdev code determines (through a Netlink query) that
174 * this TC class's qdisc is installed on 'netdev', but we didn't install
175 * it ourselves and so don't know any of the details.
176 *
177 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
178 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
179 * implementation should parse the other attributes of 'nlmsg' as
180 * necessary to determine its configuration. If necessary it should also
181 * use Netlink queries to determine the configuration of queues on
182 * 'netdev'.
183 *
184 * This function must return 0 if and only if it sets 'netdev->tc' to an
185 * initialized 'struct tc'. */
186 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
187
188 /* Destroys the data structures allocated by the implementation as part of
189 * 'tc'. (This includes destroying 'tc->queues' by calling
190 * tc_destroy(tc).
191 *
192 * The implementation should not need to perform any Netlink calls. If
193 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
194 * (But it may not be desirable.)
195 *
196 * This function may be null if 'tc' is trivial. */
197 void (*tc_destroy)(struct tc *tc);
198
199 /* Retrieves details of 'netdev->tc' configuration into 'details'.
200 *
201 * The implementation should not need to perform any Netlink calls, because
202 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
203 * cached the configuration.
204 *
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
208 *
209 * This function may be null if 'tc' is not configurable.
210 */
211 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
212
213 /* Reconfigures 'netdev->tc' according to 'details', performing any
214 * required Netlink calls to complete the reconfiguration.
215 *
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
219 *
220 * This function may be null if 'tc' is not configurable.
221 */
222 int (*qdisc_set)(struct netdev *, const struct shash *details);
223
93b13be8
BP
224 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
225 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
226 *
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "Queue" table in
229 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
230 *
231 * The implementation should not need to perform any Netlink calls, because
232 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
233 * cached the queue configuration.
234 *
235 * This function may be null if 'tc' does not have queues ('n_queues' is
236 * 0). */
93b13be8 237 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
238 struct shash *details);
239
240 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
241 * 'details', perfoming any required Netlink calls to complete the
242 * reconfiguration. The caller ensures that 'queue_id' is less than
243 * 'n_queues'.
244 *
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "Queue" table in
247 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 *
249 * This function may be null if 'tc' does not have queues or its queues are
250 * not configurable. */
251 int (*class_set)(struct netdev *, unsigned int queue_id,
252 const struct shash *details);
253
93b13be8
BP
254 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
255 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
256 *
257 * This function may be null if 'tc' does not have queues or its queues
258 * cannot be deleted. */
93b13be8 259 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 260
93b13be8
BP
261 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
262 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
263 *
264 * On success, initializes '*stats'.
265 *
266 * This function may be null if 'tc' does not have queues or if it cannot
267 * report queue statistics. */
93b13be8
BP
268 int (*class_get_stats)(const struct netdev *netdev,
269 const struct tc_queue *queue,
c1c9c9c4
BP
270 struct netdev_queue_stats *stats);
271
272 /* Extracts queue stats from 'nlmsg', which is a response to a
273 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
274 *
275 * This function may be null if 'tc' does not have queues or if it cannot
276 * report queue statistics. */
277 int (*class_dump_stats)(const struct netdev *netdev,
278 const struct ofpbuf *nlmsg,
279 netdev_dump_queue_stats_cb *cb, void *aux);
280};
281
282static void
283tc_init(struct tc *tc, const struct tc_ops *ops)
284{
285 tc->ops = ops;
93b13be8 286 hmap_init(&tc->queues);
c1c9c9c4
BP
287}
288
289static void
290tc_destroy(struct tc *tc)
291{
93b13be8 292 hmap_destroy(&tc->queues);
c1c9c9c4
BP
293}
294
295static const struct tc_ops tc_ops_htb;
a339aa81 296static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
297static const struct tc_ops tc_ops_default;
298static const struct tc_ops tc_ops_other;
299
300static const struct tc_ops *tcs[] = {
301 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 302 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
303 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
304 &tc_ops_other, /* Some other qdisc. */
305 NULL
306};
149f577a 307
c1c9c9c4
BP
308static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
309static unsigned int tc_get_major(unsigned int handle);
310static unsigned int tc_get_minor(unsigned int handle);
311
312static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
313static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
314static unsigned int tc_buffer_per_jiffy(unsigned int rate);
315
316static struct tcmsg *tc_make_request(const struct netdev *, int type,
317 unsigned int flags, struct ofpbuf *);
318static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
319
320static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
321 struct nlattr **options);
322static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
323 struct nlattr **options,
324 struct netdev_queue_stats *);
325static int tc_query_class(const struct netdev *,
326 unsigned int handle, unsigned int parent,
327 struct ofpbuf **replyp);
328static int tc_delete_class(const struct netdev *, unsigned int handle);
329
330static int tc_del_qdisc(struct netdev *netdev);
331static int tc_query_qdisc(const struct netdev *netdev);
332
333static int tc_calc_cell_log(unsigned int mtu);
334static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
335static void tc_put_rtab(struct ofpbuf *, uint16_t type,
336 const struct tc_ratespec *rate);
337static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
338\f
149f577a
JG
339struct netdev_dev_linux {
340 struct netdev_dev netdev_dev;
341
8b61709d 342 struct shash_node *shash_node;
149f577a 343 unsigned int cache_valid;
8b61709d 344
8722022c
BP
345 /* The following are figured out "on demand" only. They are only valid
346 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
347 int ifindex;
348 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 349 struct in_addr address, netmask;
8b61709d
BP
350 struct in6_addr in6;
351 int mtu;
352 int carrier;
8722022c
BP
353 bool is_internal; /* Is this an openvswitch internal device? */
354 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
355 uint32_t kbits_rate; /* Policing data. */
356 uint32_t kbits_burst;
7fbef77a 357 bool have_vport_stats;
c1c9c9c4 358 struct tc *tc;
149f577a
JG
359
360 union {
361 struct tap_state tap;
362 } state;
8b61709d
BP
363};
364
149f577a
JG
365struct netdev_linux {
366 struct netdev netdev;
5b7448ed 367 int fd;
149f577a 368};
8b61709d 369
8b61709d
BP
370/* An AF_INET socket (used for ioctl operations). */
371static int af_inet_sock = -1;
372
ff4ed3c9
BP
373/* A Netlink routing socket that is not subscribed to any multicast groups. */
374static struct nl_sock *rtnl_sock;
375
8b61709d
BP
376struct netdev_linux_notifier {
377 struct netdev_notifier notifier;
378 struct list node;
379};
380
381static struct shash netdev_linux_notifiers =
382 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 383static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
384
385/* This is set pretty low because we probably won't learn anything from the
386 * additional log messages. */
387static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
388
15b3596a 389static int netdev_linux_init(void);
6f643e49 390
0b0544d7 391static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 392 int cmd, const char *cmd_name);
149f577a
JG
393static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
394 const char *cmd_name);
f1acd62b
BP
395static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
396 int cmd, const char *cmd_name);
8b61709d
BP
397static int get_flags(const struct netdev *, int *flagsp);
398static int set_flags(struct netdev *, int flags);
399static int do_get_ifindex(const char *netdev_name);
400static int get_ifindex(const struct netdev *, int *ifindexp);
401static int do_set_addr(struct netdev *netdev,
402 int ioctl_nr, const char *ioctl_name,
403 struct in_addr addr);
404static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
405static int set_etheraddr(const char *netdev_name, int hwaddr_family,
406 const uint8_t[ETH_ADDR_LEN]);
407static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
408static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
409
15b3596a
JG
410static bool
411is_netdev_linux_class(const struct netdev_class *netdev_class)
412{
413 return netdev_class->init == netdev_linux_init;
414}
415
149f577a
JG
416static struct netdev_dev_linux *
417netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 418{
15b3596a
JG
419 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
420 assert(is_netdev_linux_class(netdev_class));
421
149f577a 422 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
423}
424
8b61709d
BP
425static struct netdev_linux *
426netdev_linux_cast(const struct netdev *netdev)
427{
15b3596a
JG
428 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
431
8b61709d
BP
432 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
433}
ff4ed3c9 434\f
8b61709d
BP
435static int
436netdev_linux_init(void)
437{
438 static int status = -1;
439 if (status < 0) {
ff4ed3c9 440 /* Create AF_INET socket. */
8b61709d
BP
441 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
442 status = af_inet_sock >= 0 ? 0 : errno;
443 if (status) {
444 VLOG_ERR("failed to create inet socket: %s", strerror(status));
445 }
ff4ed3c9
BP
446
447 /* Create rtnetlink socket. */
448 if (!status) {
449 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
450 if (status) {
451 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
452 strerror(status));
453 }
454 }
8b61709d
BP
455 }
456 return status;
457}
458
459static void
460netdev_linux_run(void)
461{
21d6e22e 462 rtnetlink_link_notifier_run();
8b61709d
BP
463}
464
465static void
466netdev_linux_wait(void)
467{
21d6e22e 468 rtnetlink_link_notifier_wait();
8b61709d
BP
469}
470
471static void
21d6e22e 472netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 473 void *aux OVS_UNUSED)
8b61709d 474{
149f577a 475 struct netdev_dev_linux *dev;
8b61709d 476 if (change) {
46415c90
JG
477 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
478 if (base_dev) {
15b3596a
JG
479 const struct netdev_class *netdev_class =
480 netdev_dev_get_class(base_dev);
481
482 if (is_netdev_linux_class(netdev_class)) {
483 dev = netdev_dev_linux_cast(base_dev);
484 dev->cache_valid = 0;
485 }
8b61709d
BP
486 }
487 } else {
46415c90 488 struct shash device_shash;
8b61709d 489 struct shash_node *node;
46415c90
JG
490
491 shash_init(&device_shash);
492 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
493 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
494 dev = node->data;
495 dev->cache_valid = 0;
8b61709d 496 }
46415c90 497 shash_destroy(&device_shash);
8b61709d
BP
498 }
499}
500
c3827f61 501/* Creates system and internal devices. */
8b61709d 502static int
c3827f61 503netdev_linux_create(const struct netdev_class *class,
b8dcf5e9
BP
504 const char *name, const struct shash *args,
505 struct netdev_dev **netdev_devp)
6c88d577 506{
149f577a
JG
507 struct netdev_dev_linux *netdev_dev;
508 int error;
6c88d577
JP
509
510 if (!shash_is_empty(args)) {
c3827f61
BP
511 VLOG_WARN("%s: arguments for %s devices should be empty",
512 name, class->type);
6c88d577
JP
513 }
514
46415c90 515 if (!cache_notifier_refcount) {
21d6e22e
EJ
516 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
517 netdev_linux_cache_cb, NULL);
149f577a
JG
518 if (error) {
519 return error;
520 }
521 }
46415c90 522 cache_notifier_refcount++;
6c88d577 523
149f577a 524 netdev_dev = xzalloc(sizeof *netdev_dev);
c3827f61 525 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
46415c90 526
149f577a 527 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
528 return 0;
529}
530
5b7448ed
JG
531/* For most types of netdevs we open the device for each call of
532 * netdev_open(). However, this is not the case with tap devices,
533 * since it is only possible to open the device once. In this
534 * situation we share a single file descriptor, and consequently
535 * buffers, across all readers. Therefore once data is read it will
536 * be unavailable to other reads for tap devices. */
a740f0de 537static int
b8dcf5e9
BP
538netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
539 const char *name, const struct shash *args,
540 struct netdev_dev **netdev_devp)
a740f0de 541{
149f577a 542 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
543 struct tap_state *state;
544 static const char tap_dev[] = "/dev/net/tun";
545 struct ifreq ifr;
546 int error;
547
548 if (!shash_is_empty(args)) {
149f577a 549 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
550 }
551
149f577a
JG
552 netdev_dev = xzalloc(sizeof *netdev_dev);
553 state = &netdev_dev->state.tap;
a740f0de 554
6c88d577 555 /* Open tap device. */
149f577a
JG
556 state->fd = open(tap_dev, O_RDWR);
557 if (state->fd < 0) {
6c88d577
JP
558 error = errno;
559 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
560 goto error;
561 }
562
563 /* Create tap device. */
564 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
565 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 566 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
567 VLOG_WARN("%s: creating tap device failed: %s", name,
568 strerror(errno));
569 error = errno;
570 goto error;
571 }
572
573 /* Make non-blocking. */
149f577a 574 error = set_nonblocking(state->fd);
a740f0de
JG
575 if (error) {
576 goto error;
577 }
578
149f577a
JG
579 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
580 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
581 return 0;
582
583error:
149f577a 584 free(netdev_dev);
a740f0de
JG
585 return error;
586}
587
a740f0de 588static void
149f577a 589destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 590{
149f577a
JG
591 struct tap_state *state = &netdev_dev->state.tap;
592
593 if (state->fd >= 0) {
594 close(state->fd);
a740f0de
JG
595 }
596}
597
149f577a 598/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 599static void
149f577a 600netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 601{
149f577a 602 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 603 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 604
c1c9c9c4
BP
605 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
606 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
607 }
608
d2bb2799 609 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 610 cache_notifier_refcount--;
149f577a 611
46415c90 612 if (!cache_notifier_refcount) {
21d6e22e 613 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 614 }
d2bb2799 615 } else if (class == &netdev_tap_class) {
149f577a 616 destroy_tap(netdev_dev);
d2bb2799
BP
617 } else {
618 NOT_REACHED();
6c88d577 619 }
149f577a 620
658797c8 621 free(netdev_dev);
6c88d577
JP
622}
623
8b61709d 624static int
5b7448ed 625netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 626 struct netdev **netdevp)
8b61709d 627{
5b7448ed 628 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
629 struct netdev_linux *netdev;
630 enum netdev_flags flags;
631 int error;
632
633 /* Allocate network device. */
ec6fde61 634 netdev = xzalloc(sizeof *netdev);
49a6a163 635 netdev->fd = -1;
5b7448ed 636 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 637
c3827f61
BP
638 /* Verify that the device really exists, by attempting to read its flags.
639 * (The flags might be cached, in which case this won't actually do an
640 * ioctl.)
641 *
642 * Don't do this for "internal" netdevs, though, because those have to be
643 * created as netdev objects before they exist in the kernel, because
644 * creating them in the kernel happens by passing a netdev object to
645 * dpif_port_add(). */
646 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
647 error = netdev_get_flags(&netdev->netdev, &flags);
648 if (error == ENODEV) {
649 goto error;
650 }
8b61709d
BP
651 }
652
61b999dd
JG
653 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
654 !netdev_dev->state.tap.opened) {
655
656 /* We assume that the first user of the tap device is the primary user
657 * and give them the tap FD. Subsequent users probably just expect
658 * this to be a system device so open it normally to avoid send/receive
659 * directions appearing to be reversed. */
5b7448ed 660 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 661 netdev_dev->state.tap.opened = true;
5b7448ed 662 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
663 struct sockaddr_ll sll;
664 int protocol;
665 int ifindex;
666
667 /* Create file descriptor. */
668 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
669 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
670 : ethertype);
5b7448ed
JG
671 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
672 if (netdev->fd < 0) {
8b61709d
BP
673 error = errno;
674 goto error;
675 }
8b61709d
BP
676
677 /* Set non-blocking mode. */
5b7448ed 678 error = set_nonblocking(netdev->fd);
8b61709d
BP
679 if (error) {
680 goto error;
681 }
682
683 /* Get ethernet device index. */
684 error = get_ifindex(&netdev->netdev, &ifindex);
685 if (error) {
686 goto error;
687 }
688
689 /* Bind to specific ethernet device. */
690 memset(&sll, 0, sizeof sll);
691 sll.sll_family = AF_PACKET;
692 sll.sll_ifindex = ifindex;
5b7448ed 693 if (bind(netdev->fd,
8b61709d
BP
694 (struct sockaddr *) &sll, sizeof sll) < 0) {
695 error = errno;
5b7448ed 696 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 697 strerror(error));
8b61709d
BP
698 goto error;
699 }
700
701 /* Between the socket() and bind() calls above, the socket receives all
702 * packets of the requested type on all system interfaces. We do not
703 * want to receive that data, but there is no way to avoid it. So we
704 * must now drain out the receive queue. */
5b7448ed 705 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
706 if (error) {
707 goto error;
708 }
709 }
710
711 *netdevp = &netdev->netdev;
712 return 0;
713
714error:
149f577a 715 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
716 return error;
717}
718
719/* Closes and destroys 'netdev'. */
720static void
721netdev_linux_close(struct netdev *netdev_)
722{
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724
49a6a163 725 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 726 close(netdev->fd);
8b61709d
BP
727 }
728 free(netdev);
729}
e9e28be3 730
8b61709d
BP
731/* Initializes 'svec' with a list of the names of all known network devices. */
732static int
733netdev_linux_enumerate(struct svec *svec)
734{
735 struct if_nameindex *names;
736
737 names = if_nameindex();
738 if (names) {
739 size_t i;
740
741 for (i = 0; names[i].if_name != NULL; i++) {
742 svec_add(svec, names[i].if_name);
743 }
744 if_freenameindex(names);
745 return 0;
746 } else {
747 VLOG_WARN("could not obtain list of network device names: %s",
748 strerror(errno));
749 return errno;
750 }
751}
752
753static int
754netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
755{
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
757
5b7448ed 758 if (netdev->fd < 0) {
8b61709d 759 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 760 return -EAGAIN;
8b61709d
BP
761 }
762
763 for (;;) {
5b7448ed 764 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
765 if (retval >= 0) {
766 return retval;
767 } else if (errno != EINTR) {
768 if (errno != EAGAIN) {
769 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
770 strerror(errno), netdev_get_name(netdev_));
771 }
c0e5f6ca 772 return -errno;
8b61709d
BP
773 }
774 }
775}
776
777/* Registers with the poll loop to wake up from the next call to poll_block()
778 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
779static void
780netdev_linux_recv_wait(struct netdev *netdev_)
781{
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
783 if (netdev->fd >= 0) {
784 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
785 }
786}
787
788/* Discards all packets waiting to be received from 'netdev'. */
789static int
790netdev_linux_drain(struct netdev *netdev_)
791{
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 793 if (netdev->fd < 0) {
8b61709d 794 return 0;
5b7448ed 795 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 796 struct ifreq ifr;
149f577a 797 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
798 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
799 if (error) {
800 return error;
801 }
5b7448ed 802 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
803 return 0;
804 } else {
5b7448ed 805 return drain_rcvbuf(netdev->fd);
8b61709d
BP
806 }
807}
808
809/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
810 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
811 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
812 * the packet is too big or too small to transmit on the device.
813 *
814 * The caller retains ownership of 'buffer' in all cases.
815 *
816 * The kernel maintains a packet transmission queue, so the caller is not
817 * expected to do additional queuing of packets. */
818static int
819netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
820{
821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
822
823 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
824 */
5b7448ed 825 if (netdev->fd < 0) {
8b61709d
BP
826 return EPIPE;
827 }
828
829 for (;;) {
5b7448ed 830 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
831 if (retval < 0) {
832 /* The Linux AF_PACKET implementation never blocks waiting for room
833 * for packets, instead returning ENOBUFS. Translate this into
834 * EAGAIN for the caller. */
835 if (errno == ENOBUFS) {
836 return EAGAIN;
837 } else if (errno == EINTR) {
838 continue;
839 } else if (errno != EAGAIN) {
840 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
841 netdev_get_name(netdev_), strerror(errno));
842 }
843 return errno;
844 } else if (retval != size) {
845 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
846 "%zu) on %s", retval, size, netdev_get_name(netdev_));
847 return EMSGSIZE;
848 } else {
849 return 0;
850 }
851 }
852}
853
854/* Registers with the poll loop to wake up from the next call to poll_block()
855 * when the packet transmission queue has sufficient room to transmit a packet
856 * with netdev_send().
857 *
858 * The kernel maintains a packet transmission queue, so the client is not
859 * expected to do additional queuing of packets. Thus, this function is
860 * unlikely to ever be used. It is included for completeness. */
861static void
862netdev_linux_send_wait(struct netdev *netdev_)
863{
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 865 if (netdev->fd < 0) {
8b61709d 866 /* Nothing to do. */
5b7448ed
JG
867 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
868 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
869 } else {
870 /* TAP device always accepts packets.*/
871 poll_immediate_wake();
872 }
873}
874
875/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
876 * otherwise a positive errno value. */
877static int
878netdev_linux_set_etheraddr(struct netdev *netdev_,
879 const uint8_t mac[ETH_ADDR_LEN])
880{
149f577a
JG
881 struct netdev_dev_linux *netdev_dev =
882 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
883 int error;
884
149f577a
JG
885 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
886 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
887 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
888 if (!error) {
149f577a
JG
889 netdev_dev->cache_valid |= VALID_ETHERADDR;
890 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
891 }
892 } else {
893 error = 0;
8b61709d
BP
894 }
895 return error;
896}
897
898/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
899 * free the returned buffer. */
900static int
901netdev_linux_get_etheraddr(const struct netdev *netdev_,
902 uint8_t mac[ETH_ADDR_LEN])
903{
149f577a
JG
904 struct netdev_dev_linux *netdev_dev =
905 netdev_dev_linux_cast(netdev_get_dev(netdev_));
906 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 907 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 908 netdev_dev->etheraddr);
8b61709d
BP
909 if (error) {
910 return error;
911 }
149f577a 912 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 913 }
149f577a 914 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
915 return 0;
916}
917
918/* Returns the maximum size of transmitted (and received) packets on 'netdev',
919 * in bytes, not including the hardware header; thus, this is typically 1500
920 * bytes for Ethernet devices. */
921static int
922netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
923{
149f577a
JG
924 struct netdev_dev_linux *netdev_dev =
925 netdev_dev_linux_cast(netdev_get_dev(netdev_));
926 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
927 struct ifreq ifr;
928 int error;
929
149f577a
JG
930 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
931 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
932 if (error) {
933 return error;
934 }
149f577a
JG
935 netdev_dev->mtu = ifr.ifr_mtu;
936 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 937 }
149f577a 938 *mtup = netdev_dev->mtu;
8b61709d
BP
939 return 0;
940}
941
9ab3d9a3
BP
942/* Returns the ifindex of 'netdev', if successful, as a positive number.
943 * On failure, returns a negative errno value. */
944static int
945netdev_linux_get_ifindex(const struct netdev *netdev)
946{
947 int ifindex, error;
948
949 error = get_ifindex(netdev, &ifindex);
950 return error ? -error : ifindex;
951}
952
8b61709d
BP
953static int
954netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
955{
149f577a
JG
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
958 int error = 0;
959 char *fn = NULL;
960 int fd = -1;
961
149f577a 962 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
963 char line[8];
964 int retval;
965
149f577a
JG
966 fn = xasprintf("/sys/class/net/%s/carrier",
967 netdev_get_name(netdev_));
8b61709d
BP
968 fd = open(fn, O_RDONLY);
969 if (fd < 0) {
970 error = errno;
971 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
972 goto exit;
973 }
974
975 retval = read(fd, line, sizeof line);
976 if (retval < 0) {
977 error = errno;
978 if (error == EINVAL) {
979 /* This is the normal return value when we try to check carrier
980 * if the network device is not up. */
981 } else {
982 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
983 }
984 goto exit;
985 } else if (retval == 0) {
986 error = EPROTO;
987 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
988 goto exit;
989 }
990
991 if (line[0] != '0' && line[0] != '1') {
992 error = EPROTO;
993 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
994 fn, line[0]);
995 goto exit;
996 }
149f577a
JG
997 netdev_dev->carrier = line[0] != '0';
998 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 999 }
149f577a 1000 *carrier = netdev_dev->carrier;
8b61709d
BP
1001 error = 0;
1002
1003exit:
1004 if (fd >= 0) {
1005 close(fd);
1006 }
1007 free(fn);
1008 return error;
1009}
1010
63331829
EJ
1011static int
1012netdev_linux_get_miimon(const struct netdev *netdev_, bool *miimon)
1013{
1014 int error;
1015 struct ifreq ifr;
1016 const char *name = netdev_get_name(netdev_);
1017
1018 *miimon = false;
1019 memset(&ifr, 0, sizeof ifr);
1020
1021 error = netdev_linux_do_ioctl(name, &ifr, SIOCGMIIPHY, "SIOCGMIIPHY");
1022 if (!error) {
1023 struct mii_ioctl_data *data = (struct mii_ioctl_data *)&ifr.ifr_data;
1024
1025 /* data->phy_id is filled out by previous SIOCGMIIPHY ioctl call. */
1026 data->reg_num = MII_BMSR;
1027 error = netdev_linux_do_ioctl(name, &ifr, SIOCGMIIREG, "SIOCGMIIREG");
1028
1029 if (!error) {
1030 *miimon = !!(data->val_out & BMSR_LSTATUS);
1031 } else {
1032 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1033 }
1034 } else {
1035 struct ethtool_cmd ecmd;
1036 struct ethtool_value *eval = (struct ethtool_value *) &ecmd;
1037
1038 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1039 name);
1040
1041 memset(&ecmd, 0, sizeof ecmd);
1042 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1043 "ETHTOOL_GLINK");
1044 if (!error) {
1045 *miimon = !!eval->data;
1046 } else {
1047 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1048 }
1049 }
1050
1051 return error;
1052}
1053
8b61709d
BP
1054/* Check whether we can we use RTM_GETLINK to get network device statistics.
1055 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1056 * enabled. */
1057static bool
1058check_for_working_netlink_stats(void)
1059{
1060 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1061 * preferable, so if that works, we'll use it. */
1062 int ifindex = do_get_ifindex("lo");
1063 if (ifindex < 0) {
1064 VLOG_WARN("failed to get ifindex for lo, "
1065 "obtaining netdev stats from proc");
1066 return false;
1067 } else {
1068 struct netdev_stats stats;
1069 int error = get_stats_via_netlink(ifindex, &stats);
1070 if (!error) {
1071 VLOG_DBG("obtaining netdev stats via rtnetlink");
1072 return true;
1073 } else {
1074 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1075 "via proc (you are probably running a pre-2.6.19 "
1076 "kernel)", strerror(error));
1077 return false;
1078 }
1079 }
1080}
1081
8722022c
BP
1082/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1083static void
1084netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1085{
1086 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1087 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1088 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1089
8722022c
BP
1090 netdev_dev->is_tap = !strcmp(type, "tap");
1091 netdev_dev->is_internal = false;
1092 if (!netdev_dev->is_tap) {
1093 struct ethtool_drvinfo drvinfo;
1094 int error;
1095
1096 memset(&drvinfo, 0, sizeof drvinfo);
1097 error = netdev_linux_do_ethtool(name,
1098 (struct ethtool_cmd *)&drvinfo,
1099 ETHTOOL_GDRVINFO,
1100 "ETHTOOL_GDRVINFO");
1101
1102 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1103 netdev_dev->is_internal = true;
1104 }
1105 }
1106
1107 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1108 }
1109}
1110
92df599c
JG
1111static void
1112swap_uint64(uint64_t *a, uint64_t *b)
1113{
1114 *a ^= *b;
1115 *b ^= *a;
1116 *a ^= *b;
1117}
1118
7fbef77a 1119/* Retrieves current device stats for 'netdev'. */
8b61709d 1120static int
149f577a
JG
1121netdev_linux_get_stats(const struct netdev *netdev_,
1122 struct netdev_stats *stats)
8b61709d 1123{
149f577a
JG
1124 struct netdev_dev_linux *netdev_dev =
1125 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1126 static int use_netlink_stats = -1;
1127 int error;
1128
7fbef77a
JG
1129 if (netdev_dev->have_vport_stats ||
1130 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1131
1132 error = netdev_vport_get_stats(netdev_, stats);
1133 netdev_dev->have_vport_stats = !error;
1134 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1135 }
8b61709d 1136
7fbef77a
JG
1137 if (!netdev_dev->have_vport_stats) {
1138 if (use_netlink_stats < 0) {
1139 use_netlink_stats = check_for_working_netlink_stats();
1140 }
1141 if (use_netlink_stats) {
1142 int ifindex;
1143
1144 error = get_ifindex(netdev_, &ifindex);
1145 if (!error) {
1146 error = get_stats_via_netlink(ifindex, stats);
1147 }
1148 } else {
1149 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1150 }
8b61709d 1151 }
fe6b0e03
JG
1152
1153 /* If this port is an internal port then the transmit and receive stats
1154 * will appear to be swapped relative to the other ports since we are the
1155 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1156 * them back here. This does not apply if we are getting stats from the
1157 * vport layer because it always tracks stats from the perspective of the
1158 * switch. */
92df599c 1159 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1160 if (!error && !netdev_dev->have_vport_stats &&
1161 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1162 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1163 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1164 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1165 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1166 stats->rx_length_errors = 0;
1167 stats->rx_over_errors = 0;
1168 stats->rx_crc_errors = 0;
1169 stats->rx_frame_errors = 0;
1170 stats->rx_fifo_errors = 0;
1171 stats->rx_missed_errors = 0;
1172 stats->tx_aborted_errors = 0;
1173 stats->tx_carrier_errors = 0;
1174 stats->tx_fifo_errors = 0;
1175 stats->tx_heartbeat_errors = 0;
1176 stats->tx_window_errors = 0;
1177 }
1178
8b61709d
BP
1179 return error;
1180}
1181
1182/* Stores the features supported by 'netdev' into each of '*current',
1183 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1184 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1185 * successful, otherwise a positive errno value. */
8b61709d
BP
1186static int
1187netdev_linux_get_features(struct netdev *netdev,
1188 uint32_t *current, uint32_t *advertised,
1189 uint32_t *supported, uint32_t *peer)
1190{
1191 struct ethtool_cmd ecmd;
1192 int error;
1193
1194 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1195 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1196 ETHTOOL_GSET, "ETHTOOL_GSET");
1197 if (error) {
1198 return error;
1199 }
1200
1201 /* Supported features. */
1202 *supported = 0;
1203 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1204 *supported |= OFPPF_10MB_HD;
1205 }
1206 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1207 *supported |= OFPPF_10MB_FD;
1208 }
1209 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1210 *supported |= OFPPF_100MB_HD;
1211 }
1212 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1213 *supported |= OFPPF_100MB_FD;
1214 }
1215 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1216 *supported |= OFPPF_1GB_HD;
1217 }
1218 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1219 *supported |= OFPPF_1GB_FD;
1220 }
1221 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1222 *supported |= OFPPF_10GB_FD;
1223 }
1224 if (ecmd.supported & SUPPORTED_TP) {
1225 *supported |= OFPPF_COPPER;
1226 }
1227 if (ecmd.supported & SUPPORTED_FIBRE) {
1228 *supported |= OFPPF_FIBER;
1229 }
1230 if (ecmd.supported & SUPPORTED_Autoneg) {
1231 *supported |= OFPPF_AUTONEG;
1232 }
1233 if (ecmd.supported & SUPPORTED_Pause) {
1234 *supported |= OFPPF_PAUSE;
1235 }
1236 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1237 *supported |= OFPPF_PAUSE_ASYM;
1238 }
1239
1240 /* Advertised features. */
1241 *advertised = 0;
1242 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1243 *advertised |= OFPPF_10MB_HD;
1244 }
1245 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1246 *advertised |= OFPPF_10MB_FD;
1247 }
1248 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1249 *advertised |= OFPPF_100MB_HD;
1250 }
1251 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1252 *advertised |= OFPPF_100MB_FD;
1253 }
1254 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1255 *advertised |= OFPPF_1GB_HD;
1256 }
1257 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1258 *advertised |= OFPPF_1GB_FD;
1259 }
1260 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1261 *advertised |= OFPPF_10GB_FD;
1262 }
1263 if (ecmd.advertising & ADVERTISED_TP) {
1264 *advertised |= OFPPF_COPPER;
1265 }
1266 if (ecmd.advertising & ADVERTISED_FIBRE) {
1267 *advertised |= OFPPF_FIBER;
1268 }
1269 if (ecmd.advertising & ADVERTISED_Autoneg) {
1270 *advertised |= OFPPF_AUTONEG;
1271 }
1272 if (ecmd.advertising & ADVERTISED_Pause) {
1273 *advertised |= OFPPF_PAUSE;
1274 }
1275 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1276 *advertised |= OFPPF_PAUSE_ASYM;
1277 }
1278
1279 /* Current settings. */
1280 if (ecmd.speed == SPEED_10) {
1281 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1282 } else if (ecmd.speed == SPEED_100) {
1283 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1284 } else if (ecmd.speed == SPEED_1000) {
1285 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1286 } else if (ecmd.speed == SPEED_10000) {
1287 *current = OFPPF_10GB_FD;
1288 } else {
1289 *current = 0;
1290 }
1291
1292 if (ecmd.port == PORT_TP) {
1293 *current |= OFPPF_COPPER;
1294 } else if (ecmd.port == PORT_FIBRE) {
1295 *current |= OFPPF_FIBER;
1296 }
1297
1298 if (ecmd.autoneg) {
1299 *current |= OFPPF_AUTONEG;
1300 }
1301
1302 /* Peer advertisements. */
1303 *peer = 0; /* XXX */
1304
1305 return 0;
1306}
1307
1308/* Set the features advertised by 'netdev' to 'advertise'. */
1309static int
1310netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1311{
1312 struct ethtool_cmd ecmd;
1313 int error;
1314
1315 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1316 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1317 ETHTOOL_GSET, "ETHTOOL_GSET");
1318 if (error) {
1319 return error;
1320 }
1321
1322 ecmd.advertising = 0;
1323 if (advertise & OFPPF_10MB_HD) {
1324 ecmd.advertising |= ADVERTISED_10baseT_Half;
1325 }
1326 if (advertise & OFPPF_10MB_FD) {
1327 ecmd.advertising |= ADVERTISED_10baseT_Full;
1328 }
1329 if (advertise & OFPPF_100MB_HD) {
1330 ecmd.advertising |= ADVERTISED_100baseT_Half;
1331 }
1332 if (advertise & OFPPF_100MB_FD) {
1333 ecmd.advertising |= ADVERTISED_100baseT_Full;
1334 }
1335 if (advertise & OFPPF_1GB_HD) {
1336 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1337 }
1338 if (advertise & OFPPF_1GB_FD) {
1339 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1340 }
1341 if (advertise & OFPPF_10GB_FD) {
1342 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1343 }
1344 if (advertise & OFPPF_COPPER) {
1345 ecmd.advertising |= ADVERTISED_TP;
1346 }
1347 if (advertise & OFPPF_FIBER) {
1348 ecmd.advertising |= ADVERTISED_FIBRE;
1349 }
1350 if (advertise & OFPPF_AUTONEG) {
1351 ecmd.advertising |= ADVERTISED_Autoneg;
1352 }
1353 if (advertise & OFPPF_PAUSE) {
1354 ecmd.advertising |= ADVERTISED_Pause;
1355 }
1356 if (advertise & OFPPF_PAUSE_ASYM) {
1357 ecmd.advertising |= ADVERTISED_Asym_Pause;
1358 }
0b0544d7 1359 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1360 ETHTOOL_SSET, "ETHTOOL_SSET");
1361}
1362
1363/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1364 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1365 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1366 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1367 * sets '*vlan_vid' to -1. */
1368static int
1369netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1370{
1371 const char *netdev_name = netdev_get_name(netdev);
1372 struct ds line = DS_EMPTY_INITIALIZER;
1373 FILE *stream = NULL;
1374 int error;
1375 char *fn;
1376
1377 COVERAGE_INC(netdev_get_vlan_vid);
1378 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1379 stream = fopen(fn, "r");
1380 if (!stream) {
1381 error = errno;
1382 goto done;
1383 }
1384
1385 if (ds_get_line(&line, stream)) {
1386 if (ferror(stream)) {
1387 error = errno;
1388 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1389 } else {
1390 error = EPROTO;
1391 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1392 }
1393 goto done;
1394 }
1395
1396 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1397 error = EPROTO;
1398 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1399 fn, ds_cstr(&line));
1400 goto done;
1401 }
1402
1403 error = 0;
1404
1405done:
1406 free(fn);
1407 if (stream) {
1408 fclose(stream);
1409 }
1410 ds_destroy(&line);
1411 if (error) {
1412 *vlan_vid = -1;
1413 }
1414 return error;
1415}
1416
1417#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1418#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1419
8e460221 1420/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1421 * positive errno value.
1422 *
1423 * This function is equivalent to running
1424 * /sbin/tc qdisc del dev %s handle ffff: ingress
1425 * but it is much, much faster.
1426 */
8e460221
BP
1427static int
1428netdev_linux_remove_policing(struct netdev *netdev)
1429{
80a86fbe
BP
1430 struct netdev_dev_linux *netdev_dev =
1431 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1432 const char *netdev_name = netdev_get_name(netdev);
8e460221 1433
6f42c8ea 1434 struct ofpbuf request;
6f42c8ea 1435 struct tcmsg *tcmsg;
6f42c8ea
BP
1436 int error;
1437
c1c9c9c4 1438 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1439 if (!tcmsg) {
1440 return ENODEV;
1441 }
c1c9c9c4 1442 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1443 tcmsg->tcm_parent = TC_H_INGRESS;
1444 nl_msg_put_string(&request, TCA_KIND, "ingress");
1445 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1446
1447 error = tc_transact(&request, NULL);
4d10512c 1448 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1449 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1450 netdev_name, strerror(error));
1451 return error;
1452 }
1453
80a86fbe
BP
1454 netdev_dev->kbits_rate = 0;
1455 netdev_dev->kbits_burst = 0;
1456 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1457 return 0;
1458}
1459
8b61709d
BP
1460/* Attempts to set input rate limiting (policing) policy. */
1461static int
1462netdev_linux_set_policing(struct netdev *netdev,
1463 uint32_t kbits_rate, uint32_t kbits_burst)
1464{
80a86fbe
BP
1465 struct netdev_dev_linux *netdev_dev =
1466 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1467 const char *netdev_name = netdev_get_name(netdev);
1468 char command[1024];
1469
1470 COVERAGE_INC(netdev_set_policing);
8e460221 1471
80a86fbe
BP
1472 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1473 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1474 : kbits_burst); /* Stick with user-specified value. */
1475
1476 if (netdev_dev->cache_valid & VALID_POLICING
1477 && netdev_dev->kbits_rate == kbits_rate
1478 && netdev_dev->kbits_burst == kbits_burst) {
1479 /* Assume that settings haven't changed since we last set them. */
1480 return 0;
1481 }
1482
8e460221 1483 netdev_linux_remove_policing(netdev);
8b61709d 1484 if (kbits_rate) {
8b61709d
BP
1485 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1486 if (system(command) != 0) {
1487 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1488 return -1;
1489 }
1490
1491 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1492 kbits_rate, kbits_burst);
1493 if (system(command) != 0) {
1494 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1495 netdev_name);
1496 return -1;
1497 }
80a86fbe
BP
1498
1499 netdev_dev->kbits_rate = kbits_rate;
1500 netdev_dev->kbits_burst = kbits_burst;
1501 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1502 }
1503
1504 return 0;
1505}
1506
c1c9c9c4
BP
1507static int
1508netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1509 struct svec *types)
1510{
1511 const struct tc_ops **opsp;
1512
1513 for (opsp = tcs; *opsp != NULL; opsp++) {
1514 const struct tc_ops *ops = *opsp;
1515 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1516 svec_add(types, ops->ovs_name);
1517 }
1518 }
1519 return 0;
1520}
1521
1522static const struct tc_ops *
1523tc_lookup_ovs_name(const char *name)
1524{
1525 const struct tc_ops **opsp;
1526
1527 for (opsp = tcs; *opsp != NULL; opsp++) {
1528 const struct tc_ops *ops = *opsp;
1529 if (!strcmp(name, ops->ovs_name)) {
1530 return ops;
1531 }
1532 }
1533 return NULL;
1534}
1535
1536static const struct tc_ops *
1537tc_lookup_linux_name(const char *name)
1538{
1539 const struct tc_ops **opsp;
1540
1541 for (opsp = tcs; *opsp != NULL; opsp++) {
1542 const struct tc_ops *ops = *opsp;
1543 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1544 return ops;
1545 }
1546 }
1547 return NULL;
1548}
1549
93b13be8
BP
1550static struct tc_queue *
1551tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1552 size_t hash)
1553{
1554 struct netdev_dev_linux *netdev_dev =
1555 netdev_dev_linux_cast(netdev_get_dev(netdev));
1556 struct tc_queue *queue;
1557
4e8e4213 1558 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1559 if (queue->queue_id == queue_id) {
1560 return queue;
1561 }
1562 }
1563 return NULL;
1564}
1565
1566static struct tc_queue *
1567tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1568{
1569 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1570}
1571
c1c9c9c4
BP
1572static int
1573netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1574 const char *type,
1575 struct netdev_qos_capabilities *caps)
1576{
1577 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1578 if (!ops) {
1579 return EOPNOTSUPP;
1580 }
1581 caps->n_queues = ops->n_queues;
1582 return 0;
1583}
1584
1585static int
1586netdev_linux_get_qos(const struct netdev *netdev,
1587 const char **typep, struct shash *details)
1588{
1589 struct netdev_dev_linux *netdev_dev =
1590 netdev_dev_linux_cast(netdev_get_dev(netdev));
1591 int error;
1592
1593 error = tc_query_qdisc(netdev);
1594 if (error) {
1595 return error;
1596 }
1597
1598 *typep = netdev_dev->tc->ops->ovs_name;
1599 return (netdev_dev->tc->ops->qdisc_get
1600 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1601 : 0);
1602}
1603
1604static int
1605netdev_linux_set_qos(struct netdev *netdev,
1606 const char *type, const struct shash *details)
1607{
1608 struct netdev_dev_linux *netdev_dev =
1609 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 const struct tc_ops *new_ops;
1611 int error;
1612
1613 new_ops = tc_lookup_ovs_name(type);
1614 if (!new_ops || !new_ops->tc_install) {
1615 return EOPNOTSUPP;
1616 }
1617
1618 error = tc_query_qdisc(netdev);
1619 if (error) {
1620 return error;
1621 }
1622
1623 if (new_ops == netdev_dev->tc->ops) {
1624 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1625 } else {
1626 /* Delete existing qdisc. */
1627 error = tc_del_qdisc(netdev);
1628 if (error) {
1629 return error;
1630 }
1631 assert(netdev_dev->tc == NULL);
1632
1633 /* Install new qdisc. */
1634 error = new_ops->tc_install(netdev, details);
1635 assert((error == 0) == (netdev_dev->tc != NULL));
1636
1637 return error;
1638 }
1639}
1640
1641static int
1642netdev_linux_get_queue(const struct netdev *netdev,
1643 unsigned int queue_id, struct shash *details)
1644{
1645 struct netdev_dev_linux *netdev_dev =
1646 netdev_dev_linux_cast(netdev_get_dev(netdev));
1647 int error;
1648
1649 error = tc_query_qdisc(netdev);
1650 if (error) {
1651 return error;
93b13be8
BP
1652 } else {
1653 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1654 return (queue
1655 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1656 : ENOENT);
c1c9c9c4 1657 }
c1c9c9c4
BP
1658}
1659
1660static int
1661netdev_linux_set_queue(struct netdev *netdev,
1662 unsigned int queue_id, const struct shash *details)
1663{
1664 struct netdev_dev_linux *netdev_dev =
1665 netdev_dev_linux_cast(netdev_get_dev(netdev));
1666 int error;
1667
1668 error = tc_query_qdisc(netdev);
1669 if (error) {
1670 return error;
1671 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1672 || !netdev_dev->tc->ops->class_set) {
1673 return EINVAL;
1674 }
1675
1676 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1677}
1678
1679static int
1680netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1681{
1682 struct netdev_dev_linux *netdev_dev =
1683 netdev_dev_linux_cast(netdev_get_dev(netdev));
1684 int error;
1685
1686 error = tc_query_qdisc(netdev);
1687 if (error) {
1688 return error;
1689 } else if (!netdev_dev->tc->ops->class_delete) {
1690 return EINVAL;
93b13be8
BP
1691 } else {
1692 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1693 return (queue
1694 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1695 : ENOENT);
c1c9c9c4 1696 }
c1c9c9c4
BP
1697}
1698
1699static int
1700netdev_linux_get_queue_stats(const struct netdev *netdev,
1701 unsigned int queue_id,
1702 struct netdev_queue_stats *stats)
1703{
1704 struct netdev_dev_linux *netdev_dev =
1705 netdev_dev_linux_cast(netdev_get_dev(netdev));
1706 int error;
1707
1708 error = tc_query_qdisc(netdev);
1709 if (error) {
1710 return error;
c1c9c9c4
BP
1711 } else if (!netdev_dev->tc->ops->class_get_stats) {
1712 return EOPNOTSUPP;
93b13be8
BP
1713 } else {
1714 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1715 return (queue
1716 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1717 : ENOENT);
c1c9c9c4 1718 }
c1c9c9c4
BP
1719}
1720
23a98ffe 1721static bool
c1c9c9c4
BP
1722start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1723{
1724 struct ofpbuf request;
1725 struct tcmsg *tcmsg;
1726
1727 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1728 if (!tcmsg) {
1729 return false;
1730 }
3c4de644 1731 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1732 nl_dump_start(dump, rtnl_sock, &request);
1733 ofpbuf_uninit(&request);
23a98ffe 1734 return true;
c1c9c9c4
BP
1735}
1736
1737static int
1738netdev_linux_dump_queues(const struct netdev *netdev,
1739 netdev_dump_queues_cb *cb, void *aux)
1740{
1741 struct netdev_dev_linux *netdev_dev =
1742 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1743 struct tc_queue *queue;
c1c9c9c4
BP
1744 struct shash details;
1745 int last_error;
c1c9c9c4
BP
1746 int error;
1747
1748 error = tc_query_qdisc(netdev);
1749 if (error) {
1750 return error;
1751 } else if (!netdev_dev->tc->ops->class_get) {
1752 return EOPNOTSUPP;
1753 }
1754
1755 last_error = 0;
1756 shash_init(&details);
4e8e4213 1757 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1758 shash_clear(&details);
1759
93b13be8 1760 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1761 if (!error) {
93b13be8 1762 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1763 } else {
1764 last_error = error;
1765 }
1766 }
1767 shash_destroy(&details);
1768
1769 return last_error;
1770}
1771
1772static int
1773netdev_linux_dump_queue_stats(const struct netdev *netdev,
1774 netdev_dump_queue_stats_cb *cb, void *aux)
1775{
1776 struct netdev_dev_linux *netdev_dev =
1777 netdev_dev_linux_cast(netdev_get_dev(netdev));
1778 struct nl_dump dump;
1779 struct ofpbuf msg;
1780 int last_error;
1781 int error;
1782
1783 error = tc_query_qdisc(netdev);
1784 if (error) {
1785 return error;
1786 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1787 return EOPNOTSUPP;
1788 }
1789
1790 last_error = 0;
23a98ffe
BP
1791 if (!start_queue_dump(netdev, &dump)) {
1792 return ENODEV;
1793 }
c1c9c9c4
BP
1794 while (nl_dump_next(&dump, &msg)) {
1795 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1796 if (error) {
1797 last_error = error;
1798 }
1799 }
1800
1801 error = nl_dump_done(&dump);
1802 return error ? error : last_error;
1803}
1804
8b61709d 1805static int
f1acd62b
BP
1806netdev_linux_get_in4(const struct netdev *netdev_,
1807 struct in_addr *address, struct in_addr *netmask)
8b61709d 1808{
149f577a
JG
1809 struct netdev_dev_linux *netdev_dev =
1810 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1811
1812 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1813 int error;
1814
149f577a 1815 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1816 SIOCGIFADDR, "SIOCGIFADDR");
1817 if (error) {
1818 return error;
1819 }
1820
149f577a 1821 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1822 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1823 if (error) {
1824 return error;
1825 }
1826
149f577a 1827 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1828 }
149f577a
JG
1829 *address = netdev_dev->address;
1830 *netmask = netdev_dev->netmask;
f1acd62b 1831 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1832}
1833
8b61709d 1834static int
f1acd62b
BP
1835netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1836 struct in_addr netmask)
8b61709d 1837{
149f577a
JG
1838 struct netdev_dev_linux *netdev_dev =
1839 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1840 int error;
1841
f1acd62b 1842 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1843 if (!error) {
149f577a
JG
1844 netdev_dev->cache_valid |= VALID_IN4;
1845 netdev_dev->address = address;
1846 netdev_dev->netmask = netmask;
f1acd62b 1847 if (address.s_addr != INADDR_ANY) {
8b61709d 1848 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1849 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1850 }
1851 }
1852 return error;
1853}
1854
1855static bool
1856parse_if_inet6_line(const char *line,
1857 struct in6_addr *in6, char ifname[16 + 1])
1858{
1859 uint8_t *s6 = in6->s6_addr;
1860#define X8 "%2"SCNx8
1861 return sscanf(line,
1862 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1863 "%*x %*x %*x %*x %16s\n",
1864 &s6[0], &s6[1], &s6[2], &s6[3],
1865 &s6[4], &s6[5], &s6[6], &s6[7],
1866 &s6[8], &s6[9], &s6[10], &s6[11],
1867 &s6[12], &s6[13], &s6[14], &s6[15],
1868 ifname) == 17;
1869}
1870
1871/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1872 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1873static int
1874netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1875{
149f577a
JG
1876 struct netdev_dev_linux *netdev_dev =
1877 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1878 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1879 FILE *file;
1880 char line[128];
1881
149f577a 1882 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1883
1884 file = fopen("/proc/net/if_inet6", "r");
1885 if (file != NULL) {
1886 const char *name = netdev_get_name(netdev_);
1887 while (fgets(line, sizeof line, file)) {
2a022368 1888 struct in6_addr in6_tmp;
8b61709d 1889 char ifname[16 + 1];
2a022368 1890 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1891 && !strcmp(name, ifname))
1892 {
2a022368 1893 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1894 break;
1895 }
1896 }
1897 fclose(file);
1898 }
149f577a 1899 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1900 }
149f577a 1901 *in6 = netdev_dev->in6;
8b61709d
BP
1902 return 0;
1903}
1904
1905static void
1906make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1907{
1908 struct sockaddr_in sin;
1909 memset(&sin, 0, sizeof sin);
1910 sin.sin_family = AF_INET;
1911 sin.sin_addr = addr;
1912 sin.sin_port = 0;
1913
1914 memset(sa, 0, sizeof *sa);
1915 memcpy(sa, &sin, sizeof sin);
1916}
1917
1918static int
1919do_set_addr(struct netdev *netdev,
1920 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1921{
1922 struct ifreq ifr;
149f577a 1923 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1924 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1925
1926 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1927 ioctl_name);
8b61709d
BP
1928}
1929
1930/* Adds 'router' as a default IP gateway. */
1931static int
67a4917b 1932netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1933{
1934 struct in_addr any = { INADDR_ANY };
1935 struct rtentry rt;
1936 int error;
1937
1938 memset(&rt, 0, sizeof rt);
1939 make_in4_sockaddr(&rt.rt_dst, any);
1940 make_in4_sockaddr(&rt.rt_gateway, router);
1941 make_in4_sockaddr(&rt.rt_genmask, any);
1942 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
1943 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1944 if (error) {
1945 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1946 }
1947 return error;
1948}
1949
f1acd62b
BP
1950static int
1951netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1952 char **netdev_name)
1953{
1954 static const char fn[] = "/proc/net/route";
1955 FILE *stream;
1956 char line[256];
1957 int ln;
1958
1959 *netdev_name = NULL;
1960 stream = fopen(fn, "r");
1961 if (stream == NULL) {
1962 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1963 return errno;
1964 }
1965
1966 ln = 0;
1967 while (fgets(line, sizeof line, stream)) {
1968 if (++ln >= 2) {
1969 char iface[17];
1970 uint32_t dest, gateway, mask;
1971 int refcnt, metric, mtu;
1972 unsigned int flags, use, window, irtt;
1973
1974 if (sscanf(line,
1975 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1976 " %d %u %u\n",
1977 iface, &dest, &gateway, &flags, &refcnt,
1978 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1979
d295e8e9 1980 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1981 fn, ln, line);
1982 continue;
1983 }
1984 if (!(flags & RTF_UP)) {
1985 /* Skip routes that aren't up. */
1986 continue;
1987 }
1988
1989 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1990 * network byte order, so we don't need need any endian
f1acd62b
BP
1991 * conversions here. */
1992 if ((dest & mask) == (host->s_addr & mask)) {
1993 if (!gateway) {
1994 /* The host is directly reachable. */
1995 next_hop->s_addr = 0;
1996 } else {
1997 /* To reach the host, we must go through a gateway. */
1998 next_hop->s_addr = gateway;
1999 }
2000 *netdev_name = xstrdup(iface);
2001 fclose(stream);
2002 return 0;
2003 }
2004 }
2005 }
2006
2007 fclose(stream);
2008 return ENXIO;
2009}
2010
8b61709d
BP
2011/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2012 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2013 * returns 0. Otherwise, it returns a positive errno value; in particular,
2014 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2015static int
2016netdev_linux_arp_lookup(const struct netdev *netdev,
2017 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2018{
2019 struct arpreq r;
c100e025 2020 struct sockaddr_in sin;
8b61709d
BP
2021 int retval;
2022
2023 memset(&r, 0, sizeof r);
c100e025
BP
2024 sin.sin_family = AF_INET;
2025 sin.sin_addr.s_addr = ip;
2026 sin.sin_port = 0;
2027 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2028 r.arp_ha.sa_family = ARPHRD_ETHER;
2029 r.arp_flags = 0;
149f577a 2030 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2031 COVERAGE_INC(netdev_arp_lookup);
2032 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2033 if (!retval) {
2034 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2035 } else if (retval != ENXIO) {
2036 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2037 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2038 }
2039 return retval;
2040}
2041
2042static int
2043nd_to_iff_flags(enum netdev_flags nd)
2044{
2045 int iff = 0;
2046 if (nd & NETDEV_UP) {
2047 iff |= IFF_UP;
2048 }
2049 if (nd & NETDEV_PROMISC) {
2050 iff |= IFF_PROMISC;
2051 }
2052 return iff;
2053}
2054
2055static int
2056iff_to_nd_flags(int iff)
2057{
2058 enum netdev_flags nd = 0;
2059 if (iff & IFF_UP) {
2060 nd |= NETDEV_UP;
2061 }
2062 if (iff & IFF_PROMISC) {
2063 nd |= NETDEV_PROMISC;
2064 }
2065 return nd;
2066}
2067
2068static int
2069netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2070 enum netdev_flags on, enum netdev_flags *old_flagsp)
2071{
2072 int old_flags, new_flags;
2073 int error;
2074
2075 error = get_flags(netdev, &old_flags);
2076 if (!error) {
2077 *old_flagsp = iff_to_nd_flags(old_flags);
2078 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2079 if (new_flags != old_flags) {
2080 error = set_flags(netdev, new_flags);
2081 }
2082 }
2083 return error;
2084}
2085
2086static void
2087poll_notify(struct list *list)
2088{
2089 struct netdev_linux_notifier *notifier;
4e8e4213 2090 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2091 struct netdev_notifier *n = &notifier->notifier;
2092 n->cb(n);
2093 }
2094}
2095
2096static void
21d6e22e 2097netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
67a4917b 2098 void *aux OVS_UNUSED)
8b61709d
BP
2099{
2100 if (change) {
2101 struct list *list = shash_find_data(&netdev_linux_notifiers,
2102 change->ifname);
2103 if (list) {
2104 poll_notify(list);
2105 }
2106 } else {
2107 struct shash_node *node;
2108 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2109 poll_notify(node->data);
2110 }
2111 }
2112}
2113
2114static int
2115netdev_linux_poll_add(struct netdev *netdev,
2116 void (*cb)(struct netdev_notifier *), void *aux,
2117 struct netdev_notifier **notifierp)
2118{
2119 const char *netdev_name = netdev_get_name(netdev);
2120 struct netdev_linux_notifier *notifier;
2121 struct list *list;
2122
2123 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e
EJ
2124 int error;
2125 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2126 netdev_linux_poll_cb, NULL);
8b61709d
BP
2127 if (error) {
2128 return error;
2129 }
2130 }
2131
2132 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2133 if (!list) {
2134 list = xmalloc(sizeof *list);
2135 list_init(list);
2136 shash_add(&netdev_linux_notifiers, netdev_name, list);
2137 }
2138
2139 notifier = xmalloc(sizeof *notifier);
2140 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2141 list_push_back(list, &notifier->node);
2142 *notifierp = &notifier->notifier;
2143 return 0;
2144}
2145
2146static void
2147netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2148{
2149 struct netdev_linux_notifier *notifier =
2150 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2151 struct list *list;
2152
2153 /* Remove 'notifier' from its list. */
2154 list = list_remove(&notifier->node);
2155 if (list_is_empty(list)) {
2156 /* The list is now empty. Remove it from the hash and free it. */
2157 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2158 shash_delete(&netdev_linux_notifiers,
2159 shash_find(&netdev_linux_notifiers, netdev_name));
2160 free(list);
2161 }
2162 free(notifier);
2163
2164 /* If that was the last notifier, unregister. */
2165 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e 2166 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2167 }
2168}
2169
c3827f61
BP
2170#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2171{ \
2172 NAME, \
2173 \
2174 netdev_linux_init, \
2175 netdev_linux_run, \
2176 netdev_linux_wait, \
2177 \
2178 CREATE, \
2179 netdev_linux_destroy, \
2180 NULL, /* reconfigure */ \
2181 \
2182 netdev_linux_open, \
2183 netdev_linux_close, \
2184 \
2185 ENUMERATE, \
2186 \
2187 netdev_linux_recv, \
2188 netdev_linux_recv_wait, \
2189 netdev_linux_drain, \
2190 \
2191 netdev_linux_send, \
2192 netdev_linux_send_wait, \
2193 \
2194 netdev_linux_set_etheraddr, \
2195 netdev_linux_get_etheraddr, \
2196 netdev_linux_get_mtu, \
2197 netdev_linux_get_ifindex, \
2198 netdev_linux_get_carrier, \
63331829 2199 netdev_linux_get_miimon, \
c3827f61
BP
2200 netdev_linux_get_stats, \
2201 SET_STATS, \
2202 \
2203 netdev_linux_get_features, \
2204 netdev_linux_set_advertisements, \
2205 netdev_linux_get_vlan_vid, \
2206 \
2207 netdev_linux_set_policing, \
2208 netdev_linux_get_qos_types, \
2209 netdev_linux_get_qos_capabilities, \
2210 netdev_linux_get_qos, \
2211 netdev_linux_set_qos, \
2212 netdev_linux_get_queue, \
2213 netdev_linux_set_queue, \
2214 netdev_linux_delete_queue, \
2215 netdev_linux_get_queue_stats, \
2216 netdev_linux_dump_queues, \
2217 netdev_linux_dump_queue_stats, \
2218 \
2219 netdev_linux_get_in4, \
2220 netdev_linux_set_in4, \
2221 netdev_linux_get_in6, \
2222 netdev_linux_add_router, \
2223 netdev_linux_get_next_hop, \
ea763e0e 2224 NULL, /* get_status */ \
c3827f61
BP
2225 netdev_linux_arp_lookup, \
2226 \
2227 netdev_linux_update_flags, \
2228 \
2229 netdev_linux_poll_add, \
2230 netdev_linux_poll_remove \
2231}
2232
2233const struct netdev_class netdev_linux_class =
2234 NETDEV_LINUX_CLASS(
2235 "system",
2236 netdev_linux_create,
2237 netdev_linux_enumerate,
98563392 2238 NULL); /* set_stats */
c3827f61
BP
2239
2240const struct netdev_class netdev_tap_class =
2241 NETDEV_LINUX_CLASS(
2242 "tap",
2243 netdev_linux_create_tap,
2244 NULL, /* enumerate */
2245 NULL); /* set_stats */
2246
2247const struct netdev_class netdev_internal_class =
2248 NETDEV_LINUX_CLASS(
2249 "internal",
2250 netdev_linux_create,
2251 NULL, /* enumerate */
2252 netdev_vport_set_stats);
8b61709d 2253\f
c1c9c9c4 2254/* HTB traffic control class. */
559843ed 2255
c1c9c9c4 2256#define HTB_N_QUEUES 0xf000
8b61709d 2257
c1c9c9c4
BP
2258struct htb {
2259 struct tc tc;
2260 unsigned int max_rate; /* In bytes/s. */
2261};
8b61709d 2262
c1c9c9c4 2263struct htb_class {
93b13be8 2264 struct tc_queue tc_queue;
c1c9c9c4
BP
2265 unsigned int min_rate; /* In bytes/s. */
2266 unsigned int max_rate; /* In bytes/s. */
2267 unsigned int burst; /* In bytes. */
2268 unsigned int priority; /* Lower values are higher priorities. */
2269};
8b61709d 2270
c1c9c9c4
BP
2271static struct htb *
2272htb_get__(const struct netdev *netdev)
2273{
2274 struct netdev_dev_linux *netdev_dev =
2275 netdev_dev_linux_cast(netdev_get_dev(netdev));
2276 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2277}
2278
2279static struct htb *
2280htb_install__(struct netdev *netdev, uint64_t max_rate)
2281{
2282 struct netdev_dev_linux *netdev_dev =
2283 netdev_dev_linux_cast(netdev_get_dev(netdev));
2284 struct htb *htb;
2285
2286 htb = xmalloc(sizeof *htb);
2287 tc_init(&htb->tc, &tc_ops_htb);
2288 htb->max_rate = max_rate;
2289
2290 netdev_dev->tc = &htb->tc;
2291
2292 return htb;
2293}
2294
2295/* Create an HTB qdisc.
2296 *
a339aa81 2297 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2298static int
2299htb_setup_qdisc__(struct netdev *netdev)
2300{
2301 size_t opt_offset;
2302 struct tc_htb_glob opt;
2303 struct ofpbuf request;
2304 struct tcmsg *tcmsg;
2305
2306 tc_del_qdisc(netdev);
2307
2308 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2309 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2310 if (!tcmsg) {
2311 return ENODEV;
2312 }
c1c9c9c4
BP
2313 tcmsg->tcm_handle = tc_make_handle(1, 0);
2314 tcmsg->tcm_parent = TC_H_ROOT;
2315
2316 nl_msg_put_string(&request, TCA_KIND, "htb");
2317
2318 memset(&opt, 0, sizeof opt);
2319 opt.rate2quantum = 10;
2320 opt.version = 3;
4ecf12d5 2321 opt.defcls = 1;
c1c9c9c4
BP
2322
2323 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2324 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2325 nl_msg_end_nested(&request, opt_offset);
2326
2327 return tc_transact(&request, NULL);
2328}
2329
2330/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2331 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2332static int
2333htb_setup_class__(struct netdev *netdev, unsigned int handle,
2334 unsigned int parent, struct htb_class *class)
2335{
2336 size_t opt_offset;
2337 struct tc_htb_opt opt;
2338 struct ofpbuf request;
2339 struct tcmsg *tcmsg;
2340 int error;
2341 int mtu;
2342
2343 netdev_get_mtu(netdev, &mtu);
2344
2345 memset(&opt, 0, sizeof opt);
2346 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2347 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2348 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2349 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2350 opt.prio = class->priority;
2351
2352 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2353 if (!tcmsg) {
2354 return ENODEV;
2355 }
c1c9c9c4
BP
2356 tcmsg->tcm_handle = handle;
2357 tcmsg->tcm_parent = parent;
2358
2359 nl_msg_put_string(&request, TCA_KIND, "htb");
2360 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2361 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2362 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2363 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2364 nl_msg_end_nested(&request, opt_offset);
2365
2366 error = tc_transact(&request, NULL);
2367 if (error) {
2368 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2369 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2370 netdev_get_name(netdev),
2371 tc_get_major(handle), tc_get_minor(handle),
2372 tc_get_major(parent), tc_get_minor(parent),
2373 class->min_rate, class->max_rate,
2374 class->burst, class->priority, strerror(error));
2375 }
2376 return error;
2377}
2378
2379/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2380 * description of them into 'details'. The description complies with the
2381 * specification given in the vswitch database documentation for linux-htb
2382 * queue details. */
2383static int
2384htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2385{
2386 static const struct nl_policy tca_htb_policy[] = {
2387 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2388 .min_len = sizeof(struct tc_htb_opt) },
2389 };
2390
2391 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2392 const struct tc_htb_opt *htb;
2393
2394 if (!nl_parse_nested(nl_options, tca_htb_policy,
2395 attrs, ARRAY_SIZE(tca_htb_policy))) {
2396 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2397 return EPROTO;
2398 }
2399
2400 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2401 class->min_rate = htb->rate.rate;
2402 class->max_rate = htb->ceil.rate;
2403 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2404 class->priority = htb->prio;
2405 return 0;
2406}
2407
2408static int
2409htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2410 struct htb_class *options,
2411 struct netdev_queue_stats *stats)
2412{
2413 struct nlattr *nl_options;
2414 unsigned int handle;
2415 int error;
2416
2417 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2418 if (!error && queue_id) {
17ee3c1f
BP
2419 unsigned int major = tc_get_major(handle);
2420 unsigned int minor = tc_get_minor(handle);
2421 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2422 *queue_id = minor - 1;
c1c9c9c4
BP
2423 } else {
2424 error = EPROTO;
2425 }
2426 }
2427 if (!error && options) {
2428 error = htb_parse_tca_options__(nl_options, options);
2429 }
2430 return error;
2431}
2432
2433static void
2434htb_parse_qdisc_details__(struct netdev *netdev,
2435 const struct shash *details, struct htb_class *hc)
2436{
2437 const char *max_rate_s;
2438
2439 max_rate_s = shash_find_data(details, "max-rate");
2440 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2441 if (!hc->max_rate) {
2442 uint32_t current;
2443
2444 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2445 hc->max_rate = netdev_features_to_bps(current) / 8;
2446 }
2447 hc->min_rate = hc->max_rate;
2448 hc->burst = 0;
2449 hc->priority = 0;
2450}
2451
2452static int
2453htb_parse_class_details__(struct netdev *netdev,
2454 const struct shash *details, struct htb_class *hc)
2455{
2456 const struct htb *htb = htb_get__(netdev);
2457 const char *min_rate_s = shash_find_data(details, "min-rate");
2458 const char *max_rate_s = shash_find_data(details, "max-rate");
2459 const char *burst_s = shash_find_data(details, "burst");
2460 const char *priority_s = shash_find_data(details, "priority");
2461 int mtu;
2462
da3827b5 2463 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
c1c9c9c4
BP
2464 if (!min_rate_s) {
2465 /* min-rate is required. */
2466 return EINVAL;
2467 }
2468 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
da3827b5 2469 hc->min_rate = MAX(hc->min_rate, 1500);
c1c9c9c4
BP
2470 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2471
2472 /* max-rate */
2473 hc->max_rate = (max_rate_s
2474 ? strtoull(max_rate_s, NULL, 10) / 8
2475 : htb->max_rate);
2476 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2477 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2478
2479 /* burst
2480 *
2481 * According to hints in the documentation that I've read, it is important
2482 * that 'burst' be at least as big as the largest frame that might be
2483 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2484 * but having it a bit too small is a problem. Since netdev_get_mtu()
2485 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2486 * the MTU. We actually add 64, instead of 14, as a guard against
2487 * additional headers get tacked on somewhere that we're not aware of. */
2488 netdev_get_mtu(netdev, &mtu);
2489 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2490 hc->burst = MAX(hc->burst, mtu + 64);
2491
2492 /* priority */
2493 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2494
2495 return 0;
2496}
2497
2498static int
2499htb_query_class__(const struct netdev *netdev, unsigned int handle,
2500 unsigned int parent, struct htb_class *options,
2501 struct netdev_queue_stats *stats)
2502{
2503 struct ofpbuf *reply;
2504 int error;
2505
2506 error = tc_query_class(netdev, handle, parent, &reply);
2507 if (!error) {
2508 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2509 ofpbuf_delete(reply);
2510 }
2511 return error;
2512}
2513
2514static int
2515htb_tc_install(struct netdev *netdev, const struct shash *details)
2516{
2517 int error;
2518
2519 error = htb_setup_qdisc__(netdev);
2520 if (!error) {
2521 struct htb_class hc;
2522
2523 htb_parse_qdisc_details__(netdev, details, &hc);
2524 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2525 tc_make_handle(1, 0), &hc);
2526 if (!error) {
2527 htb_install__(netdev, hc.max_rate);
2528 }
2529 }
2530 return error;
2531}
2532
93b13be8
BP
2533static struct htb_class *
2534htb_class_cast__(const struct tc_queue *queue)
2535{
2536 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2537}
2538
c1c9c9c4
BP
2539static void
2540htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2541 const struct htb_class *hc)
2542{
2543 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2544 size_t hash = hash_int(queue_id, 0);
2545 struct tc_queue *queue;
c1c9c9c4
BP
2546 struct htb_class *hcp;
2547
93b13be8
BP
2548 queue = tc_find_queue__(netdev, queue_id, hash);
2549 if (queue) {
2550 hcp = htb_class_cast__(queue);
2551 } else {
c1c9c9c4 2552 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2553 queue = &hcp->tc_queue;
2554 queue->queue_id = queue_id;
2555 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2556 }
93b13be8
BP
2557
2558 hcp->min_rate = hc->min_rate;
2559 hcp->max_rate = hc->max_rate;
2560 hcp->burst = hc->burst;
2561 hcp->priority = hc->priority;
c1c9c9c4
BP
2562}
2563
2564static int
2565htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2566{
c1c9c9c4
BP
2567 struct ofpbuf msg;
2568 struct nl_dump dump;
2569 struct htb_class hc;
2570 struct htb *htb;
2571
2572 /* Get qdisc options. */
2573 hc.max_rate = 0;
2574 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2575 htb = htb_install__(netdev, hc.max_rate);
2576
2577 /* Get queues. */
23a98ffe
BP
2578 if (!start_queue_dump(netdev, &dump)) {
2579 return ENODEV;
2580 }
c1c9c9c4
BP
2581 while (nl_dump_next(&dump, &msg)) {
2582 unsigned int queue_id;
2583
2584 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2585 htb_update_queue__(netdev, queue_id, &hc);
2586 }
2587 }
2588 nl_dump_done(&dump);
2589
2590 return 0;
2591}
2592
2593static void
2594htb_tc_destroy(struct tc *tc)
2595{
2596 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2597 struct htb_class *hc, *next;
c1c9c9c4 2598
4e8e4213 2599 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2600 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2601 free(hc);
2602 }
2603 tc_destroy(tc);
2604 free(htb);
2605}
2606
2607static int
2608htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2609{
2610 const struct htb *htb = htb_get__(netdev);
2611 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2612 return 0;
2613}
2614
2615static int
2616htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2617{
2618 struct htb_class hc;
2619 int error;
2620
2621 htb_parse_qdisc_details__(netdev, details, &hc);
2622 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2623 tc_make_handle(1, 0), &hc);
2624 if (!error) {
2625 htb_get__(netdev)->max_rate = hc.max_rate;
2626 }
2627 return error;
2628}
2629
2630static int
93b13be8
BP
2631htb_class_get(const struct netdev *netdev OVS_UNUSED,
2632 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2633{
93b13be8 2634 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2635
2636 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2637 if (hc->min_rate != hc->max_rate) {
2638 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2639 }
2640 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2641 if (hc->priority) {
2642 shash_add(details, "priority", xasprintf("%u", hc->priority));
2643 }
2644 return 0;
2645}
2646
2647static int
2648htb_class_set(struct netdev *netdev, unsigned int queue_id,
2649 const struct shash *details)
2650{
2651 struct htb_class hc;
2652 int error;
2653
2654 error = htb_parse_class_details__(netdev, details, &hc);
2655 if (error) {
2656 return error;
2657 }
2658
17ee3c1f 2659 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2660 tc_make_handle(1, 0xfffe), &hc);
2661 if (error) {
2662 return error;
2663 }
2664
2665 htb_update_queue__(netdev, queue_id, &hc);
2666 return 0;
2667}
2668
2669static int
93b13be8 2670htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2671{
93b13be8 2672 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2673 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2674 int error;
2675
93b13be8 2676 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2677 if (!error) {
93b13be8 2678 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2679 free(hc);
c1c9c9c4
BP
2680 }
2681 return error;
2682}
2683
2684static int
93b13be8 2685htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2686 struct netdev_queue_stats *stats)
2687{
93b13be8 2688 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2689 tc_make_handle(1, 0xfffe), NULL, stats);
2690}
2691
2692static int
2693htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2694 const struct ofpbuf *nlmsg,
2695 netdev_dump_queue_stats_cb *cb, void *aux)
2696{
2697 struct netdev_queue_stats stats;
17ee3c1f 2698 unsigned int handle, major, minor;
c1c9c9c4
BP
2699 int error;
2700
2701 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2702 if (error) {
2703 return error;
2704 }
2705
17ee3c1f
BP
2706 major = tc_get_major(handle);
2707 minor = tc_get_minor(handle);
2708 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2709 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2710 }
2711 return 0;
2712}
2713
2714static const struct tc_ops tc_ops_htb = {
2715 "htb", /* linux_name */
2716 "linux-htb", /* ovs_name */
2717 HTB_N_QUEUES, /* n_queues */
2718 htb_tc_install,
2719 htb_tc_load,
2720 htb_tc_destroy,
2721 htb_qdisc_get,
2722 htb_qdisc_set,
2723 htb_class_get,
2724 htb_class_set,
2725 htb_class_delete,
2726 htb_class_get_stats,
2727 htb_class_dump_stats
2728};
2729\f
a339aa81
EJ
2730/* "linux-hfsc" traffic control class. */
2731
2732#define HFSC_N_QUEUES 0xf000
2733
2734struct hfsc {
2735 struct tc tc;
2736 uint32_t max_rate;
2737};
2738
2739struct hfsc_class {
2740 struct tc_queue tc_queue;
2741 uint32_t min_rate;
2742 uint32_t max_rate;
2743};
2744
2745static struct hfsc *
2746hfsc_get__(const struct netdev *netdev)
2747{
2748 struct netdev_dev_linux *netdev_dev;
2749 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2750 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2751}
2752
2753static struct hfsc_class *
2754hfsc_class_cast__(const struct tc_queue *queue)
2755{
2756 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2757}
2758
2759static struct hfsc *
2760hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2761{
2762 struct netdev_dev_linux * netdev_dev;
2763 struct hfsc *hfsc;
2764
2765 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2766 hfsc = xmalloc(sizeof *hfsc);
2767 tc_init(&hfsc->tc, &tc_ops_hfsc);
2768 hfsc->max_rate = max_rate;
2769 netdev_dev->tc = &hfsc->tc;
2770
2771 return hfsc;
2772}
2773
2774static void
2775hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2776 const struct hfsc_class *hc)
2777{
2778 size_t hash;
2779 struct hfsc *hfsc;
2780 struct hfsc_class *hcp;
2781 struct tc_queue *queue;
2782
2783 hfsc = hfsc_get__(netdev);
2784 hash = hash_int(queue_id, 0);
2785
2786 queue = tc_find_queue__(netdev, queue_id, hash);
2787 if (queue) {
2788 hcp = hfsc_class_cast__(queue);
2789 } else {
2790 hcp = xmalloc(sizeof *hcp);
2791 queue = &hcp->tc_queue;
2792 queue->queue_id = queue_id;
2793 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2794 }
2795
2796 hcp->min_rate = hc->min_rate;
2797 hcp->max_rate = hc->max_rate;
2798}
2799
2800static int
2801hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2802{
2803 const struct tc_service_curve *rsc, *fsc, *usc;
2804 static const struct nl_policy tca_hfsc_policy[] = {
2805 [TCA_HFSC_RSC] = {
2806 .type = NL_A_UNSPEC,
2807 .optional = false,
2808 .min_len = sizeof(struct tc_service_curve),
2809 },
2810 [TCA_HFSC_FSC] = {
2811 .type = NL_A_UNSPEC,
2812 .optional = false,
2813 .min_len = sizeof(struct tc_service_curve),
2814 },
2815 [TCA_HFSC_USC] = {
2816 .type = NL_A_UNSPEC,
2817 .optional = false,
2818 .min_len = sizeof(struct tc_service_curve),
2819 },
2820 };
2821 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2822
2823 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2824 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2825 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2826 return EPROTO;
2827 }
2828
2829 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2830 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2831 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2832
2833 if (rsc->m1 != 0 || rsc->d != 0 ||
2834 fsc->m1 != 0 || fsc->d != 0 ||
2835 usc->m1 != 0 || usc->d != 0) {
2836 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2837 "Non-linear service curves are not supported.");
2838 return EPROTO;
2839 }
2840
2841 if (rsc->m2 != fsc->m2) {
2842 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2843 "Real-time service curves are not supported ");
2844 return EPROTO;
2845 }
2846
2847 if (rsc->m2 > usc->m2) {
2848 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2849 "Min-rate service curve is greater than "
2850 "the max-rate service curve.");
2851 return EPROTO;
2852 }
2853
2854 class->min_rate = fsc->m2;
2855 class->max_rate = usc->m2;
2856 return 0;
2857}
2858
2859static int
2860hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2861 struct hfsc_class *options,
2862 struct netdev_queue_stats *stats)
2863{
2864 int error;
2865 unsigned int handle;
2866 struct nlattr *nl_options;
2867
2868 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2869 if (error) {
2870 return error;
2871 }
2872
2873 if (queue_id) {
2874 unsigned int major, minor;
2875
2876 major = tc_get_major(handle);
2877 minor = tc_get_minor(handle);
2878 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2879 *queue_id = minor - 1;
2880 } else {
2881 return EPROTO;
2882 }
2883 }
2884
2885 if (options) {
2886 error = hfsc_parse_tca_options__(nl_options, options);
2887 }
2888
2889 return error;
2890}
2891
2892static int
2893hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2894 unsigned int parent, struct hfsc_class *options,
2895 struct netdev_queue_stats *stats)
2896{
2897 int error;
2898 struct ofpbuf *reply;
2899
2900 error = tc_query_class(netdev, handle, parent, &reply);
2901 if (error) {
2902 return error;
2903 }
2904
2905 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2906 ofpbuf_delete(reply);
2907 return error;
2908}
2909
2910static void
2911hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2912 struct hfsc_class *class)
2913{
2914 uint32_t max_rate;
2915 const char *max_rate_s;
2916
2917 max_rate_s = shash_find_data(details, "max-rate");
2918 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2919
2920 if (!max_rate) {
2921 uint32_t current;
2922
2923 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2924 max_rate = netdev_features_to_bps(current) / 8;
2925 }
2926
2927 class->min_rate = max_rate;
2928 class->max_rate = max_rate;
2929}
2930
2931static int
2932hfsc_parse_class_details__(struct netdev *netdev,
2933 const struct shash *details,
2934 struct hfsc_class * class)
2935{
2936 const struct hfsc *hfsc;
2937 uint32_t min_rate, max_rate;
2938 const char *min_rate_s, *max_rate_s;
2939
2940 hfsc = hfsc_get__(netdev);
2941 min_rate_s = shash_find_data(details, "min-rate");
2942 max_rate_s = shash_find_data(details, "max-rate");
2943
2944 if (!min_rate_s) {
2945 return EINVAL;
2946 }
2947
2948 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2949 min_rate = MAX(min_rate, 1500);
2950 min_rate = MIN(min_rate, hfsc->max_rate);
2951
2952 max_rate = (max_rate_s
2953 ? strtoull(max_rate_s, NULL, 10) / 8
2954 : hfsc->max_rate);
2955 max_rate = MAX(max_rate, min_rate);
2956 max_rate = MIN(max_rate, hfsc->max_rate);
2957
2958 class->min_rate = min_rate;
2959 class->max_rate = max_rate;
2960
2961 return 0;
2962}
2963
2964/* Create an HFSC qdisc.
2965 *
2966 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2967static int
2968hfsc_setup_qdisc__(struct netdev * netdev)
2969{
2970 struct tcmsg *tcmsg;
2971 struct ofpbuf request;
2972 struct tc_hfsc_qopt opt;
2973
2974 tc_del_qdisc(netdev);
2975
2976 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2977 NLM_F_EXCL | NLM_F_CREATE, &request);
2978
2979 if (!tcmsg) {
2980 return ENODEV;
2981 }
2982
2983 tcmsg->tcm_handle = tc_make_handle(1, 0);
2984 tcmsg->tcm_parent = TC_H_ROOT;
2985
2986 memset(&opt, 0, sizeof opt);
2987 opt.defcls = 1;
2988
2989 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2990 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2991
2992 return tc_transact(&request, NULL);
2993}
2994
2995/* Create an HFSC class.
2996 *
2997 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2998 * sc rate <min_rate> ul rate <max_rate>" */
2999static int
3000hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3001 unsigned int parent, struct hfsc_class *class)
3002{
3003 int error;
3004 size_t opt_offset;
3005 struct tcmsg *tcmsg;
3006 struct ofpbuf request;
3007 struct tc_service_curve min, max;
3008
3009 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3010
3011 if (!tcmsg) {
3012 return ENODEV;
3013 }
3014
3015 tcmsg->tcm_handle = handle;
3016 tcmsg->tcm_parent = parent;
3017
3018 min.m1 = 0;
3019 min.d = 0;
3020 min.m2 = class->min_rate;
3021
3022 max.m1 = 0;
3023 max.d = 0;
3024 max.m2 = class->max_rate;
3025
3026 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3027 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3028 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3029 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3030 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3031 nl_msg_end_nested(&request, opt_offset);
3032
3033 error = tc_transact(&request, NULL);
3034 if (error) {
3035 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3036 "min-rate %ubps, max-rate %ubps (%s)",
3037 netdev_get_name(netdev),
3038 tc_get_major(handle), tc_get_minor(handle),
3039 tc_get_major(parent), tc_get_minor(parent),
3040 class->min_rate, class->max_rate, strerror(error));
3041 }
3042
3043 return error;
3044}
3045
3046static int
3047hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3048{
3049 int error;
3050 struct hfsc_class class;
3051
3052 error = hfsc_setup_qdisc__(netdev);
3053
3054 if (error) {
3055 return error;
3056 }
3057
3058 hfsc_parse_qdisc_details__(netdev, details, &class);
3059 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3060 tc_make_handle(1, 0), &class);
3061
3062 if (error) {
3063 return error;
3064 }
3065
3066 hfsc_install__(netdev, class.max_rate);
3067 return 0;
3068}
3069
3070static int
3071hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3072{
3073 struct ofpbuf msg;
3074 struct hfsc *hfsc;
3075 struct nl_dump dump;
3076 struct hfsc_class hc;
3077
3078 hc.max_rate = 0;
3079 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3080 hfsc = hfsc_install__(netdev, hc.max_rate);
3081
3082 if (!start_queue_dump(netdev, &dump)) {
3083 return ENODEV;
3084 }
3085
3086 while (nl_dump_next(&dump, &msg)) {
3087 unsigned int queue_id;
3088
3089 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3090 hfsc_update_queue__(netdev, queue_id, &hc);
3091 }
3092 }
3093
3094 nl_dump_done(&dump);
3095 return 0;
3096}
3097
3098static void
3099hfsc_tc_destroy(struct tc *tc)
3100{
3101 struct hfsc *hfsc;
3102 struct hfsc_class *hc, *next;
3103
3104 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3105
3106 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3107 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3108 free(hc);
3109 }
3110
3111 tc_destroy(tc);
3112 free(hfsc);
3113}
3114
3115static int
3116hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3117{
3118 const struct hfsc *hfsc;
3119 hfsc = hfsc_get__(netdev);
3120 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3121 return 0;
3122}
3123
3124static int
3125hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3126{
3127 int error;
3128 struct hfsc_class class;
3129
3130 hfsc_parse_qdisc_details__(netdev, details, &class);
3131 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3132 tc_make_handle(1, 0), &class);
3133
3134 if (!error) {
3135 hfsc_get__(netdev)->max_rate = class.max_rate;
3136 }
3137
3138 return error;
3139}
3140
3141static int
3142hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3143 const struct tc_queue *queue, struct shash *details)
3144{
3145 const struct hfsc_class *hc;
3146
3147 hc = hfsc_class_cast__(queue);
3148 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3149 if (hc->min_rate != hc->max_rate) {
3150 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3151 }
3152 return 0;
3153}
3154
3155static int
3156hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3157 const struct shash *details)
3158{
3159 int error;
3160 struct hfsc_class class;
3161
3162 error = hfsc_parse_class_details__(netdev, details, &class);
3163 if (error) {
3164 return error;
3165 }
3166
3167 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3168 tc_make_handle(1, 0xfffe), &class);
3169 if (error) {
3170 return error;
3171 }
3172
3173 hfsc_update_queue__(netdev, queue_id, &class);
3174 return 0;
3175}
3176
3177static int
3178hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3179{
3180 int error;
3181 struct hfsc *hfsc;
3182 struct hfsc_class *hc;
3183
3184 hc = hfsc_class_cast__(queue);
3185 hfsc = hfsc_get__(netdev);
3186
3187 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3188 if (!error) {
3189 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3190 free(hc);
3191 }
3192 return error;
3193}
3194
3195static int
3196hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3197 struct netdev_queue_stats *stats)
3198{
3199 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3200 tc_make_handle(1, 0xfffe), NULL, stats);
3201}
3202
3203static int
3204hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3205 const struct ofpbuf *nlmsg,
3206 netdev_dump_queue_stats_cb *cb, void *aux)
3207{
3208 struct netdev_queue_stats stats;
3209 unsigned int handle, major, minor;
3210 int error;
3211
3212 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3213 if (error) {
3214 return error;
3215 }
3216
3217 major = tc_get_major(handle);
3218 minor = tc_get_minor(handle);
3219 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3220 (*cb)(minor - 1, &stats, aux);
3221 }
3222 return 0;
3223}
3224
3225static const struct tc_ops tc_ops_hfsc = {
3226 "hfsc", /* linux_name */
3227 "linux-hfsc", /* ovs_name */
3228 HFSC_N_QUEUES, /* n_queues */
3229 hfsc_tc_install, /* tc_install */
3230 hfsc_tc_load, /* tc_load */
3231 hfsc_tc_destroy, /* tc_destroy */
3232 hfsc_qdisc_get, /* qdisc_get */
3233 hfsc_qdisc_set, /* qdisc_set */
3234 hfsc_class_get, /* class_get */
3235 hfsc_class_set, /* class_set */
3236 hfsc_class_delete, /* class_delete */
3237 hfsc_class_get_stats, /* class_get_stats */
3238 hfsc_class_dump_stats /* class_dump_stats */
3239};
3240\f
c1c9c9c4
BP
3241/* "linux-default" traffic control class.
3242 *
3243 * This class represents the default, unnamed Linux qdisc. It corresponds to
3244 * the "" (empty string) QoS type in the OVS database. */
3245
3246static void
3247default_install__(struct netdev *netdev)
3248{
3249 struct netdev_dev_linux *netdev_dev =
3250 netdev_dev_linux_cast(netdev_get_dev(netdev));
3251 static struct tc *tc;
3252
3253 if (!tc) {
3254 tc = xmalloc(sizeof *tc);
3255 tc_init(tc, &tc_ops_default);
3256 }
3257 netdev_dev->tc = tc;
3258}
3259
3260static int
3261default_tc_install(struct netdev *netdev,
3262 const struct shash *details OVS_UNUSED)
3263{
3264 default_install__(netdev);
3265 return 0;
3266}
3267
3268static int
3269default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3270{
3271 default_install__(netdev);
3272 return 0;
3273}
3274
3275static const struct tc_ops tc_ops_default = {
3276 NULL, /* linux_name */
3277 "", /* ovs_name */
3278 0, /* n_queues */
3279 default_tc_install,
3280 default_tc_load,
3281 NULL, /* tc_destroy */
3282 NULL, /* qdisc_get */
3283 NULL, /* qdisc_set */
3284 NULL, /* class_get */
3285 NULL, /* class_set */
3286 NULL, /* class_delete */
3287 NULL, /* class_get_stats */
3288 NULL /* class_dump_stats */
3289};
3290\f
3291/* "linux-other" traffic control class.
3292 *
3293 * */
3294
3295static int
3296other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3297{
3298 struct netdev_dev_linux *netdev_dev =
3299 netdev_dev_linux_cast(netdev_get_dev(netdev));
3300 static struct tc *tc;
3301
3302 if (!tc) {
3303 tc = xmalloc(sizeof *tc);
3304 tc_init(tc, &tc_ops_other);
3305 }
3306 netdev_dev->tc = tc;
3307 return 0;
3308}
3309
3310static const struct tc_ops tc_ops_other = {
3311 NULL, /* linux_name */
3312 "linux-other", /* ovs_name */
3313 0, /* n_queues */
3314 NULL, /* tc_install */
3315 other_tc_load,
3316 NULL, /* tc_destroy */
3317 NULL, /* qdisc_get */
3318 NULL, /* qdisc_set */
3319 NULL, /* class_get */
3320 NULL, /* class_set */
3321 NULL, /* class_delete */
3322 NULL, /* class_get_stats */
3323 NULL /* class_dump_stats */
3324};
3325\f
3326/* Traffic control. */
3327
3328/* Number of kernel "tc" ticks per second. */
3329static double ticks_per_s;
3330
3331/* Number of kernel "jiffies" per second. This is used for the purpose of
3332 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3333 * one jiffy's worth of data.
3334 *
3335 * There are two possibilities here:
3336 *
3337 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3338 * approximate range of 100 to 1024. That means that we really need to
3339 * make sure that the qdisc can buffer that much data.
3340 *
3341 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3342 * has finely granular timers and there's no need to fudge additional room
3343 * for buffers. (There's no extra effort needed to implement that: the
3344 * large 'buffer_hz' is used as a divisor, so practically any number will
3345 * come out as 0 in the division. Small integer results in the case of
3346 * really high dividends won't have any real effect anyhow.)
3347 */
3348static unsigned int buffer_hz;
3349
3350/* Returns tc handle 'major':'minor'. */
3351static unsigned int
3352tc_make_handle(unsigned int major, unsigned int minor)
3353{
3354 return TC_H_MAKE(major << 16, minor);
3355}
3356
3357/* Returns the major number from 'handle'. */
3358static unsigned int
3359tc_get_major(unsigned int handle)
3360{
3361 return TC_H_MAJ(handle) >> 16;
3362}
3363
3364/* Returns the minor number from 'handle'. */
3365static unsigned int
3366tc_get_minor(unsigned int handle)
3367{
3368 return TC_H_MIN(handle);
3369}
3370
3371static struct tcmsg *
3372tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3373 struct ofpbuf *request)
3374{
3375 struct tcmsg *tcmsg;
3376 int ifindex;
3377 int error;
3378
3379 error = get_ifindex(netdev, &ifindex);
3380 if (error) {
3381 return NULL;
3382 }
3383
3384 ofpbuf_init(request, 512);
3385 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3386 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3387 tcmsg->tcm_family = AF_UNSPEC;
3388 tcmsg->tcm_ifindex = ifindex;
3389 /* Caller should fill in tcmsg->tcm_handle. */
3390 /* Caller should fill in tcmsg->tcm_parent. */
3391
3392 return tcmsg;
3393}
3394
3395static int
3396tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3397{
3398 int error = nl_sock_transact(rtnl_sock, request, replyp);
3399 ofpbuf_uninit(request);
3400 return error;
3401}
3402
3403static void
3404read_psched(void)
3405{
3406 /* The values in psched are not individually very meaningful, but they are
3407 * important. The tables below show some values seen in the wild.
3408 *
3409 * Some notes:
3410 *
3411 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3412 * (Before that, there are hints that it was 1000000000.)
3413 *
3414 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3415 * above.
3416 *
3417 * /proc/net/psched
3418 * -----------------------------------
3419 * [1] 000c8000 000f4240 000f4240 00000064
3420 * [2] 000003e8 00000400 000f4240 3b9aca00
3421 * [3] 000003e8 00000400 000f4240 3b9aca00
3422 * [4] 000003e8 00000400 000f4240 00000064
3423 * [5] 000003e8 00000040 000f4240 3b9aca00
3424 * [6] 000003e8 00000040 000f4240 000000f9
3425 *
3426 * a b c d ticks_per_s buffer_hz
3427 * ------- --------- ---------- ------------- ----------- -------------
3428 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3429 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3430 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3431 * [4] 1,000 1,024 1,000,000 100 976,562 100
3432 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3433 * [6] 1,000 64 1,000,000 249 15,625,000 249
3434 *
3435 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3436 * [2] 2.6.26-1-686-bigmem from Debian lenny
3437 * [3] 2.6.26-2-sparc64 from Debian lenny
3438 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3439 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3440 * [6] 2.6.34 from kernel.org on KVM
3441 */
3442 static const char fn[] = "/proc/net/psched";
3443 unsigned int a, b, c, d;
3444 FILE *stream;
3445
3446 ticks_per_s = 1.0;
3447 buffer_hz = 100;
3448
3449 stream = fopen(fn, "r");
3450 if (!stream) {
3451 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3452 return;
3453 }
3454
3455 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3456 VLOG_WARN("%s: read failed", fn);
3457 fclose(stream);
3458 return;
3459 }
3460 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3461 fclose(stream);
3462
3463 if (!a || !c) {
3464 VLOG_WARN("%s: invalid scheduler parameters", fn);
3465 return;
3466 }
3467
3468 ticks_per_s = (double) a * c / b;
3469 if (c == 1000000) {
3470 buffer_hz = d;
3471 } else {
3472 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3473 fn, a, b, c, d);
3474 }
3475 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3476}
3477
3478/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3479 * rate of 'rate' bytes per second. */
3480static unsigned int
3481tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3482{
3483 if (!buffer_hz) {
3484 read_psched();
3485 }
3486 return (rate * ticks) / ticks_per_s;
3487}
3488
3489/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3490 * rate of 'rate' bytes per second. */
3491static unsigned int
3492tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3493{
3494 if (!buffer_hz) {
3495 read_psched();
3496 }
015c93a4 3497 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3498}
3499
3500/* Returns the number of bytes that need to be reserved for qdisc buffering at
3501 * a transmission rate of 'rate' bytes per second. */
3502static unsigned int
3503tc_buffer_per_jiffy(unsigned int rate)
3504{
3505 if (!buffer_hz) {
3506 read_psched();
3507 }
3508 return rate / buffer_hz;
3509}
3510
3511/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3512 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3513 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3514 * stores NULL into it if it is absent.
3515 *
3516 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3517 * 'msg'.
3518 *
3519 * Returns 0 if successful, otherwise a positive errno value. */
3520static int
3521tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3522 struct nlattr **options)
3523{
3524 static const struct nl_policy tca_policy[] = {
3525 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3526 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3527 };
3528 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3529
3530 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3531 tca_policy, ta, ARRAY_SIZE(ta))) {
3532 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3533 goto error;
3534 }
3535
3536 if (kind) {
3537 *kind = nl_attr_get_string(ta[TCA_KIND]);
3538 }
3539
3540 if (options) {
3541 *options = ta[TCA_OPTIONS];
3542 }
3543
3544 return 0;
3545
3546error:
3547 if (kind) {
3548 *kind = NULL;
3549 }
3550 if (options) {
3551 *options = NULL;
3552 }
3553 return EPROTO;
3554}
3555
3556/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3557 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3558 * into '*options', and its queue statistics into '*stats'. Any of the output
3559 * arguments may be null.
3560 *
3561 * Returns 0 if successful, otherwise a positive errno value. */
3562static int
3563tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3564 struct nlattr **options, struct netdev_queue_stats *stats)
3565{
3566 static const struct nl_policy tca_policy[] = {
3567 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3568 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3569 };
3570 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3571
3572 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3573 tca_policy, ta, ARRAY_SIZE(ta))) {
3574 VLOG_WARN_RL(&rl, "failed to parse class message");
3575 goto error;
3576 }
3577
3578 if (handlep) {
3579 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3580 *handlep = tc->tcm_handle;
3581 }
3582
3583 if (options) {
3584 *options = ta[TCA_OPTIONS];
3585 }
3586
3587 if (stats) {
3588 const struct gnet_stats_queue *gsq;
3589 struct gnet_stats_basic gsb;
3590
3591 static const struct nl_policy stats_policy[] = {
3592 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3593 .min_len = sizeof gsb },
3594 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3595 .min_len = sizeof *gsq },
3596 };
3597 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3598
3599 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3600 sa, ARRAY_SIZE(sa))) {
3601 VLOG_WARN_RL(&rl, "failed to parse class stats");
3602 goto error;
3603 }
3604
3605 /* Alignment issues screw up the length of struct gnet_stats_basic on
3606 * some arch/bitsize combinations. Newer versions of Linux have a
3607 * struct gnet_stats_basic_packed, but we can't depend on that. The
3608 * easiest thing to do is just to make a copy. */
3609 memset(&gsb, 0, sizeof gsb);
3610 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3611 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3612 stats->tx_bytes = gsb.bytes;
3613 stats->tx_packets = gsb.packets;
3614
3615 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3616 stats->tx_errors = gsq->drops;
3617 }
3618
3619 return 0;
3620
3621error:
3622 if (options) {
3623 *options = NULL;
3624 }
3625 if (stats) {
3626 memset(stats, 0, sizeof *stats);
3627 }
3628 return EPROTO;
3629}
3630
3631/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3632 * on 'netdev'. */
3633static int
3634tc_query_class(const struct netdev *netdev,
3635 unsigned int handle, unsigned int parent,
3636 struct ofpbuf **replyp)
3637{
3638 struct ofpbuf request;
3639 struct tcmsg *tcmsg;
3640 int error;
3641
3642 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3643 if (!tcmsg) {
3644 return ENODEV;
3645 }
c1c9c9c4
BP
3646 tcmsg->tcm_handle = handle;
3647 tcmsg->tcm_parent = parent;
3648
3649 error = tc_transact(&request, replyp);
3650 if (error) {
3651 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3652 netdev_get_name(netdev),
3653 tc_get_major(handle), tc_get_minor(handle),
3654 tc_get_major(parent), tc_get_minor(parent),
3655 strerror(error));
3656 }
3657 return error;
3658}
3659
3660/* Equivalent to "tc class del dev <name> handle <handle>". */
3661static int
3662tc_delete_class(const struct netdev *netdev, unsigned int handle)
3663{
3664 struct ofpbuf request;
3665 struct tcmsg *tcmsg;
3666 int error;
3667
3668 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3669 if (!tcmsg) {
3670 return ENODEV;
3671 }
c1c9c9c4
BP
3672 tcmsg->tcm_handle = handle;
3673 tcmsg->tcm_parent = 0;
3674
3675 error = tc_transact(&request, NULL);
3676 if (error) {
3677 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3678 netdev_get_name(netdev),
3679 tc_get_major(handle), tc_get_minor(handle),
3680 strerror(error));
3681 }
3682 return error;
3683}
3684
3685/* Equivalent to "tc qdisc del dev <name> root". */
3686static int
3687tc_del_qdisc(struct netdev *netdev)
3688{
3689 struct netdev_dev_linux *netdev_dev =
3690 netdev_dev_linux_cast(netdev_get_dev(netdev));
3691 struct ofpbuf request;
3692 struct tcmsg *tcmsg;
3693 int error;
3694
3695 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3696 if (!tcmsg) {
3697 return ENODEV;
3698 }
c1c9c9c4
BP
3699 tcmsg->tcm_handle = tc_make_handle(1, 0);
3700 tcmsg->tcm_parent = TC_H_ROOT;
3701
3702 error = tc_transact(&request, NULL);
3703 if (error == EINVAL) {
3704 /* EINVAL probably means that the default qdisc was in use, in which
3705 * case we've accomplished our purpose. */
3706 error = 0;
3707 }
3708 if (!error && netdev_dev->tc) {
3709 if (netdev_dev->tc->ops->tc_destroy) {
3710 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3711 }
3712 netdev_dev->tc = NULL;
3713 }
3714 return error;
3715}
3716
3717/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3718 * kernel to determine what they are. Returns 0 if successful, otherwise a
3719 * positive errno value. */
3720static int
3721tc_query_qdisc(const struct netdev *netdev)
3722{
3723 struct netdev_dev_linux *netdev_dev =
3724 netdev_dev_linux_cast(netdev_get_dev(netdev));
3725 struct ofpbuf request, *qdisc;
3726 const struct tc_ops *ops;
3727 struct tcmsg *tcmsg;
3728 int load_error;
3729 int error;
3730
3731 if (netdev_dev->tc) {
3732 return 0;
3733 }
3734
3735 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3736 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3737 * 2.6.35 without that fix backported to it.
3738 *
3739 * To avoid the OOPS, we must not make a request that would attempt to dump
3740 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3741 * few others. There are a few ways that I can see to do this, but most of
3742 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3743 * technique chosen here is to assume that any non-default qdisc that we
3744 * create will have a class with handle 1:0. The built-in qdiscs only have
3745 * a class with handle 0:0.
3746 *
3747 * We could check for Linux 2.6.35+ and use a more straightforward method
3748 * there. */
3749 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3750 if (!tcmsg) {
3751 return ENODEV;
3752 }
c1c9c9c4
BP
3753 tcmsg->tcm_handle = tc_make_handle(1, 0);
3754 tcmsg->tcm_parent = 0;
3755
3756 /* Figure out what tc class to instantiate. */
3757 error = tc_transact(&request, &qdisc);
3758 if (!error) {
3759 const char *kind;
3760
3761 error = tc_parse_qdisc(qdisc, &kind, NULL);
3762 if (error) {
3763 ops = &tc_ops_other;
3764 } else {
3765 ops = tc_lookup_linux_name(kind);
3766 if (!ops) {
3767 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3768 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3769
3770 ops = &tc_ops_other;
3771 }
3772 }
3773 } else if (error == ENOENT) {
3774 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3775 * other entity that doesn't have a handle 1:0. We will assume
3776 * that it's the system default qdisc. */
3777 ops = &tc_ops_default;
3778 error = 0;
3779 } else {
3780 /* Who knows? Maybe the device got deleted. */
3781 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3782 netdev_get_name(netdev), strerror(error));
3783 ops = &tc_ops_other;
3784 }
3785
3786 /* Instantiate it. */
3787 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3788 assert((load_error == 0) == (netdev_dev->tc != NULL));
3789 ofpbuf_delete(qdisc);
3790
3791 return error ? error : load_error;
3792}
3793
3794/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3795 approximate the time to transmit packets of various lengths. For an MTU of
3796 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3797 represents two possible packet lengths; for a MTU of 513 through 1024, four
3798 possible lengths; and so on.
3799
3800 Returns, for the specified 'mtu', the number of bits that packet lengths
3801 need to be shifted right to fit within such a 256-entry table. */
3802static int
3803tc_calc_cell_log(unsigned int mtu)
3804{
3805 int cell_log;
3806
3807 if (!mtu) {
3808 mtu = ETH_PAYLOAD_MAX;
3809 }
3810 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3811
3812 for (cell_log = 0; mtu >= 256; cell_log++) {
3813 mtu >>= 1;
3814 }
3815
3816 return cell_log;
3817}
3818
3819/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3820 * of 'mtu'. */
3821static void
3822tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3823{
3824 memset(rate, 0, sizeof *rate);
3825 rate->cell_log = tc_calc_cell_log(mtu);
3826 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3827 /* rate->cell_align = 0; */ /* distro headers. */
3828 rate->mpu = ETH_TOTAL_MIN;
3829 rate->rate = Bps;
3830}
3831
3832/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3833 * attribute of the specified "type".
3834 *
3835 * See tc_calc_cell_log() above for a description of "rtab"s. */
3836static void
3837tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3838{
3839 uint32_t *rtab;
3840 unsigned int i;
3841
3842 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3843 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3844 unsigned packet_size = (i + 1) << rate->cell_log;
3845 if (packet_size < rate->mpu) {
3846 packet_size = rate->mpu;
3847 }
3848 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3849 }
3850}
3851
3852/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3853 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3854 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3855 * 0 is fine.) */
c1c9c9c4
BP
3856static int
3857tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3858{
3859 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3860 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3861}
3862
3863\f
3864/* Utility functions. */
3865
3866static int
3867get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3868{
3869 /* Policy for RTNLGRP_LINK messages.
3870 *
3871 * There are *many* more fields in these messages, but currently we only
3872 * care about these fields. */
3873 static const struct nl_policy rtnlgrp_link_policy[] = {
3874 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3875 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3876 .min_len = sizeof(struct rtnl_link_stats) },
3877 };
3878
3879 struct ofpbuf request;
3880 struct ofpbuf *reply;
3881 struct ifinfomsg *ifi;
3882 const struct rtnl_link_stats *rtnl_stats;
3883 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3884 int error;
3885
3886 ofpbuf_init(&request, 0);
3887 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3888 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3889 ifi->ifi_family = PF_UNSPEC;
3890 ifi->ifi_index = ifindex;
3891 error = nl_sock_transact(rtnl_sock, &request, &reply);
3892 ofpbuf_uninit(&request);
3893 if (error) {
3894 return error;
3895 }
3896
3897 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3898 rtnlgrp_link_policy,
3899 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3900 ofpbuf_delete(reply);
3901 return EPROTO;
3902 }
3903
3904 if (!attrs[IFLA_STATS]) {
3905 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3906 ofpbuf_delete(reply);
3907 return EPROTO;
3908 }
8b61709d
BP
3909
3910 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3911 stats->rx_packets = rtnl_stats->rx_packets;
3912 stats->tx_packets = rtnl_stats->tx_packets;
3913 stats->rx_bytes = rtnl_stats->rx_bytes;
3914 stats->tx_bytes = rtnl_stats->tx_bytes;
3915 stats->rx_errors = rtnl_stats->rx_errors;
3916 stats->tx_errors = rtnl_stats->tx_errors;
3917 stats->rx_dropped = rtnl_stats->rx_dropped;
3918 stats->tx_dropped = rtnl_stats->tx_dropped;
3919 stats->multicast = rtnl_stats->multicast;
3920 stats->collisions = rtnl_stats->collisions;
3921 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3922 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3923 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3924 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3925 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3926 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3927 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3928 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3929 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3930 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3931 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3932
576e26d7
BP
3933 ofpbuf_delete(reply);
3934
8b61709d
BP
3935 return 0;
3936}
3937
3938static int
3939get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3940{
3941 static const char fn[] = "/proc/net/dev";
3942 char line[1024];
3943 FILE *stream;
3944 int ln;
3945
3946 stream = fopen(fn, "r");
3947 if (!stream) {
3948 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3949 return errno;
3950 }
3951
3952 ln = 0;
3953 while (fgets(line, sizeof line, stream)) {
3954 if (++ln >= 3) {
3955 char devname[16];
3956#define X64 "%"SCNu64
3957 if (sscanf(line,
3958 " %15[^:]:"
3959 X64 X64 X64 X64 X64 X64 X64 "%*u"
3960 X64 X64 X64 X64 X64 X64 X64 "%*u",
3961 devname,
3962 &stats->rx_bytes,
3963 &stats->rx_packets,
3964 &stats->rx_errors,
3965 &stats->rx_dropped,
3966 &stats->rx_fifo_errors,
3967 &stats->rx_frame_errors,
3968 &stats->multicast,
3969 &stats->tx_bytes,
3970 &stats->tx_packets,
3971 &stats->tx_errors,
3972 &stats->tx_dropped,
3973 &stats->tx_fifo_errors,
3974 &stats->collisions,
3975 &stats->tx_carrier_errors) != 15) {
3976 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3977 } else if (!strcmp(devname, netdev_name)) {
3978 stats->rx_length_errors = UINT64_MAX;
3979 stats->rx_over_errors = UINT64_MAX;
3980 stats->rx_crc_errors = UINT64_MAX;
3981 stats->rx_missed_errors = UINT64_MAX;
3982 stats->tx_aborted_errors = UINT64_MAX;
3983 stats->tx_heartbeat_errors = UINT64_MAX;
3984 stats->tx_window_errors = UINT64_MAX;
3985 fclose(stream);
3986 return 0;
3987 }
3988 }
3989 }
3990 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3991 fclose(stream);
3992 return ENODEV;
3993}
c1c9c9c4 3994
8b61709d
BP
3995static int
3996get_flags(const struct netdev *netdev, int *flags)
3997{
3998 struct ifreq ifr;
3999 int error;
4000
149f577a
JG
4001 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4002 "SIOCGIFFLAGS");
8b61709d
BP
4003 *flags = ifr.ifr_flags;
4004 return error;
4005}
4006
4007static int
4008set_flags(struct netdev *netdev, int flags)
4009{
4010 struct ifreq ifr;
4011
4012 ifr.ifr_flags = flags;
149f577a
JG
4013 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4014 "SIOCSIFFLAGS");
8b61709d
BP
4015}
4016
4017static int
4018do_get_ifindex(const char *netdev_name)
4019{
4020 struct ifreq ifr;
4021
4022 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4023 COVERAGE_INC(netdev_get_ifindex);
4024 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4025 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4026 netdev_name, strerror(errno));
4027 return -errno;
4028 }
4029 return ifr.ifr_ifindex;
4030}
4031
4032static int
4033get_ifindex(const struct netdev *netdev_, int *ifindexp)
4034{
149f577a
JG
4035 struct netdev_dev_linux *netdev_dev =
4036 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4037 *ifindexp = 0;
149f577a 4038 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4039 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4040 if (ifindex < 0) {
4041 return -ifindex;
4042 }
149f577a
JG
4043 netdev_dev->cache_valid |= VALID_IFINDEX;
4044 netdev_dev->ifindex = ifindex;
8b61709d 4045 }
149f577a 4046 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4047 return 0;
4048}
4049
4050static int
4051get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4052{
4053 struct ifreq ifr;
4054 int hwaddr_family;
4055
4056 memset(&ifr, 0, sizeof ifr);
4057 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4058 COVERAGE_INC(netdev_get_hwaddr);
4059 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4060 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4061 netdev_name, strerror(errno));
4062 return errno;
4063 }
4064 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4065 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4066 VLOG_WARN("%s device has unknown hardware address family %d",
4067 netdev_name, hwaddr_family);
4068 }
4069 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4070 return 0;
4071}
4072
4073static int
4074set_etheraddr(const char *netdev_name, int hwaddr_family,
4075 const uint8_t mac[ETH_ADDR_LEN])
4076{
4077 struct ifreq ifr;
4078
4079 memset(&ifr, 0, sizeof ifr);
4080 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4081 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4082 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4083 COVERAGE_INC(netdev_set_hwaddr);
4084 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4085 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4086 netdev_name, strerror(errno));
4087 return errno;
4088 }
4089 return 0;
4090}
4091
4092static int
0b0544d7 4093netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4094 int cmd, const char *cmd_name)
4095{
4096 struct ifreq ifr;
4097
4098 memset(&ifr, 0, sizeof ifr);
0b0544d7 4099 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4100 ifr.ifr_data = (caddr_t) ecmd;
4101
4102 ecmd->cmd = cmd;
4103 COVERAGE_INC(netdev_ethtool);
4104 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4105 return 0;
4106 } else {
4107 if (errno != EOPNOTSUPP) {
4108 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4109 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4110 } else {
4111 /* The device doesn't support this operation. That's pretty
4112 * common, so there's no point in logging anything. */
4113 }
4114 return errno;
4115 }
4116}
4117
4118static int
149f577a
JG
4119netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4120 const char *cmd_name)
8b61709d 4121{
149f577a 4122 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4123 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4124 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4125 strerror(errno));
8b61709d
BP
4126 return errno;
4127 }
4128 return 0;
4129}
f1acd62b
BP
4130
4131static int
4132netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4133 int cmd, const char *cmd_name)
4134{
4135 struct ifreq ifr;
4136 int error;
4137
4138 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4139 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4140 if (!error) {
4141 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4142 *ip = sin->sin_addr;
4143 }
4144 return error;
4145}