]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netlink: Make netlink-protocol.h compatible with <linux/netlink.h>.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
149f577a 2 * Copyright (c) 2009, 2010 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
6f42c8ea 28#include <linux/pkt_sched.h>
e9e28be3 29#include <linux/rtnetlink.h>
8b61709d
BP
30#include <linux/sockios.h>
31#include <linux/version.h>
32#include <sys/types.h>
33#include <sys/ioctl.h>
34#include <sys/socket.h>
35#include <netpacket/packet.h>
36#include <net/ethernet.h>
37#include <net/if.h>
a740f0de 38#include <linux/if_tunnel.h>
8b61709d
BP
39#include <net/if_arp.h>
40#include <net/if_packet.h>
41#include <net/route.h>
42#include <netinet/in.h>
e9e28be3 43#include <poll.h>
8b61709d
BP
44#include <stdlib.h>
45#include <string.h>
46#include <unistd.h>
e9e28be3
BP
47
48#include "coverage.h"
8b61709d
BP
49#include "dynamic-string.h"
50#include "fatal-signal.h"
93b13be8
BP
51#include "hash.h"
52#include "hmap.h"
8b61709d 53#include "netdev-provider.h"
7fbef77a 54#include "netdev-vport.h"
e9e28be3
BP
55#include "netlink.h"
56#include "ofpbuf.h"
8b61709d
BP
57#include "openflow/openflow.h"
58#include "packets.h"
59#include "poll-loop.h"
559843ed 60#include "rtnetlink.h"
8b61709d
BP
61#include "socket-util.h"
62#include "shash.h"
63#include "svec.h"
e9e28be3 64#include "vlog.h"
5136ce49 65
d98e6007 66VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
67
68COVERAGE_DEFINE(netdev_get_vlan_vid);
69COVERAGE_DEFINE(netdev_set_policing);
70COVERAGE_DEFINE(netdev_arp_lookup);
71COVERAGE_DEFINE(netdev_get_ifindex);
72COVERAGE_DEFINE(netdev_get_hwaddr);
73COVERAGE_DEFINE(netdev_set_hwaddr);
74COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
75\f
76/* These were introduced in Linux 2.6.14, so they might be missing if we have
77 * old headers. */
78#ifndef ADVERTISED_Pause
79#define ADVERTISED_Pause (1 << 13)
80#endif
81#ifndef ADVERTISED_Asym_Pause
82#define ADVERTISED_Asym_Pause (1 << 14)
83#endif
84
c1c9c9c4
BP
85/* This was introduced in Linux 2.6.25, so it might be missing if we have old
86 * headers. */
87#ifndef TC_RTAB_SIZE
88#define TC_RTAB_SIZE 1024
89#endif
90
149f577a 91static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 92static int cache_notifier_refcount;
8b61709d
BP
93
94enum {
7fbef77a
JG
95 VALID_IFINDEX = 1 << 0,
96 VALID_ETHERADDR = 1 << 1,
97 VALID_IN4 = 1 << 2,
98 VALID_IN6 = 1 << 3,
99 VALID_MTU = 1 << 4,
100 VALID_CARRIER = 1 << 5,
101 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
102 VALID_POLICING = 1 << 7,
103 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
104};
105
149f577a
JG
106struct tap_state {
107 int fd;
61b999dd 108 bool opened;
149f577a 109};
c1c9c9c4
BP
110\f
111/* Traffic control. */
112
113/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
114 * network device.
115 *
116 * Each TC implementation subclasses this with whatever additional data it
117 * needs. */
c1c9c9c4
BP
118struct tc {
119 const struct tc_ops *ops;
93b13be8
BP
120 struct hmap queues; /* Contains "struct tc_queue"s.
121 * Read by generic TC layer.
122 * Written only by TC implementation. */
123};
c1c9c9c4 124
93b13be8
BP
125/* One traffic control queue.
126 *
127 * Each TC implementation subclasses this with whatever additional data it
128 * needs. */
129struct tc_queue {
130 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
131 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
132};
133
134/* A particular kind of traffic control. Each implementation generally maps to
135 * one particular Linux qdisc class.
136 *
137 * The functions below return 0 if successful or a positive errno value on
138 * failure, except where otherwise noted. All of them must be provided, except
139 * where otherwise noted. */
140struct tc_ops {
141 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
142 * This is null for tc_ops_default and tc_ops_other, for which there are no
143 * appropriate values. */
144 const char *linux_name;
145
146 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
147 const char *ovs_name;
148
149 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
150 * queues. The queues are numbered 0 through n_queues - 1. */
151 unsigned int n_queues;
152
153 /* Called to install this TC class on 'netdev'. The implementation should
154 * make the Netlink calls required to set up 'netdev' with the right qdisc
155 * and configure it according to 'details'. The implementation may assume
156 * that the current qdisc is the default; that is, there is no need for it
157 * to delete the current qdisc before installing itself.
158 *
159 * The contents of 'details' should be documented as valid for 'ovs_name'
160 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
161 * (which is built as ovs-vswitchd.conf.db(8)).
162 *
163 * This function must return 0 if and only if it sets 'netdev->tc' to an
164 * initialized 'struct tc'.
165 *
166 * (This function is null for tc_ops_other, which cannot be installed. For
167 * other TC classes it should always be nonnull.) */
168 int (*tc_install)(struct netdev *netdev, const struct shash *details);
169
170 /* Called when the netdev code determines (through a Netlink query) that
171 * this TC class's qdisc is installed on 'netdev', but we didn't install
172 * it ourselves and so don't know any of the details.
173 *
174 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
175 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
176 * implementation should parse the other attributes of 'nlmsg' as
177 * necessary to determine its configuration. If necessary it should also
178 * use Netlink queries to determine the configuration of queues on
179 * 'netdev'.
180 *
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'. */
183 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
184
185 /* Destroys the data structures allocated by the implementation as part of
186 * 'tc'. (This includes destroying 'tc->queues' by calling
187 * tc_destroy(tc).
188 *
189 * The implementation should not need to perform any Netlink calls. If
190 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
191 * (But it may not be desirable.)
192 *
193 * This function may be null if 'tc' is trivial. */
194 void (*tc_destroy)(struct tc *tc);
195
196 /* Retrieves details of 'netdev->tc' configuration into 'details'.
197 *
198 * The implementation should not need to perform any Netlink calls, because
199 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
200 * cached the configuration.
201 *
202 * The contents of 'details' should be documented as valid for 'ovs_name'
203 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
204 * (which is built as ovs-vswitchd.conf.db(8)).
205 *
206 * This function may be null if 'tc' is not configurable.
207 */
208 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
209
210 /* Reconfigures 'netdev->tc' according to 'details', performing any
211 * required Netlink calls to complete the reconfiguration.
212 *
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
216 *
217 * This function may be null if 'tc' is not configurable.
218 */
219 int (*qdisc_set)(struct netdev *, const struct shash *details);
220
93b13be8
BP
221 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
222 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
223 *
224 * The contents of 'details' should be documented as valid for 'ovs_name'
225 * in the "other_config" column in the "Queue" table in
226 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
227 *
228 * The implementation should not need to perform any Netlink calls, because
229 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
230 * cached the queue configuration.
231 *
232 * This function may be null if 'tc' does not have queues ('n_queues' is
233 * 0). */
93b13be8 234 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
235 struct shash *details);
236
237 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
238 * 'details', perfoming any required Netlink calls to complete the
239 * reconfiguration. The caller ensures that 'queue_id' is less than
240 * 'n_queues'.
241 *
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 *
246 * This function may be null if 'tc' does not have queues or its queues are
247 * not configurable. */
248 int (*class_set)(struct netdev *, unsigned int queue_id,
249 const struct shash *details);
250
93b13be8
BP
251 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
252 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
253 *
254 * This function may be null if 'tc' does not have queues or its queues
255 * cannot be deleted. */
93b13be8 256 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 257
93b13be8
BP
258 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
259 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
260 *
261 * On success, initializes '*stats'.
262 *
263 * This function may be null if 'tc' does not have queues or if it cannot
264 * report queue statistics. */
93b13be8
BP
265 int (*class_get_stats)(const struct netdev *netdev,
266 const struct tc_queue *queue,
c1c9c9c4
BP
267 struct netdev_queue_stats *stats);
268
269 /* Extracts queue stats from 'nlmsg', which is a response to a
270 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
271 *
272 * This function may be null if 'tc' does not have queues or if it cannot
273 * report queue statistics. */
274 int (*class_dump_stats)(const struct netdev *netdev,
275 const struct ofpbuf *nlmsg,
276 netdev_dump_queue_stats_cb *cb, void *aux);
277};
278
279static void
280tc_init(struct tc *tc, const struct tc_ops *ops)
281{
282 tc->ops = ops;
93b13be8 283 hmap_init(&tc->queues);
c1c9c9c4
BP
284}
285
286static void
287tc_destroy(struct tc *tc)
288{
93b13be8 289 hmap_destroy(&tc->queues);
c1c9c9c4
BP
290}
291
292static const struct tc_ops tc_ops_htb;
a339aa81 293static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
294static const struct tc_ops tc_ops_default;
295static const struct tc_ops tc_ops_other;
296
297static const struct tc_ops *tcs[] = {
298 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 299 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
300 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
301 &tc_ops_other, /* Some other qdisc. */
302 NULL
303};
149f577a 304
c1c9c9c4
BP
305static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
306static unsigned int tc_get_major(unsigned int handle);
307static unsigned int tc_get_minor(unsigned int handle);
308
309static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
310static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
311static unsigned int tc_buffer_per_jiffy(unsigned int rate);
312
313static struct tcmsg *tc_make_request(const struct netdev *, int type,
314 unsigned int flags, struct ofpbuf *);
315static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
316
317static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
318 struct nlattr **options);
319static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
320 struct nlattr **options,
321 struct netdev_queue_stats *);
322static int tc_query_class(const struct netdev *,
323 unsigned int handle, unsigned int parent,
324 struct ofpbuf **replyp);
325static int tc_delete_class(const struct netdev *, unsigned int handle);
326
327static int tc_del_qdisc(struct netdev *netdev);
328static int tc_query_qdisc(const struct netdev *netdev);
329
330static int tc_calc_cell_log(unsigned int mtu);
331static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
332static void tc_put_rtab(struct ofpbuf *, uint16_t type,
333 const struct tc_ratespec *rate);
334static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
335\f
149f577a
JG
336struct netdev_dev_linux {
337 struct netdev_dev netdev_dev;
338
8b61709d 339 struct shash_node *shash_node;
149f577a 340 unsigned int cache_valid;
8b61709d 341
8722022c
BP
342 /* The following are figured out "on demand" only. They are only valid
343 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
344 int ifindex;
345 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 346 struct in_addr address, netmask;
8b61709d
BP
347 struct in6_addr in6;
348 int mtu;
349 int carrier;
8722022c
BP
350 bool is_internal; /* Is this an openvswitch internal device? */
351 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
352 uint32_t kbits_rate; /* Policing data. */
353 uint32_t kbits_burst;
7fbef77a 354 bool have_vport_stats;
c1c9c9c4 355 struct tc *tc;
149f577a
JG
356
357 union {
358 struct tap_state tap;
359 } state;
8b61709d
BP
360};
361
149f577a
JG
362struct netdev_linux {
363 struct netdev netdev;
5b7448ed 364 int fd;
149f577a 365};
8b61709d 366
8b61709d
BP
367/* An AF_INET socket (used for ioctl operations). */
368static int af_inet_sock = -1;
369
ff4ed3c9
BP
370/* A Netlink routing socket that is not subscribed to any multicast groups. */
371static struct nl_sock *rtnl_sock;
372
8b61709d
BP
373struct netdev_linux_notifier {
374 struct netdev_notifier notifier;
375 struct list node;
376};
377
378static struct shash netdev_linux_notifiers =
379 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 380static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
381
382/* This is set pretty low because we probably won't learn anything from the
383 * additional log messages. */
384static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
385
15b3596a 386static int netdev_linux_init(void);
6f643e49 387
0b0544d7 388static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 389 int cmd, const char *cmd_name);
149f577a
JG
390static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
391 const char *cmd_name);
f1acd62b
BP
392static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
393 int cmd, const char *cmd_name);
8b61709d
BP
394static int get_flags(const struct netdev *, int *flagsp);
395static int set_flags(struct netdev *, int flags);
396static int do_get_ifindex(const char *netdev_name);
397static int get_ifindex(const struct netdev *, int *ifindexp);
398static int do_set_addr(struct netdev *netdev,
399 int ioctl_nr, const char *ioctl_name,
400 struct in_addr addr);
401static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
402static int set_etheraddr(const char *netdev_name, int hwaddr_family,
403 const uint8_t[ETH_ADDR_LEN]);
404static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
405static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
406
15b3596a
JG
407static bool
408is_netdev_linux_class(const struct netdev_class *netdev_class)
409{
410 return netdev_class->init == netdev_linux_init;
411}
412
149f577a
JG
413static struct netdev_dev_linux *
414netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 415{
15b3596a
JG
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
418
149f577a 419 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
420}
421
8b61709d
BP
422static struct netdev_linux *
423netdev_linux_cast(const struct netdev *netdev)
424{
15b3596a
JG
425 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
426 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
427 assert(is_netdev_linux_class(netdev_class));
428
8b61709d
BP
429 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
430}
ff4ed3c9 431\f
8b61709d
BP
432static int
433netdev_linux_init(void)
434{
435 static int status = -1;
436 if (status < 0) {
ff4ed3c9 437 /* Create AF_INET socket. */
8b61709d
BP
438 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
439 status = af_inet_sock >= 0 ? 0 : errno;
440 if (status) {
441 VLOG_ERR("failed to create inet socket: %s", strerror(status));
442 }
ff4ed3c9
BP
443
444 /* Create rtnetlink socket. */
445 if (!status) {
446 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
447 if (status) {
448 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
449 strerror(status));
450 }
451 }
8b61709d
BP
452 }
453 return status;
454}
455
456static void
457netdev_linux_run(void)
458{
46097491 459 rtnetlink_notifier_run();
8b61709d
BP
460}
461
462static void
463netdev_linux_wait(void)
464{
46097491 465 rtnetlink_notifier_wait();
8b61709d
BP
466}
467
468static void
46097491 469netdev_linux_cache_cb(const struct rtnetlink_change *change,
67a4917b 470 void *aux OVS_UNUSED)
8b61709d 471{
149f577a 472 struct netdev_dev_linux *dev;
8b61709d 473 if (change) {
46415c90
JG
474 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
475 if (base_dev) {
15b3596a
JG
476 const struct netdev_class *netdev_class =
477 netdev_dev_get_class(base_dev);
478
479 if (is_netdev_linux_class(netdev_class)) {
480 dev = netdev_dev_linux_cast(base_dev);
481 dev->cache_valid = 0;
482 }
8b61709d
BP
483 }
484 } else {
46415c90 485 struct shash device_shash;
8b61709d 486 struct shash_node *node;
46415c90
JG
487
488 shash_init(&device_shash);
489 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
490 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
491 dev = node->data;
492 dev->cache_valid = 0;
8b61709d 493 }
46415c90 494 shash_destroy(&device_shash);
8b61709d
BP
495 }
496}
497
c3827f61 498/* Creates system and internal devices. */
8b61709d 499static int
c3827f61 500netdev_linux_create(const struct netdev_class *class,
b8dcf5e9
BP
501 const char *name, const struct shash *args,
502 struct netdev_dev **netdev_devp)
6c88d577 503{
149f577a
JG
504 struct netdev_dev_linux *netdev_dev;
505 int error;
6c88d577
JP
506
507 if (!shash_is_empty(args)) {
c3827f61
BP
508 VLOG_WARN("%s: arguments for %s devices should be empty",
509 name, class->type);
6c88d577
JP
510 }
511
46415c90 512 if (!cache_notifier_refcount) {
149f577a
JG
513 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
514 netdev_linux_cache_cb, NULL);
515 if (error) {
516 return error;
517 }
518 }
46415c90 519 cache_notifier_refcount++;
6c88d577 520
149f577a 521 netdev_dev = xzalloc(sizeof *netdev_dev);
c3827f61 522 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
46415c90 523
149f577a 524 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
525 return 0;
526}
527
5b7448ed
JG
528/* For most types of netdevs we open the device for each call of
529 * netdev_open(). However, this is not the case with tap devices,
530 * since it is only possible to open the device once. In this
531 * situation we share a single file descriptor, and consequently
532 * buffers, across all readers. Therefore once data is read it will
533 * be unavailable to other reads for tap devices. */
a740f0de 534static int
b8dcf5e9
BP
535netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
536 const char *name, const struct shash *args,
537 struct netdev_dev **netdev_devp)
a740f0de 538{
149f577a 539 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
540 struct tap_state *state;
541 static const char tap_dev[] = "/dev/net/tun";
542 struct ifreq ifr;
543 int error;
544
545 if (!shash_is_empty(args)) {
149f577a 546 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
547 }
548
149f577a
JG
549 netdev_dev = xzalloc(sizeof *netdev_dev);
550 state = &netdev_dev->state.tap;
a740f0de 551
6c88d577 552 /* Open tap device. */
149f577a
JG
553 state->fd = open(tap_dev, O_RDWR);
554 if (state->fd < 0) {
6c88d577
JP
555 error = errno;
556 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
557 goto error;
558 }
559
560 /* Create tap device. */
561 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
562 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 563 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
564 VLOG_WARN("%s: creating tap device failed: %s", name,
565 strerror(errno));
566 error = errno;
567 goto error;
568 }
569
570 /* Make non-blocking. */
149f577a 571 error = set_nonblocking(state->fd);
a740f0de
JG
572 if (error) {
573 goto error;
574 }
575
149f577a
JG
576 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
577 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
578 return 0;
579
580error:
149f577a 581 free(netdev_dev);
a740f0de
JG
582 return error;
583}
584
a740f0de 585static void
149f577a 586destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 587{
149f577a
JG
588 struct tap_state *state = &netdev_dev->state.tap;
589
590 if (state->fd >= 0) {
591 close(state->fd);
a740f0de
JG
592 }
593}
594
149f577a 595/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 596static void
149f577a 597netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 598{
149f577a
JG
599 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
600 const char *type = netdev_dev_get_type(netdev_dev_);
6c88d577 601
c1c9c9c4
BP
602 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
603 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
604 }
605
149f577a 606 if (!strcmp(type, "system")) {
46415c90 607 cache_notifier_refcount--;
149f577a 608
46415c90 609 if (!cache_notifier_refcount) {
149f577a
JG
610 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
611 }
612 } else if (!strcmp(type, "tap")) {
613 destroy_tap(netdev_dev);
6c88d577 614 }
149f577a 615
658797c8 616 free(netdev_dev);
6c88d577
JP
617}
618
8b61709d 619static int
5b7448ed 620netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 621 struct netdev **netdevp)
8b61709d 622{
5b7448ed 623 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
624 struct netdev_linux *netdev;
625 enum netdev_flags flags;
626 int error;
627
628 /* Allocate network device. */
ec6fde61 629 netdev = xzalloc(sizeof *netdev);
49a6a163 630 netdev->fd = -1;
5b7448ed 631 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 632
c3827f61
BP
633 /* Verify that the device really exists, by attempting to read its flags.
634 * (The flags might be cached, in which case this won't actually do an
635 * ioctl.)
636 *
637 * Don't do this for "internal" netdevs, though, because those have to be
638 * created as netdev objects before they exist in the kernel, because
639 * creating them in the kernel happens by passing a netdev object to
640 * dpif_port_add(). */
641 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
642 error = netdev_get_flags(&netdev->netdev, &flags);
643 if (error == ENODEV) {
644 goto error;
645 }
8b61709d
BP
646 }
647
61b999dd
JG
648 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
649 !netdev_dev->state.tap.opened) {
650
651 /* We assume that the first user of the tap device is the primary user
652 * and give them the tap FD. Subsequent users probably just expect
653 * this to be a system device so open it normally to avoid send/receive
654 * directions appearing to be reversed. */
5b7448ed 655 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 656 netdev_dev->state.tap.opened = true;
5b7448ed 657 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
658 struct sockaddr_ll sll;
659 int protocol;
660 int ifindex;
661
662 /* Create file descriptor. */
663 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
664 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
665 : ethertype);
5b7448ed
JG
666 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
667 if (netdev->fd < 0) {
8b61709d
BP
668 error = errno;
669 goto error;
670 }
8b61709d
BP
671
672 /* Set non-blocking mode. */
5b7448ed 673 error = set_nonblocking(netdev->fd);
8b61709d
BP
674 if (error) {
675 goto error;
676 }
677
678 /* Get ethernet device index. */
679 error = get_ifindex(&netdev->netdev, &ifindex);
680 if (error) {
681 goto error;
682 }
683
684 /* Bind to specific ethernet device. */
685 memset(&sll, 0, sizeof sll);
686 sll.sll_family = AF_PACKET;
687 sll.sll_ifindex = ifindex;
5b7448ed 688 if (bind(netdev->fd,
8b61709d
BP
689 (struct sockaddr *) &sll, sizeof sll) < 0) {
690 error = errno;
5b7448ed 691 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 692 strerror(error));
8b61709d
BP
693 goto error;
694 }
695
696 /* Between the socket() and bind() calls above, the socket receives all
697 * packets of the requested type on all system interfaces. We do not
698 * want to receive that data, but there is no way to avoid it. So we
699 * must now drain out the receive queue. */
5b7448ed 700 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
701 if (error) {
702 goto error;
703 }
704 }
705
706 *netdevp = &netdev->netdev;
707 return 0;
708
709error:
149f577a 710 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
711 return error;
712}
713
714/* Closes and destroys 'netdev'. */
715static void
716netdev_linux_close(struct netdev *netdev_)
717{
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
719
49a6a163 720 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 721 close(netdev->fd);
8b61709d
BP
722 }
723 free(netdev);
724}
e9e28be3 725
8b61709d
BP
726/* Initializes 'svec' with a list of the names of all known network devices. */
727static int
728netdev_linux_enumerate(struct svec *svec)
729{
730 struct if_nameindex *names;
731
732 names = if_nameindex();
733 if (names) {
734 size_t i;
735
736 for (i = 0; names[i].if_name != NULL; i++) {
737 svec_add(svec, names[i].if_name);
738 }
739 if_freenameindex(names);
740 return 0;
741 } else {
742 VLOG_WARN("could not obtain list of network device names: %s",
743 strerror(errno));
744 return errno;
745 }
746}
747
748static int
749netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
750{
751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
752
5b7448ed 753 if (netdev->fd < 0) {
8b61709d 754 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 755 return -EAGAIN;
8b61709d
BP
756 }
757
758 for (;;) {
5b7448ed 759 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
760 if (retval >= 0) {
761 return retval;
762 } else if (errno != EINTR) {
763 if (errno != EAGAIN) {
764 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
765 strerror(errno), netdev_get_name(netdev_));
766 }
c0e5f6ca 767 return -errno;
8b61709d
BP
768 }
769 }
770}
771
772/* Registers with the poll loop to wake up from the next call to poll_block()
773 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
774static void
775netdev_linux_recv_wait(struct netdev *netdev_)
776{
777 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
778 if (netdev->fd >= 0) {
779 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
780 }
781}
782
783/* Discards all packets waiting to be received from 'netdev'. */
784static int
785netdev_linux_drain(struct netdev *netdev_)
786{
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 788 if (netdev->fd < 0) {
8b61709d 789 return 0;
5b7448ed 790 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 791 struct ifreq ifr;
149f577a 792 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
793 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
794 if (error) {
795 return error;
796 }
5b7448ed 797 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
798 return 0;
799 } else {
5b7448ed 800 return drain_rcvbuf(netdev->fd);
8b61709d
BP
801 }
802}
803
804/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
805 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
806 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
807 * the packet is too big or too small to transmit on the device.
808 *
809 * The caller retains ownership of 'buffer' in all cases.
810 *
811 * The kernel maintains a packet transmission queue, so the caller is not
812 * expected to do additional queuing of packets. */
813static int
814netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
815{
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
817
818 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
819 */
5b7448ed 820 if (netdev->fd < 0) {
8b61709d
BP
821 return EPIPE;
822 }
823
824 for (;;) {
5b7448ed 825 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
826 if (retval < 0) {
827 /* The Linux AF_PACKET implementation never blocks waiting for room
828 * for packets, instead returning ENOBUFS. Translate this into
829 * EAGAIN for the caller. */
830 if (errno == ENOBUFS) {
831 return EAGAIN;
832 } else if (errno == EINTR) {
833 continue;
834 } else if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
836 netdev_get_name(netdev_), strerror(errno));
837 }
838 return errno;
839 } else if (retval != size) {
840 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
841 "%zu) on %s", retval, size, netdev_get_name(netdev_));
842 return EMSGSIZE;
843 } else {
844 return 0;
845 }
846 }
847}
848
849/* Registers with the poll loop to wake up from the next call to poll_block()
850 * when the packet transmission queue has sufficient room to transmit a packet
851 * with netdev_send().
852 *
853 * The kernel maintains a packet transmission queue, so the client is not
854 * expected to do additional queuing of packets. Thus, this function is
855 * unlikely to ever be used. It is included for completeness. */
856static void
857netdev_linux_send_wait(struct netdev *netdev_)
858{
859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 860 if (netdev->fd < 0) {
8b61709d 861 /* Nothing to do. */
5b7448ed
JG
862 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
863 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
864 } else {
865 /* TAP device always accepts packets.*/
866 poll_immediate_wake();
867 }
868}
869
870/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
871 * otherwise a positive errno value. */
872static int
873netdev_linux_set_etheraddr(struct netdev *netdev_,
874 const uint8_t mac[ETH_ADDR_LEN])
875{
149f577a
JG
876 struct netdev_dev_linux *netdev_dev =
877 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
878 int error;
879
149f577a
JG
880 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
881 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
882 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
883 if (!error) {
149f577a
JG
884 netdev_dev->cache_valid |= VALID_ETHERADDR;
885 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
886 }
887 } else {
888 error = 0;
8b61709d
BP
889 }
890 return error;
891}
892
893/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
894 * free the returned buffer. */
895static int
896netdev_linux_get_etheraddr(const struct netdev *netdev_,
897 uint8_t mac[ETH_ADDR_LEN])
898{
149f577a
JG
899 struct netdev_dev_linux *netdev_dev =
900 netdev_dev_linux_cast(netdev_get_dev(netdev_));
901 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 902 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 903 netdev_dev->etheraddr);
8b61709d
BP
904 if (error) {
905 return error;
906 }
149f577a 907 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 908 }
149f577a 909 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
910 return 0;
911}
912
913/* Returns the maximum size of transmitted (and received) packets on 'netdev',
914 * in bytes, not including the hardware header; thus, this is typically 1500
915 * bytes for Ethernet devices. */
916static int
917netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
918{
149f577a
JG
919 struct netdev_dev_linux *netdev_dev =
920 netdev_dev_linux_cast(netdev_get_dev(netdev_));
921 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
922 struct ifreq ifr;
923 int error;
924
149f577a
JG
925 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
926 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
927 if (error) {
928 return error;
929 }
149f577a
JG
930 netdev_dev->mtu = ifr.ifr_mtu;
931 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 932 }
149f577a 933 *mtup = netdev_dev->mtu;
8b61709d
BP
934 return 0;
935}
936
9ab3d9a3
BP
937/* Returns the ifindex of 'netdev', if successful, as a positive number.
938 * On failure, returns a negative errno value. */
939static int
940netdev_linux_get_ifindex(const struct netdev *netdev)
941{
942 int ifindex, error;
943
944 error = get_ifindex(netdev, &ifindex);
945 return error ? -error : ifindex;
946}
947
8b61709d
BP
948static int
949netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
950{
149f577a
JG
951 struct netdev_dev_linux *netdev_dev =
952 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
953 int error = 0;
954 char *fn = NULL;
955 int fd = -1;
956
149f577a 957 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
958 char line[8];
959 int retval;
960
149f577a
JG
961 fn = xasprintf("/sys/class/net/%s/carrier",
962 netdev_get_name(netdev_));
8b61709d
BP
963 fd = open(fn, O_RDONLY);
964 if (fd < 0) {
965 error = errno;
966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
967 goto exit;
968 }
969
970 retval = read(fd, line, sizeof line);
971 if (retval < 0) {
972 error = errno;
973 if (error == EINVAL) {
974 /* This is the normal return value when we try to check carrier
975 * if the network device is not up. */
976 } else {
977 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
978 }
979 goto exit;
980 } else if (retval == 0) {
981 error = EPROTO;
982 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
983 goto exit;
984 }
985
986 if (line[0] != '0' && line[0] != '1') {
987 error = EPROTO;
988 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
989 fn, line[0]);
990 goto exit;
991 }
149f577a
JG
992 netdev_dev->carrier = line[0] != '0';
993 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 994 }
149f577a 995 *carrier = netdev_dev->carrier;
8b61709d
BP
996 error = 0;
997
998exit:
999 if (fd >= 0) {
1000 close(fd);
1001 }
1002 free(fn);
1003 return error;
1004}
1005
1006/* Check whether we can we use RTM_GETLINK to get network device statistics.
1007 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1008 * enabled. */
1009static bool
1010check_for_working_netlink_stats(void)
1011{
1012 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1013 * preferable, so if that works, we'll use it. */
1014 int ifindex = do_get_ifindex("lo");
1015 if (ifindex < 0) {
1016 VLOG_WARN("failed to get ifindex for lo, "
1017 "obtaining netdev stats from proc");
1018 return false;
1019 } else {
1020 struct netdev_stats stats;
1021 int error = get_stats_via_netlink(ifindex, &stats);
1022 if (!error) {
1023 VLOG_DBG("obtaining netdev stats via rtnetlink");
1024 return true;
1025 } else {
1026 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1027 "via proc (you are probably running a pre-2.6.19 "
1028 "kernel)", strerror(error));
1029 return false;
1030 }
1031 }
1032}
1033
8722022c
BP
1034/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1035static void
1036netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1037{
1038 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1039 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1040 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1041
8722022c
BP
1042 netdev_dev->is_tap = !strcmp(type, "tap");
1043 netdev_dev->is_internal = false;
1044 if (!netdev_dev->is_tap) {
1045 struct ethtool_drvinfo drvinfo;
1046 int error;
1047
1048 memset(&drvinfo, 0, sizeof drvinfo);
1049 error = netdev_linux_do_ethtool(name,
1050 (struct ethtool_cmd *)&drvinfo,
1051 ETHTOOL_GDRVINFO,
1052 "ETHTOOL_GDRVINFO");
1053
1054 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1055 netdev_dev->is_internal = true;
1056 }
1057 }
1058
1059 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1060 }
1061}
1062
92df599c
JG
1063static void
1064swap_uint64(uint64_t *a, uint64_t *b)
1065{
1066 *a ^= *b;
1067 *b ^= *a;
1068 *a ^= *b;
1069}
1070
7fbef77a 1071/* Retrieves current device stats for 'netdev'. */
8b61709d 1072static int
149f577a
JG
1073netdev_linux_get_stats(const struct netdev *netdev_,
1074 struct netdev_stats *stats)
8b61709d 1075{
149f577a
JG
1076 struct netdev_dev_linux *netdev_dev =
1077 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1078 static int use_netlink_stats = -1;
1079 int error;
1080
7fbef77a
JG
1081 if (netdev_dev->have_vport_stats ||
1082 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1083
1084 error = netdev_vport_get_stats(netdev_, stats);
1085 netdev_dev->have_vport_stats = !error;
1086 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1087 }
8b61709d 1088
7fbef77a
JG
1089 if (!netdev_dev->have_vport_stats) {
1090 if (use_netlink_stats < 0) {
1091 use_netlink_stats = check_for_working_netlink_stats();
1092 }
1093 if (use_netlink_stats) {
1094 int ifindex;
1095
1096 error = get_ifindex(netdev_, &ifindex);
1097 if (!error) {
1098 error = get_stats_via_netlink(ifindex, stats);
1099 }
1100 } else {
1101 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1102 }
8b61709d 1103 }
fe6b0e03
JG
1104
1105 /* If this port is an internal port then the transmit and receive stats
1106 * will appear to be swapped relative to the other ports since we are the
1107 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1108 * them back here. This does not apply if we are getting stats from the
1109 * vport layer because it always tracks stats from the perspective of the
1110 * switch. */
92df599c 1111 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1112 if (!error && !netdev_dev->have_vport_stats &&
1113 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1114 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1115 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1116 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1117 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1118 stats->rx_length_errors = 0;
1119 stats->rx_over_errors = 0;
1120 stats->rx_crc_errors = 0;
1121 stats->rx_frame_errors = 0;
1122 stats->rx_fifo_errors = 0;
1123 stats->rx_missed_errors = 0;
1124 stats->tx_aborted_errors = 0;
1125 stats->tx_carrier_errors = 0;
1126 stats->tx_fifo_errors = 0;
1127 stats->tx_heartbeat_errors = 0;
1128 stats->tx_window_errors = 0;
1129 }
1130
8b61709d
BP
1131 return error;
1132}
1133
1134/* Stores the features supported by 'netdev' into each of '*current',
1135 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1136 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1137 * successful, otherwise a positive errno value. */
8b61709d
BP
1138static int
1139netdev_linux_get_features(struct netdev *netdev,
1140 uint32_t *current, uint32_t *advertised,
1141 uint32_t *supported, uint32_t *peer)
1142{
1143 struct ethtool_cmd ecmd;
1144 int error;
1145
1146 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1147 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1148 ETHTOOL_GSET, "ETHTOOL_GSET");
1149 if (error) {
1150 return error;
1151 }
1152
1153 /* Supported features. */
1154 *supported = 0;
1155 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1156 *supported |= OFPPF_10MB_HD;
1157 }
1158 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1159 *supported |= OFPPF_10MB_FD;
1160 }
1161 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1162 *supported |= OFPPF_100MB_HD;
1163 }
1164 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1165 *supported |= OFPPF_100MB_FD;
1166 }
1167 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1168 *supported |= OFPPF_1GB_HD;
1169 }
1170 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1171 *supported |= OFPPF_1GB_FD;
1172 }
1173 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1174 *supported |= OFPPF_10GB_FD;
1175 }
1176 if (ecmd.supported & SUPPORTED_TP) {
1177 *supported |= OFPPF_COPPER;
1178 }
1179 if (ecmd.supported & SUPPORTED_FIBRE) {
1180 *supported |= OFPPF_FIBER;
1181 }
1182 if (ecmd.supported & SUPPORTED_Autoneg) {
1183 *supported |= OFPPF_AUTONEG;
1184 }
1185 if (ecmd.supported & SUPPORTED_Pause) {
1186 *supported |= OFPPF_PAUSE;
1187 }
1188 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1189 *supported |= OFPPF_PAUSE_ASYM;
1190 }
1191
1192 /* Advertised features. */
1193 *advertised = 0;
1194 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1195 *advertised |= OFPPF_10MB_HD;
1196 }
1197 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1198 *advertised |= OFPPF_10MB_FD;
1199 }
1200 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1201 *advertised |= OFPPF_100MB_HD;
1202 }
1203 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1204 *advertised |= OFPPF_100MB_FD;
1205 }
1206 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1207 *advertised |= OFPPF_1GB_HD;
1208 }
1209 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1210 *advertised |= OFPPF_1GB_FD;
1211 }
1212 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1213 *advertised |= OFPPF_10GB_FD;
1214 }
1215 if (ecmd.advertising & ADVERTISED_TP) {
1216 *advertised |= OFPPF_COPPER;
1217 }
1218 if (ecmd.advertising & ADVERTISED_FIBRE) {
1219 *advertised |= OFPPF_FIBER;
1220 }
1221 if (ecmd.advertising & ADVERTISED_Autoneg) {
1222 *advertised |= OFPPF_AUTONEG;
1223 }
1224 if (ecmd.advertising & ADVERTISED_Pause) {
1225 *advertised |= OFPPF_PAUSE;
1226 }
1227 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1228 *advertised |= OFPPF_PAUSE_ASYM;
1229 }
1230
1231 /* Current settings. */
1232 if (ecmd.speed == SPEED_10) {
1233 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1234 } else if (ecmd.speed == SPEED_100) {
1235 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1236 } else if (ecmd.speed == SPEED_1000) {
1237 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1238 } else if (ecmd.speed == SPEED_10000) {
1239 *current = OFPPF_10GB_FD;
1240 } else {
1241 *current = 0;
1242 }
1243
1244 if (ecmd.port == PORT_TP) {
1245 *current |= OFPPF_COPPER;
1246 } else if (ecmd.port == PORT_FIBRE) {
1247 *current |= OFPPF_FIBER;
1248 }
1249
1250 if (ecmd.autoneg) {
1251 *current |= OFPPF_AUTONEG;
1252 }
1253
1254 /* Peer advertisements. */
1255 *peer = 0; /* XXX */
1256
1257 return 0;
1258}
1259
1260/* Set the features advertised by 'netdev' to 'advertise'. */
1261static int
1262netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1263{
1264 struct ethtool_cmd ecmd;
1265 int error;
1266
1267 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1268 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1269 ETHTOOL_GSET, "ETHTOOL_GSET");
1270 if (error) {
1271 return error;
1272 }
1273
1274 ecmd.advertising = 0;
1275 if (advertise & OFPPF_10MB_HD) {
1276 ecmd.advertising |= ADVERTISED_10baseT_Half;
1277 }
1278 if (advertise & OFPPF_10MB_FD) {
1279 ecmd.advertising |= ADVERTISED_10baseT_Full;
1280 }
1281 if (advertise & OFPPF_100MB_HD) {
1282 ecmd.advertising |= ADVERTISED_100baseT_Half;
1283 }
1284 if (advertise & OFPPF_100MB_FD) {
1285 ecmd.advertising |= ADVERTISED_100baseT_Full;
1286 }
1287 if (advertise & OFPPF_1GB_HD) {
1288 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1289 }
1290 if (advertise & OFPPF_1GB_FD) {
1291 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1292 }
1293 if (advertise & OFPPF_10GB_FD) {
1294 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1295 }
1296 if (advertise & OFPPF_COPPER) {
1297 ecmd.advertising |= ADVERTISED_TP;
1298 }
1299 if (advertise & OFPPF_FIBER) {
1300 ecmd.advertising |= ADVERTISED_FIBRE;
1301 }
1302 if (advertise & OFPPF_AUTONEG) {
1303 ecmd.advertising |= ADVERTISED_Autoneg;
1304 }
1305 if (advertise & OFPPF_PAUSE) {
1306 ecmd.advertising |= ADVERTISED_Pause;
1307 }
1308 if (advertise & OFPPF_PAUSE_ASYM) {
1309 ecmd.advertising |= ADVERTISED_Asym_Pause;
1310 }
0b0544d7 1311 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1312 ETHTOOL_SSET, "ETHTOOL_SSET");
1313}
1314
1315/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1316 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1317 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1318 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1319 * sets '*vlan_vid' to -1. */
1320static int
1321netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1322{
1323 const char *netdev_name = netdev_get_name(netdev);
1324 struct ds line = DS_EMPTY_INITIALIZER;
1325 FILE *stream = NULL;
1326 int error;
1327 char *fn;
1328
1329 COVERAGE_INC(netdev_get_vlan_vid);
1330 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1331 stream = fopen(fn, "r");
1332 if (!stream) {
1333 error = errno;
1334 goto done;
1335 }
1336
1337 if (ds_get_line(&line, stream)) {
1338 if (ferror(stream)) {
1339 error = errno;
1340 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1341 } else {
1342 error = EPROTO;
1343 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1344 }
1345 goto done;
1346 }
1347
1348 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1349 error = EPROTO;
1350 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1351 fn, ds_cstr(&line));
1352 goto done;
1353 }
1354
1355 error = 0;
1356
1357done:
1358 free(fn);
1359 if (stream) {
1360 fclose(stream);
1361 }
1362 ds_destroy(&line);
1363 if (error) {
1364 *vlan_vid = -1;
1365 }
1366 return error;
1367}
1368
1369#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1370#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1371
8e460221 1372/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1373 * positive errno value.
1374 *
1375 * This function is equivalent to running
1376 * /sbin/tc qdisc del dev %s handle ffff: ingress
1377 * but it is much, much faster.
1378 */
8e460221
BP
1379static int
1380netdev_linux_remove_policing(struct netdev *netdev)
1381{
80a86fbe
BP
1382 struct netdev_dev_linux *netdev_dev =
1383 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1384 const char *netdev_name = netdev_get_name(netdev);
8e460221 1385
6f42c8ea 1386 struct ofpbuf request;
6f42c8ea 1387 struct tcmsg *tcmsg;
6f42c8ea
BP
1388 int error;
1389
c1c9c9c4 1390 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1391 if (!tcmsg) {
1392 return ENODEV;
1393 }
c1c9c9c4 1394 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1395 tcmsg->tcm_parent = TC_H_INGRESS;
1396 nl_msg_put_string(&request, TCA_KIND, "ingress");
1397 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1398
1399 error = tc_transact(&request, NULL);
4d10512c 1400 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1401 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1402 netdev_name, strerror(error));
1403 return error;
1404 }
1405
80a86fbe
BP
1406 netdev_dev->kbits_rate = 0;
1407 netdev_dev->kbits_burst = 0;
1408 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1409 return 0;
1410}
1411
8b61709d
BP
1412/* Attempts to set input rate limiting (policing) policy. */
1413static int
1414netdev_linux_set_policing(struct netdev *netdev,
1415 uint32_t kbits_rate, uint32_t kbits_burst)
1416{
80a86fbe
BP
1417 struct netdev_dev_linux *netdev_dev =
1418 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1419 const char *netdev_name = netdev_get_name(netdev);
1420 char command[1024];
1421
1422 COVERAGE_INC(netdev_set_policing);
8e460221 1423
80a86fbe
BP
1424 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1425 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1426 : kbits_burst); /* Stick with user-specified value. */
1427
1428 if (netdev_dev->cache_valid & VALID_POLICING
1429 && netdev_dev->kbits_rate == kbits_rate
1430 && netdev_dev->kbits_burst == kbits_burst) {
1431 /* Assume that settings haven't changed since we last set them. */
1432 return 0;
1433 }
1434
8e460221 1435 netdev_linux_remove_policing(netdev);
8b61709d 1436 if (kbits_rate) {
8b61709d
BP
1437 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1438 if (system(command) != 0) {
1439 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1440 return -1;
1441 }
1442
1443 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1444 kbits_rate, kbits_burst);
1445 if (system(command) != 0) {
1446 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1447 netdev_name);
1448 return -1;
1449 }
80a86fbe
BP
1450
1451 netdev_dev->kbits_rate = kbits_rate;
1452 netdev_dev->kbits_burst = kbits_burst;
1453 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1454 }
1455
1456 return 0;
1457}
1458
c1c9c9c4
BP
1459static int
1460netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1461 struct svec *types)
1462{
1463 const struct tc_ops **opsp;
1464
1465 for (opsp = tcs; *opsp != NULL; opsp++) {
1466 const struct tc_ops *ops = *opsp;
1467 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1468 svec_add(types, ops->ovs_name);
1469 }
1470 }
1471 return 0;
1472}
1473
1474static const struct tc_ops *
1475tc_lookup_ovs_name(const char *name)
1476{
1477 const struct tc_ops **opsp;
1478
1479 for (opsp = tcs; *opsp != NULL; opsp++) {
1480 const struct tc_ops *ops = *opsp;
1481 if (!strcmp(name, ops->ovs_name)) {
1482 return ops;
1483 }
1484 }
1485 return NULL;
1486}
1487
1488static const struct tc_ops *
1489tc_lookup_linux_name(const char *name)
1490{
1491 const struct tc_ops **opsp;
1492
1493 for (opsp = tcs; *opsp != NULL; opsp++) {
1494 const struct tc_ops *ops = *opsp;
1495 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1496 return ops;
1497 }
1498 }
1499 return NULL;
1500}
1501
93b13be8
BP
1502static struct tc_queue *
1503tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1504 size_t hash)
1505{
1506 struct netdev_dev_linux *netdev_dev =
1507 netdev_dev_linux_cast(netdev_get_dev(netdev));
1508 struct tc_queue *queue;
1509
4e8e4213 1510 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1511 if (queue->queue_id == queue_id) {
1512 return queue;
1513 }
1514 }
1515 return NULL;
1516}
1517
1518static struct tc_queue *
1519tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1520{
1521 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1522}
1523
c1c9c9c4
BP
1524static int
1525netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1526 const char *type,
1527 struct netdev_qos_capabilities *caps)
1528{
1529 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1530 if (!ops) {
1531 return EOPNOTSUPP;
1532 }
1533 caps->n_queues = ops->n_queues;
1534 return 0;
1535}
1536
1537static int
1538netdev_linux_get_qos(const struct netdev *netdev,
1539 const char **typep, struct shash *details)
1540{
1541 struct netdev_dev_linux *netdev_dev =
1542 netdev_dev_linux_cast(netdev_get_dev(netdev));
1543 int error;
1544
1545 error = tc_query_qdisc(netdev);
1546 if (error) {
1547 return error;
1548 }
1549
1550 *typep = netdev_dev->tc->ops->ovs_name;
1551 return (netdev_dev->tc->ops->qdisc_get
1552 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1553 : 0);
1554}
1555
1556static int
1557netdev_linux_set_qos(struct netdev *netdev,
1558 const char *type, const struct shash *details)
1559{
1560 struct netdev_dev_linux *netdev_dev =
1561 netdev_dev_linux_cast(netdev_get_dev(netdev));
1562 const struct tc_ops *new_ops;
1563 int error;
1564
1565 new_ops = tc_lookup_ovs_name(type);
1566 if (!new_ops || !new_ops->tc_install) {
1567 return EOPNOTSUPP;
1568 }
1569
1570 error = tc_query_qdisc(netdev);
1571 if (error) {
1572 return error;
1573 }
1574
1575 if (new_ops == netdev_dev->tc->ops) {
1576 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1577 } else {
1578 /* Delete existing qdisc. */
1579 error = tc_del_qdisc(netdev);
1580 if (error) {
1581 return error;
1582 }
1583 assert(netdev_dev->tc == NULL);
1584
1585 /* Install new qdisc. */
1586 error = new_ops->tc_install(netdev, details);
1587 assert((error == 0) == (netdev_dev->tc != NULL));
1588
1589 return error;
1590 }
1591}
1592
1593static int
1594netdev_linux_get_queue(const struct netdev *netdev,
1595 unsigned int queue_id, struct shash *details)
1596{
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
1599 int error;
1600
1601 error = tc_query_qdisc(netdev);
1602 if (error) {
1603 return error;
93b13be8
BP
1604 } else {
1605 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1606 return (queue
1607 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1608 : ENOENT);
c1c9c9c4 1609 }
c1c9c9c4
BP
1610}
1611
1612static int
1613netdev_linux_set_queue(struct netdev *netdev,
1614 unsigned int queue_id, const struct shash *details)
1615{
1616 struct netdev_dev_linux *netdev_dev =
1617 netdev_dev_linux_cast(netdev_get_dev(netdev));
1618 int error;
1619
1620 error = tc_query_qdisc(netdev);
1621 if (error) {
1622 return error;
1623 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1624 || !netdev_dev->tc->ops->class_set) {
1625 return EINVAL;
1626 }
1627
1628 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1629}
1630
1631static int
1632netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1633{
1634 struct netdev_dev_linux *netdev_dev =
1635 netdev_dev_linux_cast(netdev_get_dev(netdev));
1636 int error;
1637
1638 error = tc_query_qdisc(netdev);
1639 if (error) {
1640 return error;
1641 } else if (!netdev_dev->tc->ops->class_delete) {
1642 return EINVAL;
93b13be8
BP
1643 } else {
1644 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1645 return (queue
1646 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1647 : ENOENT);
c1c9c9c4 1648 }
c1c9c9c4
BP
1649}
1650
1651static int
1652netdev_linux_get_queue_stats(const struct netdev *netdev,
1653 unsigned int queue_id,
1654 struct netdev_queue_stats *stats)
1655{
1656 struct netdev_dev_linux *netdev_dev =
1657 netdev_dev_linux_cast(netdev_get_dev(netdev));
1658 int error;
1659
1660 error = tc_query_qdisc(netdev);
1661 if (error) {
1662 return error;
c1c9c9c4
BP
1663 } else if (!netdev_dev->tc->ops->class_get_stats) {
1664 return EOPNOTSUPP;
93b13be8
BP
1665 } else {
1666 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1667 return (queue
1668 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1669 : ENOENT);
c1c9c9c4 1670 }
c1c9c9c4
BP
1671}
1672
23a98ffe 1673static bool
c1c9c9c4
BP
1674start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1675{
1676 struct ofpbuf request;
1677 struct tcmsg *tcmsg;
1678
1679 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1680 if (!tcmsg) {
1681 return false;
1682 }
3c4de644 1683 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1684 nl_dump_start(dump, rtnl_sock, &request);
1685 ofpbuf_uninit(&request);
23a98ffe 1686 return true;
c1c9c9c4
BP
1687}
1688
1689static int
1690netdev_linux_dump_queues(const struct netdev *netdev,
1691 netdev_dump_queues_cb *cb, void *aux)
1692{
1693 struct netdev_dev_linux *netdev_dev =
1694 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1695 struct tc_queue *queue;
c1c9c9c4
BP
1696 struct shash details;
1697 int last_error;
c1c9c9c4
BP
1698 int error;
1699
1700 error = tc_query_qdisc(netdev);
1701 if (error) {
1702 return error;
1703 } else if (!netdev_dev->tc->ops->class_get) {
1704 return EOPNOTSUPP;
1705 }
1706
1707 last_error = 0;
1708 shash_init(&details);
4e8e4213 1709 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1710 shash_clear(&details);
1711
93b13be8 1712 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1713 if (!error) {
93b13be8 1714 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1715 } else {
1716 last_error = error;
1717 }
1718 }
1719 shash_destroy(&details);
1720
1721 return last_error;
1722}
1723
1724static int
1725netdev_linux_dump_queue_stats(const struct netdev *netdev,
1726 netdev_dump_queue_stats_cb *cb, void *aux)
1727{
1728 struct netdev_dev_linux *netdev_dev =
1729 netdev_dev_linux_cast(netdev_get_dev(netdev));
1730 struct nl_dump dump;
1731 struct ofpbuf msg;
1732 int last_error;
1733 int error;
1734
1735 error = tc_query_qdisc(netdev);
1736 if (error) {
1737 return error;
1738 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1739 return EOPNOTSUPP;
1740 }
1741
1742 last_error = 0;
23a98ffe
BP
1743 if (!start_queue_dump(netdev, &dump)) {
1744 return ENODEV;
1745 }
c1c9c9c4
BP
1746 while (nl_dump_next(&dump, &msg)) {
1747 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1748 if (error) {
1749 last_error = error;
1750 }
1751 }
1752
1753 error = nl_dump_done(&dump);
1754 return error ? error : last_error;
1755}
1756
8b61709d 1757static int
f1acd62b
BP
1758netdev_linux_get_in4(const struct netdev *netdev_,
1759 struct in_addr *address, struct in_addr *netmask)
8b61709d 1760{
149f577a
JG
1761 struct netdev_dev_linux *netdev_dev =
1762 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1763
1764 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1765 int error;
1766
149f577a 1767 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1768 SIOCGIFADDR, "SIOCGIFADDR");
1769 if (error) {
1770 return error;
1771 }
1772
149f577a 1773 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1774 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1775 if (error) {
1776 return error;
1777 }
1778
149f577a 1779 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1780 }
149f577a
JG
1781 *address = netdev_dev->address;
1782 *netmask = netdev_dev->netmask;
f1acd62b 1783 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1784}
1785
8b61709d 1786static int
f1acd62b
BP
1787netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1788 struct in_addr netmask)
8b61709d 1789{
149f577a
JG
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1792 int error;
1793
f1acd62b 1794 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1795 if (!error) {
149f577a
JG
1796 netdev_dev->cache_valid |= VALID_IN4;
1797 netdev_dev->address = address;
1798 netdev_dev->netmask = netmask;
f1acd62b 1799 if (address.s_addr != INADDR_ANY) {
8b61709d 1800 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1801 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1802 }
1803 }
1804 return error;
1805}
1806
1807static bool
1808parse_if_inet6_line(const char *line,
1809 struct in6_addr *in6, char ifname[16 + 1])
1810{
1811 uint8_t *s6 = in6->s6_addr;
1812#define X8 "%2"SCNx8
1813 return sscanf(line,
1814 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1815 "%*x %*x %*x %*x %16s\n",
1816 &s6[0], &s6[1], &s6[2], &s6[3],
1817 &s6[4], &s6[5], &s6[6], &s6[7],
1818 &s6[8], &s6[9], &s6[10], &s6[11],
1819 &s6[12], &s6[13], &s6[14], &s6[15],
1820 ifname) == 17;
1821}
1822
1823/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1824 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1825static int
1826netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1827{
149f577a
JG
1828 struct netdev_dev_linux *netdev_dev =
1829 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1830 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1831 FILE *file;
1832 char line[128];
1833
149f577a 1834 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1835
1836 file = fopen("/proc/net/if_inet6", "r");
1837 if (file != NULL) {
1838 const char *name = netdev_get_name(netdev_);
1839 while (fgets(line, sizeof line, file)) {
2a022368 1840 struct in6_addr in6_tmp;
8b61709d 1841 char ifname[16 + 1];
2a022368 1842 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1843 && !strcmp(name, ifname))
1844 {
2a022368 1845 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1846 break;
1847 }
1848 }
1849 fclose(file);
1850 }
149f577a 1851 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1852 }
149f577a 1853 *in6 = netdev_dev->in6;
8b61709d
BP
1854 return 0;
1855}
1856
1857static void
1858make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1859{
1860 struct sockaddr_in sin;
1861 memset(&sin, 0, sizeof sin);
1862 sin.sin_family = AF_INET;
1863 sin.sin_addr = addr;
1864 sin.sin_port = 0;
1865
1866 memset(sa, 0, sizeof *sa);
1867 memcpy(sa, &sin, sizeof sin);
1868}
1869
1870static int
1871do_set_addr(struct netdev *netdev,
1872 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1873{
1874 struct ifreq ifr;
149f577a 1875 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1876 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1877
1878 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1879 ioctl_name);
8b61709d
BP
1880}
1881
1882/* Adds 'router' as a default IP gateway. */
1883static int
67a4917b 1884netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1885{
1886 struct in_addr any = { INADDR_ANY };
1887 struct rtentry rt;
1888 int error;
1889
1890 memset(&rt, 0, sizeof rt);
1891 make_in4_sockaddr(&rt.rt_dst, any);
1892 make_in4_sockaddr(&rt.rt_gateway, router);
1893 make_in4_sockaddr(&rt.rt_genmask, any);
1894 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
1895 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1896 if (error) {
1897 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1898 }
1899 return error;
1900}
1901
f1acd62b
BP
1902static int
1903netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1904 char **netdev_name)
1905{
1906 static const char fn[] = "/proc/net/route";
1907 FILE *stream;
1908 char line[256];
1909 int ln;
1910
1911 *netdev_name = NULL;
1912 stream = fopen(fn, "r");
1913 if (stream == NULL) {
1914 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1915 return errno;
1916 }
1917
1918 ln = 0;
1919 while (fgets(line, sizeof line, stream)) {
1920 if (++ln >= 2) {
1921 char iface[17];
1922 uint32_t dest, gateway, mask;
1923 int refcnt, metric, mtu;
1924 unsigned int flags, use, window, irtt;
1925
1926 if (sscanf(line,
1927 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1928 " %d %u %u\n",
1929 iface, &dest, &gateway, &flags, &refcnt,
1930 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1931
d295e8e9 1932 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1933 fn, ln, line);
1934 continue;
1935 }
1936 if (!(flags & RTF_UP)) {
1937 /* Skip routes that aren't up. */
1938 continue;
1939 }
1940
1941 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1942 * network byte order, so we don't need need any endian
f1acd62b
BP
1943 * conversions here. */
1944 if ((dest & mask) == (host->s_addr & mask)) {
1945 if (!gateway) {
1946 /* The host is directly reachable. */
1947 next_hop->s_addr = 0;
1948 } else {
1949 /* To reach the host, we must go through a gateway. */
1950 next_hop->s_addr = gateway;
1951 }
1952 *netdev_name = xstrdup(iface);
1953 fclose(stream);
1954 return 0;
1955 }
1956 }
1957 }
1958
1959 fclose(stream);
1960 return ENXIO;
1961}
1962
8b61709d
BP
1963/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1964 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1965 * returns 0. Otherwise, it returns a positive errno value; in particular,
1966 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1967static int
1968netdev_linux_arp_lookup(const struct netdev *netdev,
1969 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1970{
1971 struct arpreq r;
c100e025 1972 struct sockaddr_in sin;
8b61709d
BP
1973 int retval;
1974
1975 memset(&r, 0, sizeof r);
c100e025
BP
1976 sin.sin_family = AF_INET;
1977 sin.sin_addr.s_addr = ip;
1978 sin.sin_port = 0;
1979 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
1980 r.arp_ha.sa_family = ARPHRD_ETHER;
1981 r.arp_flags = 0;
149f577a 1982 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
1983 COVERAGE_INC(netdev_arp_lookup);
1984 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1985 if (!retval) {
1986 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1987 } else if (retval != ENXIO) {
1988 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 1989 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
1990 }
1991 return retval;
1992}
1993
1994static int
1995nd_to_iff_flags(enum netdev_flags nd)
1996{
1997 int iff = 0;
1998 if (nd & NETDEV_UP) {
1999 iff |= IFF_UP;
2000 }
2001 if (nd & NETDEV_PROMISC) {
2002 iff |= IFF_PROMISC;
2003 }
2004 return iff;
2005}
2006
2007static int
2008iff_to_nd_flags(int iff)
2009{
2010 enum netdev_flags nd = 0;
2011 if (iff & IFF_UP) {
2012 nd |= NETDEV_UP;
2013 }
2014 if (iff & IFF_PROMISC) {
2015 nd |= NETDEV_PROMISC;
2016 }
2017 return nd;
2018}
2019
2020static int
2021netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2022 enum netdev_flags on, enum netdev_flags *old_flagsp)
2023{
2024 int old_flags, new_flags;
2025 int error;
2026
2027 error = get_flags(netdev, &old_flags);
2028 if (!error) {
2029 *old_flagsp = iff_to_nd_flags(old_flags);
2030 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2031 if (new_flags != old_flags) {
2032 error = set_flags(netdev, new_flags);
2033 }
2034 }
2035 return error;
2036}
2037
2038static void
2039poll_notify(struct list *list)
2040{
2041 struct netdev_linux_notifier *notifier;
4e8e4213 2042 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2043 struct netdev_notifier *n = &notifier->notifier;
2044 n->cb(n);
2045 }
2046}
2047
2048static void
46097491 2049netdev_linux_poll_cb(const struct rtnetlink_change *change,
67a4917b 2050 void *aux OVS_UNUSED)
8b61709d
BP
2051{
2052 if (change) {
2053 struct list *list = shash_find_data(&netdev_linux_notifiers,
2054 change->ifname);
2055 if (list) {
2056 poll_notify(list);
2057 }
2058 } else {
2059 struct shash_node *node;
2060 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2061 poll_notify(node->data);
2062 }
2063 }
2064}
2065
2066static int
2067netdev_linux_poll_add(struct netdev *netdev,
2068 void (*cb)(struct netdev_notifier *), void *aux,
2069 struct netdev_notifier **notifierp)
2070{
2071 const char *netdev_name = netdev_get_name(netdev);
2072 struct netdev_linux_notifier *notifier;
2073 struct list *list;
2074
2075 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2076 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
8b61709d
BP
2077 netdev_linux_poll_cb, NULL);
2078 if (error) {
2079 return error;
2080 }
2081 }
2082
2083 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2084 if (!list) {
2085 list = xmalloc(sizeof *list);
2086 list_init(list);
2087 shash_add(&netdev_linux_notifiers, netdev_name, list);
2088 }
2089
2090 notifier = xmalloc(sizeof *notifier);
2091 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2092 list_push_back(list, &notifier->node);
2093 *notifierp = &notifier->notifier;
2094 return 0;
2095}
2096
2097static void
2098netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2099{
2100 struct netdev_linux_notifier *notifier =
2101 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2102 struct list *list;
2103
2104 /* Remove 'notifier' from its list. */
2105 list = list_remove(&notifier->node);
2106 if (list_is_empty(list)) {
2107 /* The list is now empty. Remove it from the hash and free it. */
2108 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2109 shash_delete(&netdev_linux_notifiers,
2110 shash_find(&netdev_linux_notifiers, netdev_name));
2111 free(list);
2112 }
2113 free(notifier);
2114
2115 /* If that was the last notifier, unregister. */
2116 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2117 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2118 }
2119}
2120
c3827f61
BP
2121#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2122{ \
2123 NAME, \
2124 \
2125 netdev_linux_init, \
2126 netdev_linux_run, \
2127 netdev_linux_wait, \
2128 \
2129 CREATE, \
2130 netdev_linux_destroy, \
2131 NULL, /* reconfigure */ \
2132 \
2133 netdev_linux_open, \
2134 netdev_linux_close, \
2135 \
2136 ENUMERATE, \
2137 \
2138 netdev_linux_recv, \
2139 netdev_linux_recv_wait, \
2140 netdev_linux_drain, \
2141 \
2142 netdev_linux_send, \
2143 netdev_linux_send_wait, \
2144 \
2145 netdev_linux_set_etheraddr, \
2146 netdev_linux_get_etheraddr, \
2147 netdev_linux_get_mtu, \
2148 netdev_linux_get_ifindex, \
2149 netdev_linux_get_carrier, \
2150 netdev_linux_get_stats, \
2151 SET_STATS, \
2152 \
2153 netdev_linux_get_features, \
2154 netdev_linux_set_advertisements, \
2155 netdev_linux_get_vlan_vid, \
2156 \
2157 netdev_linux_set_policing, \
2158 netdev_linux_get_qos_types, \
2159 netdev_linux_get_qos_capabilities, \
2160 netdev_linux_get_qos, \
2161 netdev_linux_set_qos, \
2162 netdev_linux_get_queue, \
2163 netdev_linux_set_queue, \
2164 netdev_linux_delete_queue, \
2165 netdev_linux_get_queue_stats, \
2166 netdev_linux_dump_queues, \
2167 netdev_linux_dump_queue_stats, \
2168 \
2169 netdev_linux_get_in4, \
2170 netdev_linux_set_in4, \
2171 netdev_linux_get_in6, \
2172 netdev_linux_add_router, \
2173 netdev_linux_get_next_hop, \
2174 netdev_linux_arp_lookup, \
2175 \
2176 netdev_linux_update_flags, \
2177 \
2178 netdev_linux_poll_add, \
2179 netdev_linux_poll_remove \
2180}
2181
2182const struct netdev_class netdev_linux_class =
2183 NETDEV_LINUX_CLASS(
2184 "system",
2185 netdev_linux_create,
2186 netdev_linux_enumerate,
98563392 2187 NULL); /* set_stats */
c3827f61
BP
2188
2189const struct netdev_class netdev_tap_class =
2190 NETDEV_LINUX_CLASS(
2191 "tap",
2192 netdev_linux_create_tap,
2193 NULL, /* enumerate */
2194 NULL); /* set_stats */
2195
2196const struct netdev_class netdev_internal_class =
2197 NETDEV_LINUX_CLASS(
2198 "internal",
2199 netdev_linux_create,
2200 NULL, /* enumerate */
2201 netdev_vport_set_stats);
8b61709d 2202\f
c1c9c9c4 2203/* HTB traffic control class. */
559843ed 2204
c1c9c9c4 2205#define HTB_N_QUEUES 0xf000
8b61709d 2206
c1c9c9c4
BP
2207struct htb {
2208 struct tc tc;
2209 unsigned int max_rate; /* In bytes/s. */
2210};
8b61709d 2211
c1c9c9c4 2212struct htb_class {
93b13be8 2213 struct tc_queue tc_queue;
c1c9c9c4
BP
2214 unsigned int min_rate; /* In bytes/s. */
2215 unsigned int max_rate; /* In bytes/s. */
2216 unsigned int burst; /* In bytes. */
2217 unsigned int priority; /* Lower values are higher priorities. */
2218};
8b61709d 2219
c1c9c9c4
BP
2220static struct htb *
2221htb_get__(const struct netdev *netdev)
2222{
2223 struct netdev_dev_linux *netdev_dev =
2224 netdev_dev_linux_cast(netdev_get_dev(netdev));
2225 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2226}
2227
2228static struct htb *
2229htb_install__(struct netdev *netdev, uint64_t max_rate)
2230{
2231 struct netdev_dev_linux *netdev_dev =
2232 netdev_dev_linux_cast(netdev_get_dev(netdev));
2233 struct htb *htb;
2234
2235 htb = xmalloc(sizeof *htb);
2236 tc_init(&htb->tc, &tc_ops_htb);
2237 htb->max_rate = max_rate;
2238
2239 netdev_dev->tc = &htb->tc;
2240
2241 return htb;
2242}
2243
2244/* Create an HTB qdisc.
2245 *
a339aa81 2246 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2247static int
2248htb_setup_qdisc__(struct netdev *netdev)
2249{
2250 size_t opt_offset;
2251 struct tc_htb_glob opt;
2252 struct ofpbuf request;
2253 struct tcmsg *tcmsg;
2254
2255 tc_del_qdisc(netdev);
2256
2257 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2258 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2259 if (!tcmsg) {
2260 return ENODEV;
2261 }
c1c9c9c4
BP
2262 tcmsg->tcm_handle = tc_make_handle(1, 0);
2263 tcmsg->tcm_parent = TC_H_ROOT;
2264
2265 nl_msg_put_string(&request, TCA_KIND, "htb");
2266
2267 memset(&opt, 0, sizeof opt);
2268 opt.rate2quantum = 10;
2269 opt.version = 3;
4ecf12d5 2270 opt.defcls = 1;
c1c9c9c4
BP
2271
2272 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2273 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2274 nl_msg_end_nested(&request, opt_offset);
2275
2276 return tc_transact(&request, NULL);
2277}
2278
2279/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2280 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2281static int
2282htb_setup_class__(struct netdev *netdev, unsigned int handle,
2283 unsigned int parent, struct htb_class *class)
2284{
2285 size_t opt_offset;
2286 struct tc_htb_opt opt;
2287 struct ofpbuf request;
2288 struct tcmsg *tcmsg;
2289 int error;
2290 int mtu;
2291
2292 netdev_get_mtu(netdev, &mtu);
2293
2294 memset(&opt, 0, sizeof opt);
2295 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2296 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2297 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2298 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2299 opt.prio = class->priority;
2300
2301 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2302 if (!tcmsg) {
2303 return ENODEV;
2304 }
c1c9c9c4
BP
2305 tcmsg->tcm_handle = handle;
2306 tcmsg->tcm_parent = parent;
2307
2308 nl_msg_put_string(&request, TCA_KIND, "htb");
2309 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2310 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2311 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2312 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2313 nl_msg_end_nested(&request, opt_offset);
2314
2315 error = tc_transact(&request, NULL);
2316 if (error) {
2317 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2318 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2319 netdev_get_name(netdev),
2320 tc_get_major(handle), tc_get_minor(handle),
2321 tc_get_major(parent), tc_get_minor(parent),
2322 class->min_rate, class->max_rate,
2323 class->burst, class->priority, strerror(error));
2324 }
2325 return error;
2326}
2327
2328/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2329 * description of them into 'details'. The description complies with the
2330 * specification given in the vswitch database documentation for linux-htb
2331 * queue details. */
2332static int
2333htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2334{
2335 static const struct nl_policy tca_htb_policy[] = {
2336 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2337 .min_len = sizeof(struct tc_htb_opt) },
2338 };
2339
2340 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2341 const struct tc_htb_opt *htb;
2342
2343 if (!nl_parse_nested(nl_options, tca_htb_policy,
2344 attrs, ARRAY_SIZE(tca_htb_policy))) {
2345 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2346 return EPROTO;
2347 }
2348
2349 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2350 class->min_rate = htb->rate.rate;
2351 class->max_rate = htb->ceil.rate;
2352 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2353 class->priority = htb->prio;
2354 return 0;
2355}
2356
2357static int
2358htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2359 struct htb_class *options,
2360 struct netdev_queue_stats *stats)
2361{
2362 struct nlattr *nl_options;
2363 unsigned int handle;
2364 int error;
2365
2366 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2367 if (!error && queue_id) {
17ee3c1f
BP
2368 unsigned int major = tc_get_major(handle);
2369 unsigned int minor = tc_get_minor(handle);
2370 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2371 *queue_id = minor - 1;
c1c9c9c4
BP
2372 } else {
2373 error = EPROTO;
2374 }
2375 }
2376 if (!error && options) {
2377 error = htb_parse_tca_options__(nl_options, options);
2378 }
2379 return error;
2380}
2381
2382static void
2383htb_parse_qdisc_details__(struct netdev *netdev,
2384 const struct shash *details, struct htb_class *hc)
2385{
2386 const char *max_rate_s;
2387
2388 max_rate_s = shash_find_data(details, "max-rate");
2389 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2390 if (!hc->max_rate) {
2391 uint32_t current;
2392
2393 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2394 hc->max_rate = netdev_features_to_bps(current) / 8;
2395 }
2396 hc->min_rate = hc->max_rate;
2397 hc->burst = 0;
2398 hc->priority = 0;
2399}
2400
2401static int
2402htb_parse_class_details__(struct netdev *netdev,
2403 const struct shash *details, struct htb_class *hc)
2404{
2405 const struct htb *htb = htb_get__(netdev);
2406 const char *min_rate_s = shash_find_data(details, "min-rate");
2407 const char *max_rate_s = shash_find_data(details, "max-rate");
2408 const char *burst_s = shash_find_data(details, "burst");
2409 const char *priority_s = shash_find_data(details, "priority");
2410 int mtu;
2411
da3827b5 2412 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
c1c9c9c4
BP
2413 if (!min_rate_s) {
2414 /* min-rate is required. */
2415 return EINVAL;
2416 }
2417 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
da3827b5 2418 hc->min_rate = MAX(hc->min_rate, 1500);
c1c9c9c4
BP
2419 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2420
2421 /* max-rate */
2422 hc->max_rate = (max_rate_s
2423 ? strtoull(max_rate_s, NULL, 10) / 8
2424 : htb->max_rate);
2425 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2426 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2427
2428 /* burst
2429 *
2430 * According to hints in the documentation that I've read, it is important
2431 * that 'burst' be at least as big as the largest frame that might be
2432 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2433 * but having it a bit too small is a problem. Since netdev_get_mtu()
2434 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2435 * the MTU. We actually add 64, instead of 14, as a guard against
2436 * additional headers get tacked on somewhere that we're not aware of. */
2437 netdev_get_mtu(netdev, &mtu);
2438 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2439 hc->burst = MAX(hc->burst, mtu + 64);
2440
2441 /* priority */
2442 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2443
2444 return 0;
2445}
2446
2447static int
2448htb_query_class__(const struct netdev *netdev, unsigned int handle,
2449 unsigned int parent, struct htb_class *options,
2450 struct netdev_queue_stats *stats)
2451{
2452 struct ofpbuf *reply;
2453 int error;
2454
2455 error = tc_query_class(netdev, handle, parent, &reply);
2456 if (!error) {
2457 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2458 ofpbuf_delete(reply);
2459 }
2460 return error;
2461}
2462
2463static int
2464htb_tc_install(struct netdev *netdev, const struct shash *details)
2465{
2466 int error;
2467
2468 error = htb_setup_qdisc__(netdev);
2469 if (!error) {
2470 struct htb_class hc;
2471
2472 htb_parse_qdisc_details__(netdev, details, &hc);
2473 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2474 tc_make_handle(1, 0), &hc);
2475 if (!error) {
2476 htb_install__(netdev, hc.max_rate);
2477 }
2478 }
2479 return error;
2480}
2481
93b13be8
BP
2482static struct htb_class *
2483htb_class_cast__(const struct tc_queue *queue)
2484{
2485 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2486}
2487
c1c9c9c4
BP
2488static void
2489htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2490 const struct htb_class *hc)
2491{
2492 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2493 size_t hash = hash_int(queue_id, 0);
2494 struct tc_queue *queue;
c1c9c9c4
BP
2495 struct htb_class *hcp;
2496
93b13be8
BP
2497 queue = tc_find_queue__(netdev, queue_id, hash);
2498 if (queue) {
2499 hcp = htb_class_cast__(queue);
2500 } else {
c1c9c9c4 2501 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2502 queue = &hcp->tc_queue;
2503 queue->queue_id = queue_id;
2504 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2505 }
93b13be8
BP
2506
2507 hcp->min_rate = hc->min_rate;
2508 hcp->max_rate = hc->max_rate;
2509 hcp->burst = hc->burst;
2510 hcp->priority = hc->priority;
c1c9c9c4
BP
2511}
2512
2513static int
2514htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2515{
c1c9c9c4
BP
2516 struct ofpbuf msg;
2517 struct nl_dump dump;
2518 struct htb_class hc;
2519 struct htb *htb;
2520
2521 /* Get qdisc options. */
2522 hc.max_rate = 0;
2523 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2524 htb = htb_install__(netdev, hc.max_rate);
2525
2526 /* Get queues. */
23a98ffe
BP
2527 if (!start_queue_dump(netdev, &dump)) {
2528 return ENODEV;
2529 }
c1c9c9c4
BP
2530 while (nl_dump_next(&dump, &msg)) {
2531 unsigned int queue_id;
2532
2533 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2534 htb_update_queue__(netdev, queue_id, &hc);
2535 }
2536 }
2537 nl_dump_done(&dump);
2538
2539 return 0;
2540}
2541
2542static void
2543htb_tc_destroy(struct tc *tc)
2544{
2545 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2546 struct htb_class *hc, *next;
c1c9c9c4 2547
4e8e4213 2548 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2549 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2550 free(hc);
2551 }
2552 tc_destroy(tc);
2553 free(htb);
2554}
2555
2556static int
2557htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2558{
2559 const struct htb *htb = htb_get__(netdev);
2560 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2561 return 0;
2562}
2563
2564static int
2565htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2566{
2567 struct htb_class hc;
2568 int error;
2569
2570 htb_parse_qdisc_details__(netdev, details, &hc);
2571 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2572 tc_make_handle(1, 0), &hc);
2573 if (!error) {
2574 htb_get__(netdev)->max_rate = hc.max_rate;
2575 }
2576 return error;
2577}
2578
2579static int
93b13be8
BP
2580htb_class_get(const struct netdev *netdev OVS_UNUSED,
2581 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2582{
93b13be8 2583 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2584
2585 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2586 if (hc->min_rate != hc->max_rate) {
2587 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2588 }
2589 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2590 if (hc->priority) {
2591 shash_add(details, "priority", xasprintf("%u", hc->priority));
2592 }
2593 return 0;
2594}
2595
2596static int
2597htb_class_set(struct netdev *netdev, unsigned int queue_id,
2598 const struct shash *details)
2599{
2600 struct htb_class hc;
2601 int error;
2602
2603 error = htb_parse_class_details__(netdev, details, &hc);
2604 if (error) {
2605 return error;
2606 }
2607
17ee3c1f 2608 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2609 tc_make_handle(1, 0xfffe), &hc);
2610 if (error) {
2611 return error;
2612 }
2613
2614 htb_update_queue__(netdev, queue_id, &hc);
2615 return 0;
2616}
2617
2618static int
93b13be8 2619htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2620{
93b13be8 2621 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2622 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2623 int error;
2624
93b13be8 2625 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2626 if (!error) {
93b13be8 2627 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2628 free(hc);
c1c9c9c4
BP
2629 }
2630 return error;
2631}
2632
2633static int
93b13be8 2634htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2635 struct netdev_queue_stats *stats)
2636{
93b13be8 2637 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2638 tc_make_handle(1, 0xfffe), NULL, stats);
2639}
2640
2641static int
2642htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2643 const struct ofpbuf *nlmsg,
2644 netdev_dump_queue_stats_cb *cb, void *aux)
2645{
2646 struct netdev_queue_stats stats;
17ee3c1f 2647 unsigned int handle, major, minor;
c1c9c9c4
BP
2648 int error;
2649
2650 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2651 if (error) {
2652 return error;
2653 }
2654
17ee3c1f
BP
2655 major = tc_get_major(handle);
2656 minor = tc_get_minor(handle);
2657 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2658 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2659 }
2660 return 0;
2661}
2662
2663static const struct tc_ops tc_ops_htb = {
2664 "htb", /* linux_name */
2665 "linux-htb", /* ovs_name */
2666 HTB_N_QUEUES, /* n_queues */
2667 htb_tc_install,
2668 htb_tc_load,
2669 htb_tc_destroy,
2670 htb_qdisc_get,
2671 htb_qdisc_set,
2672 htb_class_get,
2673 htb_class_set,
2674 htb_class_delete,
2675 htb_class_get_stats,
2676 htb_class_dump_stats
2677};
2678\f
a339aa81
EJ
2679/* "linux-hfsc" traffic control class. */
2680
2681#define HFSC_N_QUEUES 0xf000
2682
2683struct hfsc {
2684 struct tc tc;
2685 uint32_t max_rate;
2686};
2687
2688struct hfsc_class {
2689 struct tc_queue tc_queue;
2690 uint32_t min_rate;
2691 uint32_t max_rate;
2692};
2693
2694static struct hfsc *
2695hfsc_get__(const struct netdev *netdev)
2696{
2697 struct netdev_dev_linux *netdev_dev;
2698 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2699 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2700}
2701
2702static struct hfsc_class *
2703hfsc_class_cast__(const struct tc_queue *queue)
2704{
2705 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2706}
2707
2708static struct hfsc *
2709hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2710{
2711 struct netdev_dev_linux * netdev_dev;
2712 struct hfsc *hfsc;
2713
2714 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2715 hfsc = xmalloc(sizeof *hfsc);
2716 tc_init(&hfsc->tc, &tc_ops_hfsc);
2717 hfsc->max_rate = max_rate;
2718 netdev_dev->tc = &hfsc->tc;
2719
2720 return hfsc;
2721}
2722
2723static void
2724hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2725 const struct hfsc_class *hc)
2726{
2727 size_t hash;
2728 struct hfsc *hfsc;
2729 struct hfsc_class *hcp;
2730 struct tc_queue *queue;
2731
2732 hfsc = hfsc_get__(netdev);
2733 hash = hash_int(queue_id, 0);
2734
2735 queue = tc_find_queue__(netdev, queue_id, hash);
2736 if (queue) {
2737 hcp = hfsc_class_cast__(queue);
2738 } else {
2739 hcp = xmalloc(sizeof *hcp);
2740 queue = &hcp->tc_queue;
2741 queue->queue_id = queue_id;
2742 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2743 }
2744
2745 hcp->min_rate = hc->min_rate;
2746 hcp->max_rate = hc->max_rate;
2747}
2748
2749static int
2750hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2751{
2752 const struct tc_service_curve *rsc, *fsc, *usc;
2753 static const struct nl_policy tca_hfsc_policy[] = {
2754 [TCA_HFSC_RSC] = {
2755 .type = NL_A_UNSPEC,
2756 .optional = false,
2757 .min_len = sizeof(struct tc_service_curve),
2758 },
2759 [TCA_HFSC_FSC] = {
2760 .type = NL_A_UNSPEC,
2761 .optional = false,
2762 .min_len = sizeof(struct tc_service_curve),
2763 },
2764 [TCA_HFSC_USC] = {
2765 .type = NL_A_UNSPEC,
2766 .optional = false,
2767 .min_len = sizeof(struct tc_service_curve),
2768 },
2769 };
2770 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2771
2772 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2773 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2774 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2775 return EPROTO;
2776 }
2777
2778 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2779 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2780 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2781
2782 if (rsc->m1 != 0 || rsc->d != 0 ||
2783 fsc->m1 != 0 || fsc->d != 0 ||
2784 usc->m1 != 0 || usc->d != 0) {
2785 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2786 "Non-linear service curves are not supported.");
2787 return EPROTO;
2788 }
2789
2790 if (rsc->m2 != fsc->m2) {
2791 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2792 "Real-time service curves are not supported ");
2793 return EPROTO;
2794 }
2795
2796 if (rsc->m2 > usc->m2) {
2797 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2798 "Min-rate service curve is greater than "
2799 "the max-rate service curve.");
2800 return EPROTO;
2801 }
2802
2803 class->min_rate = fsc->m2;
2804 class->max_rate = usc->m2;
2805 return 0;
2806}
2807
2808static int
2809hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2810 struct hfsc_class *options,
2811 struct netdev_queue_stats *stats)
2812{
2813 int error;
2814 unsigned int handle;
2815 struct nlattr *nl_options;
2816
2817 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2818 if (error) {
2819 return error;
2820 }
2821
2822 if (queue_id) {
2823 unsigned int major, minor;
2824
2825 major = tc_get_major(handle);
2826 minor = tc_get_minor(handle);
2827 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2828 *queue_id = minor - 1;
2829 } else {
2830 return EPROTO;
2831 }
2832 }
2833
2834 if (options) {
2835 error = hfsc_parse_tca_options__(nl_options, options);
2836 }
2837
2838 return error;
2839}
2840
2841static int
2842hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2843 unsigned int parent, struct hfsc_class *options,
2844 struct netdev_queue_stats *stats)
2845{
2846 int error;
2847 struct ofpbuf *reply;
2848
2849 error = tc_query_class(netdev, handle, parent, &reply);
2850 if (error) {
2851 return error;
2852 }
2853
2854 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2855 ofpbuf_delete(reply);
2856 return error;
2857}
2858
2859static void
2860hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2861 struct hfsc_class *class)
2862{
2863 uint32_t max_rate;
2864 const char *max_rate_s;
2865
2866 max_rate_s = shash_find_data(details, "max-rate");
2867 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2868
2869 if (!max_rate) {
2870 uint32_t current;
2871
2872 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2873 max_rate = netdev_features_to_bps(current) / 8;
2874 }
2875
2876 class->min_rate = max_rate;
2877 class->max_rate = max_rate;
2878}
2879
2880static int
2881hfsc_parse_class_details__(struct netdev *netdev,
2882 const struct shash *details,
2883 struct hfsc_class * class)
2884{
2885 const struct hfsc *hfsc;
2886 uint32_t min_rate, max_rate;
2887 const char *min_rate_s, *max_rate_s;
2888
2889 hfsc = hfsc_get__(netdev);
2890 min_rate_s = shash_find_data(details, "min-rate");
2891 max_rate_s = shash_find_data(details, "max-rate");
2892
2893 if (!min_rate_s) {
2894 return EINVAL;
2895 }
2896
2897 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2898 min_rate = MAX(min_rate, 1500);
2899 min_rate = MIN(min_rate, hfsc->max_rate);
2900
2901 max_rate = (max_rate_s
2902 ? strtoull(max_rate_s, NULL, 10) / 8
2903 : hfsc->max_rate);
2904 max_rate = MAX(max_rate, min_rate);
2905 max_rate = MIN(max_rate, hfsc->max_rate);
2906
2907 class->min_rate = min_rate;
2908 class->max_rate = max_rate;
2909
2910 return 0;
2911}
2912
2913/* Create an HFSC qdisc.
2914 *
2915 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2916static int
2917hfsc_setup_qdisc__(struct netdev * netdev)
2918{
2919 struct tcmsg *tcmsg;
2920 struct ofpbuf request;
2921 struct tc_hfsc_qopt opt;
2922
2923 tc_del_qdisc(netdev);
2924
2925 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2926 NLM_F_EXCL | NLM_F_CREATE, &request);
2927
2928 if (!tcmsg) {
2929 return ENODEV;
2930 }
2931
2932 tcmsg->tcm_handle = tc_make_handle(1, 0);
2933 tcmsg->tcm_parent = TC_H_ROOT;
2934
2935 memset(&opt, 0, sizeof opt);
2936 opt.defcls = 1;
2937
2938 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2939 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2940
2941 return tc_transact(&request, NULL);
2942}
2943
2944/* Create an HFSC class.
2945 *
2946 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2947 * sc rate <min_rate> ul rate <max_rate>" */
2948static int
2949hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2950 unsigned int parent, struct hfsc_class *class)
2951{
2952 int error;
2953 size_t opt_offset;
2954 struct tcmsg *tcmsg;
2955 struct ofpbuf request;
2956 struct tc_service_curve min, max;
2957
2958 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2959
2960 if (!tcmsg) {
2961 return ENODEV;
2962 }
2963
2964 tcmsg->tcm_handle = handle;
2965 tcmsg->tcm_parent = parent;
2966
2967 min.m1 = 0;
2968 min.d = 0;
2969 min.m2 = class->min_rate;
2970
2971 max.m1 = 0;
2972 max.d = 0;
2973 max.m2 = class->max_rate;
2974
2975 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2976 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2977 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2978 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2979 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2980 nl_msg_end_nested(&request, opt_offset);
2981
2982 error = tc_transact(&request, NULL);
2983 if (error) {
2984 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2985 "min-rate %ubps, max-rate %ubps (%s)",
2986 netdev_get_name(netdev),
2987 tc_get_major(handle), tc_get_minor(handle),
2988 tc_get_major(parent), tc_get_minor(parent),
2989 class->min_rate, class->max_rate, strerror(error));
2990 }
2991
2992 return error;
2993}
2994
2995static int
2996hfsc_tc_install(struct netdev *netdev, const struct shash *details)
2997{
2998 int error;
2999 struct hfsc_class class;
3000
3001 error = hfsc_setup_qdisc__(netdev);
3002
3003 if (error) {
3004 return error;
3005 }
3006
3007 hfsc_parse_qdisc_details__(netdev, details, &class);
3008 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3009 tc_make_handle(1, 0), &class);
3010
3011 if (error) {
3012 return error;
3013 }
3014
3015 hfsc_install__(netdev, class.max_rate);
3016 return 0;
3017}
3018
3019static int
3020hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3021{
3022 struct ofpbuf msg;
3023 struct hfsc *hfsc;
3024 struct nl_dump dump;
3025 struct hfsc_class hc;
3026
3027 hc.max_rate = 0;
3028 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3029 hfsc = hfsc_install__(netdev, hc.max_rate);
3030
3031 if (!start_queue_dump(netdev, &dump)) {
3032 return ENODEV;
3033 }
3034
3035 while (nl_dump_next(&dump, &msg)) {
3036 unsigned int queue_id;
3037
3038 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3039 hfsc_update_queue__(netdev, queue_id, &hc);
3040 }
3041 }
3042
3043 nl_dump_done(&dump);
3044 return 0;
3045}
3046
3047static void
3048hfsc_tc_destroy(struct tc *tc)
3049{
3050 struct hfsc *hfsc;
3051 struct hfsc_class *hc, *next;
3052
3053 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3054
3055 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3056 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3057 free(hc);
3058 }
3059
3060 tc_destroy(tc);
3061 free(hfsc);
3062}
3063
3064static int
3065hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3066{
3067 const struct hfsc *hfsc;
3068 hfsc = hfsc_get__(netdev);
3069 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3070 return 0;
3071}
3072
3073static int
3074hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3075{
3076 int error;
3077 struct hfsc_class class;
3078
3079 hfsc_parse_qdisc_details__(netdev, details, &class);
3080 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3081 tc_make_handle(1, 0), &class);
3082
3083 if (!error) {
3084 hfsc_get__(netdev)->max_rate = class.max_rate;
3085 }
3086
3087 return error;
3088}
3089
3090static int
3091hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3092 const struct tc_queue *queue, struct shash *details)
3093{
3094 const struct hfsc_class *hc;
3095
3096 hc = hfsc_class_cast__(queue);
3097 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3098 if (hc->min_rate != hc->max_rate) {
3099 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3100 }
3101 return 0;
3102}
3103
3104static int
3105hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3106 const struct shash *details)
3107{
3108 int error;
3109 struct hfsc_class class;
3110
3111 error = hfsc_parse_class_details__(netdev, details, &class);
3112 if (error) {
3113 return error;
3114 }
3115
3116 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3117 tc_make_handle(1, 0xfffe), &class);
3118 if (error) {
3119 return error;
3120 }
3121
3122 hfsc_update_queue__(netdev, queue_id, &class);
3123 return 0;
3124}
3125
3126static int
3127hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3128{
3129 int error;
3130 struct hfsc *hfsc;
3131 struct hfsc_class *hc;
3132
3133 hc = hfsc_class_cast__(queue);
3134 hfsc = hfsc_get__(netdev);
3135
3136 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3137 if (!error) {
3138 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3139 free(hc);
3140 }
3141 return error;
3142}
3143
3144static int
3145hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3146 struct netdev_queue_stats *stats)
3147{
3148 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3149 tc_make_handle(1, 0xfffe), NULL, stats);
3150}
3151
3152static int
3153hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3154 const struct ofpbuf *nlmsg,
3155 netdev_dump_queue_stats_cb *cb, void *aux)
3156{
3157 struct netdev_queue_stats stats;
3158 unsigned int handle, major, minor;
3159 int error;
3160
3161 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3162 if (error) {
3163 return error;
3164 }
3165
3166 major = tc_get_major(handle);
3167 minor = tc_get_minor(handle);
3168 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3169 (*cb)(minor - 1, &stats, aux);
3170 }
3171 return 0;
3172}
3173
3174static const struct tc_ops tc_ops_hfsc = {
3175 "hfsc", /* linux_name */
3176 "linux-hfsc", /* ovs_name */
3177 HFSC_N_QUEUES, /* n_queues */
3178 hfsc_tc_install, /* tc_install */
3179 hfsc_tc_load, /* tc_load */
3180 hfsc_tc_destroy, /* tc_destroy */
3181 hfsc_qdisc_get, /* qdisc_get */
3182 hfsc_qdisc_set, /* qdisc_set */
3183 hfsc_class_get, /* class_get */
3184 hfsc_class_set, /* class_set */
3185 hfsc_class_delete, /* class_delete */
3186 hfsc_class_get_stats, /* class_get_stats */
3187 hfsc_class_dump_stats /* class_dump_stats */
3188};
3189\f
c1c9c9c4
BP
3190/* "linux-default" traffic control class.
3191 *
3192 * This class represents the default, unnamed Linux qdisc. It corresponds to
3193 * the "" (empty string) QoS type in the OVS database. */
3194
3195static void
3196default_install__(struct netdev *netdev)
3197{
3198 struct netdev_dev_linux *netdev_dev =
3199 netdev_dev_linux_cast(netdev_get_dev(netdev));
3200 static struct tc *tc;
3201
3202 if (!tc) {
3203 tc = xmalloc(sizeof *tc);
3204 tc_init(tc, &tc_ops_default);
3205 }
3206 netdev_dev->tc = tc;
3207}
3208
3209static int
3210default_tc_install(struct netdev *netdev,
3211 const struct shash *details OVS_UNUSED)
3212{
3213 default_install__(netdev);
3214 return 0;
3215}
3216
3217static int
3218default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3219{
3220 default_install__(netdev);
3221 return 0;
3222}
3223
3224static const struct tc_ops tc_ops_default = {
3225 NULL, /* linux_name */
3226 "", /* ovs_name */
3227 0, /* n_queues */
3228 default_tc_install,
3229 default_tc_load,
3230 NULL, /* tc_destroy */
3231 NULL, /* qdisc_get */
3232 NULL, /* qdisc_set */
3233 NULL, /* class_get */
3234 NULL, /* class_set */
3235 NULL, /* class_delete */
3236 NULL, /* class_get_stats */
3237 NULL /* class_dump_stats */
3238};
3239\f
3240/* "linux-other" traffic control class.
3241 *
3242 * */
3243
3244static int
3245other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3246{
3247 struct netdev_dev_linux *netdev_dev =
3248 netdev_dev_linux_cast(netdev_get_dev(netdev));
3249 static struct tc *tc;
3250
3251 if (!tc) {
3252 tc = xmalloc(sizeof *tc);
3253 tc_init(tc, &tc_ops_other);
3254 }
3255 netdev_dev->tc = tc;
3256 return 0;
3257}
3258
3259static const struct tc_ops tc_ops_other = {
3260 NULL, /* linux_name */
3261 "linux-other", /* ovs_name */
3262 0, /* n_queues */
3263 NULL, /* tc_install */
3264 other_tc_load,
3265 NULL, /* tc_destroy */
3266 NULL, /* qdisc_get */
3267 NULL, /* qdisc_set */
3268 NULL, /* class_get */
3269 NULL, /* class_set */
3270 NULL, /* class_delete */
3271 NULL, /* class_get_stats */
3272 NULL /* class_dump_stats */
3273};
3274\f
3275/* Traffic control. */
3276
3277/* Number of kernel "tc" ticks per second. */
3278static double ticks_per_s;
3279
3280/* Number of kernel "jiffies" per second. This is used for the purpose of
3281 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3282 * one jiffy's worth of data.
3283 *
3284 * There are two possibilities here:
3285 *
3286 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3287 * approximate range of 100 to 1024. That means that we really need to
3288 * make sure that the qdisc can buffer that much data.
3289 *
3290 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3291 * has finely granular timers and there's no need to fudge additional room
3292 * for buffers. (There's no extra effort needed to implement that: the
3293 * large 'buffer_hz' is used as a divisor, so practically any number will
3294 * come out as 0 in the division. Small integer results in the case of
3295 * really high dividends won't have any real effect anyhow.)
3296 */
3297static unsigned int buffer_hz;
3298
3299/* Returns tc handle 'major':'minor'. */
3300static unsigned int
3301tc_make_handle(unsigned int major, unsigned int minor)
3302{
3303 return TC_H_MAKE(major << 16, minor);
3304}
3305
3306/* Returns the major number from 'handle'. */
3307static unsigned int
3308tc_get_major(unsigned int handle)
3309{
3310 return TC_H_MAJ(handle) >> 16;
3311}
3312
3313/* Returns the minor number from 'handle'. */
3314static unsigned int
3315tc_get_minor(unsigned int handle)
3316{
3317 return TC_H_MIN(handle);
3318}
3319
3320static struct tcmsg *
3321tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3322 struct ofpbuf *request)
3323{
3324 struct tcmsg *tcmsg;
3325 int ifindex;
3326 int error;
3327
3328 error = get_ifindex(netdev, &ifindex);
3329 if (error) {
3330 return NULL;
3331 }
3332
3333 ofpbuf_init(request, 512);
3334 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3335 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3336 tcmsg->tcm_family = AF_UNSPEC;
3337 tcmsg->tcm_ifindex = ifindex;
3338 /* Caller should fill in tcmsg->tcm_handle. */
3339 /* Caller should fill in tcmsg->tcm_parent. */
3340
3341 return tcmsg;
3342}
3343
3344static int
3345tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3346{
3347 int error = nl_sock_transact(rtnl_sock, request, replyp);
3348 ofpbuf_uninit(request);
3349 return error;
3350}
3351
3352static void
3353read_psched(void)
3354{
3355 /* The values in psched are not individually very meaningful, but they are
3356 * important. The tables below show some values seen in the wild.
3357 *
3358 * Some notes:
3359 *
3360 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3361 * (Before that, there are hints that it was 1000000000.)
3362 *
3363 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3364 * above.
3365 *
3366 * /proc/net/psched
3367 * -----------------------------------
3368 * [1] 000c8000 000f4240 000f4240 00000064
3369 * [2] 000003e8 00000400 000f4240 3b9aca00
3370 * [3] 000003e8 00000400 000f4240 3b9aca00
3371 * [4] 000003e8 00000400 000f4240 00000064
3372 * [5] 000003e8 00000040 000f4240 3b9aca00
3373 * [6] 000003e8 00000040 000f4240 000000f9
3374 *
3375 * a b c d ticks_per_s buffer_hz
3376 * ------- --------- ---------- ------------- ----------- -------------
3377 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3378 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3379 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3380 * [4] 1,000 1,024 1,000,000 100 976,562 100
3381 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3382 * [6] 1,000 64 1,000,000 249 15,625,000 249
3383 *
3384 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3385 * [2] 2.6.26-1-686-bigmem from Debian lenny
3386 * [3] 2.6.26-2-sparc64 from Debian lenny
3387 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3388 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3389 * [6] 2.6.34 from kernel.org on KVM
3390 */
3391 static const char fn[] = "/proc/net/psched";
3392 unsigned int a, b, c, d;
3393 FILE *stream;
3394
3395 ticks_per_s = 1.0;
3396 buffer_hz = 100;
3397
3398 stream = fopen(fn, "r");
3399 if (!stream) {
3400 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3401 return;
3402 }
3403
3404 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3405 VLOG_WARN("%s: read failed", fn);
3406 fclose(stream);
3407 return;
3408 }
3409 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3410 fclose(stream);
3411
3412 if (!a || !c) {
3413 VLOG_WARN("%s: invalid scheduler parameters", fn);
3414 return;
3415 }
3416
3417 ticks_per_s = (double) a * c / b;
3418 if (c == 1000000) {
3419 buffer_hz = d;
3420 } else {
3421 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3422 fn, a, b, c, d);
3423 }
3424 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3425}
3426
3427/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3428 * rate of 'rate' bytes per second. */
3429static unsigned int
3430tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3431{
3432 if (!buffer_hz) {
3433 read_psched();
3434 }
3435 return (rate * ticks) / ticks_per_s;
3436}
3437
3438/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3439 * rate of 'rate' bytes per second. */
3440static unsigned int
3441tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3442{
3443 if (!buffer_hz) {
3444 read_psched();
3445 }
015c93a4 3446 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3447}
3448
3449/* Returns the number of bytes that need to be reserved for qdisc buffering at
3450 * a transmission rate of 'rate' bytes per second. */
3451static unsigned int
3452tc_buffer_per_jiffy(unsigned int rate)
3453{
3454 if (!buffer_hz) {
3455 read_psched();
3456 }
3457 return rate / buffer_hz;
3458}
3459
3460/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3461 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3462 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3463 * stores NULL into it if it is absent.
3464 *
3465 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3466 * 'msg'.
3467 *
3468 * Returns 0 if successful, otherwise a positive errno value. */
3469static int
3470tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3471 struct nlattr **options)
3472{
3473 static const struct nl_policy tca_policy[] = {
3474 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3475 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3476 };
3477 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3478
3479 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3480 tca_policy, ta, ARRAY_SIZE(ta))) {
3481 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3482 goto error;
3483 }
3484
3485 if (kind) {
3486 *kind = nl_attr_get_string(ta[TCA_KIND]);
3487 }
3488
3489 if (options) {
3490 *options = ta[TCA_OPTIONS];
3491 }
3492
3493 return 0;
3494
3495error:
3496 if (kind) {
3497 *kind = NULL;
3498 }
3499 if (options) {
3500 *options = NULL;
3501 }
3502 return EPROTO;
3503}
3504
3505/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3506 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3507 * into '*options', and its queue statistics into '*stats'. Any of the output
3508 * arguments may be null.
3509 *
3510 * Returns 0 if successful, otherwise a positive errno value. */
3511static int
3512tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3513 struct nlattr **options, struct netdev_queue_stats *stats)
3514{
3515 static const struct nl_policy tca_policy[] = {
3516 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3517 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3518 };
3519 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3520
3521 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3522 tca_policy, ta, ARRAY_SIZE(ta))) {
3523 VLOG_WARN_RL(&rl, "failed to parse class message");
3524 goto error;
3525 }
3526
3527 if (handlep) {
3528 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3529 *handlep = tc->tcm_handle;
3530 }
3531
3532 if (options) {
3533 *options = ta[TCA_OPTIONS];
3534 }
3535
3536 if (stats) {
3537 const struct gnet_stats_queue *gsq;
3538 struct gnet_stats_basic gsb;
3539
3540 static const struct nl_policy stats_policy[] = {
3541 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3542 .min_len = sizeof gsb },
3543 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3544 .min_len = sizeof *gsq },
3545 };
3546 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3547
3548 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3549 sa, ARRAY_SIZE(sa))) {
3550 VLOG_WARN_RL(&rl, "failed to parse class stats");
3551 goto error;
3552 }
3553
3554 /* Alignment issues screw up the length of struct gnet_stats_basic on
3555 * some arch/bitsize combinations. Newer versions of Linux have a
3556 * struct gnet_stats_basic_packed, but we can't depend on that. The
3557 * easiest thing to do is just to make a copy. */
3558 memset(&gsb, 0, sizeof gsb);
3559 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3560 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3561 stats->tx_bytes = gsb.bytes;
3562 stats->tx_packets = gsb.packets;
3563
3564 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3565 stats->tx_errors = gsq->drops;
3566 }
3567
3568 return 0;
3569
3570error:
3571 if (options) {
3572 *options = NULL;
3573 }
3574 if (stats) {
3575 memset(stats, 0, sizeof *stats);
3576 }
3577 return EPROTO;
3578}
3579
3580/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3581 * on 'netdev'. */
3582static int
3583tc_query_class(const struct netdev *netdev,
3584 unsigned int handle, unsigned int parent,
3585 struct ofpbuf **replyp)
3586{
3587 struct ofpbuf request;
3588 struct tcmsg *tcmsg;
3589 int error;
3590
3591 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3592 if (!tcmsg) {
3593 return ENODEV;
3594 }
c1c9c9c4
BP
3595 tcmsg->tcm_handle = handle;
3596 tcmsg->tcm_parent = parent;
3597
3598 error = tc_transact(&request, replyp);
3599 if (error) {
3600 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3601 netdev_get_name(netdev),
3602 tc_get_major(handle), tc_get_minor(handle),
3603 tc_get_major(parent), tc_get_minor(parent),
3604 strerror(error));
3605 }
3606 return error;
3607}
3608
3609/* Equivalent to "tc class del dev <name> handle <handle>". */
3610static int
3611tc_delete_class(const struct netdev *netdev, unsigned int handle)
3612{
3613 struct ofpbuf request;
3614 struct tcmsg *tcmsg;
3615 int error;
3616
3617 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3618 if (!tcmsg) {
3619 return ENODEV;
3620 }
c1c9c9c4
BP
3621 tcmsg->tcm_handle = handle;
3622 tcmsg->tcm_parent = 0;
3623
3624 error = tc_transact(&request, NULL);
3625 if (error) {
3626 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3627 netdev_get_name(netdev),
3628 tc_get_major(handle), tc_get_minor(handle),
3629 strerror(error));
3630 }
3631 return error;
3632}
3633
3634/* Equivalent to "tc qdisc del dev <name> root". */
3635static int
3636tc_del_qdisc(struct netdev *netdev)
3637{
3638 struct netdev_dev_linux *netdev_dev =
3639 netdev_dev_linux_cast(netdev_get_dev(netdev));
3640 struct ofpbuf request;
3641 struct tcmsg *tcmsg;
3642 int error;
3643
3644 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3645 if (!tcmsg) {
3646 return ENODEV;
3647 }
c1c9c9c4
BP
3648 tcmsg->tcm_handle = tc_make_handle(1, 0);
3649 tcmsg->tcm_parent = TC_H_ROOT;
3650
3651 error = tc_transact(&request, NULL);
3652 if (error == EINVAL) {
3653 /* EINVAL probably means that the default qdisc was in use, in which
3654 * case we've accomplished our purpose. */
3655 error = 0;
3656 }
3657 if (!error && netdev_dev->tc) {
3658 if (netdev_dev->tc->ops->tc_destroy) {
3659 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3660 }
3661 netdev_dev->tc = NULL;
3662 }
3663 return error;
3664}
3665
3666/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3667 * kernel to determine what they are. Returns 0 if successful, otherwise a
3668 * positive errno value. */
3669static int
3670tc_query_qdisc(const struct netdev *netdev)
3671{
3672 struct netdev_dev_linux *netdev_dev =
3673 netdev_dev_linux_cast(netdev_get_dev(netdev));
3674 struct ofpbuf request, *qdisc;
3675 const struct tc_ops *ops;
3676 struct tcmsg *tcmsg;
3677 int load_error;
3678 int error;
3679
3680 if (netdev_dev->tc) {
3681 return 0;
3682 }
3683
3684 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3685 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3686 * 2.6.35 without that fix backported to it.
3687 *
3688 * To avoid the OOPS, we must not make a request that would attempt to dump
3689 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3690 * few others. There are a few ways that I can see to do this, but most of
3691 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3692 * technique chosen here is to assume that any non-default qdisc that we
3693 * create will have a class with handle 1:0. The built-in qdiscs only have
3694 * a class with handle 0:0.
3695 *
3696 * We could check for Linux 2.6.35+ and use a more straightforward method
3697 * there. */
3698 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3699 if (!tcmsg) {
3700 return ENODEV;
3701 }
c1c9c9c4
BP
3702 tcmsg->tcm_handle = tc_make_handle(1, 0);
3703 tcmsg->tcm_parent = 0;
3704
3705 /* Figure out what tc class to instantiate. */
3706 error = tc_transact(&request, &qdisc);
3707 if (!error) {
3708 const char *kind;
3709
3710 error = tc_parse_qdisc(qdisc, &kind, NULL);
3711 if (error) {
3712 ops = &tc_ops_other;
3713 } else {
3714 ops = tc_lookup_linux_name(kind);
3715 if (!ops) {
3716 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3717 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3718
3719 ops = &tc_ops_other;
3720 }
3721 }
3722 } else if (error == ENOENT) {
3723 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3724 * other entity that doesn't have a handle 1:0. We will assume
3725 * that it's the system default qdisc. */
3726 ops = &tc_ops_default;
3727 error = 0;
3728 } else {
3729 /* Who knows? Maybe the device got deleted. */
3730 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3731 netdev_get_name(netdev), strerror(error));
3732 ops = &tc_ops_other;
3733 }
3734
3735 /* Instantiate it. */
3736 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3737 assert((load_error == 0) == (netdev_dev->tc != NULL));
3738 ofpbuf_delete(qdisc);
3739
3740 return error ? error : load_error;
3741}
3742
3743/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3744 approximate the time to transmit packets of various lengths. For an MTU of
3745 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3746 represents two possible packet lengths; for a MTU of 513 through 1024, four
3747 possible lengths; and so on.
3748
3749 Returns, for the specified 'mtu', the number of bits that packet lengths
3750 need to be shifted right to fit within such a 256-entry table. */
3751static int
3752tc_calc_cell_log(unsigned int mtu)
3753{
3754 int cell_log;
3755
3756 if (!mtu) {
3757 mtu = ETH_PAYLOAD_MAX;
3758 }
3759 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3760
3761 for (cell_log = 0; mtu >= 256; cell_log++) {
3762 mtu >>= 1;
3763 }
3764
3765 return cell_log;
3766}
3767
3768/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3769 * of 'mtu'. */
3770static void
3771tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3772{
3773 memset(rate, 0, sizeof *rate);
3774 rate->cell_log = tc_calc_cell_log(mtu);
3775 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3776 /* rate->cell_align = 0; */ /* distro headers. */
3777 rate->mpu = ETH_TOTAL_MIN;
3778 rate->rate = Bps;
3779}
3780
3781/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3782 * attribute of the specified "type".
3783 *
3784 * See tc_calc_cell_log() above for a description of "rtab"s. */
3785static void
3786tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3787{
3788 uint32_t *rtab;
3789 unsigned int i;
3790
3791 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3792 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3793 unsigned packet_size = (i + 1) << rate->cell_log;
3794 if (packet_size < rate->mpu) {
3795 packet_size = rate->mpu;
3796 }
3797 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3798 }
3799}
3800
3801/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3802 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3803 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3804 * 0 is fine.) */
c1c9c9c4
BP
3805static int
3806tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3807{
3808 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3809 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3810}
3811
3812\f
3813/* Utility functions. */
3814
3815static int
3816get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3817{
3818 /* Policy for RTNLGRP_LINK messages.
3819 *
3820 * There are *many* more fields in these messages, but currently we only
3821 * care about these fields. */
3822 static const struct nl_policy rtnlgrp_link_policy[] = {
3823 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3824 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3825 .min_len = sizeof(struct rtnl_link_stats) },
3826 };
3827
3828 struct ofpbuf request;
3829 struct ofpbuf *reply;
3830 struct ifinfomsg *ifi;
3831 const struct rtnl_link_stats *rtnl_stats;
3832 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3833 int error;
3834
3835 ofpbuf_init(&request, 0);
3836 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3837 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3838 ifi->ifi_family = PF_UNSPEC;
3839 ifi->ifi_index = ifindex;
3840 error = nl_sock_transact(rtnl_sock, &request, &reply);
3841 ofpbuf_uninit(&request);
3842 if (error) {
3843 return error;
3844 }
3845
3846 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3847 rtnlgrp_link_policy,
3848 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3849 ofpbuf_delete(reply);
3850 return EPROTO;
3851 }
3852
3853 if (!attrs[IFLA_STATS]) {
3854 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3855 ofpbuf_delete(reply);
3856 return EPROTO;
3857 }
8b61709d
BP
3858
3859 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3860 stats->rx_packets = rtnl_stats->rx_packets;
3861 stats->tx_packets = rtnl_stats->tx_packets;
3862 stats->rx_bytes = rtnl_stats->rx_bytes;
3863 stats->tx_bytes = rtnl_stats->tx_bytes;
3864 stats->rx_errors = rtnl_stats->rx_errors;
3865 stats->tx_errors = rtnl_stats->tx_errors;
3866 stats->rx_dropped = rtnl_stats->rx_dropped;
3867 stats->tx_dropped = rtnl_stats->tx_dropped;
3868 stats->multicast = rtnl_stats->multicast;
3869 stats->collisions = rtnl_stats->collisions;
3870 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3871 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3872 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3873 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3874 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3875 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3876 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3877 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3878 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3879 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3880 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3881
576e26d7
BP
3882 ofpbuf_delete(reply);
3883
8b61709d
BP
3884 return 0;
3885}
3886
3887static int
3888get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3889{
3890 static const char fn[] = "/proc/net/dev";
3891 char line[1024];
3892 FILE *stream;
3893 int ln;
3894
3895 stream = fopen(fn, "r");
3896 if (!stream) {
3897 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3898 return errno;
3899 }
3900
3901 ln = 0;
3902 while (fgets(line, sizeof line, stream)) {
3903 if (++ln >= 3) {
3904 char devname[16];
3905#define X64 "%"SCNu64
3906 if (sscanf(line,
3907 " %15[^:]:"
3908 X64 X64 X64 X64 X64 X64 X64 "%*u"
3909 X64 X64 X64 X64 X64 X64 X64 "%*u",
3910 devname,
3911 &stats->rx_bytes,
3912 &stats->rx_packets,
3913 &stats->rx_errors,
3914 &stats->rx_dropped,
3915 &stats->rx_fifo_errors,
3916 &stats->rx_frame_errors,
3917 &stats->multicast,
3918 &stats->tx_bytes,
3919 &stats->tx_packets,
3920 &stats->tx_errors,
3921 &stats->tx_dropped,
3922 &stats->tx_fifo_errors,
3923 &stats->collisions,
3924 &stats->tx_carrier_errors) != 15) {
3925 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3926 } else if (!strcmp(devname, netdev_name)) {
3927 stats->rx_length_errors = UINT64_MAX;
3928 stats->rx_over_errors = UINT64_MAX;
3929 stats->rx_crc_errors = UINT64_MAX;
3930 stats->rx_missed_errors = UINT64_MAX;
3931 stats->tx_aborted_errors = UINT64_MAX;
3932 stats->tx_heartbeat_errors = UINT64_MAX;
3933 stats->tx_window_errors = UINT64_MAX;
3934 fclose(stream);
3935 return 0;
3936 }
3937 }
3938 }
3939 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3940 fclose(stream);
3941 return ENODEV;
3942}
c1c9c9c4 3943
8b61709d
BP
3944static int
3945get_flags(const struct netdev *netdev, int *flags)
3946{
3947 struct ifreq ifr;
3948 int error;
3949
149f577a
JG
3950 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3951 "SIOCGIFFLAGS");
8b61709d
BP
3952 *flags = ifr.ifr_flags;
3953 return error;
3954}
3955
3956static int
3957set_flags(struct netdev *netdev, int flags)
3958{
3959 struct ifreq ifr;
3960
3961 ifr.ifr_flags = flags;
149f577a
JG
3962 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3963 "SIOCSIFFLAGS");
8b61709d
BP
3964}
3965
3966static int
3967do_get_ifindex(const char *netdev_name)
3968{
3969 struct ifreq ifr;
3970
3971 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3972 COVERAGE_INC(netdev_get_ifindex);
3973 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3974 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3975 netdev_name, strerror(errno));
3976 return -errno;
3977 }
3978 return ifr.ifr_ifindex;
3979}
3980
3981static int
3982get_ifindex(const struct netdev *netdev_, int *ifindexp)
3983{
149f577a
JG
3984 struct netdev_dev_linux *netdev_dev =
3985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 3986 *ifindexp = 0;
149f577a 3987 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
3988 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3989 if (ifindex < 0) {
3990 return -ifindex;
3991 }
149f577a
JG
3992 netdev_dev->cache_valid |= VALID_IFINDEX;
3993 netdev_dev->ifindex = ifindex;
8b61709d 3994 }
149f577a 3995 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
3996 return 0;
3997}
3998
3999static int
4000get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4001{
4002 struct ifreq ifr;
4003 int hwaddr_family;
4004
4005 memset(&ifr, 0, sizeof ifr);
4006 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4007 COVERAGE_INC(netdev_get_hwaddr);
4008 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4009 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4010 netdev_name, strerror(errno));
4011 return errno;
4012 }
4013 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4014 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4015 VLOG_WARN("%s device has unknown hardware address family %d",
4016 netdev_name, hwaddr_family);
4017 }
4018 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4019 return 0;
4020}
4021
4022static int
4023set_etheraddr(const char *netdev_name, int hwaddr_family,
4024 const uint8_t mac[ETH_ADDR_LEN])
4025{
4026 struct ifreq ifr;
4027
4028 memset(&ifr, 0, sizeof ifr);
4029 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4030 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4031 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4032 COVERAGE_INC(netdev_set_hwaddr);
4033 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4034 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4035 netdev_name, strerror(errno));
4036 return errno;
4037 }
4038 return 0;
4039}
4040
4041static int
0b0544d7 4042netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4043 int cmd, const char *cmd_name)
4044{
4045 struct ifreq ifr;
4046
4047 memset(&ifr, 0, sizeof ifr);
0b0544d7 4048 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4049 ifr.ifr_data = (caddr_t) ecmd;
4050
4051 ecmd->cmd = cmd;
4052 COVERAGE_INC(netdev_ethtool);
4053 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4054 return 0;
4055 } else {
4056 if (errno != EOPNOTSUPP) {
4057 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4058 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4059 } else {
4060 /* The device doesn't support this operation. That's pretty
4061 * common, so there's no point in logging anything. */
4062 }
4063 return errno;
4064 }
4065}
4066
4067static int
149f577a
JG
4068netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4069 const char *cmd_name)
8b61709d 4070{
149f577a 4071 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4072 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4073 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4074 strerror(errno));
8b61709d
BP
4075 return errno;
4076 }
4077 return 0;
4078}
f1acd62b
BP
4079
4080static int
4081netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4082 int cmd, const char *cmd_name)
4083{
4084 struct ifreq ifr;
4085 int error;
4086
4087 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4088 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4089 if (!error) {
4090 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4091 *ip = sin->sin_addr;
4092 }
4093 return error;
4094}