]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netdev-linux: Remove counter double-increments.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
149f577a 2 * Copyright (c) 2009, 2010 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
6f42c8ea 28#include <linux/pkt_sched.h>
e9e28be3 29#include <linux/rtnetlink.h>
8b61709d
BP
30#include <linux/sockios.h>
31#include <linux/version.h>
32#include <sys/types.h>
33#include <sys/ioctl.h>
34#include <sys/socket.h>
35#include <netpacket/packet.h>
36#include <net/ethernet.h>
37#include <net/if.h>
a740f0de 38#include <linux/if_tunnel.h>
8b61709d
BP
39#include <net/if_arp.h>
40#include <net/if_packet.h>
41#include <net/route.h>
42#include <netinet/in.h>
e9e28be3 43#include <poll.h>
8b61709d
BP
44#include <stdlib.h>
45#include <string.h>
46#include <unistd.h>
e9e28be3
BP
47
48#include "coverage.h"
8b61709d
BP
49#include "dynamic-string.h"
50#include "fatal-signal.h"
93b13be8
BP
51#include "hash.h"
52#include "hmap.h"
8b61709d 53#include "netdev-provider.h"
7fbef77a 54#include "netdev-vport.h"
e9e28be3
BP
55#include "netlink.h"
56#include "ofpbuf.h"
8b61709d
BP
57#include "openflow/openflow.h"
58#include "packets.h"
59#include "poll-loop.h"
559843ed 60#include "rtnetlink.h"
8b61709d
BP
61#include "socket-util.h"
62#include "shash.h"
63#include "svec.h"
e9e28be3 64#include "vlog.h"
5136ce49 65
d98e6007 66VLOG_DEFINE_THIS_MODULE(netdev_linux);
8b61709d
BP
67\f
68/* These were introduced in Linux 2.6.14, so they might be missing if we have
69 * old headers. */
70#ifndef ADVERTISED_Pause
71#define ADVERTISED_Pause (1 << 13)
72#endif
73#ifndef ADVERTISED_Asym_Pause
74#define ADVERTISED_Asym_Pause (1 << 14)
75#endif
76
c1c9c9c4
BP
77/* This was introduced in Linux 2.6.25, so it might be missing if we have old
78 * headers. */
79#ifndef TC_RTAB_SIZE
80#define TC_RTAB_SIZE 1024
81#endif
82
149f577a 83static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 84static int cache_notifier_refcount;
8b61709d
BP
85
86enum {
7fbef77a
JG
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
89 VALID_IN4 = 1 << 2,
90 VALID_IN6 = 1 << 3,
91 VALID_MTU = 1 << 4,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
96};
97
149f577a
JG
98struct tap_state {
99 int fd;
61b999dd 100 bool opened;
149f577a 101};
c1c9c9c4
BP
102\f
103/* Traffic control. */
104
105/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
106 * network device.
107 *
108 * Each TC implementation subclasses this with whatever additional data it
109 * needs. */
c1c9c9c4
BP
110struct tc {
111 const struct tc_ops *ops;
93b13be8
BP
112 struct hmap queues; /* Contains "struct tc_queue"s.
113 * Read by generic TC layer.
114 * Written only by TC implementation. */
115};
c1c9c9c4 116
93b13be8
BP
117/* One traffic control queue.
118 *
119 * Each TC implementation subclasses this with whatever additional data it
120 * needs. */
121struct tc_queue {
122 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
123 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
124};
125
126/* A particular kind of traffic control. Each implementation generally maps to
127 * one particular Linux qdisc class.
128 *
129 * The functions below return 0 if successful or a positive errno value on
130 * failure, except where otherwise noted. All of them must be provided, except
131 * where otherwise noted. */
132struct tc_ops {
133 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
134 * This is null for tc_ops_default and tc_ops_other, for which there are no
135 * appropriate values. */
136 const char *linux_name;
137
138 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
139 const char *ovs_name;
140
141 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
142 * queues. The queues are numbered 0 through n_queues - 1. */
143 unsigned int n_queues;
144
145 /* Called to install this TC class on 'netdev'. The implementation should
146 * make the Netlink calls required to set up 'netdev' with the right qdisc
147 * and configure it according to 'details'. The implementation may assume
148 * that the current qdisc is the default; that is, there is no need for it
149 * to delete the current qdisc before installing itself.
150 *
151 * The contents of 'details' should be documented as valid for 'ovs_name'
152 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
153 * (which is built as ovs-vswitchd.conf.db(8)).
154 *
155 * This function must return 0 if and only if it sets 'netdev->tc' to an
156 * initialized 'struct tc'.
157 *
158 * (This function is null for tc_ops_other, which cannot be installed. For
159 * other TC classes it should always be nonnull.) */
160 int (*tc_install)(struct netdev *netdev, const struct shash *details);
161
162 /* Called when the netdev code determines (through a Netlink query) that
163 * this TC class's qdisc is installed on 'netdev', but we didn't install
164 * it ourselves and so don't know any of the details.
165 *
166 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
167 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
168 * implementation should parse the other attributes of 'nlmsg' as
169 * necessary to determine its configuration. If necessary it should also
170 * use Netlink queries to determine the configuration of queues on
171 * 'netdev'.
172 *
173 * This function must return 0 if and only if it sets 'netdev->tc' to an
174 * initialized 'struct tc'. */
175 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
176
177 /* Destroys the data structures allocated by the implementation as part of
178 * 'tc'. (This includes destroying 'tc->queues' by calling
179 * tc_destroy(tc).
180 *
181 * The implementation should not need to perform any Netlink calls. If
182 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
183 * (But it may not be desirable.)
184 *
185 * This function may be null if 'tc' is trivial. */
186 void (*tc_destroy)(struct tc *tc);
187
188 /* Retrieves details of 'netdev->tc' configuration into 'details'.
189 *
190 * The implementation should not need to perform any Netlink calls, because
191 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
192 * cached the configuration.
193 *
194 * The contents of 'details' should be documented as valid for 'ovs_name'
195 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
196 * (which is built as ovs-vswitchd.conf.db(8)).
197 *
198 * This function may be null if 'tc' is not configurable.
199 */
200 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
201
202 /* Reconfigures 'netdev->tc' according to 'details', performing any
203 * required Netlink calls to complete the reconfiguration.
204 *
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
208 *
209 * This function may be null if 'tc' is not configurable.
210 */
211 int (*qdisc_set)(struct netdev *, const struct shash *details);
212
93b13be8
BP
213 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
214 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
215 *
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "Queue" table in
218 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
219 *
220 * The implementation should not need to perform any Netlink calls, because
221 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
222 * cached the queue configuration.
223 *
224 * This function may be null if 'tc' does not have queues ('n_queues' is
225 * 0). */
93b13be8 226 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
227 struct shash *details);
228
229 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
230 * 'details', perfoming any required Netlink calls to complete the
231 * reconfiguration. The caller ensures that 'queue_id' is less than
232 * 'n_queues'.
233 *
234 * The contents of 'details' should be documented as valid for 'ovs_name'
235 * in the "other_config" column in the "Queue" table in
236 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
237 *
238 * This function may be null if 'tc' does not have queues or its queues are
239 * not configurable. */
240 int (*class_set)(struct netdev *, unsigned int queue_id,
241 const struct shash *details);
242
93b13be8
BP
243 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
244 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
245 *
246 * This function may be null if 'tc' does not have queues or its queues
247 * cannot be deleted. */
93b13be8 248 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 249
93b13be8
BP
250 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
251 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
252 *
253 * On success, initializes '*stats'.
254 *
255 * This function may be null if 'tc' does not have queues or if it cannot
256 * report queue statistics. */
93b13be8
BP
257 int (*class_get_stats)(const struct netdev *netdev,
258 const struct tc_queue *queue,
c1c9c9c4
BP
259 struct netdev_queue_stats *stats);
260
261 /* Extracts queue stats from 'nlmsg', which is a response to a
262 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
263 *
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_dump_stats)(const struct netdev *netdev,
267 const struct ofpbuf *nlmsg,
268 netdev_dump_queue_stats_cb *cb, void *aux);
269};
270
271static void
272tc_init(struct tc *tc, const struct tc_ops *ops)
273{
274 tc->ops = ops;
93b13be8 275 hmap_init(&tc->queues);
c1c9c9c4
BP
276}
277
278static void
279tc_destroy(struct tc *tc)
280{
93b13be8 281 hmap_destroy(&tc->queues);
c1c9c9c4
BP
282}
283
284static const struct tc_ops tc_ops_htb;
a339aa81 285static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
286static const struct tc_ops tc_ops_default;
287static const struct tc_ops tc_ops_other;
288
289static const struct tc_ops *tcs[] = {
290 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 291 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
292 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
293 &tc_ops_other, /* Some other qdisc. */
294 NULL
295};
149f577a 296
c1c9c9c4
BP
297static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
298static unsigned int tc_get_major(unsigned int handle);
299static unsigned int tc_get_minor(unsigned int handle);
300
301static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
302static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
303static unsigned int tc_buffer_per_jiffy(unsigned int rate);
304
305static struct tcmsg *tc_make_request(const struct netdev *, int type,
306 unsigned int flags, struct ofpbuf *);
307static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
308
309static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
310 struct nlattr **options);
311static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
312 struct nlattr **options,
313 struct netdev_queue_stats *);
314static int tc_query_class(const struct netdev *,
315 unsigned int handle, unsigned int parent,
316 struct ofpbuf **replyp);
317static int tc_delete_class(const struct netdev *, unsigned int handle);
318
319static int tc_del_qdisc(struct netdev *netdev);
320static int tc_query_qdisc(const struct netdev *netdev);
321
322static int tc_calc_cell_log(unsigned int mtu);
323static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
324static void tc_put_rtab(struct ofpbuf *, uint16_t type,
325 const struct tc_ratespec *rate);
326static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
327\f
149f577a
JG
328struct netdev_dev_linux {
329 struct netdev_dev netdev_dev;
330
8b61709d 331 struct shash_node *shash_node;
149f577a 332 unsigned int cache_valid;
8b61709d 333
8722022c
BP
334 /* The following are figured out "on demand" only. They are only valid
335 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
336 int ifindex;
337 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 338 struct in_addr address, netmask;
8b61709d
BP
339 struct in6_addr in6;
340 int mtu;
341 int carrier;
8722022c
BP
342 bool is_internal; /* Is this an openvswitch internal device? */
343 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
344 uint32_t kbits_rate; /* Policing data. */
345 uint32_t kbits_burst;
7fbef77a 346 bool have_vport_stats;
c1c9c9c4 347 struct tc *tc;
149f577a
JG
348
349 union {
350 struct tap_state tap;
351 } state;
8b61709d
BP
352};
353
149f577a
JG
354struct netdev_linux {
355 struct netdev netdev;
5b7448ed 356 int fd;
149f577a 357};
8b61709d 358
8b61709d
BP
359/* An AF_INET socket (used for ioctl operations). */
360static int af_inet_sock = -1;
361
ff4ed3c9
BP
362/* A Netlink routing socket that is not subscribed to any multicast groups. */
363static struct nl_sock *rtnl_sock;
364
8b61709d
BP
365struct netdev_linux_notifier {
366 struct netdev_notifier notifier;
367 struct list node;
368};
369
370static struct shash netdev_linux_notifiers =
371 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 372static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
373
374/* This is set pretty low because we probably won't learn anything from the
375 * additional log messages. */
376static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
377
15b3596a 378static int netdev_linux_init(void);
6f643e49 379
0b0544d7 380static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 381 int cmd, const char *cmd_name);
149f577a
JG
382static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
383 const char *cmd_name);
f1acd62b
BP
384static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
385 int cmd, const char *cmd_name);
8b61709d
BP
386static int get_flags(const struct netdev *, int *flagsp);
387static int set_flags(struct netdev *, int flags);
388static int do_get_ifindex(const char *netdev_name);
389static int get_ifindex(const struct netdev *, int *ifindexp);
390static int do_set_addr(struct netdev *netdev,
391 int ioctl_nr, const char *ioctl_name,
392 struct in_addr addr);
393static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
394static int set_etheraddr(const char *netdev_name, int hwaddr_family,
395 const uint8_t[ETH_ADDR_LEN]);
396static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
397static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
398
15b3596a
JG
399static bool
400is_netdev_linux_class(const struct netdev_class *netdev_class)
401{
402 return netdev_class->init == netdev_linux_init;
403}
404
149f577a
JG
405static struct netdev_dev_linux *
406netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 407{
15b3596a
JG
408 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
409 assert(is_netdev_linux_class(netdev_class));
410
149f577a 411 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
412}
413
8b61709d
BP
414static struct netdev_linux *
415netdev_linux_cast(const struct netdev *netdev)
416{
15b3596a
JG
417 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
418 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
419 assert(is_netdev_linux_class(netdev_class));
420
8b61709d
BP
421 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
422}
ff4ed3c9 423\f
8b61709d
BP
424static int
425netdev_linux_init(void)
426{
427 static int status = -1;
428 if (status < 0) {
ff4ed3c9 429 /* Create AF_INET socket. */
8b61709d
BP
430 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
431 status = af_inet_sock >= 0 ? 0 : errno;
432 if (status) {
433 VLOG_ERR("failed to create inet socket: %s", strerror(status));
434 }
ff4ed3c9
BP
435
436 /* Create rtnetlink socket. */
437 if (!status) {
438 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
439 if (status) {
440 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
441 strerror(status));
442 }
443 }
8b61709d
BP
444 }
445 return status;
446}
447
448static void
449netdev_linux_run(void)
450{
46097491 451 rtnetlink_notifier_run();
8b61709d
BP
452}
453
454static void
455netdev_linux_wait(void)
456{
46097491 457 rtnetlink_notifier_wait();
8b61709d
BP
458}
459
460static void
46097491 461netdev_linux_cache_cb(const struct rtnetlink_change *change,
67a4917b 462 void *aux OVS_UNUSED)
8b61709d 463{
149f577a 464 struct netdev_dev_linux *dev;
8b61709d 465 if (change) {
46415c90
JG
466 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
467 if (base_dev) {
15b3596a
JG
468 const struct netdev_class *netdev_class =
469 netdev_dev_get_class(base_dev);
470
471 if (is_netdev_linux_class(netdev_class)) {
472 dev = netdev_dev_linux_cast(base_dev);
473 dev->cache_valid = 0;
474 }
8b61709d
BP
475 }
476 } else {
46415c90 477 struct shash device_shash;
8b61709d 478 struct shash_node *node;
46415c90
JG
479
480 shash_init(&device_shash);
481 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
482 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
483 dev = node->data;
484 dev->cache_valid = 0;
8b61709d 485 }
46415c90 486 shash_destroy(&device_shash);
8b61709d
BP
487 }
488}
489
149f577a 490/* Creates the netdev device of 'type' with 'name'. */
8b61709d 491static int
b8dcf5e9
BP
492netdev_linux_create_system(const struct netdev_class *class OVS_UNUSED,
493 const char *name, const struct shash *args,
494 struct netdev_dev **netdev_devp)
6c88d577 495{
149f577a
JG
496 struct netdev_dev_linux *netdev_dev;
497 int error;
6c88d577
JP
498
499 if (!shash_is_empty(args)) {
149f577a 500 VLOG_WARN("%s: arguments for system devices should be empty", name);
6c88d577
JP
501 }
502
46415c90 503 if (!cache_notifier_refcount) {
149f577a
JG
504 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
505 netdev_linux_cache_cb, NULL);
506 if (error) {
507 return error;
508 }
509 }
46415c90 510 cache_notifier_refcount++;
6c88d577 511
149f577a 512 netdev_dev = xzalloc(sizeof *netdev_dev);
149f577a 513 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
46415c90 514
149f577a 515 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
516 return 0;
517}
518
5b7448ed
JG
519/* For most types of netdevs we open the device for each call of
520 * netdev_open(). However, this is not the case with tap devices,
521 * since it is only possible to open the device once. In this
522 * situation we share a single file descriptor, and consequently
523 * buffers, across all readers. Therefore once data is read it will
524 * be unavailable to other reads for tap devices. */
a740f0de 525static int
b8dcf5e9
BP
526netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
527 const char *name, const struct shash *args,
528 struct netdev_dev **netdev_devp)
a740f0de 529{
149f577a 530 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
531 struct tap_state *state;
532 static const char tap_dev[] = "/dev/net/tun";
533 struct ifreq ifr;
534 int error;
535
536 if (!shash_is_empty(args)) {
149f577a 537 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
538 }
539
149f577a
JG
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 state = &netdev_dev->state.tap;
a740f0de 542
6c88d577 543 /* Open tap device. */
149f577a
JG
544 state->fd = open(tap_dev, O_RDWR);
545 if (state->fd < 0) {
6c88d577
JP
546 error = errno;
547 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
548 goto error;
549 }
550
551 /* Create tap device. */
552 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
553 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 554 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
555 VLOG_WARN("%s: creating tap device failed: %s", name,
556 strerror(errno));
557 error = errno;
558 goto error;
559 }
560
561 /* Make non-blocking. */
149f577a 562 error = set_nonblocking(state->fd);
a740f0de
JG
563 if (error) {
564 goto error;
565 }
566
149f577a
JG
567 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
568 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
569 return 0;
570
571error:
149f577a 572 free(netdev_dev);
a740f0de
JG
573 return error;
574}
575
a740f0de 576static void
149f577a 577destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 578{
149f577a
JG
579 struct tap_state *state = &netdev_dev->state.tap;
580
581 if (state->fd >= 0) {
582 close(state->fd);
a740f0de
JG
583 }
584}
585
149f577a 586/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 587static void
149f577a 588netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 589{
149f577a
JG
590 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
591 const char *type = netdev_dev_get_type(netdev_dev_);
6c88d577 592
c1c9c9c4
BP
593 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
594 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
595 }
596
149f577a 597 if (!strcmp(type, "system")) {
46415c90 598 cache_notifier_refcount--;
149f577a 599
46415c90 600 if (!cache_notifier_refcount) {
149f577a
JG
601 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
602 }
603 } else if (!strcmp(type, "tap")) {
604 destroy_tap(netdev_dev);
6c88d577 605 }
149f577a 606
658797c8 607 free(netdev_dev);
6c88d577
JP
608}
609
8b61709d 610static int
5b7448ed 611netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 612 struct netdev **netdevp)
8b61709d 613{
5b7448ed 614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
615 struct netdev_linux *netdev;
616 enum netdev_flags flags;
617 int error;
618
619 /* Allocate network device. */
ec6fde61 620 netdev = xzalloc(sizeof *netdev);
49a6a163 621 netdev->fd = -1;
5b7448ed 622 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d
BP
623
624 error = netdev_get_flags(&netdev->netdev, &flags);
625 if (error == ENODEV) {
626 goto error;
627 }
628
61b999dd
JG
629 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
630 !netdev_dev->state.tap.opened) {
631
632 /* We assume that the first user of the tap device is the primary user
633 * and give them the tap FD. Subsequent users probably just expect
634 * this to be a system device so open it normally to avoid send/receive
635 * directions appearing to be reversed. */
5b7448ed 636 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 637 netdev_dev->state.tap.opened = true;
5b7448ed 638 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
639 struct sockaddr_ll sll;
640 int protocol;
641 int ifindex;
642
643 /* Create file descriptor. */
644 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
645 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
646 : ethertype);
5b7448ed
JG
647 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
648 if (netdev->fd < 0) {
8b61709d
BP
649 error = errno;
650 goto error;
651 }
8b61709d
BP
652
653 /* Set non-blocking mode. */
5b7448ed 654 error = set_nonblocking(netdev->fd);
8b61709d
BP
655 if (error) {
656 goto error;
657 }
658
659 /* Get ethernet device index. */
660 error = get_ifindex(&netdev->netdev, &ifindex);
661 if (error) {
662 goto error;
663 }
664
665 /* Bind to specific ethernet device. */
666 memset(&sll, 0, sizeof sll);
667 sll.sll_family = AF_PACKET;
668 sll.sll_ifindex = ifindex;
5b7448ed 669 if (bind(netdev->fd,
8b61709d
BP
670 (struct sockaddr *) &sll, sizeof sll) < 0) {
671 error = errno;
5b7448ed 672 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 673 strerror(error));
8b61709d
BP
674 goto error;
675 }
676
677 /* Between the socket() and bind() calls above, the socket receives all
678 * packets of the requested type on all system interfaces. We do not
679 * want to receive that data, but there is no way to avoid it. So we
680 * must now drain out the receive queue. */
5b7448ed 681 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
682 if (error) {
683 goto error;
684 }
685 }
686
687 *netdevp = &netdev->netdev;
688 return 0;
689
690error:
149f577a 691 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
692 return error;
693}
694
695/* Closes and destroys 'netdev'. */
696static void
697netdev_linux_close(struct netdev *netdev_)
698{
699 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
700
49a6a163 701 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 702 close(netdev->fd);
8b61709d
BP
703 }
704 free(netdev);
705}
e9e28be3 706
8b61709d
BP
707/* Initializes 'svec' with a list of the names of all known network devices. */
708static int
709netdev_linux_enumerate(struct svec *svec)
710{
711 struct if_nameindex *names;
712
713 names = if_nameindex();
714 if (names) {
715 size_t i;
716
717 for (i = 0; names[i].if_name != NULL; i++) {
718 svec_add(svec, names[i].if_name);
719 }
720 if_freenameindex(names);
721 return 0;
722 } else {
723 VLOG_WARN("could not obtain list of network device names: %s",
724 strerror(errno));
725 return errno;
726 }
727}
728
729static int
730netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
731{
732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
733
5b7448ed 734 if (netdev->fd < 0) {
8b61709d 735 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 736 return -EAGAIN;
8b61709d
BP
737 }
738
739 for (;;) {
5b7448ed 740 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
741 if (retval >= 0) {
742 return retval;
743 } else if (errno != EINTR) {
744 if (errno != EAGAIN) {
745 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
746 strerror(errno), netdev_get_name(netdev_));
747 }
c0e5f6ca 748 return -errno;
8b61709d
BP
749 }
750 }
751}
752
753/* Registers with the poll loop to wake up from the next call to poll_block()
754 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
755static void
756netdev_linux_recv_wait(struct netdev *netdev_)
757{
758 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
759 if (netdev->fd >= 0) {
760 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
761 }
762}
763
764/* Discards all packets waiting to be received from 'netdev'. */
765static int
766netdev_linux_drain(struct netdev *netdev_)
767{
768 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 769 if (netdev->fd < 0) {
8b61709d 770 return 0;
5b7448ed 771 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 772 struct ifreq ifr;
149f577a 773 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
774 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
775 if (error) {
776 return error;
777 }
5b7448ed 778 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
779 return 0;
780 } else {
5b7448ed 781 return drain_rcvbuf(netdev->fd);
8b61709d
BP
782 }
783}
784
785/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
786 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
787 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
788 * the packet is too big or too small to transmit on the device.
789 *
790 * The caller retains ownership of 'buffer' in all cases.
791 *
792 * The kernel maintains a packet transmission queue, so the caller is not
793 * expected to do additional queuing of packets. */
794static int
795netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
796{
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
798
799 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
800 */
5b7448ed 801 if (netdev->fd < 0) {
8b61709d
BP
802 return EPIPE;
803 }
804
805 for (;;) {
5b7448ed 806 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
807 if (retval < 0) {
808 /* The Linux AF_PACKET implementation never blocks waiting for room
809 * for packets, instead returning ENOBUFS. Translate this into
810 * EAGAIN for the caller. */
811 if (errno == ENOBUFS) {
812 return EAGAIN;
813 } else if (errno == EINTR) {
814 continue;
815 } else if (errno != EAGAIN) {
816 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
817 netdev_get_name(netdev_), strerror(errno));
818 }
819 return errno;
820 } else if (retval != size) {
821 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
822 "%zu) on %s", retval, size, netdev_get_name(netdev_));
823 return EMSGSIZE;
824 } else {
825 return 0;
826 }
827 }
828}
829
830/* Registers with the poll loop to wake up from the next call to poll_block()
831 * when the packet transmission queue has sufficient room to transmit a packet
832 * with netdev_send().
833 *
834 * The kernel maintains a packet transmission queue, so the client is not
835 * expected to do additional queuing of packets. Thus, this function is
836 * unlikely to ever be used. It is included for completeness. */
837static void
838netdev_linux_send_wait(struct netdev *netdev_)
839{
840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 841 if (netdev->fd < 0) {
8b61709d 842 /* Nothing to do. */
5b7448ed
JG
843 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
844 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
845 } else {
846 /* TAP device always accepts packets.*/
847 poll_immediate_wake();
848 }
849}
850
851/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
852 * otherwise a positive errno value. */
853static int
854netdev_linux_set_etheraddr(struct netdev *netdev_,
855 const uint8_t mac[ETH_ADDR_LEN])
856{
149f577a
JG
857 struct netdev_dev_linux *netdev_dev =
858 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
859 int error;
860
149f577a
JG
861 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
862 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
863 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
864 if (!error) {
149f577a
JG
865 netdev_dev->cache_valid |= VALID_ETHERADDR;
866 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
867 }
868 } else {
869 error = 0;
8b61709d
BP
870 }
871 return error;
872}
873
874/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
875 * free the returned buffer. */
876static int
877netdev_linux_get_etheraddr(const struct netdev *netdev_,
878 uint8_t mac[ETH_ADDR_LEN])
879{
149f577a
JG
880 struct netdev_dev_linux *netdev_dev =
881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
882 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 883 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 884 netdev_dev->etheraddr);
8b61709d
BP
885 if (error) {
886 return error;
887 }
149f577a 888 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 889 }
149f577a 890 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
891 return 0;
892}
893
894/* Returns the maximum size of transmitted (and received) packets on 'netdev',
895 * in bytes, not including the hardware header; thus, this is typically 1500
896 * bytes for Ethernet devices. */
897static int
898netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
899{
149f577a
JG
900 struct netdev_dev_linux *netdev_dev =
901 netdev_dev_linux_cast(netdev_get_dev(netdev_));
902 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
903 struct ifreq ifr;
904 int error;
905
149f577a
JG
906 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
907 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
908 if (error) {
909 return error;
910 }
149f577a
JG
911 netdev_dev->mtu = ifr.ifr_mtu;
912 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 913 }
149f577a 914 *mtup = netdev_dev->mtu;
8b61709d
BP
915 return 0;
916}
917
9ab3d9a3
BP
918/* Returns the ifindex of 'netdev', if successful, as a positive number.
919 * On failure, returns a negative errno value. */
920static int
921netdev_linux_get_ifindex(const struct netdev *netdev)
922{
923 int ifindex, error;
924
925 error = get_ifindex(netdev, &ifindex);
926 return error ? -error : ifindex;
927}
928
8b61709d
BP
929static int
930netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
931{
149f577a
JG
932 struct netdev_dev_linux *netdev_dev =
933 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
934 int error = 0;
935 char *fn = NULL;
936 int fd = -1;
937
149f577a 938 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
939 char line[8];
940 int retval;
941
149f577a
JG
942 fn = xasprintf("/sys/class/net/%s/carrier",
943 netdev_get_name(netdev_));
8b61709d
BP
944 fd = open(fn, O_RDONLY);
945 if (fd < 0) {
946 error = errno;
947 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
948 goto exit;
949 }
950
951 retval = read(fd, line, sizeof line);
952 if (retval < 0) {
953 error = errno;
954 if (error == EINVAL) {
955 /* This is the normal return value when we try to check carrier
956 * if the network device is not up. */
957 } else {
958 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
959 }
960 goto exit;
961 } else if (retval == 0) {
962 error = EPROTO;
963 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
964 goto exit;
965 }
966
967 if (line[0] != '0' && line[0] != '1') {
968 error = EPROTO;
969 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
970 fn, line[0]);
971 goto exit;
972 }
149f577a
JG
973 netdev_dev->carrier = line[0] != '0';
974 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 975 }
149f577a 976 *carrier = netdev_dev->carrier;
8b61709d
BP
977 error = 0;
978
979exit:
980 if (fd >= 0) {
981 close(fd);
982 }
983 free(fn);
984 return error;
985}
986
987/* Check whether we can we use RTM_GETLINK to get network device statistics.
988 * In pre-2.6.19 kernels, this was only available if wireless extensions were
989 * enabled. */
990static bool
991check_for_working_netlink_stats(void)
992{
993 /* Decide on the netdev_get_stats() implementation to use. Netlink is
994 * preferable, so if that works, we'll use it. */
995 int ifindex = do_get_ifindex("lo");
996 if (ifindex < 0) {
997 VLOG_WARN("failed to get ifindex for lo, "
998 "obtaining netdev stats from proc");
999 return false;
1000 } else {
1001 struct netdev_stats stats;
1002 int error = get_stats_via_netlink(ifindex, &stats);
1003 if (!error) {
1004 VLOG_DBG("obtaining netdev stats via rtnetlink");
1005 return true;
1006 } else {
1007 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1008 "via proc (you are probably running a pre-2.6.19 "
1009 "kernel)", strerror(error));
1010 return false;
1011 }
1012 }
1013}
1014
8722022c
BP
1015/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1016static void
1017netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1018{
1019 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1020 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1021 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1022
8722022c
BP
1023 netdev_dev->is_tap = !strcmp(type, "tap");
1024 netdev_dev->is_internal = false;
1025 if (!netdev_dev->is_tap) {
1026 struct ethtool_drvinfo drvinfo;
1027 int error;
1028
1029 memset(&drvinfo, 0, sizeof drvinfo);
1030 error = netdev_linux_do_ethtool(name,
1031 (struct ethtool_cmd *)&drvinfo,
1032 ETHTOOL_GDRVINFO,
1033 "ETHTOOL_GDRVINFO");
1034
1035 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1036 netdev_dev->is_internal = true;
1037 }
1038 }
1039
1040 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1041 }
1042}
1043
92df599c
JG
1044static void
1045swap_uint64(uint64_t *a, uint64_t *b)
1046{
1047 *a ^= *b;
1048 *b ^= *a;
1049 *a ^= *b;
1050}
1051
7fbef77a 1052/* Retrieves current device stats for 'netdev'. */
8b61709d 1053static int
149f577a
JG
1054netdev_linux_get_stats(const struct netdev *netdev_,
1055 struct netdev_stats *stats)
8b61709d 1056{
149f577a
JG
1057 struct netdev_dev_linux *netdev_dev =
1058 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1059 static int use_netlink_stats = -1;
1060 int error;
1061
7fbef77a
JG
1062 if (netdev_dev->have_vport_stats ||
1063 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1064
1065 error = netdev_vport_get_stats(netdev_, stats);
1066 netdev_dev->have_vport_stats = !error;
1067 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1068 }
8b61709d 1069
7fbef77a
JG
1070 if (!netdev_dev->have_vport_stats) {
1071 if (use_netlink_stats < 0) {
1072 use_netlink_stats = check_for_working_netlink_stats();
1073 }
1074 if (use_netlink_stats) {
1075 int ifindex;
1076
1077 error = get_ifindex(netdev_, &ifindex);
1078 if (!error) {
1079 error = get_stats_via_netlink(ifindex, stats);
1080 }
1081 } else {
1082 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1083 }
8b61709d 1084 }
fe6b0e03
JG
1085
1086 /* If this port is an internal port then the transmit and receive stats
1087 * will appear to be swapped relative to the other ports since we are the
1088 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1089 * them back here. This does not apply if we are getting stats from the
1090 * vport layer because it always tracks stats from the perspective of the
1091 * switch. */
92df599c 1092 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1093 if (!error && !netdev_dev->have_vport_stats &&
1094 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1095 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1096 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1097 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1098 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1099 stats->rx_length_errors = 0;
1100 stats->rx_over_errors = 0;
1101 stats->rx_crc_errors = 0;
1102 stats->rx_frame_errors = 0;
1103 stats->rx_fifo_errors = 0;
1104 stats->rx_missed_errors = 0;
1105 stats->tx_aborted_errors = 0;
1106 stats->tx_carrier_errors = 0;
1107 stats->tx_fifo_errors = 0;
1108 stats->tx_heartbeat_errors = 0;
1109 stats->tx_window_errors = 0;
1110 }
1111
8b61709d
BP
1112 return error;
1113}
1114
1115/* Stores the features supported by 'netdev' into each of '*current',
1116 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1117 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1118 * successful, otherwise a positive errno value. */
8b61709d
BP
1119static int
1120netdev_linux_get_features(struct netdev *netdev,
1121 uint32_t *current, uint32_t *advertised,
1122 uint32_t *supported, uint32_t *peer)
1123{
1124 struct ethtool_cmd ecmd;
1125 int error;
1126
1127 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1128 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1129 ETHTOOL_GSET, "ETHTOOL_GSET");
1130 if (error) {
1131 return error;
1132 }
1133
1134 /* Supported features. */
1135 *supported = 0;
1136 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1137 *supported |= OFPPF_10MB_HD;
1138 }
1139 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1140 *supported |= OFPPF_10MB_FD;
1141 }
1142 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1143 *supported |= OFPPF_100MB_HD;
1144 }
1145 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1146 *supported |= OFPPF_100MB_FD;
1147 }
1148 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1149 *supported |= OFPPF_1GB_HD;
1150 }
1151 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1152 *supported |= OFPPF_1GB_FD;
1153 }
1154 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1155 *supported |= OFPPF_10GB_FD;
1156 }
1157 if (ecmd.supported & SUPPORTED_TP) {
1158 *supported |= OFPPF_COPPER;
1159 }
1160 if (ecmd.supported & SUPPORTED_FIBRE) {
1161 *supported |= OFPPF_FIBER;
1162 }
1163 if (ecmd.supported & SUPPORTED_Autoneg) {
1164 *supported |= OFPPF_AUTONEG;
1165 }
1166 if (ecmd.supported & SUPPORTED_Pause) {
1167 *supported |= OFPPF_PAUSE;
1168 }
1169 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1170 *supported |= OFPPF_PAUSE_ASYM;
1171 }
1172
1173 /* Advertised features. */
1174 *advertised = 0;
1175 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1176 *advertised |= OFPPF_10MB_HD;
1177 }
1178 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1179 *advertised |= OFPPF_10MB_FD;
1180 }
1181 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1182 *advertised |= OFPPF_100MB_HD;
1183 }
1184 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1185 *advertised |= OFPPF_100MB_FD;
1186 }
1187 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1188 *advertised |= OFPPF_1GB_HD;
1189 }
1190 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1191 *advertised |= OFPPF_1GB_FD;
1192 }
1193 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1194 *advertised |= OFPPF_10GB_FD;
1195 }
1196 if (ecmd.advertising & ADVERTISED_TP) {
1197 *advertised |= OFPPF_COPPER;
1198 }
1199 if (ecmd.advertising & ADVERTISED_FIBRE) {
1200 *advertised |= OFPPF_FIBER;
1201 }
1202 if (ecmd.advertising & ADVERTISED_Autoneg) {
1203 *advertised |= OFPPF_AUTONEG;
1204 }
1205 if (ecmd.advertising & ADVERTISED_Pause) {
1206 *advertised |= OFPPF_PAUSE;
1207 }
1208 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1209 *advertised |= OFPPF_PAUSE_ASYM;
1210 }
1211
1212 /* Current settings. */
1213 if (ecmd.speed == SPEED_10) {
1214 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1215 } else if (ecmd.speed == SPEED_100) {
1216 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1217 } else if (ecmd.speed == SPEED_1000) {
1218 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1219 } else if (ecmd.speed == SPEED_10000) {
1220 *current = OFPPF_10GB_FD;
1221 } else {
1222 *current = 0;
1223 }
1224
1225 if (ecmd.port == PORT_TP) {
1226 *current |= OFPPF_COPPER;
1227 } else if (ecmd.port == PORT_FIBRE) {
1228 *current |= OFPPF_FIBER;
1229 }
1230
1231 if (ecmd.autoneg) {
1232 *current |= OFPPF_AUTONEG;
1233 }
1234
1235 /* Peer advertisements. */
1236 *peer = 0; /* XXX */
1237
1238 return 0;
1239}
1240
1241/* Set the features advertised by 'netdev' to 'advertise'. */
1242static int
1243netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1244{
1245 struct ethtool_cmd ecmd;
1246 int error;
1247
1248 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1249 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1250 ETHTOOL_GSET, "ETHTOOL_GSET");
1251 if (error) {
1252 return error;
1253 }
1254
1255 ecmd.advertising = 0;
1256 if (advertise & OFPPF_10MB_HD) {
1257 ecmd.advertising |= ADVERTISED_10baseT_Half;
1258 }
1259 if (advertise & OFPPF_10MB_FD) {
1260 ecmd.advertising |= ADVERTISED_10baseT_Full;
1261 }
1262 if (advertise & OFPPF_100MB_HD) {
1263 ecmd.advertising |= ADVERTISED_100baseT_Half;
1264 }
1265 if (advertise & OFPPF_100MB_FD) {
1266 ecmd.advertising |= ADVERTISED_100baseT_Full;
1267 }
1268 if (advertise & OFPPF_1GB_HD) {
1269 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1270 }
1271 if (advertise & OFPPF_1GB_FD) {
1272 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1273 }
1274 if (advertise & OFPPF_10GB_FD) {
1275 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1276 }
1277 if (advertise & OFPPF_COPPER) {
1278 ecmd.advertising |= ADVERTISED_TP;
1279 }
1280 if (advertise & OFPPF_FIBER) {
1281 ecmd.advertising |= ADVERTISED_FIBRE;
1282 }
1283 if (advertise & OFPPF_AUTONEG) {
1284 ecmd.advertising |= ADVERTISED_Autoneg;
1285 }
1286 if (advertise & OFPPF_PAUSE) {
1287 ecmd.advertising |= ADVERTISED_Pause;
1288 }
1289 if (advertise & OFPPF_PAUSE_ASYM) {
1290 ecmd.advertising |= ADVERTISED_Asym_Pause;
1291 }
0b0544d7 1292 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1293 ETHTOOL_SSET, "ETHTOOL_SSET");
1294}
1295
1296/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1297 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1298 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1299 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1300 * sets '*vlan_vid' to -1. */
1301static int
1302netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1303{
1304 const char *netdev_name = netdev_get_name(netdev);
1305 struct ds line = DS_EMPTY_INITIALIZER;
1306 FILE *stream = NULL;
1307 int error;
1308 char *fn;
1309
1310 COVERAGE_INC(netdev_get_vlan_vid);
1311 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1312 stream = fopen(fn, "r");
1313 if (!stream) {
1314 error = errno;
1315 goto done;
1316 }
1317
1318 if (ds_get_line(&line, stream)) {
1319 if (ferror(stream)) {
1320 error = errno;
1321 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1322 } else {
1323 error = EPROTO;
1324 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1325 }
1326 goto done;
1327 }
1328
1329 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1330 error = EPROTO;
1331 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1332 fn, ds_cstr(&line));
1333 goto done;
1334 }
1335
1336 error = 0;
1337
1338done:
1339 free(fn);
1340 if (stream) {
1341 fclose(stream);
1342 }
1343 ds_destroy(&line);
1344 if (error) {
1345 *vlan_vid = -1;
1346 }
1347 return error;
1348}
1349
1350#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1351#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1352
8e460221 1353/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1354 * positive errno value.
1355 *
1356 * This function is equivalent to running
1357 * /sbin/tc qdisc del dev %s handle ffff: ingress
1358 * but it is much, much faster.
1359 */
8e460221
BP
1360static int
1361netdev_linux_remove_policing(struct netdev *netdev)
1362{
80a86fbe
BP
1363 struct netdev_dev_linux *netdev_dev =
1364 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1365 const char *netdev_name = netdev_get_name(netdev);
8e460221 1366
6f42c8ea 1367 struct ofpbuf request;
6f42c8ea 1368 struct tcmsg *tcmsg;
6f42c8ea
BP
1369 int error;
1370
c1c9c9c4 1371 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1372 if (!tcmsg) {
1373 return ENODEV;
1374 }
c1c9c9c4 1375 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1376 tcmsg->tcm_parent = TC_H_INGRESS;
1377 nl_msg_put_string(&request, TCA_KIND, "ingress");
1378 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1379
1380 error = tc_transact(&request, NULL);
4d10512c 1381 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1382 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1383 netdev_name, strerror(error));
1384 return error;
1385 }
1386
80a86fbe
BP
1387 netdev_dev->kbits_rate = 0;
1388 netdev_dev->kbits_burst = 0;
1389 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1390 return 0;
1391}
1392
8b61709d
BP
1393/* Attempts to set input rate limiting (policing) policy. */
1394static int
1395netdev_linux_set_policing(struct netdev *netdev,
1396 uint32_t kbits_rate, uint32_t kbits_burst)
1397{
80a86fbe
BP
1398 struct netdev_dev_linux *netdev_dev =
1399 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1400 const char *netdev_name = netdev_get_name(netdev);
1401 char command[1024];
1402
1403 COVERAGE_INC(netdev_set_policing);
8e460221 1404
80a86fbe
BP
1405 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1406 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1407 : kbits_burst); /* Stick with user-specified value. */
1408
1409 if (netdev_dev->cache_valid & VALID_POLICING
1410 && netdev_dev->kbits_rate == kbits_rate
1411 && netdev_dev->kbits_burst == kbits_burst) {
1412 /* Assume that settings haven't changed since we last set them. */
1413 return 0;
1414 }
1415
8e460221 1416 netdev_linux_remove_policing(netdev);
8b61709d 1417 if (kbits_rate) {
8b61709d
BP
1418 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1419 if (system(command) != 0) {
1420 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1421 return -1;
1422 }
1423
1424 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1425 kbits_rate, kbits_burst);
1426 if (system(command) != 0) {
1427 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1428 netdev_name);
1429 return -1;
1430 }
80a86fbe
BP
1431
1432 netdev_dev->kbits_rate = kbits_rate;
1433 netdev_dev->kbits_burst = kbits_burst;
1434 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1435 }
1436
1437 return 0;
1438}
1439
c1c9c9c4
BP
1440static int
1441netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1442 struct svec *types)
1443{
1444 const struct tc_ops **opsp;
1445
1446 for (opsp = tcs; *opsp != NULL; opsp++) {
1447 const struct tc_ops *ops = *opsp;
1448 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1449 svec_add(types, ops->ovs_name);
1450 }
1451 }
1452 return 0;
1453}
1454
1455static const struct tc_ops *
1456tc_lookup_ovs_name(const char *name)
1457{
1458 const struct tc_ops **opsp;
1459
1460 for (opsp = tcs; *opsp != NULL; opsp++) {
1461 const struct tc_ops *ops = *opsp;
1462 if (!strcmp(name, ops->ovs_name)) {
1463 return ops;
1464 }
1465 }
1466 return NULL;
1467}
1468
1469static const struct tc_ops *
1470tc_lookup_linux_name(const char *name)
1471{
1472 const struct tc_ops **opsp;
1473
1474 for (opsp = tcs; *opsp != NULL; opsp++) {
1475 const struct tc_ops *ops = *opsp;
1476 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1477 return ops;
1478 }
1479 }
1480 return NULL;
1481}
1482
93b13be8
BP
1483static struct tc_queue *
1484tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1485 size_t hash)
1486{
1487 struct netdev_dev_linux *netdev_dev =
1488 netdev_dev_linux_cast(netdev_get_dev(netdev));
1489 struct tc_queue *queue;
1490
4e8e4213 1491 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1492 if (queue->queue_id == queue_id) {
1493 return queue;
1494 }
1495 }
1496 return NULL;
1497}
1498
1499static struct tc_queue *
1500tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1501{
1502 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1503}
1504
c1c9c9c4
BP
1505static int
1506netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1507 const char *type,
1508 struct netdev_qos_capabilities *caps)
1509{
1510 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1511 if (!ops) {
1512 return EOPNOTSUPP;
1513 }
1514 caps->n_queues = ops->n_queues;
1515 return 0;
1516}
1517
1518static int
1519netdev_linux_get_qos(const struct netdev *netdev,
1520 const char **typep, struct shash *details)
1521{
1522 struct netdev_dev_linux *netdev_dev =
1523 netdev_dev_linux_cast(netdev_get_dev(netdev));
1524 int error;
1525
1526 error = tc_query_qdisc(netdev);
1527 if (error) {
1528 return error;
1529 }
1530
1531 *typep = netdev_dev->tc->ops->ovs_name;
1532 return (netdev_dev->tc->ops->qdisc_get
1533 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1534 : 0);
1535}
1536
1537static int
1538netdev_linux_set_qos(struct netdev *netdev,
1539 const char *type, const struct shash *details)
1540{
1541 struct netdev_dev_linux *netdev_dev =
1542 netdev_dev_linux_cast(netdev_get_dev(netdev));
1543 const struct tc_ops *new_ops;
1544 int error;
1545
1546 new_ops = tc_lookup_ovs_name(type);
1547 if (!new_ops || !new_ops->tc_install) {
1548 return EOPNOTSUPP;
1549 }
1550
1551 error = tc_query_qdisc(netdev);
1552 if (error) {
1553 return error;
1554 }
1555
1556 if (new_ops == netdev_dev->tc->ops) {
1557 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1558 } else {
1559 /* Delete existing qdisc. */
1560 error = tc_del_qdisc(netdev);
1561 if (error) {
1562 return error;
1563 }
1564 assert(netdev_dev->tc == NULL);
1565
1566 /* Install new qdisc. */
1567 error = new_ops->tc_install(netdev, details);
1568 assert((error == 0) == (netdev_dev->tc != NULL));
1569
1570 return error;
1571 }
1572}
1573
1574static int
1575netdev_linux_get_queue(const struct netdev *netdev,
1576 unsigned int queue_id, struct shash *details)
1577{
1578 struct netdev_dev_linux *netdev_dev =
1579 netdev_dev_linux_cast(netdev_get_dev(netdev));
1580 int error;
1581
1582 error = tc_query_qdisc(netdev);
1583 if (error) {
1584 return error;
93b13be8
BP
1585 } else {
1586 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1587 return (queue
1588 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1589 : ENOENT);
c1c9c9c4 1590 }
c1c9c9c4
BP
1591}
1592
1593static int
1594netdev_linux_set_queue(struct netdev *netdev,
1595 unsigned int queue_id, const struct shash *details)
1596{
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
1599 int error;
1600
1601 error = tc_query_qdisc(netdev);
1602 if (error) {
1603 return error;
1604 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1605 || !netdev_dev->tc->ops->class_set) {
1606 return EINVAL;
1607 }
1608
1609 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1610}
1611
1612static int
1613netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1614{
1615 struct netdev_dev_linux *netdev_dev =
1616 netdev_dev_linux_cast(netdev_get_dev(netdev));
1617 int error;
1618
1619 error = tc_query_qdisc(netdev);
1620 if (error) {
1621 return error;
1622 } else if (!netdev_dev->tc->ops->class_delete) {
1623 return EINVAL;
93b13be8
BP
1624 } else {
1625 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1626 return (queue
1627 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1628 : ENOENT);
c1c9c9c4 1629 }
c1c9c9c4
BP
1630}
1631
1632static int
1633netdev_linux_get_queue_stats(const struct netdev *netdev,
1634 unsigned int queue_id,
1635 struct netdev_queue_stats *stats)
1636{
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev));
1639 int error;
1640
1641 error = tc_query_qdisc(netdev);
1642 if (error) {
1643 return error;
c1c9c9c4
BP
1644 } else if (!netdev_dev->tc->ops->class_get_stats) {
1645 return EOPNOTSUPP;
93b13be8
BP
1646 } else {
1647 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1648 return (queue
1649 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1650 : ENOENT);
c1c9c9c4 1651 }
c1c9c9c4
BP
1652}
1653
23a98ffe 1654static bool
c1c9c9c4
BP
1655start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1656{
1657 struct ofpbuf request;
1658 struct tcmsg *tcmsg;
1659
1660 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1661 if (!tcmsg) {
1662 return false;
1663 }
3c4de644 1664 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1665 nl_dump_start(dump, rtnl_sock, &request);
1666 ofpbuf_uninit(&request);
23a98ffe 1667 return true;
c1c9c9c4
BP
1668}
1669
1670static int
1671netdev_linux_dump_queues(const struct netdev *netdev,
1672 netdev_dump_queues_cb *cb, void *aux)
1673{
1674 struct netdev_dev_linux *netdev_dev =
1675 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1676 struct tc_queue *queue;
c1c9c9c4
BP
1677 struct shash details;
1678 int last_error;
c1c9c9c4
BP
1679 int error;
1680
1681 error = tc_query_qdisc(netdev);
1682 if (error) {
1683 return error;
1684 } else if (!netdev_dev->tc->ops->class_get) {
1685 return EOPNOTSUPP;
1686 }
1687
1688 last_error = 0;
1689 shash_init(&details);
4e8e4213 1690 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1691 shash_clear(&details);
1692
93b13be8 1693 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1694 if (!error) {
93b13be8 1695 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1696 } else {
1697 last_error = error;
1698 }
1699 }
1700 shash_destroy(&details);
1701
1702 return last_error;
1703}
1704
1705static int
1706netdev_linux_dump_queue_stats(const struct netdev *netdev,
1707 netdev_dump_queue_stats_cb *cb, void *aux)
1708{
1709 struct netdev_dev_linux *netdev_dev =
1710 netdev_dev_linux_cast(netdev_get_dev(netdev));
1711 struct nl_dump dump;
1712 struct ofpbuf msg;
1713 int last_error;
1714 int error;
1715
1716 error = tc_query_qdisc(netdev);
1717 if (error) {
1718 return error;
1719 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1720 return EOPNOTSUPP;
1721 }
1722
1723 last_error = 0;
23a98ffe
BP
1724 if (!start_queue_dump(netdev, &dump)) {
1725 return ENODEV;
1726 }
c1c9c9c4
BP
1727 while (nl_dump_next(&dump, &msg)) {
1728 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1729 if (error) {
1730 last_error = error;
1731 }
1732 }
1733
1734 error = nl_dump_done(&dump);
1735 return error ? error : last_error;
1736}
1737
8b61709d 1738static int
f1acd62b
BP
1739netdev_linux_get_in4(const struct netdev *netdev_,
1740 struct in_addr *address, struct in_addr *netmask)
8b61709d 1741{
149f577a
JG
1742 struct netdev_dev_linux *netdev_dev =
1743 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1744
1745 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1746 int error;
1747
149f577a 1748 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1749 SIOCGIFADDR, "SIOCGIFADDR");
1750 if (error) {
1751 return error;
1752 }
1753
149f577a 1754 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1755 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1756 if (error) {
1757 return error;
1758 }
1759
149f577a 1760 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1761 }
149f577a
JG
1762 *address = netdev_dev->address;
1763 *netmask = netdev_dev->netmask;
f1acd62b 1764 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1765}
1766
8b61709d 1767static int
f1acd62b
BP
1768netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1769 struct in_addr netmask)
8b61709d 1770{
149f577a
JG
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1773 int error;
1774
f1acd62b 1775 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1776 if (!error) {
149f577a
JG
1777 netdev_dev->cache_valid |= VALID_IN4;
1778 netdev_dev->address = address;
1779 netdev_dev->netmask = netmask;
f1acd62b 1780 if (address.s_addr != INADDR_ANY) {
8b61709d 1781 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1782 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1783 }
1784 }
1785 return error;
1786}
1787
1788static bool
1789parse_if_inet6_line(const char *line,
1790 struct in6_addr *in6, char ifname[16 + 1])
1791{
1792 uint8_t *s6 = in6->s6_addr;
1793#define X8 "%2"SCNx8
1794 return sscanf(line,
1795 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1796 "%*x %*x %*x %*x %16s\n",
1797 &s6[0], &s6[1], &s6[2], &s6[3],
1798 &s6[4], &s6[5], &s6[6], &s6[7],
1799 &s6[8], &s6[9], &s6[10], &s6[11],
1800 &s6[12], &s6[13], &s6[14], &s6[15],
1801 ifname) == 17;
1802}
1803
1804/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1805 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1806static int
1807netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1808{
149f577a
JG
1809 struct netdev_dev_linux *netdev_dev =
1810 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1811 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1812 FILE *file;
1813 char line[128];
1814
149f577a 1815 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1816
1817 file = fopen("/proc/net/if_inet6", "r");
1818 if (file != NULL) {
1819 const char *name = netdev_get_name(netdev_);
1820 while (fgets(line, sizeof line, file)) {
2a022368 1821 struct in6_addr in6_tmp;
8b61709d 1822 char ifname[16 + 1];
2a022368 1823 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1824 && !strcmp(name, ifname))
1825 {
2a022368 1826 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1827 break;
1828 }
1829 }
1830 fclose(file);
1831 }
149f577a 1832 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1833 }
149f577a 1834 *in6 = netdev_dev->in6;
8b61709d
BP
1835 return 0;
1836}
1837
1838static void
1839make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1840{
1841 struct sockaddr_in sin;
1842 memset(&sin, 0, sizeof sin);
1843 sin.sin_family = AF_INET;
1844 sin.sin_addr = addr;
1845 sin.sin_port = 0;
1846
1847 memset(sa, 0, sizeof *sa);
1848 memcpy(sa, &sin, sizeof sin);
1849}
1850
1851static int
1852do_set_addr(struct netdev *netdev,
1853 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1854{
1855 struct ifreq ifr;
149f577a 1856 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1857 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1858
1859 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1860 ioctl_name);
8b61709d
BP
1861}
1862
1863/* Adds 'router' as a default IP gateway. */
1864static int
67a4917b 1865netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1866{
1867 struct in_addr any = { INADDR_ANY };
1868 struct rtentry rt;
1869 int error;
1870
1871 memset(&rt, 0, sizeof rt);
1872 make_in4_sockaddr(&rt.rt_dst, any);
1873 make_in4_sockaddr(&rt.rt_gateway, router);
1874 make_in4_sockaddr(&rt.rt_genmask, any);
1875 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
1876 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1877 if (error) {
1878 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1879 }
1880 return error;
1881}
1882
f1acd62b
BP
1883static int
1884netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1885 char **netdev_name)
1886{
1887 static const char fn[] = "/proc/net/route";
1888 FILE *stream;
1889 char line[256];
1890 int ln;
1891
1892 *netdev_name = NULL;
1893 stream = fopen(fn, "r");
1894 if (stream == NULL) {
1895 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1896 return errno;
1897 }
1898
1899 ln = 0;
1900 while (fgets(line, sizeof line, stream)) {
1901 if (++ln >= 2) {
1902 char iface[17];
1903 uint32_t dest, gateway, mask;
1904 int refcnt, metric, mtu;
1905 unsigned int flags, use, window, irtt;
1906
1907 if (sscanf(line,
1908 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1909 " %d %u %u\n",
1910 iface, &dest, &gateway, &flags, &refcnt,
1911 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1912
d295e8e9 1913 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1914 fn, ln, line);
1915 continue;
1916 }
1917 if (!(flags & RTF_UP)) {
1918 /* Skip routes that aren't up. */
1919 continue;
1920 }
1921
1922 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1923 * network byte order, so we don't need need any endian
f1acd62b
BP
1924 * conversions here. */
1925 if ((dest & mask) == (host->s_addr & mask)) {
1926 if (!gateway) {
1927 /* The host is directly reachable. */
1928 next_hop->s_addr = 0;
1929 } else {
1930 /* To reach the host, we must go through a gateway. */
1931 next_hop->s_addr = gateway;
1932 }
1933 *netdev_name = xstrdup(iface);
1934 fclose(stream);
1935 return 0;
1936 }
1937 }
1938 }
1939
1940 fclose(stream);
1941 return ENXIO;
1942}
1943
8b61709d
BP
1944/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1945 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1946 * returns 0. Otherwise, it returns a positive errno value; in particular,
1947 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1948static int
1949netdev_linux_arp_lookup(const struct netdev *netdev,
1950 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1951{
1952 struct arpreq r;
c100e025 1953 struct sockaddr_in sin;
8b61709d
BP
1954 int retval;
1955
1956 memset(&r, 0, sizeof r);
c100e025
BP
1957 sin.sin_family = AF_INET;
1958 sin.sin_addr.s_addr = ip;
1959 sin.sin_port = 0;
1960 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
1961 r.arp_ha.sa_family = ARPHRD_ETHER;
1962 r.arp_flags = 0;
149f577a 1963 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
1964 COVERAGE_INC(netdev_arp_lookup);
1965 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1966 if (!retval) {
1967 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1968 } else if (retval != ENXIO) {
1969 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 1970 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
1971 }
1972 return retval;
1973}
1974
1975static int
1976nd_to_iff_flags(enum netdev_flags nd)
1977{
1978 int iff = 0;
1979 if (nd & NETDEV_UP) {
1980 iff |= IFF_UP;
1981 }
1982 if (nd & NETDEV_PROMISC) {
1983 iff |= IFF_PROMISC;
1984 }
1985 return iff;
1986}
1987
1988static int
1989iff_to_nd_flags(int iff)
1990{
1991 enum netdev_flags nd = 0;
1992 if (iff & IFF_UP) {
1993 nd |= NETDEV_UP;
1994 }
1995 if (iff & IFF_PROMISC) {
1996 nd |= NETDEV_PROMISC;
1997 }
1998 return nd;
1999}
2000
2001static int
2002netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2003 enum netdev_flags on, enum netdev_flags *old_flagsp)
2004{
2005 int old_flags, new_flags;
2006 int error;
2007
2008 error = get_flags(netdev, &old_flags);
2009 if (!error) {
2010 *old_flagsp = iff_to_nd_flags(old_flags);
2011 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2012 if (new_flags != old_flags) {
2013 error = set_flags(netdev, new_flags);
2014 }
2015 }
2016 return error;
2017}
2018
2019static void
2020poll_notify(struct list *list)
2021{
2022 struct netdev_linux_notifier *notifier;
4e8e4213 2023 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2024 struct netdev_notifier *n = &notifier->notifier;
2025 n->cb(n);
2026 }
2027}
2028
2029static void
46097491 2030netdev_linux_poll_cb(const struct rtnetlink_change *change,
67a4917b 2031 void *aux OVS_UNUSED)
8b61709d
BP
2032{
2033 if (change) {
2034 struct list *list = shash_find_data(&netdev_linux_notifiers,
2035 change->ifname);
2036 if (list) {
2037 poll_notify(list);
2038 }
2039 } else {
2040 struct shash_node *node;
2041 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2042 poll_notify(node->data);
2043 }
2044 }
2045}
2046
2047static int
2048netdev_linux_poll_add(struct netdev *netdev,
2049 void (*cb)(struct netdev_notifier *), void *aux,
2050 struct netdev_notifier **notifierp)
2051{
2052 const char *netdev_name = netdev_get_name(netdev);
2053 struct netdev_linux_notifier *notifier;
2054 struct list *list;
2055
2056 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2057 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
8b61709d
BP
2058 netdev_linux_poll_cb, NULL);
2059 if (error) {
2060 return error;
2061 }
2062 }
2063
2064 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2065 if (!list) {
2066 list = xmalloc(sizeof *list);
2067 list_init(list);
2068 shash_add(&netdev_linux_notifiers, netdev_name, list);
2069 }
2070
2071 notifier = xmalloc(sizeof *notifier);
2072 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2073 list_push_back(list, &notifier->node);
2074 *notifierp = &notifier->notifier;
2075 return 0;
2076}
2077
2078static void
2079netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2080{
2081 struct netdev_linux_notifier *notifier =
2082 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2083 struct list *list;
2084
2085 /* Remove 'notifier' from its list. */
2086 list = list_remove(&notifier->node);
2087 if (list_is_empty(list)) {
2088 /* The list is now empty. Remove it from the hash and free it. */
2089 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2090 shash_delete(&netdev_linux_notifiers,
2091 shash_find(&netdev_linux_notifiers, netdev_name));
2092 free(list);
2093 }
2094 free(notifier);
2095
2096 /* If that was the last notifier, unregister. */
2097 if (shash_is_empty(&netdev_linux_notifiers)) {
46097491 2098 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2099 }
2100}
2101
2102const struct netdev_class netdev_linux_class = {
149f577a 2103 "system",
8b61709d
BP
2104
2105 netdev_linux_init,
2106 netdev_linux_run,
2107 netdev_linux_wait,
2108
a740f0de 2109 netdev_linux_create_system,
6c88d577
JP
2110 netdev_linux_destroy,
2111 NULL, /* reconfigure */
2112
8b61709d
BP
2113 netdev_linux_open,
2114 netdev_linux_close,
2115
2116 netdev_linux_enumerate,
2117
2118 netdev_linux_recv,
2119 netdev_linux_recv_wait,
2120 netdev_linux_drain,
2121
2122 netdev_linux_send,
2123 netdev_linux_send_wait,
2124
2125 netdev_linux_set_etheraddr,
2126 netdev_linux_get_etheraddr,
2127 netdev_linux_get_mtu,
9ab3d9a3 2128 netdev_linux_get_ifindex,
8b61709d
BP
2129 netdev_linux_get_carrier,
2130 netdev_linux_get_stats,
f4b6076a 2131 netdev_vport_set_stats,
8b61709d
BP
2132
2133 netdev_linux_get_features,
2134 netdev_linux_set_advertisements,
2135 netdev_linux_get_vlan_vid,
c1c9c9c4 2136
8b61709d 2137 netdev_linux_set_policing,
c1c9c9c4
BP
2138 netdev_linux_get_qos_types,
2139 netdev_linux_get_qos_capabilities,
2140 netdev_linux_get_qos,
2141 netdev_linux_set_qos,
2142 netdev_linux_get_queue,
2143 netdev_linux_set_queue,
2144 netdev_linux_delete_queue,
2145 netdev_linux_get_queue_stats,
2146 netdev_linux_dump_queues,
2147 netdev_linux_dump_queue_stats,
8b61709d
BP
2148
2149 netdev_linux_get_in4,
2150 netdev_linux_set_in4,
2151 netdev_linux_get_in6,
2152 netdev_linux_add_router,
f1acd62b 2153 netdev_linux_get_next_hop,
8b61709d
BP
2154 netdev_linux_arp_lookup,
2155
2156 netdev_linux_update_flags,
2157
2158 netdev_linux_poll_add,
2159 netdev_linux_poll_remove,
2160};
2161
2162const struct netdev_class netdev_tap_class = {
149f577a 2163 "tap",
8b61709d
BP
2164
2165 netdev_linux_init,
149f577a
JG
2166 netdev_linux_run,
2167 netdev_linux_wait,
8b61709d 2168
a740f0de 2169 netdev_linux_create_tap,
6c88d577
JP
2170 netdev_linux_destroy,
2171 NULL, /* reconfigure */
2172
8b61709d
BP
2173 netdev_linux_open,
2174 netdev_linux_close,
2175
149f577a 2176 NULL, /* enumerate */
8b61709d
BP
2177
2178 netdev_linux_recv,
2179 netdev_linux_recv_wait,
2180 netdev_linux_drain,
2181
2182 netdev_linux_send,
2183 netdev_linux_send_wait,
2184
2185 netdev_linux_set_etheraddr,
2186 netdev_linux_get_etheraddr,
2187 netdev_linux_get_mtu,
9ab3d9a3 2188 netdev_linux_get_ifindex,
8b61709d
BP
2189 netdev_linux_get_carrier,
2190 netdev_linux_get_stats,
8722022c 2191 NULL, /* set_stats */
8b61709d
BP
2192
2193 netdev_linux_get_features,
2194 netdev_linux_set_advertisements,
a740f0de 2195 netdev_linux_get_vlan_vid,
c1c9c9c4 2196
a740f0de 2197 netdev_linux_set_policing,
c1c9c9c4
BP
2198 netdev_linux_get_qos_types,
2199 netdev_linux_get_qos_capabilities,
2200 netdev_linux_get_qos,
2201 netdev_linux_set_qos,
2202 netdev_linux_get_queue,
2203 netdev_linux_set_queue,
2204 netdev_linux_delete_queue,
2205 netdev_linux_get_queue_stats,
2206 netdev_linux_dump_queues,
2207 netdev_linux_dump_queue_stats,
a740f0de
JG
2208
2209 netdev_linux_get_in4,
2210 netdev_linux_set_in4,
2211 netdev_linux_get_in6,
2212 netdev_linux_add_router,
2213 netdev_linux_get_next_hop,
2214 netdev_linux_arp_lookup,
2215
2216 netdev_linux_update_flags,
2217
2218 netdev_linux_poll_add,
2219 netdev_linux_poll_remove,
2220};
8b61709d 2221\f
c1c9c9c4 2222/* HTB traffic control class. */
559843ed 2223
c1c9c9c4 2224#define HTB_N_QUEUES 0xf000
8b61709d 2225
c1c9c9c4
BP
2226struct htb {
2227 struct tc tc;
2228 unsigned int max_rate; /* In bytes/s. */
2229};
8b61709d 2230
c1c9c9c4 2231struct htb_class {
93b13be8 2232 struct tc_queue tc_queue;
c1c9c9c4
BP
2233 unsigned int min_rate; /* In bytes/s. */
2234 unsigned int max_rate; /* In bytes/s. */
2235 unsigned int burst; /* In bytes. */
2236 unsigned int priority; /* Lower values are higher priorities. */
2237};
8b61709d 2238
c1c9c9c4
BP
2239static struct htb *
2240htb_get__(const struct netdev *netdev)
2241{
2242 struct netdev_dev_linux *netdev_dev =
2243 netdev_dev_linux_cast(netdev_get_dev(netdev));
2244 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2245}
2246
2247static struct htb *
2248htb_install__(struct netdev *netdev, uint64_t max_rate)
2249{
2250 struct netdev_dev_linux *netdev_dev =
2251 netdev_dev_linux_cast(netdev_get_dev(netdev));
2252 struct htb *htb;
2253
2254 htb = xmalloc(sizeof *htb);
2255 tc_init(&htb->tc, &tc_ops_htb);
2256 htb->max_rate = max_rate;
2257
2258 netdev_dev->tc = &htb->tc;
2259
2260 return htb;
2261}
2262
2263/* Create an HTB qdisc.
2264 *
a339aa81 2265 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2266static int
2267htb_setup_qdisc__(struct netdev *netdev)
2268{
2269 size_t opt_offset;
2270 struct tc_htb_glob opt;
2271 struct ofpbuf request;
2272 struct tcmsg *tcmsg;
2273
2274 tc_del_qdisc(netdev);
2275
2276 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2277 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2278 if (!tcmsg) {
2279 return ENODEV;
2280 }
c1c9c9c4
BP
2281 tcmsg->tcm_handle = tc_make_handle(1, 0);
2282 tcmsg->tcm_parent = TC_H_ROOT;
2283
2284 nl_msg_put_string(&request, TCA_KIND, "htb");
2285
2286 memset(&opt, 0, sizeof opt);
2287 opt.rate2quantum = 10;
2288 opt.version = 3;
4ecf12d5 2289 opt.defcls = 1;
c1c9c9c4
BP
2290
2291 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2292 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2293 nl_msg_end_nested(&request, opt_offset);
2294
2295 return tc_transact(&request, NULL);
2296}
2297
2298/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2299 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2300static int
2301htb_setup_class__(struct netdev *netdev, unsigned int handle,
2302 unsigned int parent, struct htb_class *class)
2303{
2304 size_t opt_offset;
2305 struct tc_htb_opt opt;
2306 struct ofpbuf request;
2307 struct tcmsg *tcmsg;
2308 int error;
2309 int mtu;
2310
2311 netdev_get_mtu(netdev, &mtu);
2312
2313 memset(&opt, 0, sizeof opt);
2314 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2315 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2316 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2317 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2318 opt.prio = class->priority;
2319
2320 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2321 if (!tcmsg) {
2322 return ENODEV;
2323 }
c1c9c9c4
BP
2324 tcmsg->tcm_handle = handle;
2325 tcmsg->tcm_parent = parent;
2326
2327 nl_msg_put_string(&request, TCA_KIND, "htb");
2328 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2329 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2330 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2331 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2332 nl_msg_end_nested(&request, opt_offset);
2333
2334 error = tc_transact(&request, NULL);
2335 if (error) {
2336 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2337 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2338 netdev_get_name(netdev),
2339 tc_get_major(handle), tc_get_minor(handle),
2340 tc_get_major(parent), tc_get_minor(parent),
2341 class->min_rate, class->max_rate,
2342 class->burst, class->priority, strerror(error));
2343 }
2344 return error;
2345}
2346
2347/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2348 * description of them into 'details'. The description complies with the
2349 * specification given in the vswitch database documentation for linux-htb
2350 * queue details. */
2351static int
2352htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2353{
2354 static const struct nl_policy tca_htb_policy[] = {
2355 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2356 .min_len = sizeof(struct tc_htb_opt) },
2357 };
2358
2359 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2360 const struct tc_htb_opt *htb;
2361
2362 if (!nl_parse_nested(nl_options, tca_htb_policy,
2363 attrs, ARRAY_SIZE(tca_htb_policy))) {
2364 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2365 return EPROTO;
2366 }
2367
2368 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2369 class->min_rate = htb->rate.rate;
2370 class->max_rate = htb->ceil.rate;
2371 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2372 class->priority = htb->prio;
2373 return 0;
2374}
2375
2376static int
2377htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2378 struct htb_class *options,
2379 struct netdev_queue_stats *stats)
2380{
2381 struct nlattr *nl_options;
2382 unsigned int handle;
2383 int error;
2384
2385 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2386 if (!error && queue_id) {
17ee3c1f
BP
2387 unsigned int major = tc_get_major(handle);
2388 unsigned int minor = tc_get_minor(handle);
2389 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2390 *queue_id = minor - 1;
c1c9c9c4
BP
2391 } else {
2392 error = EPROTO;
2393 }
2394 }
2395 if (!error && options) {
2396 error = htb_parse_tca_options__(nl_options, options);
2397 }
2398 return error;
2399}
2400
2401static void
2402htb_parse_qdisc_details__(struct netdev *netdev,
2403 const struct shash *details, struct htb_class *hc)
2404{
2405 const char *max_rate_s;
2406
2407 max_rate_s = shash_find_data(details, "max-rate");
2408 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2409 if (!hc->max_rate) {
2410 uint32_t current;
2411
2412 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2413 hc->max_rate = netdev_features_to_bps(current) / 8;
2414 }
2415 hc->min_rate = hc->max_rate;
2416 hc->burst = 0;
2417 hc->priority = 0;
2418}
2419
2420static int
2421htb_parse_class_details__(struct netdev *netdev,
2422 const struct shash *details, struct htb_class *hc)
2423{
2424 const struct htb *htb = htb_get__(netdev);
2425 const char *min_rate_s = shash_find_data(details, "min-rate");
2426 const char *max_rate_s = shash_find_data(details, "max-rate");
2427 const char *burst_s = shash_find_data(details, "burst");
2428 const char *priority_s = shash_find_data(details, "priority");
2429 int mtu;
2430
da3827b5 2431 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
c1c9c9c4
BP
2432 if (!min_rate_s) {
2433 /* min-rate is required. */
2434 return EINVAL;
2435 }
2436 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
da3827b5 2437 hc->min_rate = MAX(hc->min_rate, 1500);
c1c9c9c4
BP
2438 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2439
2440 /* max-rate */
2441 hc->max_rate = (max_rate_s
2442 ? strtoull(max_rate_s, NULL, 10) / 8
2443 : htb->max_rate);
2444 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2445 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2446
2447 /* burst
2448 *
2449 * According to hints in the documentation that I've read, it is important
2450 * that 'burst' be at least as big as the largest frame that might be
2451 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2452 * but having it a bit too small is a problem. Since netdev_get_mtu()
2453 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2454 * the MTU. We actually add 64, instead of 14, as a guard against
2455 * additional headers get tacked on somewhere that we're not aware of. */
2456 netdev_get_mtu(netdev, &mtu);
2457 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2458 hc->burst = MAX(hc->burst, mtu + 64);
2459
2460 /* priority */
2461 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2462
2463 return 0;
2464}
2465
2466static int
2467htb_query_class__(const struct netdev *netdev, unsigned int handle,
2468 unsigned int parent, struct htb_class *options,
2469 struct netdev_queue_stats *stats)
2470{
2471 struct ofpbuf *reply;
2472 int error;
2473
2474 error = tc_query_class(netdev, handle, parent, &reply);
2475 if (!error) {
2476 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2477 ofpbuf_delete(reply);
2478 }
2479 return error;
2480}
2481
2482static int
2483htb_tc_install(struct netdev *netdev, const struct shash *details)
2484{
2485 int error;
2486
2487 error = htb_setup_qdisc__(netdev);
2488 if (!error) {
2489 struct htb_class hc;
2490
2491 htb_parse_qdisc_details__(netdev, details, &hc);
2492 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2493 tc_make_handle(1, 0), &hc);
2494 if (!error) {
2495 htb_install__(netdev, hc.max_rate);
2496 }
2497 }
2498 return error;
2499}
2500
93b13be8
BP
2501static struct htb_class *
2502htb_class_cast__(const struct tc_queue *queue)
2503{
2504 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2505}
2506
c1c9c9c4
BP
2507static void
2508htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2509 const struct htb_class *hc)
2510{
2511 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2512 size_t hash = hash_int(queue_id, 0);
2513 struct tc_queue *queue;
c1c9c9c4
BP
2514 struct htb_class *hcp;
2515
93b13be8
BP
2516 queue = tc_find_queue__(netdev, queue_id, hash);
2517 if (queue) {
2518 hcp = htb_class_cast__(queue);
2519 } else {
c1c9c9c4 2520 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2521 queue = &hcp->tc_queue;
2522 queue->queue_id = queue_id;
2523 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2524 }
93b13be8
BP
2525
2526 hcp->min_rate = hc->min_rate;
2527 hcp->max_rate = hc->max_rate;
2528 hcp->burst = hc->burst;
2529 hcp->priority = hc->priority;
c1c9c9c4
BP
2530}
2531
2532static int
2533htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2534{
c1c9c9c4
BP
2535 struct ofpbuf msg;
2536 struct nl_dump dump;
2537 struct htb_class hc;
2538 struct htb *htb;
2539
2540 /* Get qdisc options. */
2541 hc.max_rate = 0;
2542 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2543 htb = htb_install__(netdev, hc.max_rate);
2544
2545 /* Get queues. */
23a98ffe
BP
2546 if (!start_queue_dump(netdev, &dump)) {
2547 return ENODEV;
2548 }
c1c9c9c4
BP
2549 while (nl_dump_next(&dump, &msg)) {
2550 unsigned int queue_id;
2551
2552 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2553 htb_update_queue__(netdev, queue_id, &hc);
2554 }
2555 }
2556 nl_dump_done(&dump);
2557
2558 return 0;
2559}
2560
2561static void
2562htb_tc_destroy(struct tc *tc)
2563{
2564 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2565 struct htb_class *hc, *next;
c1c9c9c4 2566
4e8e4213 2567 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2568 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2569 free(hc);
2570 }
2571 tc_destroy(tc);
2572 free(htb);
2573}
2574
2575static int
2576htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2577{
2578 const struct htb *htb = htb_get__(netdev);
2579 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2580 return 0;
2581}
2582
2583static int
2584htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2585{
2586 struct htb_class hc;
2587 int error;
2588
2589 htb_parse_qdisc_details__(netdev, details, &hc);
2590 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2591 tc_make_handle(1, 0), &hc);
2592 if (!error) {
2593 htb_get__(netdev)->max_rate = hc.max_rate;
2594 }
2595 return error;
2596}
2597
2598static int
93b13be8
BP
2599htb_class_get(const struct netdev *netdev OVS_UNUSED,
2600 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2601{
93b13be8 2602 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2603
2604 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2605 if (hc->min_rate != hc->max_rate) {
2606 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2607 }
2608 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2609 if (hc->priority) {
2610 shash_add(details, "priority", xasprintf("%u", hc->priority));
2611 }
2612 return 0;
2613}
2614
2615static int
2616htb_class_set(struct netdev *netdev, unsigned int queue_id,
2617 const struct shash *details)
2618{
2619 struct htb_class hc;
2620 int error;
2621
2622 error = htb_parse_class_details__(netdev, details, &hc);
2623 if (error) {
2624 return error;
2625 }
2626
17ee3c1f 2627 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2628 tc_make_handle(1, 0xfffe), &hc);
2629 if (error) {
2630 return error;
2631 }
2632
2633 htb_update_queue__(netdev, queue_id, &hc);
2634 return 0;
2635}
2636
2637static int
93b13be8 2638htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2639{
93b13be8 2640 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2641 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2642 int error;
2643
93b13be8 2644 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2645 if (!error) {
93b13be8 2646 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2647 free(hc);
c1c9c9c4
BP
2648 }
2649 return error;
2650}
2651
2652static int
93b13be8 2653htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2654 struct netdev_queue_stats *stats)
2655{
93b13be8 2656 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2657 tc_make_handle(1, 0xfffe), NULL, stats);
2658}
2659
2660static int
2661htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2662 const struct ofpbuf *nlmsg,
2663 netdev_dump_queue_stats_cb *cb, void *aux)
2664{
2665 struct netdev_queue_stats stats;
17ee3c1f 2666 unsigned int handle, major, minor;
c1c9c9c4
BP
2667 int error;
2668
2669 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2670 if (error) {
2671 return error;
2672 }
2673
17ee3c1f
BP
2674 major = tc_get_major(handle);
2675 minor = tc_get_minor(handle);
2676 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2677 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2678 }
2679 return 0;
2680}
2681
2682static const struct tc_ops tc_ops_htb = {
2683 "htb", /* linux_name */
2684 "linux-htb", /* ovs_name */
2685 HTB_N_QUEUES, /* n_queues */
2686 htb_tc_install,
2687 htb_tc_load,
2688 htb_tc_destroy,
2689 htb_qdisc_get,
2690 htb_qdisc_set,
2691 htb_class_get,
2692 htb_class_set,
2693 htb_class_delete,
2694 htb_class_get_stats,
2695 htb_class_dump_stats
2696};
2697\f
a339aa81
EJ
2698/* "linux-hfsc" traffic control class. */
2699
2700#define HFSC_N_QUEUES 0xf000
2701
2702struct hfsc {
2703 struct tc tc;
2704 uint32_t max_rate;
2705};
2706
2707struct hfsc_class {
2708 struct tc_queue tc_queue;
2709 uint32_t min_rate;
2710 uint32_t max_rate;
2711};
2712
2713static struct hfsc *
2714hfsc_get__(const struct netdev *netdev)
2715{
2716 struct netdev_dev_linux *netdev_dev;
2717 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2718 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2719}
2720
2721static struct hfsc_class *
2722hfsc_class_cast__(const struct tc_queue *queue)
2723{
2724 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2725}
2726
2727static struct hfsc *
2728hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2729{
2730 struct netdev_dev_linux * netdev_dev;
2731 struct hfsc *hfsc;
2732
2733 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2734 hfsc = xmalloc(sizeof *hfsc);
2735 tc_init(&hfsc->tc, &tc_ops_hfsc);
2736 hfsc->max_rate = max_rate;
2737 netdev_dev->tc = &hfsc->tc;
2738
2739 return hfsc;
2740}
2741
2742static void
2743hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2744 const struct hfsc_class *hc)
2745{
2746 size_t hash;
2747 struct hfsc *hfsc;
2748 struct hfsc_class *hcp;
2749 struct tc_queue *queue;
2750
2751 hfsc = hfsc_get__(netdev);
2752 hash = hash_int(queue_id, 0);
2753
2754 queue = tc_find_queue__(netdev, queue_id, hash);
2755 if (queue) {
2756 hcp = hfsc_class_cast__(queue);
2757 } else {
2758 hcp = xmalloc(sizeof *hcp);
2759 queue = &hcp->tc_queue;
2760 queue->queue_id = queue_id;
2761 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2762 }
2763
2764 hcp->min_rate = hc->min_rate;
2765 hcp->max_rate = hc->max_rate;
2766}
2767
2768static int
2769hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2770{
2771 const struct tc_service_curve *rsc, *fsc, *usc;
2772 static const struct nl_policy tca_hfsc_policy[] = {
2773 [TCA_HFSC_RSC] = {
2774 .type = NL_A_UNSPEC,
2775 .optional = false,
2776 .min_len = sizeof(struct tc_service_curve),
2777 },
2778 [TCA_HFSC_FSC] = {
2779 .type = NL_A_UNSPEC,
2780 .optional = false,
2781 .min_len = sizeof(struct tc_service_curve),
2782 },
2783 [TCA_HFSC_USC] = {
2784 .type = NL_A_UNSPEC,
2785 .optional = false,
2786 .min_len = sizeof(struct tc_service_curve),
2787 },
2788 };
2789 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2790
2791 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2792 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2793 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2794 return EPROTO;
2795 }
2796
2797 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2798 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2799 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2800
2801 if (rsc->m1 != 0 || rsc->d != 0 ||
2802 fsc->m1 != 0 || fsc->d != 0 ||
2803 usc->m1 != 0 || usc->d != 0) {
2804 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2805 "Non-linear service curves are not supported.");
2806 return EPROTO;
2807 }
2808
2809 if (rsc->m2 != fsc->m2) {
2810 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2811 "Real-time service curves are not supported ");
2812 return EPROTO;
2813 }
2814
2815 if (rsc->m2 > usc->m2) {
2816 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2817 "Min-rate service curve is greater than "
2818 "the max-rate service curve.");
2819 return EPROTO;
2820 }
2821
2822 class->min_rate = fsc->m2;
2823 class->max_rate = usc->m2;
2824 return 0;
2825}
2826
2827static int
2828hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2829 struct hfsc_class *options,
2830 struct netdev_queue_stats *stats)
2831{
2832 int error;
2833 unsigned int handle;
2834 struct nlattr *nl_options;
2835
2836 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2837 if (error) {
2838 return error;
2839 }
2840
2841 if (queue_id) {
2842 unsigned int major, minor;
2843
2844 major = tc_get_major(handle);
2845 minor = tc_get_minor(handle);
2846 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2847 *queue_id = minor - 1;
2848 } else {
2849 return EPROTO;
2850 }
2851 }
2852
2853 if (options) {
2854 error = hfsc_parse_tca_options__(nl_options, options);
2855 }
2856
2857 return error;
2858}
2859
2860static int
2861hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2862 unsigned int parent, struct hfsc_class *options,
2863 struct netdev_queue_stats *stats)
2864{
2865 int error;
2866 struct ofpbuf *reply;
2867
2868 error = tc_query_class(netdev, handle, parent, &reply);
2869 if (error) {
2870 return error;
2871 }
2872
2873 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2874 ofpbuf_delete(reply);
2875 return error;
2876}
2877
2878static void
2879hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2880 struct hfsc_class *class)
2881{
2882 uint32_t max_rate;
2883 const char *max_rate_s;
2884
2885 max_rate_s = shash_find_data(details, "max-rate");
2886 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2887
2888 if (!max_rate) {
2889 uint32_t current;
2890
2891 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2892 max_rate = netdev_features_to_bps(current) / 8;
2893 }
2894
2895 class->min_rate = max_rate;
2896 class->max_rate = max_rate;
2897}
2898
2899static int
2900hfsc_parse_class_details__(struct netdev *netdev,
2901 const struct shash *details,
2902 struct hfsc_class * class)
2903{
2904 const struct hfsc *hfsc;
2905 uint32_t min_rate, max_rate;
2906 const char *min_rate_s, *max_rate_s;
2907
2908 hfsc = hfsc_get__(netdev);
2909 min_rate_s = shash_find_data(details, "min-rate");
2910 max_rate_s = shash_find_data(details, "max-rate");
2911
2912 if (!min_rate_s) {
2913 return EINVAL;
2914 }
2915
2916 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2917 min_rate = MAX(min_rate, 1500);
2918 min_rate = MIN(min_rate, hfsc->max_rate);
2919
2920 max_rate = (max_rate_s
2921 ? strtoull(max_rate_s, NULL, 10) / 8
2922 : hfsc->max_rate);
2923 max_rate = MAX(max_rate, min_rate);
2924 max_rate = MIN(max_rate, hfsc->max_rate);
2925
2926 class->min_rate = min_rate;
2927 class->max_rate = max_rate;
2928
2929 return 0;
2930}
2931
2932/* Create an HFSC qdisc.
2933 *
2934 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2935static int
2936hfsc_setup_qdisc__(struct netdev * netdev)
2937{
2938 struct tcmsg *tcmsg;
2939 struct ofpbuf request;
2940 struct tc_hfsc_qopt opt;
2941
2942 tc_del_qdisc(netdev);
2943
2944 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2945 NLM_F_EXCL | NLM_F_CREATE, &request);
2946
2947 if (!tcmsg) {
2948 return ENODEV;
2949 }
2950
2951 tcmsg->tcm_handle = tc_make_handle(1, 0);
2952 tcmsg->tcm_parent = TC_H_ROOT;
2953
2954 memset(&opt, 0, sizeof opt);
2955 opt.defcls = 1;
2956
2957 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2958 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2959
2960 return tc_transact(&request, NULL);
2961}
2962
2963/* Create an HFSC class.
2964 *
2965 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2966 * sc rate <min_rate> ul rate <max_rate>" */
2967static int
2968hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2969 unsigned int parent, struct hfsc_class *class)
2970{
2971 int error;
2972 size_t opt_offset;
2973 struct tcmsg *tcmsg;
2974 struct ofpbuf request;
2975 struct tc_service_curve min, max;
2976
2977 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2978
2979 if (!tcmsg) {
2980 return ENODEV;
2981 }
2982
2983 tcmsg->tcm_handle = handle;
2984 tcmsg->tcm_parent = parent;
2985
2986 min.m1 = 0;
2987 min.d = 0;
2988 min.m2 = class->min_rate;
2989
2990 max.m1 = 0;
2991 max.d = 0;
2992 max.m2 = class->max_rate;
2993
2994 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2995 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2996 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2997 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2998 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2999 nl_msg_end_nested(&request, opt_offset);
3000
3001 error = tc_transact(&request, NULL);
3002 if (error) {
3003 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3004 "min-rate %ubps, max-rate %ubps (%s)",
3005 netdev_get_name(netdev),
3006 tc_get_major(handle), tc_get_minor(handle),
3007 tc_get_major(parent), tc_get_minor(parent),
3008 class->min_rate, class->max_rate, strerror(error));
3009 }
3010
3011 return error;
3012}
3013
3014static int
3015hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3016{
3017 int error;
3018 struct hfsc_class class;
3019
3020 error = hfsc_setup_qdisc__(netdev);
3021
3022 if (error) {
3023 return error;
3024 }
3025
3026 hfsc_parse_qdisc_details__(netdev, details, &class);
3027 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3028 tc_make_handle(1, 0), &class);
3029
3030 if (error) {
3031 return error;
3032 }
3033
3034 hfsc_install__(netdev, class.max_rate);
3035 return 0;
3036}
3037
3038static int
3039hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3040{
3041 struct ofpbuf msg;
3042 struct hfsc *hfsc;
3043 struct nl_dump dump;
3044 struct hfsc_class hc;
3045
3046 hc.max_rate = 0;
3047 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3048 hfsc = hfsc_install__(netdev, hc.max_rate);
3049
3050 if (!start_queue_dump(netdev, &dump)) {
3051 return ENODEV;
3052 }
3053
3054 while (nl_dump_next(&dump, &msg)) {
3055 unsigned int queue_id;
3056
3057 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3058 hfsc_update_queue__(netdev, queue_id, &hc);
3059 }
3060 }
3061
3062 nl_dump_done(&dump);
3063 return 0;
3064}
3065
3066static void
3067hfsc_tc_destroy(struct tc *tc)
3068{
3069 struct hfsc *hfsc;
3070 struct hfsc_class *hc, *next;
3071
3072 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3073
3074 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3075 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3076 free(hc);
3077 }
3078
3079 tc_destroy(tc);
3080 free(hfsc);
3081}
3082
3083static int
3084hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3085{
3086 const struct hfsc *hfsc;
3087 hfsc = hfsc_get__(netdev);
3088 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3089 return 0;
3090}
3091
3092static int
3093hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3094{
3095 int error;
3096 struct hfsc_class class;
3097
3098 hfsc_parse_qdisc_details__(netdev, details, &class);
3099 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3100 tc_make_handle(1, 0), &class);
3101
3102 if (!error) {
3103 hfsc_get__(netdev)->max_rate = class.max_rate;
3104 }
3105
3106 return error;
3107}
3108
3109static int
3110hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3111 const struct tc_queue *queue, struct shash *details)
3112{
3113 const struct hfsc_class *hc;
3114
3115 hc = hfsc_class_cast__(queue);
3116 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3117 if (hc->min_rate != hc->max_rate) {
3118 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3119 }
3120 return 0;
3121}
3122
3123static int
3124hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3125 const struct shash *details)
3126{
3127 int error;
3128 struct hfsc_class class;
3129
3130 error = hfsc_parse_class_details__(netdev, details, &class);
3131 if (error) {
3132 return error;
3133 }
3134
3135 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3136 tc_make_handle(1, 0xfffe), &class);
3137 if (error) {
3138 return error;
3139 }
3140
3141 hfsc_update_queue__(netdev, queue_id, &class);
3142 return 0;
3143}
3144
3145static int
3146hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3147{
3148 int error;
3149 struct hfsc *hfsc;
3150 struct hfsc_class *hc;
3151
3152 hc = hfsc_class_cast__(queue);
3153 hfsc = hfsc_get__(netdev);
3154
3155 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3156 if (!error) {
3157 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3158 free(hc);
3159 }
3160 return error;
3161}
3162
3163static int
3164hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3165 struct netdev_queue_stats *stats)
3166{
3167 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3168 tc_make_handle(1, 0xfffe), NULL, stats);
3169}
3170
3171static int
3172hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3173 const struct ofpbuf *nlmsg,
3174 netdev_dump_queue_stats_cb *cb, void *aux)
3175{
3176 struct netdev_queue_stats stats;
3177 unsigned int handle, major, minor;
3178 int error;
3179
3180 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3181 if (error) {
3182 return error;
3183 }
3184
3185 major = tc_get_major(handle);
3186 minor = tc_get_minor(handle);
3187 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3188 (*cb)(minor - 1, &stats, aux);
3189 }
3190 return 0;
3191}
3192
3193static const struct tc_ops tc_ops_hfsc = {
3194 "hfsc", /* linux_name */
3195 "linux-hfsc", /* ovs_name */
3196 HFSC_N_QUEUES, /* n_queues */
3197 hfsc_tc_install, /* tc_install */
3198 hfsc_tc_load, /* tc_load */
3199 hfsc_tc_destroy, /* tc_destroy */
3200 hfsc_qdisc_get, /* qdisc_get */
3201 hfsc_qdisc_set, /* qdisc_set */
3202 hfsc_class_get, /* class_get */
3203 hfsc_class_set, /* class_set */
3204 hfsc_class_delete, /* class_delete */
3205 hfsc_class_get_stats, /* class_get_stats */
3206 hfsc_class_dump_stats /* class_dump_stats */
3207};
3208\f
c1c9c9c4
BP
3209/* "linux-default" traffic control class.
3210 *
3211 * This class represents the default, unnamed Linux qdisc. It corresponds to
3212 * the "" (empty string) QoS type in the OVS database. */
3213
3214static void
3215default_install__(struct netdev *netdev)
3216{
3217 struct netdev_dev_linux *netdev_dev =
3218 netdev_dev_linux_cast(netdev_get_dev(netdev));
3219 static struct tc *tc;
3220
3221 if (!tc) {
3222 tc = xmalloc(sizeof *tc);
3223 tc_init(tc, &tc_ops_default);
3224 }
3225 netdev_dev->tc = tc;
3226}
3227
3228static int
3229default_tc_install(struct netdev *netdev,
3230 const struct shash *details OVS_UNUSED)
3231{
3232 default_install__(netdev);
3233 return 0;
3234}
3235
3236static int
3237default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3238{
3239 default_install__(netdev);
3240 return 0;
3241}
3242
3243static const struct tc_ops tc_ops_default = {
3244 NULL, /* linux_name */
3245 "", /* ovs_name */
3246 0, /* n_queues */
3247 default_tc_install,
3248 default_tc_load,
3249 NULL, /* tc_destroy */
3250 NULL, /* qdisc_get */
3251 NULL, /* qdisc_set */
3252 NULL, /* class_get */
3253 NULL, /* class_set */
3254 NULL, /* class_delete */
3255 NULL, /* class_get_stats */
3256 NULL /* class_dump_stats */
3257};
3258\f
3259/* "linux-other" traffic control class.
3260 *
3261 * */
3262
3263static int
3264other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3265{
3266 struct netdev_dev_linux *netdev_dev =
3267 netdev_dev_linux_cast(netdev_get_dev(netdev));
3268 static struct tc *tc;
3269
3270 if (!tc) {
3271 tc = xmalloc(sizeof *tc);
3272 tc_init(tc, &tc_ops_other);
3273 }
3274 netdev_dev->tc = tc;
3275 return 0;
3276}
3277
3278static const struct tc_ops tc_ops_other = {
3279 NULL, /* linux_name */
3280 "linux-other", /* ovs_name */
3281 0, /* n_queues */
3282 NULL, /* tc_install */
3283 other_tc_load,
3284 NULL, /* tc_destroy */
3285 NULL, /* qdisc_get */
3286 NULL, /* qdisc_set */
3287 NULL, /* class_get */
3288 NULL, /* class_set */
3289 NULL, /* class_delete */
3290 NULL, /* class_get_stats */
3291 NULL /* class_dump_stats */
3292};
3293\f
3294/* Traffic control. */
3295
3296/* Number of kernel "tc" ticks per second. */
3297static double ticks_per_s;
3298
3299/* Number of kernel "jiffies" per second. This is used for the purpose of
3300 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3301 * one jiffy's worth of data.
3302 *
3303 * There are two possibilities here:
3304 *
3305 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3306 * approximate range of 100 to 1024. That means that we really need to
3307 * make sure that the qdisc can buffer that much data.
3308 *
3309 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3310 * has finely granular timers and there's no need to fudge additional room
3311 * for buffers. (There's no extra effort needed to implement that: the
3312 * large 'buffer_hz' is used as a divisor, so practically any number will
3313 * come out as 0 in the division. Small integer results in the case of
3314 * really high dividends won't have any real effect anyhow.)
3315 */
3316static unsigned int buffer_hz;
3317
3318/* Returns tc handle 'major':'minor'. */
3319static unsigned int
3320tc_make_handle(unsigned int major, unsigned int minor)
3321{
3322 return TC_H_MAKE(major << 16, minor);
3323}
3324
3325/* Returns the major number from 'handle'. */
3326static unsigned int
3327tc_get_major(unsigned int handle)
3328{
3329 return TC_H_MAJ(handle) >> 16;
3330}
3331
3332/* Returns the minor number from 'handle'. */
3333static unsigned int
3334tc_get_minor(unsigned int handle)
3335{
3336 return TC_H_MIN(handle);
3337}
3338
3339static struct tcmsg *
3340tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3341 struct ofpbuf *request)
3342{
3343 struct tcmsg *tcmsg;
3344 int ifindex;
3345 int error;
3346
3347 error = get_ifindex(netdev, &ifindex);
3348 if (error) {
3349 return NULL;
3350 }
3351
3352 ofpbuf_init(request, 512);
3353 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3354 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3355 tcmsg->tcm_family = AF_UNSPEC;
3356 tcmsg->tcm_ifindex = ifindex;
3357 /* Caller should fill in tcmsg->tcm_handle. */
3358 /* Caller should fill in tcmsg->tcm_parent. */
3359
3360 return tcmsg;
3361}
3362
3363static int
3364tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3365{
3366 int error = nl_sock_transact(rtnl_sock, request, replyp);
3367 ofpbuf_uninit(request);
3368 return error;
3369}
3370
3371static void
3372read_psched(void)
3373{
3374 /* The values in psched are not individually very meaningful, but they are
3375 * important. The tables below show some values seen in the wild.
3376 *
3377 * Some notes:
3378 *
3379 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3380 * (Before that, there are hints that it was 1000000000.)
3381 *
3382 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3383 * above.
3384 *
3385 * /proc/net/psched
3386 * -----------------------------------
3387 * [1] 000c8000 000f4240 000f4240 00000064
3388 * [2] 000003e8 00000400 000f4240 3b9aca00
3389 * [3] 000003e8 00000400 000f4240 3b9aca00
3390 * [4] 000003e8 00000400 000f4240 00000064
3391 * [5] 000003e8 00000040 000f4240 3b9aca00
3392 * [6] 000003e8 00000040 000f4240 000000f9
3393 *
3394 * a b c d ticks_per_s buffer_hz
3395 * ------- --------- ---------- ------------- ----------- -------------
3396 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3397 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3398 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3399 * [4] 1,000 1,024 1,000,000 100 976,562 100
3400 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3401 * [6] 1,000 64 1,000,000 249 15,625,000 249
3402 *
3403 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3404 * [2] 2.6.26-1-686-bigmem from Debian lenny
3405 * [3] 2.6.26-2-sparc64 from Debian lenny
3406 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3407 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3408 * [6] 2.6.34 from kernel.org on KVM
3409 */
3410 static const char fn[] = "/proc/net/psched";
3411 unsigned int a, b, c, d;
3412 FILE *stream;
3413
3414 ticks_per_s = 1.0;
3415 buffer_hz = 100;
3416
3417 stream = fopen(fn, "r");
3418 if (!stream) {
3419 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3420 return;
3421 }
3422
3423 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3424 VLOG_WARN("%s: read failed", fn);
3425 fclose(stream);
3426 return;
3427 }
3428 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3429 fclose(stream);
3430
3431 if (!a || !c) {
3432 VLOG_WARN("%s: invalid scheduler parameters", fn);
3433 return;
3434 }
3435
3436 ticks_per_s = (double) a * c / b;
3437 if (c == 1000000) {
3438 buffer_hz = d;
3439 } else {
3440 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3441 fn, a, b, c, d);
3442 }
3443 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3444}
3445
3446/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3447 * rate of 'rate' bytes per second. */
3448static unsigned int
3449tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3450{
3451 if (!buffer_hz) {
3452 read_psched();
3453 }
3454 return (rate * ticks) / ticks_per_s;
3455}
3456
3457/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3458 * rate of 'rate' bytes per second. */
3459static unsigned int
3460tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3461{
3462 if (!buffer_hz) {
3463 read_psched();
3464 }
015c93a4 3465 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3466}
3467
3468/* Returns the number of bytes that need to be reserved for qdisc buffering at
3469 * a transmission rate of 'rate' bytes per second. */
3470static unsigned int
3471tc_buffer_per_jiffy(unsigned int rate)
3472{
3473 if (!buffer_hz) {
3474 read_psched();
3475 }
3476 return rate / buffer_hz;
3477}
3478
3479/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3480 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3481 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3482 * stores NULL into it if it is absent.
3483 *
3484 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3485 * 'msg'.
3486 *
3487 * Returns 0 if successful, otherwise a positive errno value. */
3488static int
3489tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3490 struct nlattr **options)
3491{
3492 static const struct nl_policy tca_policy[] = {
3493 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3494 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3495 };
3496 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3497
3498 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3499 tca_policy, ta, ARRAY_SIZE(ta))) {
3500 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3501 goto error;
3502 }
3503
3504 if (kind) {
3505 *kind = nl_attr_get_string(ta[TCA_KIND]);
3506 }
3507
3508 if (options) {
3509 *options = ta[TCA_OPTIONS];
3510 }
3511
3512 return 0;
3513
3514error:
3515 if (kind) {
3516 *kind = NULL;
3517 }
3518 if (options) {
3519 *options = NULL;
3520 }
3521 return EPROTO;
3522}
3523
3524/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3525 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3526 * into '*options', and its queue statistics into '*stats'. Any of the output
3527 * arguments may be null.
3528 *
3529 * Returns 0 if successful, otherwise a positive errno value. */
3530static int
3531tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3532 struct nlattr **options, struct netdev_queue_stats *stats)
3533{
3534 static const struct nl_policy tca_policy[] = {
3535 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3536 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3537 };
3538 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3539
3540 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3541 tca_policy, ta, ARRAY_SIZE(ta))) {
3542 VLOG_WARN_RL(&rl, "failed to parse class message");
3543 goto error;
3544 }
3545
3546 if (handlep) {
3547 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3548 *handlep = tc->tcm_handle;
3549 }
3550
3551 if (options) {
3552 *options = ta[TCA_OPTIONS];
3553 }
3554
3555 if (stats) {
3556 const struct gnet_stats_queue *gsq;
3557 struct gnet_stats_basic gsb;
3558
3559 static const struct nl_policy stats_policy[] = {
3560 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3561 .min_len = sizeof gsb },
3562 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3563 .min_len = sizeof *gsq },
3564 };
3565 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3566
3567 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3568 sa, ARRAY_SIZE(sa))) {
3569 VLOG_WARN_RL(&rl, "failed to parse class stats");
3570 goto error;
3571 }
3572
3573 /* Alignment issues screw up the length of struct gnet_stats_basic on
3574 * some arch/bitsize combinations. Newer versions of Linux have a
3575 * struct gnet_stats_basic_packed, but we can't depend on that. The
3576 * easiest thing to do is just to make a copy. */
3577 memset(&gsb, 0, sizeof gsb);
3578 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3579 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3580 stats->tx_bytes = gsb.bytes;
3581 stats->tx_packets = gsb.packets;
3582
3583 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3584 stats->tx_errors = gsq->drops;
3585 }
3586
3587 return 0;
3588
3589error:
3590 if (options) {
3591 *options = NULL;
3592 }
3593 if (stats) {
3594 memset(stats, 0, sizeof *stats);
3595 }
3596 return EPROTO;
3597}
3598
3599/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3600 * on 'netdev'. */
3601static int
3602tc_query_class(const struct netdev *netdev,
3603 unsigned int handle, unsigned int parent,
3604 struct ofpbuf **replyp)
3605{
3606 struct ofpbuf request;
3607 struct tcmsg *tcmsg;
3608 int error;
3609
3610 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3611 if (!tcmsg) {
3612 return ENODEV;
3613 }
c1c9c9c4
BP
3614 tcmsg->tcm_handle = handle;
3615 tcmsg->tcm_parent = parent;
3616
3617 error = tc_transact(&request, replyp);
3618 if (error) {
3619 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3620 netdev_get_name(netdev),
3621 tc_get_major(handle), tc_get_minor(handle),
3622 tc_get_major(parent), tc_get_minor(parent),
3623 strerror(error));
3624 }
3625 return error;
3626}
3627
3628/* Equivalent to "tc class del dev <name> handle <handle>". */
3629static int
3630tc_delete_class(const struct netdev *netdev, unsigned int handle)
3631{
3632 struct ofpbuf request;
3633 struct tcmsg *tcmsg;
3634 int error;
3635
3636 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3637 if (!tcmsg) {
3638 return ENODEV;
3639 }
c1c9c9c4
BP
3640 tcmsg->tcm_handle = handle;
3641 tcmsg->tcm_parent = 0;
3642
3643 error = tc_transact(&request, NULL);
3644 if (error) {
3645 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3646 netdev_get_name(netdev),
3647 tc_get_major(handle), tc_get_minor(handle),
3648 strerror(error));
3649 }
3650 return error;
3651}
3652
3653/* Equivalent to "tc qdisc del dev <name> root". */
3654static int
3655tc_del_qdisc(struct netdev *netdev)
3656{
3657 struct netdev_dev_linux *netdev_dev =
3658 netdev_dev_linux_cast(netdev_get_dev(netdev));
3659 struct ofpbuf request;
3660 struct tcmsg *tcmsg;
3661 int error;
3662
3663 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3664 if (!tcmsg) {
3665 return ENODEV;
3666 }
c1c9c9c4
BP
3667 tcmsg->tcm_handle = tc_make_handle(1, 0);
3668 tcmsg->tcm_parent = TC_H_ROOT;
3669
3670 error = tc_transact(&request, NULL);
3671 if (error == EINVAL) {
3672 /* EINVAL probably means that the default qdisc was in use, in which
3673 * case we've accomplished our purpose. */
3674 error = 0;
3675 }
3676 if (!error && netdev_dev->tc) {
3677 if (netdev_dev->tc->ops->tc_destroy) {
3678 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3679 }
3680 netdev_dev->tc = NULL;
3681 }
3682 return error;
3683}
3684
3685/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3686 * kernel to determine what they are. Returns 0 if successful, otherwise a
3687 * positive errno value. */
3688static int
3689tc_query_qdisc(const struct netdev *netdev)
3690{
3691 struct netdev_dev_linux *netdev_dev =
3692 netdev_dev_linux_cast(netdev_get_dev(netdev));
3693 struct ofpbuf request, *qdisc;
3694 const struct tc_ops *ops;
3695 struct tcmsg *tcmsg;
3696 int load_error;
3697 int error;
3698
3699 if (netdev_dev->tc) {
3700 return 0;
3701 }
3702
3703 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3704 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3705 * 2.6.35 without that fix backported to it.
3706 *
3707 * To avoid the OOPS, we must not make a request that would attempt to dump
3708 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3709 * few others. There are a few ways that I can see to do this, but most of
3710 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3711 * technique chosen here is to assume that any non-default qdisc that we
3712 * create will have a class with handle 1:0. The built-in qdiscs only have
3713 * a class with handle 0:0.
3714 *
3715 * We could check for Linux 2.6.35+ and use a more straightforward method
3716 * there. */
3717 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3718 if (!tcmsg) {
3719 return ENODEV;
3720 }
c1c9c9c4
BP
3721 tcmsg->tcm_handle = tc_make_handle(1, 0);
3722 tcmsg->tcm_parent = 0;
3723
3724 /* Figure out what tc class to instantiate. */
3725 error = tc_transact(&request, &qdisc);
3726 if (!error) {
3727 const char *kind;
3728
3729 error = tc_parse_qdisc(qdisc, &kind, NULL);
3730 if (error) {
3731 ops = &tc_ops_other;
3732 } else {
3733 ops = tc_lookup_linux_name(kind);
3734 if (!ops) {
3735 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3736 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3737
3738 ops = &tc_ops_other;
3739 }
3740 }
3741 } else if (error == ENOENT) {
3742 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3743 * other entity that doesn't have a handle 1:0. We will assume
3744 * that it's the system default qdisc. */
3745 ops = &tc_ops_default;
3746 error = 0;
3747 } else {
3748 /* Who knows? Maybe the device got deleted. */
3749 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3750 netdev_get_name(netdev), strerror(error));
3751 ops = &tc_ops_other;
3752 }
3753
3754 /* Instantiate it. */
3755 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3756 assert((load_error == 0) == (netdev_dev->tc != NULL));
3757 ofpbuf_delete(qdisc);
3758
3759 return error ? error : load_error;
3760}
3761
3762/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3763 approximate the time to transmit packets of various lengths. For an MTU of
3764 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3765 represents two possible packet lengths; for a MTU of 513 through 1024, four
3766 possible lengths; and so on.
3767
3768 Returns, for the specified 'mtu', the number of bits that packet lengths
3769 need to be shifted right to fit within such a 256-entry table. */
3770static int
3771tc_calc_cell_log(unsigned int mtu)
3772{
3773 int cell_log;
3774
3775 if (!mtu) {
3776 mtu = ETH_PAYLOAD_MAX;
3777 }
3778 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3779
3780 for (cell_log = 0; mtu >= 256; cell_log++) {
3781 mtu >>= 1;
3782 }
3783
3784 return cell_log;
3785}
3786
3787/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3788 * of 'mtu'. */
3789static void
3790tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3791{
3792 memset(rate, 0, sizeof *rate);
3793 rate->cell_log = tc_calc_cell_log(mtu);
3794 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3795 /* rate->cell_align = 0; */ /* distro headers. */
3796 rate->mpu = ETH_TOTAL_MIN;
3797 rate->rate = Bps;
3798}
3799
3800/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3801 * attribute of the specified "type".
3802 *
3803 * See tc_calc_cell_log() above for a description of "rtab"s. */
3804static void
3805tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3806{
3807 uint32_t *rtab;
3808 unsigned int i;
3809
3810 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3811 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3812 unsigned packet_size = (i + 1) << rate->cell_log;
3813 if (packet_size < rate->mpu) {
3814 packet_size = rate->mpu;
3815 }
3816 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3817 }
3818}
3819
3820/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3821 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3822 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3823 * 0 is fine.) */
c1c9c9c4
BP
3824static int
3825tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3826{
3827 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3828 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3829}
3830
3831\f
3832/* Utility functions. */
3833
3834static int
3835get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3836{
3837 /* Policy for RTNLGRP_LINK messages.
3838 *
3839 * There are *many* more fields in these messages, but currently we only
3840 * care about these fields. */
3841 static const struct nl_policy rtnlgrp_link_policy[] = {
3842 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3843 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3844 .min_len = sizeof(struct rtnl_link_stats) },
3845 };
3846
3847 struct ofpbuf request;
3848 struct ofpbuf *reply;
3849 struct ifinfomsg *ifi;
3850 const struct rtnl_link_stats *rtnl_stats;
3851 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3852 int error;
3853
3854 ofpbuf_init(&request, 0);
3855 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3856 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3857 ifi->ifi_family = PF_UNSPEC;
3858 ifi->ifi_index = ifindex;
3859 error = nl_sock_transact(rtnl_sock, &request, &reply);
3860 ofpbuf_uninit(&request);
3861 if (error) {
3862 return error;
3863 }
3864
3865 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3866 rtnlgrp_link_policy,
3867 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3868 ofpbuf_delete(reply);
3869 return EPROTO;
3870 }
3871
3872 if (!attrs[IFLA_STATS]) {
3873 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3874 ofpbuf_delete(reply);
3875 return EPROTO;
3876 }
8b61709d
BP
3877
3878 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3879 stats->rx_packets = rtnl_stats->rx_packets;
3880 stats->tx_packets = rtnl_stats->tx_packets;
3881 stats->rx_bytes = rtnl_stats->rx_bytes;
3882 stats->tx_bytes = rtnl_stats->tx_bytes;
3883 stats->rx_errors = rtnl_stats->rx_errors;
3884 stats->tx_errors = rtnl_stats->tx_errors;
3885 stats->rx_dropped = rtnl_stats->rx_dropped;
3886 stats->tx_dropped = rtnl_stats->tx_dropped;
3887 stats->multicast = rtnl_stats->multicast;
3888 stats->collisions = rtnl_stats->collisions;
3889 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3890 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3891 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3892 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3893 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3894 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3895 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3896 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3897 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3898 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3899 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3900
576e26d7
BP
3901 ofpbuf_delete(reply);
3902
8b61709d
BP
3903 return 0;
3904}
3905
3906static int
3907get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3908{
3909 static const char fn[] = "/proc/net/dev";
3910 char line[1024];
3911 FILE *stream;
3912 int ln;
3913
3914 stream = fopen(fn, "r");
3915 if (!stream) {
3916 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3917 return errno;
3918 }
3919
3920 ln = 0;
3921 while (fgets(line, sizeof line, stream)) {
3922 if (++ln >= 3) {
3923 char devname[16];
3924#define X64 "%"SCNu64
3925 if (sscanf(line,
3926 " %15[^:]:"
3927 X64 X64 X64 X64 X64 X64 X64 "%*u"
3928 X64 X64 X64 X64 X64 X64 X64 "%*u",
3929 devname,
3930 &stats->rx_bytes,
3931 &stats->rx_packets,
3932 &stats->rx_errors,
3933 &stats->rx_dropped,
3934 &stats->rx_fifo_errors,
3935 &stats->rx_frame_errors,
3936 &stats->multicast,
3937 &stats->tx_bytes,
3938 &stats->tx_packets,
3939 &stats->tx_errors,
3940 &stats->tx_dropped,
3941 &stats->tx_fifo_errors,
3942 &stats->collisions,
3943 &stats->tx_carrier_errors) != 15) {
3944 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3945 } else if (!strcmp(devname, netdev_name)) {
3946 stats->rx_length_errors = UINT64_MAX;
3947 stats->rx_over_errors = UINT64_MAX;
3948 stats->rx_crc_errors = UINT64_MAX;
3949 stats->rx_missed_errors = UINT64_MAX;
3950 stats->tx_aborted_errors = UINT64_MAX;
3951 stats->tx_heartbeat_errors = UINT64_MAX;
3952 stats->tx_window_errors = UINT64_MAX;
3953 fclose(stream);
3954 return 0;
3955 }
3956 }
3957 }
3958 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3959 fclose(stream);
3960 return ENODEV;
3961}
c1c9c9c4 3962
8b61709d
BP
3963static int
3964get_flags(const struct netdev *netdev, int *flags)
3965{
3966 struct ifreq ifr;
3967 int error;
3968
149f577a
JG
3969 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3970 "SIOCGIFFLAGS");
8b61709d
BP
3971 *flags = ifr.ifr_flags;
3972 return error;
3973}
3974
3975static int
3976set_flags(struct netdev *netdev, int flags)
3977{
3978 struct ifreq ifr;
3979
3980 ifr.ifr_flags = flags;
149f577a
JG
3981 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3982 "SIOCSIFFLAGS");
8b61709d
BP
3983}
3984
3985static int
3986do_get_ifindex(const char *netdev_name)
3987{
3988 struct ifreq ifr;
3989
3990 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3991 COVERAGE_INC(netdev_get_ifindex);
3992 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3993 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3994 netdev_name, strerror(errno));
3995 return -errno;
3996 }
3997 return ifr.ifr_ifindex;
3998}
3999
4000static int
4001get_ifindex(const struct netdev *netdev_, int *ifindexp)
4002{
149f577a
JG
4003 struct netdev_dev_linux *netdev_dev =
4004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4005 *ifindexp = 0;
149f577a 4006 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4007 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4008 if (ifindex < 0) {
4009 return -ifindex;
4010 }
149f577a
JG
4011 netdev_dev->cache_valid |= VALID_IFINDEX;
4012 netdev_dev->ifindex = ifindex;
8b61709d 4013 }
149f577a 4014 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4015 return 0;
4016}
4017
4018static int
4019get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4020{
4021 struct ifreq ifr;
4022 int hwaddr_family;
4023
4024 memset(&ifr, 0, sizeof ifr);
4025 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4026 COVERAGE_INC(netdev_get_hwaddr);
4027 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4028 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4029 netdev_name, strerror(errno));
4030 return errno;
4031 }
4032 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4033 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4034 VLOG_WARN("%s device has unknown hardware address family %d",
4035 netdev_name, hwaddr_family);
4036 }
4037 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4038 return 0;
4039}
4040
4041static int
4042set_etheraddr(const char *netdev_name, int hwaddr_family,
4043 const uint8_t mac[ETH_ADDR_LEN])
4044{
4045 struct ifreq ifr;
4046
4047 memset(&ifr, 0, sizeof ifr);
4048 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4049 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4050 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4051 COVERAGE_INC(netdev_set_hwaddr);
4052 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4053 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4054 netdev_name, strerror(errno));
4055 return errno;
4056 }
4057 return 0;
4058}
4059
4060static int
0b0544d7 4061netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4062 int cmd, const char *cmd_name)
4063{
4064 struct ifreq ifr;
4065
4066 memset(&ifr, 0, sizeof ifr);
0b0544d7 4067 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4068 ifr.ifr_data = (caddr_t) ecmd;
4069
4070 ecmd->cmd = cmd;
4071 COVERAGE_INC(netdev_ethtool);
4072 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4073 return 0;
4074 } else {
4075 if (errno != EOPNOTSUPP) {
4076 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4077 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4078 } else {
4079 /* The device doesn't support this operation. That's pretty
4080 * common, so there's no point in logging anything. */
4081 }
4082 return errno;
4083 }
4084}
4085
4086static int
149f577a
JG
4087netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4088 const char *cmd_name)
8b61709d 4089{
149f577a 4090 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4091 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4092 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4093 strerror(errno));
8b61709d
BP
4094 return errno;
4095 }
4096 return 0;
4097}
f1acd62b
BP
4098
4099static int
4100netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4101 int cmd, const char *cmd_name)
4102{
4103 struct ifreq ifr;
4104 int error;
4105
4106 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4107 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4108 if (!error) {
4109 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4110 *ip = sin->sin_addr;
4111 }
4112 return error;
4113}