]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
ovs-ofctl: Fix small typo about nw_tos in man page.
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <assert.h>
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <arpa/inet.h>
22 #include <inttypes.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
25 #include <linux/ip.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
38 #include <net/if.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dpif-linux.h"
51 #include "dynamic-string.h"
52 #include "fatal-signal.h"
53 #include "hash.h"
54 #include "hmap.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
57 #include "netlink.h"
58 #include "netlink-socket.h"
59 #include "ofpbuf.h"
60 #include "openflow/openflow.h"
61 #include "packets.h"
62 #include "poll-loop.h"
63 #include "rtnetlink.h"
64 #include "rtnetlink-link.h"
65 #include "socket-util.h"
66 #include "shash.h"
67 #include "svec.h"
68 #include "vlog.h"
69
70 VLOG_DEFINE_THIS_MODULE(netdev_linux);
71
72 COVERAGE_DEFINE(netdev_get_vlan_vid);
73 COVERAGE_DEFINE(netdev_set_policing);
74 COVERAGE_DEFINE(netdev_arp_lookup);
75 COVERAGE_DEFINE(netdev_get_ifindex);
76 COVERAGE_DEFINE(netdev_get_hwaddr);
77 COVERAGE_DEFINE(netdev_set_hwaddr);
78 COVERAGE_DEFINE(netdev_ethtool);
79 \f
80 /* These were introduced in Linux 2.6.14, so they might be missing if we have
81 * old headers. */
82 #ifndef ADVERTISED_Pause
83 #define ADVERTISED_Pause (1 << 13)
84 #endif
85 #ifndef ADVERTISED_Asym_Pause
86 #define ADVERTISED_Asym_Pause (1 << 14)
87 #endif
88
89 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
90 * headers. */
91 #ifndef TC_RTAB_SIZE
92 #define TC_RTAB_SIZE 1024
93 #endif
94
95 static struct rtnetlink_notifier netdev_linux_cache_notifier;
96 static int cache_notifier_refcount;
97
98 enum {
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
101 VALID_IN4 = 1 << 2,
102 VALID_IN6 = 1 << 3,
103 VALID_MTU = 1 << 4,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
108 };
109
110 struct tap_state {
111 int fd;
112 bool opened;
113 };
114 \f
115 /* Traffic control. */
116
117 /* An instance of a traffic control class. Always associated with a particular
118 * network device.
119 *
120 * Each TC implementation subclasses this with whatever additional data it
121 * needs. */
122 struct tc {
123 const struct tc_ops *ops;
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
127 };
128
129 /* One traffic control queue.
130 *
131 * Each TC implementation subclasses this with whatever additional data it
132 * needs. */
133 struct tc_queue {
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
136 };
137
138 /* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
140 *
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
144 struct tc_ops {
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
149
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
152
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
156
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
162 *
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
166 *
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
169 *
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
177 *
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
183 * 'netdev'.
184 *
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
191 * tc_destroy(tc).
192 *
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
196 *
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
199
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 *
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
205 *
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
209 *
210 * This function may be null if 'tc' is not configurable.
211 */
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
216 *
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
220 *
221 * This function may be null if 'tc' is not configurable.
222 */
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
224
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
227 *
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 *
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
235 *
236 * This function may be null if 'tc' does not have queues ('n_queues' is
237 * 0). */
238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
239 struct shash *details);
240
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
244 * 'n_queues'.
245 *
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 *
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
254
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
257 *
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
261
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
264 *
265 * On success, initializes '*stats'.
266 *
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
271 struct netdev_queue_stats *stats);
272
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 *
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
281 };
282
283 static void
284 tc_init(struct tc *tc, const struct tc_ops *ops)
285 {
286 tc->ops = ops;
287 hmap_init(&tc->queues);
288 }
289
290 static void
291 tc_destroy(struct tc *tc)
292 {
293 hmap_destroy(&tc->queues);
294 }
295
296 static const struct tc_ops tc_ops_htb;
297 static const struct tc_ops tc_ops_hfsc;
298 static const struct tc_ops tc_ops_default;
299 static const struct tc_ops tc_ops_other;
300
301 static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
306 NULL
307 };
308
309 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310 static unsigned int tc_get_major(unsigned int handle);
311 static unsigned int tc_get_minor(unsigned int handle);
312
313 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316
317 static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320
321 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326 static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329 static int tc_delete_class(const struct netdev *, unsigned int handle);
330
331 static int tc_del_qdisc(struct netdev *netdev);
332 static int tc_query_qdisc(const struct netdev *netdev);
333
334 static int tc_calc_cell_log(unsigned int mtu);
335 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339 \f
340 struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
342
343 struct shash_node *shash_node;
344 unsigned int cache_valid;
345
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
348 int ifindex;
349 uint8_t etheraddr[ETH_ADDR_LEN];
350 struct in_addr address, netmask;
351 struct in6_addr in6;
352 int mtu;
353 int carrier;
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
358 bool have_vport_stats;
359 struct tc *tc;
360
361 union {
362 struct tap_state tap;
363 } state;
364 };
365
366 struct netdev_linux {
367 struct netdev netdev;
368 int fd;
369 };
370
371 /* An AF_INET socket (used for ioctl operations). */
372 static int af_inet_sock = -1;
373
374 /* A Netlink routing socket that is not subscribed to any multicast groups. */
375 static struct nl_sock *rtnl_sock;
376
377 struct netdev_linux_notifier {
378 struct netdev_notifier notifier;
379 struct list node;
380 };
381
382 static struct shash netdev_linux_notifiers =
383 SHASH_INITIALIZER(&netdev_linux_notifiers);
384 static struct rtnetlink_notifier netdev_linux_poll_notifier;
385
386 /* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389
390 static int netdev_linux_init(void);
391
392 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
393 int cmd, const char *cmd_name);
394 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
396 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
398 static int get_flags(const struct netdev *, int *flagsp);
399 static int set_flags(struct netdev *, int flags);
400 static int do_get_ifindex(const char *netdev_name);
401 static int get_ifindex(const struct netdev *, int *ifindexp);
402 static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410
411 static bool
412 is_netdev_linux_class(const struct netdev_class *netdev_class)
413 {
414 return netdev_class->init == netdev_linux_init;
415 }
416
417 static struct netdev_dev_linux *
418 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
419 {
420 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
421 assert(is_netdev_linux_class(netdev_class));
422
423 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
424 }
425
426 static struct netdev_linux *
427 netdev_linux_cast(const struct netdev *netdev)
428 {
429 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
430 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
431 assert(is_netdev_linux_class(netdev_class));
432
433 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
434 }
435 \f
436 static int
437 netdev_linux_init(void)
438 {
439 static int status = -1;
440 if (status < 0) {
441 /* Create AF_INET socket. */
442 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
443 status = af_inet_sock >= 0 ? 0 : errno;
444 if (status) {
445 VLOG_ERR("failed to create inet socket: %s", strerror(status));
446 }
447
448 /* Create rtnetlink socket. */
449 if (!status) {
450 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
451 if (status) {
452 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
453 strerror(status));
454 }
455 }
456 }
457 return status;
458 }
459
460 static void
461 netdev_linux_run(void)
462 {
463 rtnetlink_link_notifier_run();
464 }
465
466 static void
467 netdev_linux_wait(void)
468 {
469 rtnetlink_link_notifier_wait();
470 }
471
472 static void
473 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
474 void *aux OVS_UNUSED)
475 {
476 struct netdev_dev_linux *dev;
477 if (change) {
478 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 if (base_dev) {
480 const struct netdev_class *netdev_class =
481 netdev_dev_get_class(base_dev);
482
483 if (is_netdev_linux_class(netdev_class)) {
484 dev = netdev_dev_linux_cast(base_dev);
485 dev->cache_valid = 0;
486 }
487 }
488 } else {
489 struct shash device_shash;
490 struct shash_node *node;
491
492 shash_init(&device_shash);
493 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
494 SHASH_FOR_EACH (node, &device_shash) {
495 dev = node->data;
496 dev->cache_valid = 0;
497 }
498 shash_destroy(&device_shash);
499 }
500 }
501
502 /* Creates system and internal devices. */
503 static int
504 netdev_linux_create(const struct netdev_class *class,
505 const char *name, const struct shash *args,
506 struct netdev_dev **netdev_devp)
507 {
508 struct netdev_dev_linux *netdev_dev;
509 int error;
510
511 if (!shash_is_empty(args)) {
512 VLOG_WARN("%s: arguments for %s devices should be empty",
513 name, class->type);
514 }
515
516 if (!cache_notifier_refcount) {
517 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
518 netdev_linux_cache_cb, NULL);
519 if (error) {
520 return error;
521 }
522 }
523 cache_notifier_refcount++;
524
525 netdev_dev = xzalloc(sizeof *netdev_dev);
526 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
527
528 *netdev_devp = &netdev_dev->netdev_dev;
529 return 0;
530 }
531
532 /* For most types of netdevs we open the device for each call of
533 * netdev_open(). However, this is not the case with tap devices,
534 * since it is only possible to open the device once. In this
535 * situation we share a single file descriptor, and consequently
536 * buffers, across all readers. Therefore once data is read it will
537 * be unavailable to other reads for tap devices. */
538 static int
539 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
540 const char *name, const struct shash *args,
541 struct netdev_dev **netdev_devp)
542 {
543 struct netdev_dev_linux *netdev_dev;
544 struct tap_state *state;
545 static const char tap_dev[] = "/dev/net/tun";
546 struct ifreq ifr;
547 int error;
548
549 if (!shash_is_empty(args)) {
550 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
551 }
552
553 netdev_dev = xzalloc(sizeof *netdev_dev);
554 state = &netdev_dev->state.tap;
555
556 /* Open tap device. */
557 state->fd = open(tap_dev, O_RDWR);
558 if (state->fd < 0) {
559 error = errno;
560 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
561 goto error;
562 }
563
564 /* Create tap device. */
565 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
566 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
567 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
568 VLOG_WARN("%s: creating tap device failed: %s", name,
569 strerror(errno));
570 error = errno;
571 goto error;
572 }
573
574 /* Make non-blocking. */
575 error = set_nonblocking(state->fd);
576 if (error) {
577 goto error;
578 }
579
580 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
581 *netdev_devp = &netdev_dev->netdev_dev;
582 return 0;
583
584 error:
585 free(netdev_dev);
586 return error;
587 }
588
589 static void
590 destroy_tap(struct netdev_dev_linux *netdev_dev)
591 {
592 struct tap_state *state = &netdev_dev->state.tap;
593
594 if (state->fd >= 0) {
595 close(state->fd);
596 }
597 }
598
599 /* Destroys the netdev device 'netdev_dev_'. */
600 static void
601 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
602 {
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
605
606 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
607 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
608 }
609
610 if (class == &netdev_linux_class || class == &netdev_internal_class) {
611 cache_notifier_refcount--;
612
613 if (!cache_notifier_refcount) {
614 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
615 }
616 } else if (class == &netdev_tap_class) {
617 destroy_tap(netdev_dev);
618 } else {
619 NOT_REACHED();
620 }
621
622 free(netdev_dev);
623 }
624
625 static int
626 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
627 struct netdev **netdevp)
628 {
629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
630 struct netdev_linux *netdev;
631 enum netdev_flags flags;
632 int error;
633
634 /* Allocate network device. */
635 netdev = xzalloc(sizeof *netdev);
636 netdev->fd = -1;
637 netdev_init(&netdev->netdev, netdev_dev_);
638
639 /* Verify that the device really exists, by attempting to read its flags.
640 * (The flags might be cached, in which case this won't actually do an
641 * ioctl.)
642 *
643 * Don't do this for "internal" netdevs, though, because those have to be
644 * created as netdev objects before they exist in the kernel, because
645 * creating them in the kernel happens by passing a netdev object to
646 * dpif_port_add(). */
647 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
648 error = netdev_get_flags(&netdev->netdev, &flags);
649 if (error == ENODEV) {
650 goto error;
651 }
652 }
653
654 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
655 !netdev_dev->state.tap.opened) {
656
657 /* We assume that the first user of the tap device is the primary user
658 * and give them the tap FD. Subsequent users probably just expect
659 * this to be a system device so open it normally to avoid send/receive
660 * directions appearing to be reversed. */
661 netdev->fd = netdev_dev->state.tap.fd;
662 netdev_dev->state.tap.opened = true;
663 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
664 struct sockaddr_ll sll;
665 int protocol;
666 int ifindex;
667
668 /* Create file descriptor. */
669 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
670 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 : ethertype);
672 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
673 if (netdev->fd < 0) {
674 error = errno;
675 goto error;
676 }
677
678 /* Set non-blocking mode. */
679 error = set_nonblocking(netdev->fd);
680 if (error) {
681 goto error;
682 }
683
684 /* Get ethernet device index. */
685 error = get_ifindex(&netdev->netdev, &ifindex);
686 if (error) {
687 goto error;
688 }
689
690 /* Bind to specific ethernet device. */
691 memset(&sll, 0, sizeof sll);
692 sll.sll_family = AF_PACKET;
693 sll.sll_ifindex = ifindex;
694 if (bind(netdev->fd,
695 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 error = errno;
697 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
698 strerror(error));
699 goto error;
700 }
701
702 /* Between the socket() and bind() calls above, the socket receives all
703 * packets of the requested type on all system interfaces. We do not
704 * want to receive that data, but there is no way to avoid it. So we
705 * must now drain out the receive queue. */
706 error = drain_rcvbuf(netdev->fd);
707 if (error) {
708 goto error;
709 }
710 }
711
712 *netdevp = &netdev->netdev;
713 return 0;
714
715 error:
716 netdev_uninit(&netdev->netdev, true);
717 return error;
718 }
719
720 /* Closes and destroys 'netdev'. */
721 static void
722 netdev_linux_close(struct netdev *netdev_)
723 {
724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725
726 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
727 close(netdev->fd);
728 }
729 free(netdev);
730 }
731
732 /* Initializes 'svec' with a list of the names of all known network devices. */
733 static int
734 netdev_linux_enumerate(struct svec *svec)
735 {
736 struct if_nameindex *names;
737
738 names = if_nameindex();
739 if (names) {
740 size_t i;
741
742 for (i = 0; names[i].if_name != NULL; i++) {
743 svec_add(svec, names[i].if_name);
744 }
745 if_freenameindex(names);
746 return 0;
747 } else {
748 VLOG_WARN("could not obtain list of network device names: %s",
749 strerror(errno));
750 return errno;
751 }
752 }
753
754 static int
755 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756 {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758
759 if (netdev->fd < 0) {
760 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
761 return -EAGAIN;
762 }
763
764 for (;;) {
765 ssize_t retval = read(netdev->fd, data, size);
766 if (retval >= 0) {
767 return retval;
768 } else if (errno != EINTR) {
769 if (errno != EAGAIN) {
770 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
771 strerror(errno), netdev_get_name(netdev_));
772 }
773 return -errno;
774 }
775 }
776 }
777
778 /* Registers with the poll loop to wake up from the next call to poll_block()
779 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780 static void
781 netdev_linux_recv_wait(struct netdev *netdev_)
782 {
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
784 if (netdev->fd >= 0) {
785 poll_fd_wait(netdev->fd, POLLIN);
786 }
787 }
788
789 /* Discards all packets waiting to be received from 'netdev'. */
790 static int
791 netdev_linux_drain(struct netdev *netdev_)
792 {
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd < 0) {
795 return 0;
796 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
797 struct ifreq ifr;
798 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
799 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
800 if (error) {
801 return error;
802 }
803 drain_fd(netdev->fd, ifr.ifr_qlen);
804 return 0;
805 } else {
806 return drain_rcvbuf(netdev->fd);
807 }
808 }
809
810 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
811 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
812 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
813 * the packet is too big or too small to transmit on the device.
814 *
815 * The caller retains ownership of 'buffer' in all cases.
816 *
817 * The kernel maintains a packet transmission queue, so the caller is not
818 * expected to do additional queuing of packets. */
819 static int
820 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821 {
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823
824 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 */
826 if (netdev->fd < 0) {
827 return EPIPE;
828 }
829
830 for (;;) {
831 ssize_t retval = write(netdev->fd, data, size);
832 if (retval < 0) {
833 /* The Linux AF_PACKET implementation never blocks waiting for room
834 * for packets, instead returning ENOBUFS. Translate this into
835 * EAGAIN for the caller. */
836 if (errno == ENOBUFS) {
837 return EAGAIN;
838 } else if (errno == EINTR) {
839 continue;
840 } else if (errno != EAGAIN) {
841 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
842 netdev_get_name(netdev_), strerror(errno));
843 }
844 return errno;
845 } else if (retval != size) {
846 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
847 "%zu) on %s", retval, size, netdev_get_name(netdev_));
848 return EMSGSIZE;
849 } else {
850 return 0;
851 }
852 }
853 }
854
855 /* Registers with the poll loop to wake up from the next call to poll_block()
856 * when the packet transmission queue has sufficient room to transmit a packet
857 * with netdev_send().
858 *
859 * The kernel maintains a packet transmission queue, so the client is not
860 * expected to do additional queuing of packets. Thus, this function is
861 * unlikely to ever be used. It is included for completeness. */
862 static void
863 netdev_linux_send_wait(struct netdev *netdev_)
864 {
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 if (netdev->fd < 0) {
867 /* Nothing to do. */
868 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
869 poll_fd_wait(netdev->fd, POLLOUT);
870 } else {
871 /* TAP device always accepts packets.*/
872 poll_immediate_wake();
873 }
874 }
875
876 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
877 * otherwise a positive errno value. */
878 static int
879 netdev_linux_set_etheraddr(struct netdev *netdev_,
880 const uint8_t mac[ETH_ADDR_LEN])
881 {
882 struct netdev_dev_linux *netdev_dev =
883 netdev_dev_linux_cast(netdev_get_dev(netdev_));
884 int error;
885
886 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
887 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
888 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 if (!error) {
890 netdev_dev->cache_valid |= VALID_ETHERADDR;
891 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
892 }
893 } else {
894 error = 0;
895 }
896 return error;
897 }
898
899 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
900 * free the returned buffer. */
901 static int
902 netdev_linux_get_etheraddr(const struct netdev *netdev_,
903 uint8_t mac[ETH_ADDR_LEN])
904 {
905 struct netdev_dev_linux *netdev_dev =
906 netdev_dev_linux_cast(netdev_get_dev(netdev_));
907 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
908 int error = get_etheraddr(netdev_get_name(netdev_),
909 netdev_dev->etheraddr);
910 if (error) {
911 return error;
912 }
913 netdev_dev->cache_valid |= VALID_ETHERADDR;
914 }
915 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
916 return 0;
917 }
918
919 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
920 * in bytes, not including the hardware header; thus, this is typically 1500
921 * bytes for Ethernet devices. */
922 static int
923 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924 {
925 struct netdev_dev_linux *netdev_dev =
926 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_MTU)) {
928 struct ifreq ifr;
929 int error;
930
931 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
932 SIOCGIFMTU, "SIOCGIFMTU");
933 if (error) {
934 return error;
935 }
936 netdev_dev->mtu = ifr.ifr_mtu;
937 netdev_dev->cache_valid |= VALID_MTU;
938 }
939 *mtup = netdev_dev->mtu;
940 return 0;
941 }
942
943 /* Returns the ifindex of 'netdev', if successful, as a positive number.
944 * On failure, returns a negative errno value. */
945 static int
946 netdev_linux_get_ifindex(const struct netdev *netdev)
947 {
948 int ifindex, error;
949
950 error = get_ifindex(netdev, &ifindex);
951 return error ? -error : ifindex;
952 }
953
954 static int
955 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956 {
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
959 int error = 0;
960 char *fn = NULL;
961 int fd = -1;
962
963 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
964 char line[8];
965 int retval;
966
967 fn = xasprintf("/sys/class/net/%s/carrier",
968 netdev_get_name(netdev_));
969 fd = open(fn, O_RDONLY);
970 if (fd < 0) {
971 error = errno;
972 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
973 goto exit;
974 }
975
976 retval = read(fd, line, sizeof line);
977 if (retval < 0) {
978 error = errno;
979 if (error == EINVAL) {
980 /* This is the normal return value when we try to check carrier
981 * if the network device is not up. */
982 } else {
983 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
984 }
985 goto exit;
986 } else if (retval == 0) {
987 error = EPROTO;
988 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
989 goto exit;
990 }
991
992 if (line[0] != '0' && line[0] != '1') {
993 error = EPROTO;
994 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
995 fn, line[0]);
996 goto exit;
997 }
998 netdev_dev->carrier = line[0] != '0';
999 netdev_dev->cache_valid |= VALID_CARRIER;
1000 }
1001 *carrier = netdev_dev->carrier;
1002 error = 0;
1003
1004 exit:
1005 if (fd >= 0) {
1006 close(fd);
1007 }
1008 free(fn);
1009 return error;
1010 }
1011
1012 static int
1013 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1014 const char *cmd_name, struct mii_ioctl_data *data)
1015 {
1016 struct ifreq ifr;
1017 int error;
1018
1019 memset(&ifr, 0, sizeof ifr);
1020 memcpy(&ifr.ifr_data, data, sizeof *data);
1021 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1022 &ifr, cmd, cmd_name);
1023 memcpy(data, &ifr.ifr_data, sizeof *data);
1024
1025 return error;
1026 }
1027
1028 static int
1029 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1030 {
1031 const char *name = netdev_get_name(netdev);
1032 struct mii_ioctl_data data;
1033 int error;
1034
1035 *miimon = false;
1036
1037 memset(&data, 0, sizeof data);
1038 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1039 if (!error) {
1040 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1041 data.reg_num = MII_BMSR;
1042 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1043 &data);
1044
1045 if (!error) {
1046 *miimon = !!(data.val_out & BMSR_LSTATUS);
1047 } else {
1048 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1049 }
1050 } else {
1051 struct ethtool_cmd ecmd;
1052
1053 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1054 name);
1055
1056 memset(&ecmd, 0, sizeof ecmd);
1057 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1058 "ETHTOOL_GLINK");
1059 if (!error) {
1060 struct ethtool_value eval;
1061
1062 memcpy(&eval, &ecmd, sizeof eval);
1063 *miimon = !!eval.data;
1064 } else {
1065 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1066 }
1067 }
1068
1069 return error;
1070 }
1071
1072 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1073 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1074 * enabled. */
1075 static bool
1076 check_for_working_netlink_stats(void)
1077 {
1078 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1079 * preferable, so if that works, we'll use it. */
1080 int ifindex = do_get_ifindex("lo");
1081 if (ifindex < 0) {
1082 VLOG_WARN("failed to get ifindex for lo, "
1083 "obtaining netdev stats from proc");
1084 return false;
1085 } else {
1086 struct netdev_stats stats;
1087 int error = get_stats_via_netlink(ifindex, &stats);
1088 if (!error) {
1089 VLOG_DBG("obtaining netdev stats via rtnetlink");
1090 return true;
1091 } else {
1092 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1093 "via proc (you are probably running a pre-2.6.19 "
1094 "kernel)", strerror(error));
1095 return false;
1096 }
1097 }
1098 }
1099
1100 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1101 static void
1102 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1103 {
1104 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1105 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1106 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1107
1108 netdev_dev->is_tap = !strcmp(type, "tap");
1109 netdev_dev->is_internal = (!netdev_dev->is_tap
1110 && dpif_linux_is_internal_device(name));
1111 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1112 }
1113 }
1114
1115 static void
1116 swap_uint64(uint64_t *a, uint64_t *b)
1117 {
1118 *a ^= *b;
1119 *b ^= *a;
1120 *a ^= *b;
1121 }
1122
1123 /* Retrieves current device stats for 'netdev'. */
1124 static int
1125 netdev_linux_get_stats(const struct netdev *netdev_,
1126 struct netdev_stats *stats)
1127 {
1128 struct netdev_dev_linux *netdev_dev =
1129 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1130 static int use_netlink_stats = -1;
1131 int error;
1132
1133 if (netdev_dev->have_vport_stats ||
1134 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1135
1136 error = netdev_vport_get_stats(netdev_, stats);
1137 netdev_dev->have_vport_stats = !error;
1138 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1139 }
1140
1141 if (!netdev_dev->have_vport_stats) {
1142 if (use_netlink_stats < 0) {
1143 use_netlink_stats = check_for_working_netlink_stats();
1144 }
1145 if (use_netlink_stats) {
1146 int ifindex;
1147
1148 error = get_ifindex(netdev_, &ifindex);
1149 if (!error) {
1150 error = get_stats_via_netlink(ifindex, stats);
1151 }
1152 } else {
1153 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1154 }
1155 }
1156
1157 /* If this port is an internal port then the transmit and receive stats
1158 * will appear to be swapped relative to the other ports since we are the
1159 * one sending the data, not a remote computer. For consistency, we swap
1160 * them back here. This does not apply if we are getting stats from the
1161 * vport layer because it always tracks stats from the perspective of the
1162 * switch. */
1163 netdev_linux_update_is_pseudo(netdev_dev);
1164 if (!error && !netdev_dev->have_vport_stats &&
1165 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1166 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1167 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1168 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1169 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1170 stats->rx_length_errors = 0;
1171 stats->rx_over_errors = 0;
1172 stats->rx_crc_errors = 0;
1173 stats->rx_frame_errors = 0;
1174 stats->rx_fifo_errors = 0;
1175 stats->rx_missed_errors = 0;
1176 stats->tx_aborted_errors = 0;
1177 stats->tx_carrier_errors = 0;
1178 stats->tx_fifo_errors = 0;
1179 stats->tx_heartbeat_errors = 0;
1180 stats->tx_window_errors = 0;
1181 }
1182
1183 return error;
1184 }
1185
1186 /* Stores the features supported by 'netdev' into each of '*current',
1187 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1188 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1189 * successful, otherwise a positive errno value. */
1190 static int
1191 netdev_linux_get_features(const struct netdev *netdev,
1192 uint32_t *current, uint32_t *advertised,
1193 uint32_t *supported, uint32_t *peer)
1194 {
1195 struct ethtool_cmd ecmd;
1196 int error;
1197
1198 memset(&ecmd, 0, sizeof ecmd);
1199 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1200 ETHTOOL_GSET, "ETHTOOL_GSET");
1201 if (error) {
1202 return error;
1203 }
1204
1205 /* Supported features. */
1206 *supported = 0;
1207 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1208 *supported |= OFPPF_10MB_HD;
1209 }
1210 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1211 *supported |= OFPPF_10MB_FD;
1212 }
1213 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1214 *supported |= OFPPF_100MB_HD;
1215 }
1216 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1217 *supported |= OFPPF_100MB_FD;
1218 }
1219 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1220 *supported |= OFPPF_1GB_HD;
1221 }
1222 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1223 *supported |= OFPPF_1GB_FD;
1224 }
1225 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1226 *supported |= OFPPF_10GB_FD;
1227 }
1228 if (ecmd.supported & SUPPORTED_TP) {
1229 *supported |= OFPPF_COPPER;
1230 }
1231 if (ecmd.supported & SUPPORTED_FIBRE) {
1232 *supported |= OFPPF_FIBER;
1233 }
1234 if (ecmd.supported & SUPPORTED_Autoneg) {
1235 *supported |= OFPPF_AUTONEG;
1236 }
1237 if (ecmd.supported & SUPPORTED_Pause) {
1238 *supported |= OFPPF_PAUSE;
1239 }
1240 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1241 *supported |= OFPPF_PAUSE_ASYM;
1242 }
1243
1244 /* Advertised features. */
1245 *advertised = 0;
1246 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1247 *advertised |= OFPPF_10MB_HD;
1248 }
1249 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1250 *advertised |= OFPPF_10MB_FD;
1251 }
1252 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1253 *advertised |= OFPPF_100MB_HD;
1254 }
1255 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1256 *advertised |= OFPPF_100MB_FD;
1257 }
1258 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1259 *advertised |= OFPPF_1GB_HD;
1260 }
1261 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1262 *advertised |= OFPPF_1GB_FD;
1263 }
1264 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1265 *advertised |= OFPPF_10GB_FD;
1266 }
1267 if (ecmd.advertising & ADVERTISED_TP) {
1268 *advertised |= OFPPF_COPPER;
1269 }
1270 if (ecmd.advertising & ADVERTISED_FIBRE) {
1271 *advertised |= OFPPF_FIBER;
1272 }
1273 if (ecmd.advertising & ADVERTISED_Autoneg) {
1274 *advertised |= OFPPF_AUTONEG;
1275 }
1276 if (ecmd.advertising & ADVERTISED_Pause) {
1277 *advertised |= OFPPF_PAUSE;
1278 }
1279 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1280 *advertised |= OFPPF_PAUSE_ASYM;
1281 }
1282
1283 /* Current settings. */
1284 if (ecmd.speed == SPEED_10) {
1285 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1286 } else if (ecmd.speed == SPEED_100) {
1287 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1288 } else if (ecmd.speed == SPEED_1000) {
1289 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1290 } else if (ecmd.speed == SPEED_10000) {
1291 *current = OFPPF_10GB_FD;
1292 } else {
1293 *current = 0;
1294 }
1295
1296 if (ecmd.port == PORT_TP) {
1297 *current |= OFPPF_COPPER;
1298 } else if (ecmd.port == PORT_FIBRE) {
1299 *current |= OFPPF_FIBER;
1300 }
1301
1302 if (ecmd.autoneg) {
1303 *current |= OFPPF_AUTONEG;
1304 }
1305
1306 /* Peer advertisements. */
1307 *peer = 0; /* XXX */
1308
1309 return 0;
1310 }
1311
1312 /* Set the features advertised by 'netdev' to 'advertise'. */
1313 static int
1314 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1315 {
1316 struct ethtool_cmd ecmd;
1317 int error;
1318
1319 memset(&ecmd, 0, sizeof ecmd);
1320 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1321 ETHTOOL_GSET, "ETHTOOL_GSET");
1322 if (error) {
1323 return error;
1324 }
1325
1326 ecmd.advertising = 0;
1327 if (advertise & OFPPF_10MB_HD) {
1328 ecmd.advertising |= ADVERTISED_10baseT_Half;
1329 }
1330 if (advertise & OFPPF_10MB_FD) {
1331 ecmd.advertising |= ADVERTISED_10baseT_Full;
1332 }
1333 if (advertise & OFPPF_100MB_HD) {
1334 ecmd.advertising |= ADVERTISED_100baseT_Half;
1335 }
1336 if (advertise & OFPPF_100MB_FD) {
1337 ecmd.advertising |= ADVERTISED_100baseT_Full;
1338 }
1339 if (advertise & OFPPF_1GB_HD) {
1340 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1341 }
1342 if (advertise & OFPPF_1GB_FD) {
1343 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1344 }
1345 if (advertise & OFPPF_10GB_FD) {
1346 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1347 }
1348 if (advertise & OFPPF_COPPER) {
1349 ecmd.advertising |= ADVERTISED_TP;
1350 }
1351 if (advertise & OFPPF_FIBER) {
1352 ecmd.advertising |= ADVERTISED_FIBRE;
1353 }
1354 if (advertise & OFPPF_AUTONEG) {
1355 ecmd.advertising |= ADVERTISED_Autoneg;
1356 }
1357 if (advertise & OFPPF_PAUSE) {
1358 ecmd.advertising |= ADVERTISED_Pause;
1359 }
1360 if (advertise & OFPPF_PAUSE_ASYM) {
1361 ecmd.advertising |= ADVERTISED_Asym_Pause;
1362 }
1363 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1364 ETHTOOL_SSET, "ETHTOOL_SSET");
1365 }
1366
1367 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1368 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1369 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1370 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1371 * sets '*vlan_vid' to -1. */
1372 static int
1373 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1374 {
1375 const char *netdev_name = netdev_get_name(netdev);
1376 struct ds line = DS_EMPTY_INITIALIZER;
1377 FILE *stream = NULL;
1378 int error;
1379 char *fn;
1380
1381 COVERAGE_INC(netdev_get_vlan_vid);
1382 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1383 stream = fopen(fn, "r");
1384 if (!stream) {
1385 error = errno;
1386 goto done;
1387 }
1388
1389 if (ds_get_line(&line, stream)) {
1390 if (ferror(stream)) {
1391 error = errno;
1392 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1393 } else {
1394 error = EPROTO;
1395 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1396 }
1397 goto done;
1398 }
1399
1400 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1401 error = EPROTO;
1402 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1403 fn, ds_cstr(&line));
1404 goto done;
1405 }
1406
1407 error = 0;
1408
1409 done:
1410 free(fn);
1411 if (stream) {
1412 fclose(stream);
1413 }
1414 ds_destroy(&line);
1415 if (error) {
1416 *vlan_vid = -1;
1417 }
1418 return error;
1419 }
1420
1421 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1422 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1423
1424 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1425 * positive errno value.
1426 *
1427 * This function is equivalent to running
1428 * /sbin/tc qdisc del dev %s handle ffff: ingress
1429 * but it is much, much faster.
1430 */
1431 static int
1432 netdev_linux_remove_policing(struct netdev *netdev)
1433 {
1434 struct netdev_dev_linux *netdev_dev =
1435 netdev_dev_linux_cast(netdev_get_dev(netdev));
1436 const char *netdev_name = netdev_get_name(netdev);
1437
1438 struct ofpbuf request;
1439 struct tcmsg *tcmsg;
1440 int error;
1441
1442 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1443 if (!tcmsg) {
1444 return ENODEV;
1445 }
1446 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1447 tcmsg->tcm_parent = TC_H_INGRESS;
1448 nl_msg_put_string(&request, TCA_KIND, "ingress");
1449 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1450
1451 error = tc_transact(&request, NULL);
1452 if (error && error != ENOENT && error != EINVAL) {
1453 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1454 netdev_name, strerror(error));
1455 return error;
1456 }
1457
1458 netdev_dev->kbits_rate = 0;
1459 netdev_dev->kbits_burst = 0;
1460 netdev_dev->cache_valid |= VALID_POLICING;
1461 return 0;
1462 }
1463
1464 /* Attempts to set input rate limiting (policing) policy. */
1465 static int
1466 netdev_linux_set_policing(struct netdev *netdev,
1467 uint32_t kbits_rate, uint32_t kbits_burst)
1468 {
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev));
1471 const char *netdev_name = netdev_get_name(netdev);
1472 char command[1024];
1473
1474 COVERAGE_INC(netdev_set_policing);
1475
1476 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1477 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1478 : kbits_burst); /* Stick with user-specified value. */
1479
1480 if (netdev_dev->cache_valid & VALID_POLICING
1481 && netdev_dev->kbits_rate == kbits_rate
1482 && netdev_dev->kbits_burst == kbits_burst) {
1483 /* Assume that settings haven't changed since we last set them. */
1484 return 0;
1485 }
1486
1487 netdev_linux_remove_policing(netdev);
1488 if (kbits_rate) {
1489 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1490 if (system(command) != 0) {
1491 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1492 return -1;
1493 }
1494
1495 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1496 kbits_rate, kbits_burst);
1497 if (system(command) != 0) {
1498 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1499 netdev_name);
1500 return -1;
1501 }
1502
1503 netdev_dev->kbits_rate = kbits_rate;
1504 netdev_dev->kbits_burst = kbits_burst;
1505 netdev_dev->cache_valid |= VALID_POLICING;
1506 }
1507
1508 return 0;
1509 }
1510
1511 static int
1512 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1513 struct svec *types)
1514 {
1515 const struct tc_ops **opsp;
1516
1517 for (opsp = tcs; *opsp != NULL; opsp++) {
1518 const struct tc_ops *ops = *opsp;
1519 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1520 svec_add(types, ops->ovs_name);
1521 }
1522 }
1523 return 0;
1524 }
1525
1526 static const struct tc_ops *
1527 tc_lookup_ovs_name(const char *name)
1528 {
1529 const struct tc_ops **opsp;
1530
1531 for (opsp = tcs; *opsp != NULL; opsp++) {
1532 const struct tc_ops *ops = *opsp;
1533 if (!strcmp(name, ops->ovs_name)) {
1534 return ops;
1535 }
1536 }
1537 return NULL;
1538 }
1539
1540 static const struct tc_ops *
1541 tc_lookup_linux_name(const char *name)
1542 {
1543 const struct tc_ops **opsp;
1544
1545 for (opsp = tcs; *opsp != NULL; opsp++) {
1546 const struct tc_ops *ops = *opsp;
1547 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1548 return ops;
1549 }
1550 }
1551 return NULL;
1552 }
1553
1554 static struct tc_queue *
1555 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1556 size_t hash)
1557 {
1558 struct netdev_dev_linux *netdev_dev =
1559 netdev_dev_linux_cast(netdev_get_dev(netdev));
1560 struct tc_queue *queue;
1561
1562 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1563 if (queue->queue_id == queue_id) {
1564 return queue;
1565 }
1566 }
1567 return NULL;
1568 }
1569
1570 static struct tc_queue *
1571 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1572 {
1573 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1574 }
1575
1576 static int
1577 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1578 const char *type,
1579 struct netdev_qos_capabilities *caps)
1580 {
1581 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1582 if (!ops) {
1583 return EOPNOTSUPP;
1584 }
1585 caps->n_queues = ops->n_queues;
1586 return 0;
1587 }
1588
1589 static int
1590 netdev_linux_get_qos(const struct netdev *netdev,
1591 const char **typep, struct shash *details)
1592 {
1593 struct netdev_dev_linux *netdev_dev =
1594 netdev_dev_linux_cast(netdev_get_dev(netdev));
1595 int error;
1596
1597 error = tc_query_qdisc(netdev);
1598 if (error) {
1599 return error;
1600 }
1601
1602 *typep = netdev_dev->tc->ops->ovs_name;
1603 return (netdev_dev->tc->ops->qdisc_get
1604 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1605 : 0);
1606 }
1607
1608 static int
1609 netdev_linux_set_qos(struct netdev *netdev,
1610 const char *type, const struct shash *details)
1611 {
1612 struct netdev_dev_linux *netdev_dev =
1613 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 const struct tc_ops *new_ops;
1615 int error;
1616
1617 new_ops = tc_lookup_ovs_name(type);
1618 if (!new_ops || !new_ops->tc_install) {
1619 return EOPNOTSUPP;
1620 }
1621
1622 error = tc_query_qdisc(netdev);
1623 if (error) {
1624 return error;
1625 }
1626
1627 if (new_ops == netdev_dev->tc->ops) {
1628 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1629 } else {
1630 /* Delete existing qdisc. */
1631 error = tc_del_qdisc(netdev);
1632 if (error) {
1633 return error;
1634 }
1635 assert(netdev_dev->tc == NULL);
1636
1637 /* Install new qdisc. */
1638 error = new_ops->tc_install(netdev, details);
1639 assert((error == 0) == (netdev_dev->tc != NULL));
1640
1641 return error;
1642 }
1643 }
1644
1645 static int
1646 netdev_linux_get_queue(const struct netdev *netdev,
1647 unsigned int queue_id, struct shash *details)
1648 {
1649 struct netdev_dev_linux *netdev_dev =
1650 netdev_dev_linux_cast(netdev_get_dev(netdev));
1651 int error;
1652
1653 error = tc_query_qdisc(netdev);
1654 if (error) {
1655 return error;
1656 } else {
1657 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1658 return (queue
1659 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1660 : ENOENT);
1661 }
1662 }
1663
1664 static int
1665 netdev_linux_set_queue(struct netdev *netdev,
1666 unsigned int queue_id, const struct shash *details)
1667 {
1668 struct netdev_dev_linux *netdev_dev =
1669 netdev_dev_linux_cast(netdev_get_dev(netdev));
1670 int error;
1671
1672 error = tc_query_qdisc(netdev);
1673 if (error) {
1674 return error;
1675 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1676 || !netdev_dev->tc->ops->class_set) {
1677 return EINVAL;
1678 }
1679
1680 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1681 }
1682
1683 static int
1684 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1685 {
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1688 int error;
1689
1690 error = tc_query_qdisc(netdev);
1691 if (error) {
1692 return error;
1693 } else if (!netdev_dev->tc->ops->class_delete) {
1694 return EINVAL;
1695 } else {
1696 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1697 return (queue
1698 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1699 : ENOENT);
1700 }
1701 }
1702
1703 static int
1704 netdev_linux_get_queue_stats(const struct netdev *netdev,
1705 unsigned int queue_id,
1706 struct netdev_queue_stats *stats)
1707 {
1708 struct netdev_dev_linux *netdev_dev =
1709 netdev_dev_linux_cast(netdev_get_dev(netdev));
1710 int error;
1711
1712 error = tc_query_qdisc(netdev);
1713 if (error) {
1714 return error;
1715 } else if (!netdev_dev->tc->ops->class_get_stats) {
1716 return EOPNOTSUPP;
1717 } else {
1718 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1719 return (queue
1720 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1721 : ENOENT);
1722 }
1723 }
1724
1725 static bool
1726 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1727 {
1728 struct ofpbuf request;
1729 struct tcmsg *tcmsg;
1730
1731 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1732 if (!tcmsg) {
1733 return false;
1734 }
1735 tcmsg->tcm_parent = 0;
1736 nl_dump_start(dump, rtnl_sock, &request);
1737 ofpbuf_uninit(&request);
1738 return true;
1739 }
1740
1741 static int
1742 netdev_linux_dump_queues(const struct netdev *netdev,
1743 netdev_dump_queues_cb *cb, void *aux)
1744 {
1745 struct netdev_dev_linux *netdev_dev =
1746 netdev_dev_linux_cast(netdev_get_dev(netdev));
1747 struct tc_queue *queue;
1748 struct shash details;
1749 int last_error;
1750 int error;
1751
1752 error = tc_query_qdisc(netdev);
1753 if (error) {
1754 return error;
1755 } else if (!netdev_dev->tc->ops->class_get) {
1756 return EOPNOTSUPP;
1757 }
1758
1759 last_error = 0;
1760 shash_init(&details);
1761 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1762 shash_clear(&details);
1763
1764 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1765 if (!error) {
1766 (*cb)(queue->queue_id, &details, aux);
1767 } else {
1768 last_error = error;
1769 }
1770 }
1771 shash_destroy(&details);
1772
1773 return last_error;
1774 }
1775
1776 static int
1777 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1778 netdev_dump_queue_stats_cb *cb, void *aux)
1779 {
1780 struct netdev_dev_linux *netdev_dev =
1781 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 struct nl_dump dump;
1783 struct ofpbuf msg;
1784 int last_error;
1785 int error;
1786
1787 error = tc_query_qdisc(netdev);
1788 if (error) {
1789 return error;
1790 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1791 return EOPNOTSUPP;
1792 }
1793
1794 last_error = 0;
1795 if (!start_queue_dump(netdev, &dump)) {
1796 return ENODEV;
1797 }
1798 while (nl_dump_next(&dump, &msg)) {
1799 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1800 if (error) {
1801 last_error = error;
1802 }
1803 }
1804
1805 error = nl_dump_done(&dump);
1806 return error ? error : last_error;
1807 }
1808
1809 static int
1810 netdev_linux_get_in4(const struct netdev *netdev_,
1811 struct in_addr *address, struct in_addr *netmask)
1812 {
1813 struct netdev_dev_linux *netdev_dev =
1814 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1815
1816 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1817 int error;
1818
1819 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1820 SIOCGIFADDR, "SIOCGIFADDR");
1821 if (error) {
1822 return error;
1823 }
1824
1825 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1826 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1827 if (error) {
1828 return error;
1829 }
1830
1831 netdev_dev->cache_valid |= VALID_IN4;
1832 }
1833 *address = netdev_dev->address;
1834 *netmask = netdev_dev->netmask;
1835 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1836 }
1837
1838 static int
1839 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1840 struct in_addr netmask)
1841 {
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1844 int error;
1845
1846 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1847 if (!error) {
1848 netdev_dev->cache_valid |= VALID_IN4;
1849 netdev_dev->address = address;
1850 netdev_dev->netmask = netmask;
1851 if (address.s_addr != INADDR_ANY) {
1852 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1853 "SIOCSIFNETMASK", netmask);
1854 }
1855 }
1856 return error;
1857 }
1858
1859 static bool
1860 parse_if_inet6_line(const char *line,
1861 struct in6_addr *in6, char ifname[16 + 1])
1862 {
1863 uint8_t *s6 = in6->s6_addr;
1864 #define X8 "%2"SCNx8
1865 return sscanf(line,
1866 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1867 "%*x %*x %*x %*x %16s\n",
1868 &s6[0], &s6[1], &s6[2], &s6[3],
1869 &s6[4], &s6[5], &s6[6], &s6[7],
1870 &s6[8], &s6[9], &s6[10], &s6[11],
1871 &s6[12], &s6[13], &s6[14], &s6[15],
1872 ifname) == 17;
1873 }
1874
1875 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1876 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1877 static int
1878 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1879 {
1880 struct netdev_dev_linux *netdev_dev =
1881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1882 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1883 FILE *file;
1884 char line[128];
1885
1886 netdev_dev->in6 = in6addr_any;
1887
1888 file = fopen("/proc/net/if_inet6", "r");
1889 if (file != NULL) {
1890 const char *name = netdev_get_name(netdev_);
1891 while (fgets(line, sizeof line, file)) {
1892 struct in6_addr in6_tmp;
1893 char ifname[16 + 1];
1894 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1895 && !strcmp(name, ifname))
1896 {
1897 netdev_dev->in6 = in6_tmp;
1898 break;
1899 }
1900 }
1901 fclose(file);
1902 }
1903 netdev_dev->cache_valid |= VALID_IN6;
1904 }
1905 *in6 = netdev_dev->in6;
1906 return 0;
1907 }
1908
1909 static void
1910 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1911 {
1912 struct sockaddr_in sin;
1913 memset(&sin, 0, sizeof sin);
1914 sin.sin_family = AF_INET;
1915 sin.sin_addr = addr;
1916 sin.sin_port = 0;
1917
1918 memset(sa, 0, sizeof *sa);
1919 memcpy(sa, &sin, sizeof sin);
1920 }
1921
1922 static int
1923 do_set_addr(struct netdev *netdev,
1924 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1925 {
1926 struct ifreq ifr;
1927 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1928 make_in4_sockaddr(&ifr.ifr_addr, addr);
1929
1930 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1931 ioctl_name);
1932 }
1933
1934 /* Adds 'router' as a default IP gateway. */
1935 static int
1936 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1937 {
1938 struct in_addr any = { INADDR_ANY };
1939 struct rtentry rt;
1940 int error;
1941
1942 memset(&rt, 0, sizeof rt);
1943 make_in4_sockaddr(&rt.rt_dst, any);
1944 make_in4_sockaddr(&rt.rt_gateway, router);
1945 make_in4_sockaddr(&rt.rt_genmask, any);
1946 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1947 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1948 if (error) {
1949 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1950 }
1951 return error;
1952 }
1953
1954 static int
1955 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1956 char **netdev_name)
1957 {
1958 static const char fn[] = "/proc/net/route";
1959 FILE *stream;
1960 char line[256];
1961 int ln;
1962
1963 *netdev_name = NULL;
1964 stream = fopen(fn, "r");
1965 if (stream == NULL) {
1966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1967 return errno;
1968 }
1969
1970 ln = 0;
1971 while (fgets(line, sizeof line, stream)) {
1972 if (++ln >= 2) {
1973 char iface[17];
1974 uint32_t dest, gateway, mask;
1975 int refcnt, metric, mtu;
1976 unsigned int flags, use, window, irtt;
1977
1978 if (sscanf(line,
1979 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1980 " %d %u %u\n",
1981 iface, &dest, &gateway, &flags, &refcnt,
1982 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1983
1984 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1985 fn, ln, line);
1986 continue;
1987 }
1988 if (!(flags & RTF_UP)) {
1989 /* Skip routes that aren't up. */
1990 continue;
1991 }
1992
1993 /* The output of 'dest', 'mask', and 'gateway' were given in
1994 * network byte order, so we don't need need any endian
1995 * conversions here. */
1996 if ((dest & mask) == (host->s_addr & mask)) {
1997 if (!gateway) {
1998 /* The host is directly reachable. */
1999 next_hop->s_addr = 0;
2000 } else {
2001 /* To reach the host, we must go through a gateway. */
2002 next_hop->s_addr = gateway;
2003 }
2004 *netdev_name = xstrdup(iface);
2005 fclose(stream);
2006 return 0;
2007 }
2008 }
2009 }
2010
2011 fclose(stream);
2012 return ENXIO;
2013 }
2014
2015 static int
2016 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2017 {
2018 struct ethtool_drvinfo drvinfo;
2019 int error;
2020
2021 memset(&drvinfo, 0, sizeof drvinfo);
2022 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2023 (struct ethtool_cmd *)&drvinfo,
2024 ETHTOOL_GDRVINFO,
2025 "ETHTOOL_GDRVINFO");
2026 if (!error) {
2027 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2028 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2029 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2030 }
2031
2032 return error;
2033 }
2034
2035 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2036 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2037 * returns 0. Otherwise, it returns a positive errno value; in particular,
2038 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2039 static int
2040 netdev_linux_arp_lookup(const struct netdev *netdev,
2041 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2042 {
2043 struct arpreq r;
2044 struct sockaddr_in sin;
2045 int retval;
2046
2047 memset(&r, 0, sizeof r);
2048 sin.sin_family = AF_INET;
2049 sin.sin_addr.s_addr = ip;
2050 sin.sin_port = 0;
2051 memcpy(&r.arp_pa, &sin, sizeof sin);
2052 r.arp_ha.sa_family = ARPHRD_ETHER;
2053 r.arp_flags = 0;
2054 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2055 COVERAGE_INC(netdev_arp_lookup);
2056 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2057 if (!retval) {
2058 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2059 } else if (retval != ENXIO) {
2060 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2061 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2062 }
2063 return retval;
2064 }
2065
2066 static int
2067 nd_to_iff_flags(enum netdev_flags nd)
2068 {
2069 int iff = 0;
2070 if (nd & NETDEV_UP) {
2071 iff |= IFF_UP;
2072 }
2073 if (nd & NETDEV_PROMISC) {
2074 iff |= IFF_PROMISC;
2075 }
2076 return iff;
2077 }
2078
2079 static int
2080 iff_to_nd_flags(int iff)
2081 {
2082 enum netdev_flags nd = 0;
2083 if (iff & IFF_UP) {
2084 nd |= NETDEV_UP;
2085 }
2086 if (iff & IFF_PROMISC) {
2087 nd |= NETDEV_PROMISC;
2088 }
2089 return nd;
2090 }
2091
2092 static int
2093 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2094 enum netdev_flags on, enum netdev_flags *old_flagsp)
2095 {
2096 int old_flags, new_flags;
2097 int error;
2098
2099 error = get_flags(netdev, &old_flags);
2100 if (!error) {
2101 *old_flagsp = iff_to_nd_flags(old_flags);
2102 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2103 if (new_flags != old_flags) {
2104 error = set_flags(netdev, new_flags);
2105 }
2106 }
2107 return error;
2108 }
2109
2110 static void
2111 poll_notify(struct list *list)
2112 {
2113 struct netdev_linux_notifier *notifier;
2114 LIST_FOR_EACH (notifier, node, list) {
2115 struct netdev_notifier *n = &notifier->notifier;
2116 n->cb(n);
2117 }
2118 }
2119
2120 static void
2121 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2122 void *aux OVS_UNUSED)
2123 {
2124 if (change) {
2125 struct list *list = shash_find_data(&netdev_linux_notifiers,
2126 change->ifname);
2127 if (list) {
2128 poll_notify(list);
2129 }
2130 } else {
2131 struct shash_node *node;
2132 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2133 poll_notify(node->data);
2134 }
2135 }
2136 }
2137
2138 static int
2139 netdev_linux_poll_add(struct netdev *netdev,
2140 void (*cb)(struct netdev_notifier *), void *aux,
2141 struct netdev_notifier **notifierp)
2142 {
2143 const char *netdev_name = netdev_get_name(netdev);
2144 struct netdev_linux_notifier *notifier;
2145 struct list *list;
2146
2147 if (shash_is_empty(&netdev_linux_notifiers)) {
2148 int error;
2149 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2150 netdev_linux_poll_cb, NULL);
2151 if (error) {
2152 return error;
2153 }
2154 }
2155
2156 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2157 if (!list) {
2158 list = xmalloc(sizeof *list);
2159 list_init(list);
2160 shash_add(&netdev_linux_notifiers, netdev_name, list);
2161 }
2162
2163 notifier = xmalloc(sizeof *notifier);
2164 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2165 list_push_back(list, &notifier->node);
2166 *notifierp = &notifier->notifier;
2167 return 0;
2168 }
2169
2170 static void
2171 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2172 {
2173 struct netdev_linux_notifier *notifier =
2174 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2175 struct list *list;
2176
2177 /* Remove 'notifier' from its list. */
2178 list = list_remove(&notifier->node);
2179 if (list_is_empty(list)) {
2180 /* The list is now empty. Remove it from the hash and free it. */
2181 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2182 shash_delete(&netdev_linux_notifiers,
2183 shash_find(&netdev_linux_notifiers, netdev_name));
2184 free(list);
2185 }
2186 free(notifier);
2187
2188 /* If that was the last notifier, unregister. */
2189 if (shash_is_empty(&netdev_linux_notifiers)) {
2190 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2191 }
2192 }
2193
2194 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2195 { \
2196 NAME, \
2197 \
2198 netdev_linux_init, \
2199 netdev_linux_run, \
2200 netdev_linux_wait, \
2201 \
2202 CREATE, \
2203 netdev_linux_destroy, \
2204 NULL, /* set_config */ \
2205 \
2206 netdev_linux_open, \
2207 netdev_linux_close, \
2208 \
2209 ENUMERATE, \
2210 \
2211 netdev_linux_recv, \
2212 netdev_linux_recv_wait, \
2213 netdev_linux_drain, \
2214 \
2215 netdev_linux_send, \
2216 netdev_linux_send_wait, \
2217 \
2218 netdev_linux_set_etheraddr, \
2219 netdev_linux_get_etheraddr, \
2220 netdev_linux_get_mtu, \
2221 netdev_linux_get_ifindex, \
2222 netdev_linux_get_carrier, \
2223 netdev_linux_get_miimon, \
2224 netdev_linux_get_stats, \
2225 SET_STATS, \
2226 \
2227 netdev_linux_get_features, \
2228 netdev_linux_set_advertisements, \
2229 netdev_linux_get_vlan_vid, \
2230 \
2231 netdev_linux_set_policing, \
2232 netdev_linux_get_qos_types, \
2233 netdev_linux_get_qos_capabilities, \
2234 netdev_linux_get_qos, \
2235 netdev_linux_set_qos, \
2236 netdev_linux_get_queue, \
2237 netdev_linux_set_queue, \
2238 netdev_linux_delete_queue, \
2239 netdev_linux_get_queue_stats, \
2240 netdev_linux_dump_queues, \
2241 netdev_linux_dump_queue_stats, \
2242 \
2243 netdev_linux_get_in4, \
2244 netdev_linux_set_in4, \
2245 netdev_linux_get_in6, \
2246 netdev_linux_add_router, \
2247 netdev_linux_get_next_hop, \
2248 netdev_linux_get_status, \
2249 netdev_linux_arp_lookup, \
2250 \
2251 netdev_linux_update_flags, \
2252 \
2253 netdev_linux_poll_add, \
2254 netdev_linux_poll_remove \
2255 }
2256
2257 const struct netdev_class netdev_linux_class =
2258 NETDEV_LINUX_CLASS(
2259 "system",
2260 netdev_linux_create,
2261 netdev_linux_enumerate,
2262 NULL); /* set_stats */
2263
2264 const struct netdev_class netdev_tap_class =
2265 NETDEV_LINUX_CLASS(
2266 "tap",
2267 netdev_linux_create_tap,
2268 NULL, /* enumerate */
2269 NULL); /* set_stats */
2270
2271 const struct netdev_class netdev_internal_class =
2272 NETDEV_LINUX_CLASS(
2273 "internal",
2274 netdev_linux_create,
2275 NULL, /* enumerate */
2276 netdev_vport_set_stats);
2277 \f
2278 /* HTB traffic control class. */
2279
2280 #define HTB_N_QUEUES 0xf000
2281
2282 struct htb {
2283 struct tc tc;
2284 unsigned int max_rate; /* In bytes/s. */
2285 };
2286
2287 struct htb_class {
2288 struct tc_queue tc_queue;
2289 unsigned int min_rate; /* In bytes/s. */
2290 unsigned int max_rate; /* In bytes/s. */
2291 unsigned int burst; /* In bytes. */
2292 unsigned int priority; /* Lower values are higher priorities. */
2293 };
2294
2295 static struct htb *
2296 htb_get__(const struct netdev *netdev)
2297 {
2298 struct netdev_dev_linux *netdev_dev =
2299 netdev_dev_linux_cast(netdev_get_dev(netdev));
2300 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2301 }
2302
2303 static struct htb *
2304 htb_install__(struct netdev *netdev, uint64_t max_rate)
2305 {
2306 struct netdev_dev_linux *netdev_dev =
2307 netdev_dev_linux_cast(netdev_get_dev(netdev));
2308 struct htb *htb;
2309
2310 htb = xmalloc(sizeof *htb);
2311 tc_init(&htb->tc, &tc_ops_htb);
2312 htb->max_rate = max_rate;
2313
2314 netdev_dev->tc = &htb->tc;
2315
2316 return htb;
2317 }
2318
2319 /* Create an HTB qdisc.
2320 *
2321 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2322 static int
2323 htb_setup_qdisc__(struct netdev *netdev)
2324 {
2325 size_t opt_offset;
2326 struct tc_htb_glob opt;
2327 struct ofpbuf request;
2328 struct tcmsg *tcmsg;
2329
2330 tc_del_qdisc(netdev);
2331
2332 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2333 NLM_F_EXCL | NLM_F_CREATE, &request);
2334 if (!tcmsg) {
2335 return ENODEV;
2336 }
2337 tcmsg->tcm_handle = tc_make_handle(1, 0);
2338 tcmsg->tcm_parent = TC_H_ROOT;
2339
2340 nl_msg_put_string(&request, TCA_KIND, "htb");
2341
2342 memset(&opt, 0, sizeof opt);
2343 opt.rate2quantum = 10;
2344 opt.version = 3;
2345 opt.defcls = 1;
2346
2347 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2348 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2349 nl_msg_end_nested(&request, opt_offset);
2350
2351 return tc_transact(&request, NULL);
2352 }
2353
2354 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2355 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2356 static int
2357 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2358 unsigned int parent, struct htb_class *class)
2359 {
2360 size_t opt_offset;
2361 struct tc_htb_opt opt;
2362 struct ofpbuf request;
2363 struct tcmsg *tcmsg;
2364 int error;
2365 int mtu;
2366
2367 netdev_get_mtu(netdev, &mtu);
2368
2369 memset(&opt, 0, sizeof opt);
2370 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2371 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2372 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2373 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2374 opt.prio = class->priority;
2375
2376 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2377 if (!tcmsg) {
2378 return ENODEV;
2379 }
2380 tcmsg->tcm_handle = handle;
2381 tcmsg->tcm_parent = parent;
2382
2383 nl_msg_put_string(&request, TCA_KIND, "htb");
2384 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2385 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2386 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2387 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2388 nl_msg_end_nested(&request, opt_offset);
2389
2390 error = tc_transact(&request, NULL);
2391 if (error) {
2392 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2393 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2394 netdev_get_name(netdev),
2395 tc_get_major(handle), tc_get_minor(handle),
2396 tc_get_major(parent), tc_get_minor(parent),
2397 class->min_rate, class->max_rate,
2398 class->burst, class->priority, strerror(error));
2399 }
2400 return error;
2401 }
2402
2403 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2404 * description of them into 'details'. The description complies with the
2405 * specification given in the vswitch database documentation for linux-htb
2406 * queue details. */
2407 static int
2408 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2409 {
2410 static const struct nl_policy tca_htb_policy[] = {
2411 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2412 .min_len = sizeof(struct tc_htb_opt) },
2413 };
2414
2415 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2416 const struct tc_htb_opt *htb;
2417
2418 if (!nl_parse_nested(nl_options, tca_htb_policy,
2419 attrs, ARRAY_SIZE(tca_htb_policy))) {
2420 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2421 return EPROTO;
2422 }
2423
2424 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2425 class->min_rate = htb->rate.rate;
2426 class->max_rate = htb->ceil.rate;
2427 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2428 class->priority = htb->prio;
2429 return 0;
2430 }
2431
2432 static int
2433 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2434 struct htb_class *options,
2435 struct netdev_queue_stats *stats)
2436 {
2437 struct nlattr *nl_options;
2438 unsigned int handle;
2439 int error;
2440
2441 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2442 if (!error && queue_id) {
2443 unsigned int major = tc_get_major(handle);
2444 unsigned int minor = tc_get_minor(handle);
2445 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2446 *queue_id = minor - 1;
2447 } else {
2448 error = EPROTO;
2449 }
2450 }
2451 if (!error && options) {
2452 error = htb_parse_tca_options__(nl_options, options);
2453 }
2454 return error;
2455 }
2456
2457 static void
2458 htb_parse_qdisc_details__(struct netdev *netdev,
2459 const struct shash *details, struct htb_class *hc)
2460 {
2461 const char *max_rate_s;
2462
2463 max_rate_s = shash_find_data(details, "max-rate");
2464 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2465 if (!hc->max_rate) {
2466 uint32_t current;
2467
2468 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2469 hc->max_rate = netdev_features_to_bps(current) / 8;
2470 }
2471 hc->min_rate = hc->max_rate;
2472 hc->burst = 0;
2473 hc->priority = 0;
2474 }
2475
2476 static int
2477 htb_parse_class_details__(struct netdev *netdev,
2478 const struct shash *details, struct htb_class *hc)
2479 {
2480 const struct htb *htb = htb_get__(netdev);
2481 const char *min_rate_s = shash_find_data(details, "min-rate");
2482 const char *max_rate_s = shash_find_data(details, "max-rate");
2483 const char *burst_s = shash_find_data(details, "burst");
2484 const char *priority_s = shash_find_data(details, "priority");
2485 int mtu;
2486
2487 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2488 if (!min_rate_s) {
2489 /* min-rate is required. */
2490 return EINVAL;
2491 }
2492 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2493 hc->min_rate = MAX(hc->min_rate, 1500);
2494 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2495
2496 /* max-rate */
2497 hc->max_rate = (max_rate_s
2498 ? strtoull(max_rate_s, NULL, 10) / 8
2499 : htb->max_rate);
2500 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2501 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2502
2503 /* burst
2504 *
2505 * According to hints in the documentation that I've read, it is important
2506 * that 'burst' be at least as big as the largest frame that might be
2507 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2508 * but having it a bit too small is a problem. Since netdev_get_mtu()
2509 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2510 * the MTU. We actually add 64, instead of 14, as a guard against
2511 * additional headers get tacked on somewhere that we're not aware of. */
2512 netdev_get_mtu(netdev, &mtu);
2513 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2514 hc->burst = MAX(hc->burst, mtu + 64);
2515
2516 /* priority */
2517 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2518
2519 return 0;
2520 }
2521
2522 static int
2523 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2524 unsigned int parent, struct htb_class *options,
2525 struct netdev_queue_stats *stats)
2526 {
2527 struct ofpbuf *reply;
2528 int error;
2529
2530 error = tc_query_class(netdev, handle, parent, &reply);
2531 if (!error) {
2532 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2533 ofpbuf_delete(reply);
2534 }
2535 return error;
2536 }
2537
2538 static int
2539 htb_tc_install(struct netdev *netdev, const struct shash *details)
2540 {
2541 int error;
2542
2543 error = htb_setup_qdisc__(netdev);
2544 if (!error) {
2545 struct htb_class hc;
2546
2547 htb_parse_qdisc_details__(netdev, details, &hc);
2548 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2549 tc_make_handle(1, 0), &hc);
2550 if (!error) {
2551 htb_install__(netdev, hc.max_rate);
2552 }
2553 }
2554 return error;
2555 }
2556
2557 static struct htb_class *
2558 htb_class_cast__(const struct tc_queue *queue)
2559 {
2560 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2561 }
2562
2563 static void
2564 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2565 const struct htb_class *hc)
2566 {
2567 struct htb *htb = htb_get__(netdev);
2568 size_t hash = hash_int(queue_id, 0);
2569 struct tc_queue *queue;
2570 struct htb_class *hcp;
2571
2572 queue = tc_find_queue__(netdev, queue_id, hash);
2573 if (queue) {
2574 hcp = htb_class_cast__(queue);
2575 } else {
2576 hcp = xmalloc(sizeof *hcp);
2577 queue = &hcp->tc_queue;
2578 queue->queue_id = queue_id;
2579 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2580 }
2581
2582 hcp->min_rate = hc->min_rate;
2583 hcp->max_rate = hc->max_rate;
2584 hcp->burst = hc->burst;
2585 hcp->priority = hc->priority;
2586 }
2587
2588 static int
2589 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2590 {
2591 struct ofpbuf msg;
2592 struct nl_dump dump;
2593 struct htb_class hc;
2594 struct htb *htb;
2595
2596 /* Get qdisc options. */
2597 hc.max_rate = 0;
2598 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2599 htb = htb_install__(netdev, hc.max_rate);
2600
2601 /* Get queues. */
2602 if (!start_queue_dump(netdev, &dump)) {
2603 return ENODEV;
2604 }
2605 while (nl_dump_next(&dump, &msg)) {
2606 unsigned int queue_id;
2607
2608 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2609 htb_update_queue__(netdev, queue_id, &hc);
2610 }
2611 }
2612 nl_dump_done(&dump);
2613
2614 return 0;
2615 }
2616
2617 static void
2618 htb_tc_destroy(struct tc *tc)
2619 {
2620 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2621 struct htb_class *hc, *next;
2622
2623 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2624 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2625 free(hc);
2626 }
2627 tc_destroy(tc);
2628 free(htb);
2629 }
2630
2631 static int
2632 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2633 {
2634 const struct htb *htb = htb_get__(netdev);
2635 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2636 return 0;
2637 }
2638
2639 static int
2640 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2641 {
2642 struct htb_class hc;
2643 int error;
2644
2645 htb_parse_qdisc_details__(netdev, details, &hc);
2646 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2647 tc_make_handle(1, 0), &hc);
2648 if (!error) {
2649 htb_get__(netdev)->max_rate = hc.max_rate;
2650 }
2651 return error;
2652 }
2653
2654 static int
2655 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2656 const struct tc_queue *queue, struct shash *details)
2657 {
2658 const struct htb_class *hc = htb_class_cast__(queue);
2659
2660 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2661 if (hc->min_rate != hc->max_rate) {
2662 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2663 }
2664 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2665 if (hc->priority) {
2666 shash_add(details, "priority", xasprintf("%u", hc->priority));
2667 }
2668 return 0;
2669 }
2670
2671 static int
2672 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2673 const struct shash *details)
2674 {
2675 struct htb_class hc;
2676 int error;
2677
2678 error = htb_parse_class_details__(netdev, details, &hc);
2679 if (error) {
2680 return error;
2681 }
2682
2683 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2684 tc_make_handle(1, 0xfffe), &hc);
2685 if (error) {
2686 return error;
2687 }
2688
2689 htb_update_queue__(netdev, queue_id, &hc);
2690 return 0;
2691 }
2692
2693 static int
2694 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2695 {
2696 struct htb_class *hc = htb_class_cast__(queue);
2697 struct htb *htb = htb_get__(netdev);
2698 int error;
2699
2700 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2701 if (!error) {
2702 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2703 free(hc);
2704 }
2705 return error;
2706 }
2707
2708 static int
2709 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2710 struct netdev_queue_stats *stats)
2711 {
2712 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2713 tc_make_handle(1, 0xfffe), NULL, stats);
2714 }
2715
2716 static int
2717 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2718 const struct ofpbuf *nlmsg,
2719 netdev_dump_queue_stats_cb *cb, void *aux)
2720 {
2721 struct netdev_queue_stats stats;
2722 unsigned int handle, major, minor;
2723 int error;
2724
2725 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2726 if (error) {
2727 return error;
2728 }
2729
2730 major = tc_get_major(handle);
2731 minor = tc_get_minor(handle);
2732 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2733 (*cb)(minor - 1, &stats, aux);
2734 }
2735 return 0;
2736 }
2737
2738 static const struct tc_ops tc_ops_htb = {
2739 "htb", /* linux_name */
2740 "linux-htb", /* ovs_name */
2741 HTB_N_QUEUES, /* n_queues */
2742 htb_tc_install,
2743 htb_tc_load,
2744 htb_tc_destroy,
2745 htb_qdisc_get,
2746 htb_qdisc_set,
2747 htb_class_get,
2748 htb_class_set,
2749 htb_class_delete,
2750 htb_class_get_stats,
2751 htb_class_dump_stats
2752 };
2753 \f
2754 /* "linux-hfsc" traffic control class. */
2755
2756 #define HFSC_N_QUEUES 0xf000
2757
2758 struct hfsc {
2759 struct tc tc;
2760 uint32_t max_rate;
2761 };
2762
2763 struct hfsc_class {
2764 struct tc_queue tc_queue;
2765 uint32_t min_rate;
2766 uint32_t max_rate;
2767 };
2768
2769 static struct hfsc *
2770 hfsc_get__(const struct netdev *netdev)
2771 {
2772 struct netdev_dev_linux *netdev_dev;
2773 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2774 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2775 }
2776
2777 static struct hfsc_class *
2778 hfsc_class_cast__(const struct tc_queue *queue)
2779 {
2780 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2781 }
2782
2783 static struct hfsc *
2784 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2785 {
2786 struct netdev_dev_linux * netdev_dev;
2787 struct hfsc *hfsc;
2788
2789 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2790 hfsc = xmalloc(sizeof *hfsc);
2791 tc_init(&hfsc->tc, &tc_ops_hfsc);
2792 hfsc->max_rate = max_rate;
2793 netdev_dev->tc = &hfsc->tc;
2794
2795 return hfsc;
2796 }
2797
2798 static void
2799 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2800 const struct hfsc_class *hc)
2801 {
2802 size_t hash;
2803 struct hfsc *hfsc;
2804 struct hfsc_class *hcp;
2805 struct tc_queue *queue;
2806
2807 hfsc = hfsc_get__(netdev);
2808 hash = hash_int(queue_id, 0);
2809
2810 queue = tc_find_queue__(netdev, queue_id, hash);
2811 if (queue) {
2812 hcp = hfsc_class_cast__(queue);
2813 } else {
2814 hcp = xmalloc(sizeof *hcp);
2815 queue = &hcp->tc_queue;
2816 queue->queue_id = queue_id;
2817 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2818 }
2819
2820 hcp->min_rate = hc->min_rate;
2821 hcp->max_rate = hc->max_rate;
2822 }
2823
2824 static int
2825 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2826 {
2827 const struct tc_service_curve *rsc, *fsc, *usc;
2828 static const struct nl_policy tca_hfsc_policy[] = {
2829 [TCA_HFSC_RSC] = {
2830 .type = NL_A_UNSPEC,
2831 .optional = false,
2832 .min_len = sizeof(struct tc_service_curve),
2833 },
2834 [TCA_HFSC_FSC] = {
2835 .type = NL_A_UNSPEC,
2836 .optional = false,
2837 .min_len = sizeof(struct tc_service_curve),
2838 },
2839 [TCA_HFSC_USC] = {
2840 .type = NL_A_UNSPEC,
2841 .optional = false,
2842 .min_len = sizeof(struct tc_service_curve),
2843 },
2844 };
2845 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2846
2847 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2848 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2849 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2850 return EPROTO;
2851 }
2852
2853 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2854 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2855 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2856
2857 if (rsc->m1 != 0 || rsc->d != 0 ||
2858 fsc->m1 != 0 || fsc->d != 0 ||
2859 usc->m1 != 0 || usc->d != 0) {
2860 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2861 "Non-linear service curves are not supported.");
2862 return EPROTO;
2863 }
2864
2865 if (rsc->m2 != fsc->m2) {
2866 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2867 "Real-time service curves are not supported ");
2868 return EPROTO;
2869 }
2870
2871 if (rsc->m2 > usc->m2) {
2872 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2873 "Min-rate service curve is greater than "
2874 "the max-rate service curve.");
2875 return EPROTO;
2876 }
2877
2878 class->min_rate = fsc->m2;
2879 class->max_rate = usc->m2;
2880 return 0;
2881 }
2882
2883 static int
2884 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2885 struct hfsc_class *options,
2886 struct netdev_queue_stats *stats)
2887 {
2888 int error;
2889 unsigned int handle;
2890 struct nlattr *nl_options;
2891
2892 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2893 if (error) {
2894 return error;
2895 }
2896
2897 if (queue_id) {
2898 unsigned int major, minor;
2899
2900 major = tc_get_major(handle);
2901 minor = tc_get_minor(handle);
2902 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2903 *queue_id = minor - 1;
2904 } else {
2905 return EPROTO;
2906 }
2907 }
2908
2909 if (options) {
2910 error = hfsc_parse_tca_options__(nl_options, options);
2911 }
2912
2913 return error;
2914 }
2915
2916 static int
2917 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2918 unsigned int parent, struct hfsc_class *options,
2919 struct netdev_queue_stats *stats)
2920 {
2921 int error;
2922 struct ofpbuf *reply;
2923
2924 error = tc_query_class(netdev, handle, parent, &reply);
2925 if (error) {
2926 return error;
2927 }
2928
2929 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2930 ofpbuf_delete(reply);
2931 return error;
2932 }
2933
2934 static void
2935 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2936 struct hfsc_class *class)
2937 {
2938 uint32_t max_rate;
2939 const char *max_rate_s;
2940
2941 max_rate_s = shash_find_data(details, "max-rate");
2942 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2943
2944 if (!max_rate) {
2945 uint32_t current;
2946
2947 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2948 max_rate = netdev_features_to_bps(current) / 8;
2949 }
2950
2951 class->min_rate = max_rate;
2952 class->max_rate = max_rate;
2953 }
2954
2955 static int
2956 hfsc_parse_class_details__(struct netdev *netdev,
2957 const struct shash *details,
2958 struct hfsc_class * class)
2959 {
2960 const struct hfsc *hfsc;
2961 uint32_t min_rate, max_rate;
2962 const char *min_rate_s, *max_rate_s;
2963
2964 hfsc = hfsc_get__(netdev);
2965 min_rate_s = shash_find_data(details, "min-rate");
2966 max_rate_s = shash_find_data(details, "max-rate");
2967
2968 if (!min_rate_s) {
2969 return EINVAL;
2970 }
2971
2972 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2973 min_rate = MAX(min_rate, 1500);
2974 min_rate = MIN(min_rate, hfsc->max_rate);
2975
2976 max_rate = (max_rate_s
2977 ? strtoull(max_rate_s, NULL, 10) / 8
2978 : hfsc->max_rate);
2979 max_rate = MAX(max_rate, min_rate);
2980 max_rate = MIN(max_rate, hfsc->max_rate);
2981
2982 class->min_rate = min_rate;
2983 class->max_rate = max_rate;
2984
2985 return 0;
2986 }
2987
2988 /* Create an HFSC qdisc.
2989 *
2990 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2991 static int
2992 hfsc_setup_qdisc__(struct netdev * netdev)
2993 {
2994 struct tcmsg *tcmsg;
2995 struct ofpbuf request;
2996 struct tc_hfsc_qopt opt;
2997
2998 tc_del_qdisc(netdev);
2999
3000 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3001 NLM_F_EXCL | NLM_F_CREATE, &request);
3002
3003 if (!tcmsg) {
3004 return ENODEV;
3005 }
3006
3007 tcmsg->tcm_handle = tc_make_handle(1, 0);
3008 tcmsg->tcm_parent = TC_H_ROOT;
3009
3010 memset(&opt, 0, sizeof opt);
3011 opt.defcls = 1;
3012
3013 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3014 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3015
3016 return tc_transact(&request, NULL);
3017 }
3018
3019 /* Create an HFSC class.
3020 *
3021 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3022 * sc rate <min_rate> ul rate <max_rate>" */
3023 static int
3024 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3025 unsigned int parent, struct hfsc_class *class)
3026 {
3027 int error;
3028 size_t opt_offset;
3029 struct tcmsg *tcmsg;
3030 struct ofpbuf request;
3031 struct tc_service_curve min, max;
3032
3033 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3034
3035 if (!tcmsg) {
3036 return ENODEV;
3037 }
3038
3039 tcmsg->tcm_handle = handle;
3040 tcmsg->tcm_parent = parent;
3041
3042 min.m1 = 0;
3043 min.d = 0;
3044 min.m2 = class->min_rate;
3045
3046 max.m1 = 0;
3047 max.d = 0;
3048 max.m2 = class->max_rate;
3049
3050 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3051 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3052 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3053 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3054 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3055 nl_msg_end_nested(&request, opt_offset);
3056
3057 error = tc_transact(&request, NULL);
3058 if (error) {
3059 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3060 "min-rate %ubps, max-rate %ubps (%s)",
3061 netdev_get_name(netdev),
3062 tc_get_major(handle), tc_get_minor(handle),
3063 tc_get_major(parent), tc_get_minor(parent),
3064 class->min_rate, class->max_rate, strerror(error));
3065 }
3066
3067 return error;
3068 }
3069
3070 static int
3071 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3072 {
3073 int error;
3074 struct hfsc_class class;
3075
3076 error = hfsc_setup_qdisc__(netdev);
3077
3078 if (error) {
3079 return error;
3080 }
3081
3082 hfsc_parse_qdisc_details__(netdev, details, &class);
3083 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3084 tc_make_handle(1, 0), &class);
3085
3086 if (error) {
3087 return error;
3088 }
3089
3090 hfsc_install__(netdev, class.max_rate);
3091 return 0;
3092 }
3093
3094 static int
3095 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3096 {
3097 struct ofpbuf msg;
3098 struct hfsc *hfsc;
3099 struct nl_dump dump;
3100 struct hfsc_class hc;
3101
3102 hc.max_rate = 0;
3103 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3104 hfsc = hfsc_install__(netdev, hc.max_rate);
3105
3106 if (!start_queue_dump(netdev, &dump)) {
3107 return ENODEV;
3108 }
3109
3110 while (nl_dump_next(&dump, &msg)) {
3111 unsigned int queue_id;
3112
3113 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3114 hfsc_update_queue__(netdev, queue_id, &hc);
3115 }
3116 }
3117
3118 nl_dump_done(&dump);
3119 return 0;
3120 }
3121
3122 static void
3123 hfsc_tc_destroy(struct tc *tc)
3124 {
3125 struct hfsc *hfsc;
3126 struct hfsc_class *hc, *next;
3127
3128 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3129
3130 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3131 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3132 free(hc);
3133 }
3134
3135 tc_destroy(tc);
3136 free(hfsc);
3137 }
3138
3139 static int
3140 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3141 {
3142 const struct hfsc *hfsc;
3143 hfsc = hfsc_get__(netdev);
3144 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3145 return 0;
3146 }
3147
3148 static int
3149 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3150 {
3151 int error;
3152 struct hfsc_class class;
3153
3154 hfsc_parse_qdisc_details__(netdev, details, &class);
3155 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3156 tc_make_handle(1, 0), &class);
3157
3158 if (!error) {
3159 hfsc_get__(netdev)->max_rate = class.max_rate;
3160 }
3161
3162 return error;
3163 }
3164
3165 static int
3166 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3167 const struct tc_queue *queue, struct shash *details)
3168 {
3169 const struct hfsc_class *hc;
3170
3171 hc = hfsc_class_cast__(queue);
3172 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3173 if (hc->min_rate != hc->max_rate) {
3174 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3175 }
3176 return 0;
3177 }
3178
3179 static int
3180 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3181 const struct shash *details)
3182 {
3183 int error;
3184 struct hfsc_class class;
3185
3186 error = hfsc_parse_class_details__(netdev, details, &class);
3187 if (error) {
3188 return error;
3189 }
3190
3191 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3192 tc_make_handle(1, 0xfffe), &class);
3193 if (error) {
3194 return error;
3195 }
3196
3197 hfsc_update_queue__(netdev, queue_id, &class);
3198 return 0;
3199 }
3200
3201 static int
3202 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3203 {
3204 int error;
3205 struct hfsc *hfsc;
3206 struct hfsc_class *hc;
3207
3208 hc = hfsc_class_cast__(queue);
3209 hfsc = hfsc_get__(netdev);
3210
3211 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3212 if (!error) {
3213 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3214 free(hc);
3215 }
3216 return error;
3217 }
3218
3219 static int
3220 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3221 struct netdev_queue_stats *stats)
3222 {
3223 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3224 tc_make_handle(1, 0xfffe), NULL, stats);
3225 }
3226
3227 static int
3228 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3229 const struct ofpbuf *nlmsg,
3230 netdev_dump_queue_stats_cb *cb, void *aux)
3231 {
3232 struct netdev_queue_stats stats;
3233 unsigned int handle, major, minor;
3234 int error;
3235
3236 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3237 if (error) {
3238 return error;
3239 }
3240
3241 major = tc_get_major(handle);
3242 minor = tc_get_minor(handle);
3243 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3244 (*cb)(minor - 1, &stats, aux);
3245 }
3246 return 0;
3247 }
3248
3249 static const struct tc_ops tc_ops_hfsc = {
3250 "hfsc", /* linux_name */
3251 "linux-hfsc", /* ovs_name */
3252 HFSC_N_QUEUES, /* n_queues */
3253 hfsc_tc_install, /* tc_install */
3254 hfsc_tc_load, /* tc_load */
3255 hfsc_tc_destroy, /* tc_destroy */
3256 hfsc_qdisc_get, /* qdisc_get */
3257 hfsc_qdisc_set, /* qdisc_set */
3258 hfsc_class_get, /* class_get */
3259 hfsc_class_set, /* class_set */
3260 hfsc_class_delete, /* class_delete */
3261 hfsc_class_get_stats, /* class_get_stats */
3262 hfsc_class_dump_stats /* class_dump_stats */
3263 };
3264 \f
3265 /* "linux-default" traffic control class.
3266 *
3267 * This class represents the default, unnamed Linux qdisc. It corresponds to
3268 * the "" (empty string) QoS type in the OVS database. */
3269
3270 static void
3271 default_install__(struct netdev *netdev)
3272 {
3273 struct netdev_dev_linux *netdev_dev =
3274 netdev_dev_linux_cast(netdev_get_dev(netdev));
3275 static struct tc *tc;
3276
3277 if (!tc) {
3278 tc = xmalloc(sizeof *tc);
3279 tc_init(tc, &tc_ops_default);
3280 }
3281 netdev_dev->tc = tc;
3282 }
3283
3284 static int
3285 default_tc_install(struct netdev *netdev,
3286 const struct shash *details OVS_UNUSED)
3287 {
3288 default_install__(netdev);
3289 return 0;
3290 }
3291
3292 static int
3293 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3294 {
3295 default_install__(netdev);
3296 return 0;
3297 }
3298
3299 static const struct tc_ops tc_ops_default = {
3300 NULL, /* linux_name */
3301 "", /* ovs_name */
3302 0, /* n_queues */
3303 default_tc_install,
3304 default_tc_load,
3305 NULL, /* tc_destroy */
3306 NULL, /* qdisc_get */
3307 NULL, /* qdisc_set */
3308 NULL, /* class_get */
3309 NULL, /* class_set */
3310 NULL, /* class_delete */
3311 NULL, /* class_get_stats */
3312 NULL /* class_dump_stats */
3313 };
3314 \f
3315 /* "linux-other" traffic control class.
3316 *
3317 * */
3318
3319 static int
3320 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3321 {
3322 struct netdev_dev_linux *netdev_dev =
3323 netdev_dev_linux_cast(netdev_get_dev(netdev));
3324 static struct tc *tc;
3325
3326 if (!tc) {
3327 tc = xmalloc(sizeof *tc);
3328 tc_init(tc, &tc_ops_other);
3329 }
3330 netdev_dev->tc = tc;
3331 return 0;
3332 }
3333
3334 static const struct tc_ops tc_ops_other = {
3335 NULL, /* linux_name */
3336 "linux-other", /* ovs_name */
3337 0, /* n_queues */
3338 NULL, /* tc_install */
3339 other_tc_load,
3340 NULL, /* tc_destroy */
3341 NULL, /* qdisc_get */
3342 NULL, /* qdisc_set */
3343 NULL, /* class_get */
3344 NULL, /* class_set */
3345 NULL, /* class_delete */
3346 NULL, /* class_get_stats */
3347 NULL /* class_dump_stats */
3348 };
3349 \f
3350 /* Traffic control. */
3351
3352 /* Number of kernel "tc" ticks per second. */
3353 static double ticks_per_s;
3354
3355 /* Number of kernel "jiffies" per second. This is used for the purpose of
3356 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3357 * one jiffy's worth of data.
3358 *
3359 * There are two possibilities here:
3360 *
3361 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3362 * approximate range of 100 to 1024. That means that we really need to
3363 * make sure that the qdisc can buffer that much data.
3364 *
3365 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3366 * has finely granular timers and there's no need to fudge additional room
3367 * for buffers. (There's no extra effort needed to implement that: the
3368 * large 'buffer_hz' is used as a divisor, so practically any number will
3369 * come out as 0 in the division. Small integer results in the case of
3370 * really high dividends won't have any real effect anyhow.)
3371 */
3372 static unsigned int buffer_hz;
3373
3374 /* Returns tc handle 'major':'minor'. */
3375 static unsigned int
3376 tc_make_handle(unsigned int major, unsigned int minor)
3377 {
3378 return TC_H_MAKE(major << 16, minor);
3379 }
3380
3381 /* Returns the major number from 'handle'. */
3382 static unsigned int
3383 tc_get_major(unsigned int handle)
3384 {
3385 return TC_H_MAJ(handle) >> 16;
3386 }
3387
3388 /* Returns the minor number from 'handle'. */
3389 static unsigned int
3390 tc_get_minor(unsigned int handle)
3391 {
3392 return TC_H_MIN(handle);
3393 }
3394
3395 static struct tcmsg *
3396 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3397 struct ofpbuf *request)
3398 {
3399 struct tcmsg *tcmsg;
3400 int ifindex;
3401 int error;
3402
3403 error = get_ifindex(netdev, &ifindex);
3404 if (error) {
3405 return NULL;
3406 }
3407
3408 ofpbuf_init(request, 512);
3409 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3410 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3411 tcmsg->tcm_family = AF_UNSPEC;
3412 tcmsg->tcm_ifindex = ifindex;
3413 /* Caller should fill in tcmsg->tcm_handle. */
3414 /* Caller should fill in tcmsg->tcm_parent. */
3415
3416 return tcmsg;
3417 }
3418
3419 static int
3420 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3421 {
3422 int error = nl_sock_transact(rtnl_sock, request, replyp);
3423 ofpbuf_uninit(request);
3424 return error;
3425 }
3426
3427 static void
3428 read_psched(void)
3429 {
3430 /* The values in psched are not individually very meaningful, but they are
3431 * important. The tables below show some values seen in the wild.
3432 *
3433 * Some notes:
3434 *
3435 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3436 * (Before that, there are hints that it was 1000000000.)
3437 *
3438 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3439 * above.
3440 *
3441 * /proc/net/psched
3442 * -----------------------------------
3443 * [1] 000c8000 000f4240 000f4240 00000064
3444 * [2] 000003e8 00000400 000f4240 3b9aca00
3445 * [3] 000003e8 00000400 000f4240 3b9aca00
3446 * [4] 000003e8 00000400 000f4240 00000064
3447 * [5] 000003e8 00000040 000f4240 3b9aca00
3448 * [6] 000003e8 00000040 000f4240 000000f9
3449 *
3450 * a b c d ticks_per_s buffer_hz
3451 * ------- --------- ---------- ------------- ----------- -------------
3452 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3453 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3454 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3455 * [4] 1,000 1,024 1,000,000 100 976,562 100
3456 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3457 * [6] 1,000 64 1,000,000 249 15,625,000 249
3458 *
3459 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3460 * [2] 2.6.26-1-686-bigmem from Debian lenny
3461 * [3] 2.6.26-2-sparc64 from Debian lenny
3462 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3463 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3464 * [6] 2.6.34 from kernel.org on KVM
3465 */
3466 static const char fn[] = "/proc/net/psched";
3467 unsigned int a, b, c, d;
3468 FILE *stream;
3469
3470 ticks_per_s = 1.0;
3471 buffer_hz = 100;
3472
3473 stream = fopen(fn, "r");
3474 if (!stream) {
3475 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3476 return;
3477 }
3478
3479 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3480 VLOG_WARN("%s: read failed", fn);
3481 fclose(stream);
3482 return;
3483 }
3484 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3485 fclose(stream);
3486
3487 if (!a || !c) {
3488 VLOG_WARN("%s: invalid scheduler parameters", fn);
3489 return;
3490 }
3491
3492 ticks_per_s = (double) a * c / b;
3493 if (c == 1000000) {
3494 buffer_hz = d;
3495 } else {
3496 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3497 fn, a, b, c, d);
3498 }
3499 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3500 }
3501
3502 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3503 * rate of 'rate' bytes per second. */
3504 static unsigned int
3505 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3506 {
3507 if (!buffer_hz) {
3508 read_psched();
3509 }
3510 return (rate * ticks) / ticks_per_s;
3511 }
3512
3513 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3514 * rate of 'rate' bytes per second. */
3515 static unsigned int
3516 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3517 {
3518 if (!buffer_hz) {
3519 read_psched();
3520 }
3521 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3522 }
3523
3524 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3525 * a transmission rate of 'rate' bytes per second. */
3526 static unsigned int
3527 tc_buffer_per_jiffy(unsigned int rate)
3528 {
3529 if (!buffer_hz) {
3530 read_psched();
3531 }
3532 return rate / buffer_hz;
3533 }
3534
3535 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3536 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3537 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3538 * stores NULL into it if it is absent.
3539 *
3540 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3541 * 'msg'.
3542 *
3543 * Returns 0 if successful, otherwise a positive errno value. */
3544 static int
3545 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3546 struct nlattr **options)
3547 {
3548 static const struct nl_policy tca_policy[] = {
3549 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3550 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3551 };
3552 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3553
3554 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3555 tca_policy, ta, ARRAY_SIZE(ta))) {
3556 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3557 goto error;
3558 }
3559
3560 if (kind) {
3561 *kind = nl_attr_get_string(ta[TCA_KIND]);
3562 }
3563
3564 if (options) {
3565 *options = ta[TCA_OPTIONS];
3566 }
3567
3568 return 0;
3569
3570 error:
3571 if (kind) {
3572 *kind = NULL;
3573 }
3574 if (options) {
3575 *options = NULL;
3576 }
3577 return EPROTO;
3578 }
3579
3580 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3581 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3582 * into '*options', and its queue statistics into '*stats'. Any of the output
3583 * arguments may be null.
3584 *
3585 * Returns 0 if successful, otherwise a positive errno value. */
3586 static int
3587 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3588 struct nlattr **options, struct netdev_queue_stats *stats)
3589 {
3590 static const struct nl_policy tca_policy[] = {
3591 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3592 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3593 };
3594 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3595
3596 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3597 tca_policy, ta, ARRAY_SIZE(ta))) {
3598 VLOG_WARN_RL(&rl, "failed to parse class message");
3599 goto error;
3600 }
3601
3602 if (handlep) {
3603 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3604 *handlep = tc->tcm_handle;
3605 }
3606
3607 if (options) {
3608 *options = ta[TCA_OPTIONS];
3609 }
3610
3611 if (stats) {
3612 const struct gnet_stats_queue *gsq;
3613 struct gnet_stats_basic gsb;
3614
3615 static const struct nl_policy stats_policy[] = {
3616 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3617 .min_len = sizeof gsb },
3618 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3619 .min_len = sizeof *gsq },
3620 };
3621 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3622
3623 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3624 sa, ARRAY_SIZE(sa))) {
3625 VLOG_WARN_RL(&rl, "failed to parse class stats");
3626 goto error;
3627 }
3628
3629 /* Alignment issues screw up the length of struct gnet_stats_basic on
3630 * some arch/bitsize combinations. Newer versions of Linux have a
3631 * struct gnet_stats_basic_packed, but we can't depend on that. The
3632 * easiest thing to do is just to make a copy. */
3633 memset(&gsb, 0, sizeof gsb);
3634 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3635 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3636 stats->tx_bytes = gsb.bytes;
3637 stats->tx_packets = gsb.packets;
3638
3639 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3640 stats->tx_errors = gsq->drops;
3641 }
3642
3643 return 0;
3644
3645 error:
3646 if (options) {
3647 *options = NULL;
3648 }
3649 if (stats) {
3650 memset(stats, 0, sizeof *stats);
3651 }
3652 return EPROTO;
3653 }
3654
3655 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3656 * on 'netdev'. */
3657 static int
3658 tc_query_class(const struct netdev *netdev,
3659 unsigned int handle, unsigned int parent,
3660 struct ofpbuf **replyp)
3661 {
3662 struct ofpbuf request;
3663 struct tcmsg *tcmsg;
3664 int error;
3665
3666 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3667 if (!tcmsg) {
3668 return ENODEV;
3669 }
3670 tcmsg->tcm_handle = handle;
3671 tcmsg->tcm_parent = parent;
3672
3673 error = tc_transact(&request, replyp);
3674 if (error) {
3675 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3676 netdev_get_name(netdev),
3677 tc_get_major(handle), tc_get_minor(handle),
3678 tc_get_major(parent), tc_get_minor(parent),
3679 strerror(error));
3680 }
3681 return error;
3682 }
3683
3684 /* Equivalent to "tc class del dev <name> handle <handle>". */
3685 static int
3686 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3687 {
3688 struct ofpbuf request;
3689 struct tcmsg *tcmsg;
3690 int error;
3691
3692 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3693 if (!tcmsg) {
3694 return ENODEV;
3695 }
3696 tcmsg->tcm_handle = handle;
3697 tcmsg->tcm_parent = 0;
3698
3699 error = tc_transact(&request, NULL);
3700 if (error) {
3701 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3702 netdev_get_name(netdev),
3703 tc_get_major(handle), tc_get_minor(handle),
3704 strerror(error));
3705 }
3706 return error;
3707 }
3708
3709 /* Equivalent to "tc qdisc del dev <name> root". */
3710 static int
3711 tc_del_qdisc(struct netdev *netdev)
3712 {
3713 struct netdev_dev_linux *netdev_dev =
3714 netdev_dev_linux_cast(netdev_get_dev(netdev));
3715 struct ofpbuf request;
3716 struct tcmsg *tcmsg;
3717 int error;
3718
3719 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3720 if (!tcmsg) {
3721 return ENODEV;
3722 }
3723 tcmsg->tcm_handle = tc_make_handle(1, 0);
3724 tcmsg->tcm_parent = TC_H_ROOT;
3725
3726 error = tc_transact(&request, NULL);
3727 if (error == EINVAL) {
3728 /* EINVAL probably means that the default qdisc was in use, in which
3729 * case we've accomplished our purpose. */
3730 error = 0;
3731 }
3732 if (!error && netdev_dev->tc) {
3733 if (netdev_dev->tc->ops->tc_destroy) {
3734 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3735 }
3736 netdev_dev->tc = NULL;
3737 }
3738 return error;
3739 }
3740
3741 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3742 * kernel to determine what they are. Returns 0 if successful, otherwise a
3743 * positive errno value. */
3744 static int
3745 tc_query_qdisc(const struct netdev *netdev)
3746 {
3747 struct netdev_dev_linux *netdev_dev =
3748 netdev_dev_linux_cast(netdev_get_dev(netdev));
3749 struct ofpbuf request, *qdisc;
3750 const struct tc_ops *ops;
3751 struct tcmsg *tcmsg;
3752 int load_error;
3753 int error;
3754
3755 if (netdev_dev->tc) {
3756 return 0;
3757 }
3758
3759 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3760 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3761 * 2.6.35 without that fix backported to it.
3762 *
3763 * To avoid the OOPS, we must not make a request that would attempt to dump
3764 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3765 * few others. There are a few ways that I can see to do this, but most of
3766 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3767 * technique chosen here is to assume that any non-default qdisc that we
3768 * create will have a class with handle 1:0. The built-in qdiscs only have
3769 * a class with handle 0:0.
3770 *
3771 * We could check for Linux 2.6.35+ and use a more straightforward method
3772 * there. */
3773 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3774 if (!tcmsg) {
3775 return ENODEV;
3776 }
3777 tcmsg->tcm_handle = tc_make_handle(1, 0);
3778 tcmsg->tcm_parent = 0;
3779
3780 /* Figure out what tc class to instantiate. */
3781 error = tc_transact(&request, &qdisc);
3782 if (!error) {
3783 const char *kind;
3784
3785 error = tc_parse_qdisc(qdisc, &kind, NULL);
3786 if (error) {
3787 ops = &tc_ops_other;
3788 } else {
3789 ops = tc_lookup_linux_name(kind);
3790 if (!ops) {
3791 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3792 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3793
3794 ops = &tc_ops_other;
3795 }
3796 }
3797 } else if (error == ENOENT) {
3798 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3799 * other entity that doesn't have a handle 1:0. We will assume
3800 * that it's the system default qdisc. */
3801 ops = &tc_ops_default;
3802 error = 0;
3803 } else {
3804 /* Who knows? Maybe the device got deleted. */
3805 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3806 netdev_get_name(netdev), strerror(error));
3807 ops = &tc_ops_other;
3808 }
3809
3810 /* Instantiate it. */
3811 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3812 assert((load_error == 0) == (netdev_dev->tc != NULL));
3813 ofpbuf_delete(qdisc);
3814
3815 return error ? error : load_error;
3816 }
3817
3818 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3819 approximate the time to transmit packets of various lengths. For an MTU of
3820 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3821 represents two possible packet lengths; for a MTU of 513 through 1024, four
3822 possible lengths; and so on.
3823
3824 Returns, for the specified 'mtu', the number of bits that packet lengths
3825 need to be shifted right to fit within such a 256-entry table. */
3826 static int
3827 tc_calc_cell_log(unsigned int mtu)
3828 {
3829 int cell_log;
3830
3831 if (!mtu) {
3832 mtu = ETH_PAYLOAD_MAX;
3833 }
3834 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3835
3836 for (cell_log = 0; mtu >= 256; cell_log++) {
3837 mtu >>= 1;
3838 }
3839
3840 return cell_log;
3841 }
3842
3843 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3844 * of 'mtu'. */
3845 static void
3846 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3847 {
3848 memset(rate, 0, sizeof *rate);
3849 rate->cell_log = tc_calc_cell_log(mtu);
3850 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3851 /* rate->cell_align = 0; */ /* distro headers. */
3852 rate->mpu = ETH_TOTAL_MIN;
3853 rate->rate = Bps;
3854 }
3855
3856 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3857 * attribute of the specified "type".
3858 *
3859 * See tc_calc_cell_log() above for a description of "rtab"s. */
3860 static void
3861 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3862 {
3863 uint32_t *rtab;
3864 unsigned int i;
3865
3866 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3867 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3868 unsigned packet_size = (i + 1) << rate->cell_log;
3869 if (packet_size < rate->mpu) {
3870 packet_size = rate->mpu;
3871 }
3872 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3873 }
3874 }
3875
3876 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3877 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3878 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3879 * 0 is fine.) */
3880 static int
3881 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3882 {
3883 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3884 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3885 }
3886
3887 \f
3888 /* Utility functions. */
3889
3890 static int
3891 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3892 {
3893 /* Policy for RTNLGRP_LINK messages.
3894 *
3895 * There are *many* more fields in these messages, but currently we only
3896 * care about these fields. */
3897 static const struct nl_policy rtnlgrp_link_policy[] = {
3898 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3899 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3900 .min_len = sizeof(struct rtnl_link_stats) },
3901 };
3902
3903 struct ofpbuf request;
3904 struct ofpbuf *reply;
3905 struct ifinfomsg *ifi;
3906 const struct rtnl_link_stats *rtnl_stats;
3907 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3908 int error;
3909
3910 ofpbuf_init(&request, 0);
3911 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3912 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3913 ifi->ifi_family = PF_UNSPEC;
3914 ifi->ifi_index = ifindex;
3915 error = nl_sock_transact(rtnl_sock, &request, &reply);
3916 ofpbuf_uninit(&request);
3917 if (error) {
3918 return error;
3919 }
3920
3921 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3922 rtnlgrp_link_policy,
3923 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3924 ofpbuf_delete(reply);
3925 return EPROTO;
3926 }
3927
3928 if (!attrs[IFLA_STATS]) {
3929 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3930 ofpbuf_delete(reply);
3931 return EPROTO;
3932 }
3933
3934 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3935 stats->rx_packets = rtnl_stats->rx_packets;
3936 stats->tx_packets = rtnl_stats->tx_packets;
3937 stats->rx_bytes = rtnl_stats->rx_bytes;
3938 stats->tx_bytes = rtnl_stats->tx_bytes;
3939 stats->rx_errors = rtnl_stats->rx_errors;
3940 stats->tx_errors = rtnl_stats->tx_errors;
3941 stats->rx_dropped = rtnl_stats->rx_dropped;
3942 stats->tx_dropped = rtnl_stats->tx_dropped;
3943 stats->multicast = rtnl_stats->multicast;
3944 stats->collisions = rtnl_stats->collisions;
3945 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3946 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3947 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3948 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3949 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3950 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3951 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3952 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3953 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3954 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3955 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3956
3957 ofpbuf_delete(reply);
3958
3959 return 0;
3960 }
3961
3962 static int
3963 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3964 {
3965 static const char fn[] = "/proc/net/dev";
3966 char line[1024];
3967 FILE *stream;
3968 int ln;
3969
3970 stream = fopen(fn, "r");
3971 if (!stream) {
3972 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3973 return errno;
3974 }
3975
3976 ln = 0;
3977 while (fgets(line, sizeof line, stream)) {
3978 if (++ln >= 3) {
3979 char devname[16];
3980 #define X64 "%"SCNu64
3981 if (sscanf(line,
3982 " %15[^:]:"
3983 X64 X64 X64 X64 X64 X64 X64 "%*u"
3984 X64 X64 X64 X64 X64 X64 X64 "%*u",
3985 devname,
3986 &stats->rx_bytes,
3987 &stats->rx_packets,
3988 &stats->rx_errors,
3989 &stats->rx_dropped,
3990 &stats->rx_fifo_errors,
3991 &stats->rx_frame_errors,
3992 &stats->multicast,
3993 &stats->tx_bytes,
3994 &stats->tx_packets,
3995 &stats->tx_errors,
3996 &stats->tx_dropped,
3997 &stats->tx_fifo_errors,
3998 &stats->collisions,
3999 &stats->tx_carrier_errors) != 15) {
4000 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4001 } else if (!strcmp(devname, netdev_name)) {
4002 stats->rx_length_errors = UINT64_MAX;
4003 stats->rx_over_errors = UINT64_MAX;
4004 stats->rx_crc_errors = UINT64_MAX;
4005 stats->rx_missed_errors = UINT64_MAX;
4006 stats->tx_aborted_errors = UINT64_MAX;
4007 stats->tx_heartbeat_errors = UINT64_MAX;
4008 stats->tx_window_errors = UINT64_MAX;
4009 fclose(stream);
4010 return 0;
4011 }
4012 }
4013 }
4014 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4015 fclose(stream);
4016 return ENODEV;
4017 }
4018
4019 static int
4020 get_flags(const struct netdev *netdev, int *flags)
4021 {
4022 struct ifreq ifr;
4023 int error;
4024
4025 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4026 "SIOCGIFFLAGS");
4027 *flags = ifr.ifr_flags;
4028 return error;
4029 }
4030
4031 static int
4032 set_flags(struct netdev *netdev, int flags)
4033 {
4034 struct ifreq ifr;
4035
4036 ifr.ifr_flags = flags;
4037 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4038 "SIOCSIFFLAGS");
4039 }
4040
4041 static int
4042 do_get_ifindex(const char *netdev_name)
4043 {
4044 struct ifreq ifr;
4045
4046 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4047 COVERAGE_INC(netdev_get_ifindex);
4048 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4049 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4050 netdev_name, strerror(errno));
4051 return -errno;
4052 }
4053 return ifr.ifr_ifindex;
4054 }
4055
4056 static int
4057 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4058 {
4059 struct netdev_dev_linux *netdev_dev =
4060 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4061 *ifindexp = 0;
4062 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4063 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4064 if (ifindex < 0) {
4065 return -ifindex;
4066 }
4067 netdev_dev->cache_valid |= VALID_IFINDEX;
4068 netdev_dev->ifindex = ifindex;
4069 }
4070 *ifindexp = netdev_dev->ifindex;
4071 return 0;
4072 }
4073
4074 static int
4075 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4076 {
4077 struct ifreq ifr;
4078 int hwaddr_family;
4079
4080 memset(&ifr, 0, sizeof ifr);
4081 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4082 COVERAGE_INC(netdev_get_hwaddr);
4083 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4084 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4085 netdev_name, strerror(errno));
4086 return errno;
4087 }
4088 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4089 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4090 VLOG_WARN("%s device has unknown hardware address family %d",
4091 netdev_name, hwaddr_family);
4092 }
4093 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4094 return 0;
4095 }
4096
4097 static int
4098 set_etheraddr(const char *netdev_name, int hwaddr_family,
4099 const uint8_t mac[ETH_ADDR_LEN])
4100 {
4101 struct ifreq ifr;
4102
4103 memset(&ifr, 0, sizeof ifr);
4104 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4105 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4106 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4107 COVERAGE_INC(netdev_set_hwaddr);
4108 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4109 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4110 netdev_name, strerror(errno));
4111 return errno;
4112 }
4113 return 0;
4114 }
4115
4116 static int
4117 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4118 int cmd, const char *cmd_name)
4119 {
4120 struct ifreq ifr;
4121
4122 memset(&ifr, 0, sizeof ifr);
4123 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4124 ifr.ifr_data = (caddr_t) ecmd;
4125
4126 ecmd->cmd = cmd;
4127 COVERAGE_INC(netdev_ethtool);
4128 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4129 return 0;
4130 } else {
4131 if (errno != EOPNOTSUPP) {
4132 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4133 "failed: %s", cmd_name, name, strerror(errno));
4134 } else {
4135 /* The device doesn't support this operation. That's pretty
4136 * common, so there's no point in logging anything. */
4137 }
4138 return errno;
4139 }
4140 }
4141
4142 static int
4143 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4144 const char *cmd_name)
4145 {
4146 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4147 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4148 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4149 strerror(errno));
4150 return errno;
4151 }
4152 return 0;
4153 }
4154
4155 static int
4156 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4157 int cmd, const char *cmd_name)
4158 {
4159 struct ifreq ifr;
4160 int error;
4161
4162 ifr.ifr_addr.sa_family = AF_INET;
4163 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4164 if (!error) {
4165 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4166 *ip = sin->sin_addr;
4167 }
4168 return error;
4169 }