]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
vswitchd: New column "link_resets".
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <assert.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <arpa/inet.h>
25 #include <inttypes.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/ip.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "packets.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
68 #include "shash.h"
69 #include "sset.h"
70 #include "timer.h"
71 #include "vlog.h"
72
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
74
75 COVERAGE_DEFINE(netdev_get_vlan_vid);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
82 \f
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 * old headers. */
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
87 #endif
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
90 #endif
91
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #endif
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
99 #endif
100
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
102 * headers. */
103 #ifndef TC_RTAB_SIZE
104 #define TC_RTAB_SIZE 1024
105 #endif
106
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
109
110 enum {
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
113 VALID_IN4 = 1 << 2,
114 VALID_IN6 = 1 << 3,
115 VALID_MTU = 1 << 4,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
118 };
119
120 struct tap_state {
121 int fd;
122 bool opened;
123 };
124 \f
125 /* Traffic control. */
126
127 /* An instance of a traffic control class. Always associated with a particular
128 * network device.
129 *
130 * Each TC implementation subclasses this with whatever additional data it
131 * needs. */
132 struct tc {
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
137 };
138
139 /* One traffic control queue.
140 *
141 * Each TC implementation subclasses this with whatever additional data it
142 * needs. */
143 struct tc_queue {
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
146 };
147
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
150 *
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
154 struct tc_ops {
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
159
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
162
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
166
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
172 *
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
176 *
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
179 *
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
183
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
187 *
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
193 * 'netdev'.
194 *
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
201 * tc_destroy(tc).
202 *
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
206 *
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
209
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 *
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
215 *
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
219 *
220 * This function may be null if 'tc' is not configurable.
221 */
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
226 *
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
230 *
231 * This function may be null if 'tc' is not configurable.
232 */
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
234
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 *
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 *
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
245 *
246 * This function may be null if 'tc' does not have queues ('n_queues' is
247 * 0). */
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
250
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
254 * 'n_queues'.
255 *
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 *
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
264
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
267 *
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
274 *
275 * On success, initializes '*stats'.
276 *
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
282
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 *
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
291 };
292
293 static void
294 tc_init(struct tc *tc, const struct tc_ops *ops)
295 {
296 tc->ops = ops;
297 hmap_init(&tc->queues);
298 }
299
300 static void
301 tc_destroy(struct tc *tc)
302 {
303 hmap_destroy(&tc->queues);
304 }
305
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
310
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
316 NULL
317 };
318
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
322
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330
331 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
332 struct nlattr **options);
333 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
334 struct nlattr **options,
335 struct netdev_queue_stats *);
336 static int tc_query_class(const struct netdev *,
337 unsigned int handle, unsigned int parent,
338 struct ofpbuf **replyp);
339 static int tc_delete_class(const struct netdev *, unsigned int handle);
340
341 static int tc_del_qdisc(struct netdev *netdev);
342 static int tc_query_qdisc(const struct netdev *netdev);
343
344 static int tc_calc_cell_log(unsigned int mtu);
345 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
346 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
347 const struct tc_ratespec *rate);
348 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349 \f
350 struct netdev_dev_linux {
351 struct netdev_dev netdev_dev;
352
353 struct shash_node *shash_node;
354 unsigned int cache_valid;
355 unsigned int change_seq;
356
357 bool miimon; /* Link status of last poll. */
358 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
359 struct timer miimon_timer;
360
361 /* The following are figured out "on demand" only. They are only valid
362 * when the corresponding VALID_* bit in 'cache_valid' is set. */
363 int ifindex;
364 uint8_t etheraddr[ETH_ADDR_LEN];
365 struct in_addr address, netmask;
366 struct in6_addr in6;
367 int mtu;
368 bool carrier;
369 long long int carrier_resets;
370 uint32_t kbits_rate; /* Policing data. */
371 uint32_t kbits_burst;
372 bool have_vport_stats;
373 struct tc *tc;
374
375 union {
376 struct tap_state tap;
377 } state;
378 };
379
380 struct netdev_linux {
381 struct netdev netdev;
382 int fd;
383 };
384
385 /* Sockets used for ioctl operations. */
386 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387
388 /* A Netlink routing socket that is not subscribed to any multicast groups. */
389 static struct nl_sock *rtnl_sock;
390
391 /* This is set pretty low because we probably won't learn anything from the
392 * additional log messages. */
393 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394
395 static int netdev_linux_init(void);
396
397 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
398 int cmd, const char *cmd_name);
399 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
400 const char *cmd_name);
401 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
402 int cmd, const char *cmd_name);
403 static int get_flags(const struct netdev *, int *flagsp);
404 static int set_flags(struct netdev *, int flags);
405 static int do_get_ifindex(const char *netdev_name);
406 static int get_ifindex(const struct netdev *, int *ifindexp);
407 static int do_set_addr(struct netdev *netdev,
408 int ioctl_nr, const char *ioctl_name,
409 struct in_addr addr);
410 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
411 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
412 const uint8_t[ETH_ADDR_LEN]);
413 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
414 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
415 static int get_carrier_via_sysfs(const char *name, bool *carrier);
416 static int af_packet_sock(void);
417 static void netdev_linux_miimon_run(void);
418 static void netdev_linux_miimon_wait(void);
419
420 static bool
421 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 {
423 return netdev_class->init == netdev_linux_init;
424 }
425
426 static struct netdev_dev_linux *
427 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 {
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
431
432 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
433 }
434
435 static struct netdev_linux *
436 netdev_linux_cast(const struct netdev *netdev)
437 {
438 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
439 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
440 assert(is_netdev_linux_class(netdev_class));
441
442 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
443 }
444 \f
445 static int
446 netdev_linux_init(void)
447 {
448 static int status = -1;
449 if (status < 0) {
450 /* Create AF_INET socket. */
451 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
452 status = af_inet_sock >= 0 ? 0 : errno;
453 if (status) {
454 VLOG_ERR("failed to create inet socket: %s", strerror(status));
455 }
456
457 /* Create rtnetlink socket. */
458 if (!status) {
459 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 if (status) {
461 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
462 strerror(status));
463 }
464 }
465 }
466 return status;
467 }
468
469 static void
470 netdev_linux_run(void)
471 {
472 rtnetlink_link_run();
473 netdev_linux_miimon_run();
474 }
475
476 static void
477 netdev_linux_wait(void)
478 {
479 rtnetlink_link_wait();
480 netdev_linux_miimon_wait();
481 }
482
483 static void
484 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
485 {
486 dev->change_seq++;
487 if (!dev->change_seq) {
488 dev->change_seq++;
489 }
490 dev->cache_valid = 0;
491 }
492
493 static void
494 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
495 void *aux OVS_UNUSED)
496 {
497 struct netdev_dev_linux *dev;
498 if (change) {
499 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 if (base_dev) {
501 const struct netdev_class *netdev_class =
502 netdev_dev_get_class(base_dev);
503
504 if (is_netdev_linux_class(netdev_class)) {
505 dev = netdev_dev_linux_cast(base_dev);
506
507 if (dev->carrier != change->running) {
508 dev->carrier = change->running;
509 dev->carrier_resets++;
510 }
511
512 netdev_dev_linux_changed(dev);
513 }
514 }
515 } else {
516 struct shash device_shash;
517 struct shash_node *node;
518
519 shash_init(&device_shash);
520 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
521 SHASH_FOR_EACH (node, &device_shash) {
522 bool carrier;
523
524 dev = node->data;
525
526 get_carrier_via_sysfs(node->name, &carrier);
527 if (dev->carrier != carrier) {
528 dev->carrier = carrier;
529 dev->carrier_resets++;
530 }
531
532 netdev_dev_linux_changed(dev);
533 }
534 shash_destroy(&device_shash);
535 }
536 }
537
538 /* Creates system and internal devices. */
539 static int
540 netdev_linux_create(const struct netdev_class *class, const char *name,
541 struct netdev_dev **netdev_devp)
542 {
543 struct netdev_dev_linux *netdev_dev;
544
545 if (!cache_notifier_refcount) {
546 assert(!netdev_linux_cache_notifier);
547
548 netdev_linux_cache_notifier =
549 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550
551 if (!netdev_linux_cache_notifier) {
552 return EINVAL;
553 }
554 }
555 cache_notifier_refcount++;
556
557 netdev_dev = xzalloc(sizeof *netdev_dev);
558 netdev_dev->change_seq = 1;
559 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
560 get_carrier_via_sysfs(name, &netdev_dev->carrier);
561
562 *netdev_devp = &netdev_dev->netdev_dev;
563 return 0;
564 }
565
566 /* For most types of netdevs we open the device for each call of
567 * netdev_open(). However, this is not the case with tap devices,
568 * since it is only possible to open the device once. In this
569 * situation we share a single file descriptor, and consequently
570 * buffers, across all readers. Therefore once data is read it will
571 * be unavailable to other reads for tap devices. */
572 static int
573 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
574 const char *name, struct netdev_dev **netdev_devp)
575 {
576 struct netdev_dev_linux *netdev_dev;
577 struct tap_state *state;
578 static const char tap_dev[] = "/dev/net/tun";
579 struct ifreq ifr;
580 int error;
581
582 netdev_dev = xzalloc(sizeof *netdev_dev);
583 state = &netdev_dev->state.tap;
584
585 /* Open tap device. */
586 state->fd = open(tap_dev, O_RDWR);
587 if (state->fd < 0) {
588 error = errno;
589 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
590 goto error;
591 }
592
593 /* Create tap device. */
594 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
595 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
596 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
597 VLOG_WARN("%s: creating tap device failed: %s", name,
598 strerror(errno));
599 error = errno;
600 goto error;
601 }
602
603 /* Make non-blocking. */
604 error = set_nonblocking(state->fd);
605 if (error) {
606 goto error;
607 }
608
609 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
610 *netdev_devp = &netdev_dev->netdev_dev;
611 return 0;
612
613 error:
614 free(netdev_dev);
615 return error;
616 }
617
618 static void
619 destroy_tap(struct netdev_dev_linux *netdev_dev)
620 {
621 struct tap_state *state = &netdev_dev->state.tap;
622
623 if (state->fd >= 0) {
624 close(state->fd);
625 }
626 }
627
628 /* Destroys the netdev device 'netdev_dev_'. */
629 static void
630 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
631 {
632 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
633 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
634
635 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
636 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
637 }
638
639 if (class == &netdev_linux_class || class == &netdev_internal_class) {
640 cache_notifier_refcount--;
641
642 if (!cache_notifier_refcount) {
643 assert(netdev_linux_cache_notifier);
644 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
645 netdev_linux_cache_notifier = NULL;
646 }
647 } else if (class == &netdev_tap_class) {
648 destroy_tap(netdev_dev);
649 } else {
650 NOT_REACHED();
651 }
652
653 free(netdev_dev);
654 }
655
656 static int
657 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
658 {
659 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
660 struct netdev_linux *netdev;
661 enum netdev_flags flags;
662 int error;
663
664 /* Allocate network device. */
665 netdev = xzalloc(sizeof *netdev);
666 netdev->fd = -1;
667 netdev_init(&netdev->netdev, netdev_dev_);
668
669 /* Verify that the device really exists, by attempting to read its flags.
670 * (The flags might be cached, in which case this won't actually do an
671 * ioctl.)
672 *
673 * Don't do this for "internal" netdevs, though, because those have to be
674 * created as netdev objects before they exist in the kernel, because
675 * creating them in the kernel happens by passing a netdev object to
676 * dpif_port_add(). */
677 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
678 error = netdev_get_flags(&netdev->netdev, &flags);
679 if (error == ENODEV) {
680 goto error;
681 }
682 }
683
684 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
685 !netdev_dev->state.tap.opened) {
686
687 /* We assume that the first user of the tap device is the primary user
688 * and give them the tap FD. Subsequent users probably just expect
689 * this to be a system device so open it normally to avoid send/receive
690 * directions appearing to be reversed. */
691 netdev->fd = netdev_dev->state.tap.fd;
692 netdev_dev->state.tap.opened = true;
693 }
694
695 *netdevp = &netdev->netdev;
696 return 0;
697
698 error:
699 netdev_uninit(&netdev->netdev, true);
700 return error;
701 }
702
703 /* Closes and destroys 'netdev'. */
704 static void
705 netdev_linux_close(struct netdev *netdev_)
706 {
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708
709 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
710 close(netdev->fd);
711 }
712 free(netdev);
713 }
714
715 static int
716 netdev_linux_listen(struct netdev *netdev_)
717 {
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
719 struct sockaddr_ll sll;
720 int ifindex;
721 int error;
722 int fd;
723
724 if (netdev->fd >= 0) {
725 return 0;
726 }
727
728 /* Create file descriptor. */
729 fd = socket(PF_PACKET, SOCK_RAW, 0);
730 if (fd < 0) {
731 error = errno;
732 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
733 goto error;
734 }
735
736 /* Set non-blocking mode. */
737 error = set_nonblocking(fd);
738 if (error) {
739 goto error;
740 }
741
742 /* Get ethernet device index. */
743 error = get_ifindex(&netdev->netdev, &ifindex);
744 if (error) {
745 goto error;
746 }
747
748 /* Bind to specific ethernet device. */
749 memset(&sll, 0, sizeof sll);
750 sll.sll_family = AF_PACKET;
751 sll.sll_ifindex = ifindex;
752 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
753 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
754 error = errno;
755 VLOG_ERR("%s: failed to bind raw socket (%s)",
756 netdev_get_name(netdev_), strerror(error));
757 goto error;
758 }
759
760 netdev->fd = fd;
761 return 0;
762
763 error:
764 if (fd >= 0) {
765 close(fd);
766 }
767 return error;
768 }
769
770 static int
771 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
772 {
773 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774
775 if (netdev->fd < 0) {
776 /* Device is not listening. */
777 return -EAGAIN;
778 }
779
780 for (;;) {
781 ssize_t retval = read(netdev->fd, data, size);
782 if (retval >= 0) {
783 return retval;
784 } else if (errno != EINTR) {
785 if (errno != EAGAIN) {
786 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
787 strerror(errno), netdev_get_name(netdev_));
788 }
789 return -errno;
790 }
791 }
792 }
793
794 /* Registers with the poll loop to wake up from the next call to poll_block()
795 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
796 static void
797 netdev_linux_recv_wait(struct netdev *netdev_)
798 {
799 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
800 if (netdev->fd >= 0) {
801 poll_fd_wait(netdev->fd, POLLIN);
802 }
803 }
804
805 /* Discards all packets waiting to be received from 'netdev'. */
806 static int
807 netdev_linux_drain(struct netdev *netdev_)
808 {
809 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
810 if (netdev->fd < 0) {
811 return 0;
812 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
813 struct ifreq ifr;
814 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
815 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
816 if (error) {
817 return error;
818 }
819 drain_fd(netdev->fd, ifr.ifr_qlen);
820 return 0;
821 } else {
822 return drain_rcvbuf(netdev->fd);
823 }
824 }
825
826 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
827 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
828 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
829 * the packet is too big or too small to transmit on the device.
830 *
831 * The caller retains ownership of 'buffer' in all cases.
832 *
833 * The kernel maintains a packet transmission queue, so the caller is not
834 * expected to do additional queuing of packets. */
835 static int
836 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
837 {
838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 for (;;) {
840 ssize_t retval;
841
842 if (netdev->fd < 0) {
843 /* Use our AF_PACKET socket to send to this device. */
844 struct sockaddr_ll sll;
845 struct msghdr msg;
846 struct iovec iov;
847 int ifindex;
848 int error;
849 int sock;
850
851 sock = af_packet_sock();
852 if (sock < 0) {
853 return sock;
854 }
855
856 error = get_ifindex(netdev_, &ifindex);
857 if (error) {
858 return error;
859 }
860
861 /* We don't bother setting most fields in sockaddr_ll because the
862 * kernel ignores them for SOCK_RAW. */
863 memset(&sll, 0, sizeof sll);
864 sll.sll_family = AF_PACKET;
865 sll.sll_ifindex = ifindex;
866
867 iov.iov_base = (void *) data;
868 iov.iov_len = size;
869
870 msg.msg_name = &sll;
871 msg.msg_namelen = sizeof sll;
872 msg.msg_iov = &iov;
873 msg.msg_iovlen = 1;
874 msg.msg_control = NULL;
875 msg.msg_controllen = 0;
876 msg.msg_flags = 0;
877
878 retval = sendmsg(sock, &msg, 0);
879 } else {
880 /* Use the netdev's own fd to send to this device. This is
881 * essential for tap devices, because packets sent to a tap device
882 * with an AF_PACKET socket will loop back to be *received* again
883 * on the tap device. */
884 retval = write(netdev->fd, data, size);
885 }
886
887 if (retval < 0) {
888 /* The Linux AF_PACKET implementation never blocks waiting for room
889 * for packets, instead returning ENOBUFS. Translate this into
890 * EAGAIN for the caller. */
891 if (errno == ENOBUFS) {
892 return EAGAIN;
893 } else if (errno == EINTR) {
894 continue;
895 } else if (errno != EAGAIN) {
896 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
897 netdev_get_name(netdev_), strerror(errno));
898 }
899 return errno;
900 } else if (retval != size) {
901 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
902 "%zu) on %s", retval, size, netdev_get_name(netdev_));
903 return EMSGSIZE;
904 } else {
905 return 0;
906 }
907 }
908 }
909
910 /* Registers with the poll loop to wake up from the next call to poll_block()
911 * when the packet transmission queue has sufficient room to transmit a packet
912 * with netdev_send().
913 *
914 * The kernel maintains a packet transmission queue, so the client is not
915 * expected to do additional queuing of packets. Thus, this function is
916 * unlikely to ever be used. It is included for completeness. */
917 static void
918 netdev_linux_send_wait(struct netdev *netdev_)
919 {
920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
921 if (netdev->fd < 0) {
922 /* Nothing to do. */
923 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
924 poll_fd_wait(netdev->fd, POLLOUT);
925 } else {
926 /* TAP device always accepts packets.*/
927 poll_immediate_wake();
928 }
929 }
930
931 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
932 * otherwise a positive errno value. */
933 static int
934 netdev_linux_set_etheraddr(struct netdev *netdev_,
935 const uint8_t mac[ETH_ADDR_LEN])
936 {
937 struct netdev_dev_linux *netdev_dev =
938 netdev_dev_linux_cast(netdev_get_dev(netdev_));
939 int error;
940
941 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
942 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
943 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
944 if (!error) {
945 netdev_dev->cache_valid |= VALID_ETHERADDR;
946 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
947 }
948 } else {
949 error = 0;
950 }
951 return error;
952 }
953
954 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
955 * free the returned buffer. */
956 static int
957 netdev_linux_get_etheraddr(const struct netdev *netdev_,
958 uint8_t mac[ETH_ADDR_LEN])
959 {
960 struct netdev_dev_linux *netdev_dev =
961 netdev_dev_linux_cast(netdev_get_dev(netdev_));
962 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
963 int error = get_etheraddr(netdev_get_name(netdev_),
964 netdev_dev->etheraddr);
965 if (error) {
966 return error;
967 }
968 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 }
970 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
971 return 0;
972 }
973
974 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
975 * in bytes, not including the hardware header; thus, this is typically 1500
976 * bytes for Ethernet devices. */
977 static int
978 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
979 {
980 struct netdev_dev_linux *netdev_dev =
981 netdev_dev_linux_cast(netdev_get_dev(netdev_));
982 if (!(netdev_dev->cache_valid & VALID_MTU)) {
983 struct ifreq ifr;
984 int error;
985
986 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
987 SIOCGIFMTU, "SIOCGIFMTU");
988 if (error) {
989 return error;
990 }
991 netdev_dev->mtu = ifr.ifr_mtu;
992 netdev_dev->cache_valid |= VALID_MTU;
993 }
994 *mtup = netdev_dev->mtu;
995 return 0;
996 }
997
998 /* Sets the maximum size of transmitted (MTU) for given device using linux
999 * networking ioctl interface.
1000 */
1001 static int
1002 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1003 {
1004 struct netdev_dev_linux *netdev_dev =
1005 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1006 struct ifreq ifr;
1007 int error;
1008
1009 ifr.ifr_mtu = mtu;
1010 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1011 SIOCSIFMTU, "SIOCSIFMTU");
1012 if (error) {
1013 return error;
1014 }
1015
1016 netdev_dev->mtu = ifr.ifr_mtu;
1017 netdev_dev->cache_valid |= VALID_MTU;
1018 return 0;
1019 }
1020
1021 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1022 * On failure, returns a negative errno value. */
1023 static int
1024 netdev_linux_get_ifindex(const struct netdev *netdev)
1025 {
1026 int ifindex, error;
1027
1028 error = get_ifindex(netdev, &ifindex);
1029 return error ? -error : ifindex;
1030 }
1031
1032 static int
1033 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1034 {
1035 struct netdev_dev_linux *netdev_dev =
1036 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1037
1038 if (netdev_dev->miimon_interval > 0) {
1039 *carrier = netdev_dev->miimon;
1040 } else {
1041 *carrier = netdev_dev->carrier;
1042 }
1043
1044 return 0;
1045 }
1046
1047 static long long int
1048 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1049 {
1050 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1051 }
1052
1053 static int
1054 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1055 struct mii_ioctl_data *data)
1056 {
1057 struct ifreq ifr;
1058 int error;
1059
1060 memset(&ifr, 0, sizeof ifr);
1061 memcpy(&ifr.ifr_data, data, sizeof *data);
1062 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1063 memcpy(data, &ifr.ifr_data, sizeof *data);
1064
1065 return error;
1066 }
1067
1068 static int
1069 netdev_linux_get_miimon(const char *name, bool *miimon)
1070 {
1071 struct mii_ioctl_data data;
1072 int error;
1073
1074 *miimon = false;
1075
1076 memset(&data, 0, sizeof data);
1077 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1078 if (!error) {
1079 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1080 data.reg_num = MII_BMSR;
1081 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1082 &data);
1083
1084 if (!error) {
1085 *miimon = !!(data.val_out & BMSR_LSTATUS);
1086 } else {
1087 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1088 }
1089 } else {
1090 struct ethtool_cmd ecmd;
1091
1092 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1093 name);
1094
1095 memset(&ecmd, 0, sizeof ecmd);
1096 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1097 "ETHTOOL_GLINK");
1098 if (!error) {
1099 struct ethtool_value eval;
1100
1101 memcpy(&eval, &ecmd, sizeof eval);
1102 *miimon = !!eval.data;
1103 } else {
1104 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1105 }
1106 }
1107
1108 return error;
1109 }
1110
1111 static int
1112 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1113 long long int interval)
1114 {
1115 struct netdev_dev_linux *netdev_dev;
1116
1117 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1118
1119 interval = interval > 0 ? MAX(interval, 100) : 0;
1120 if (netdev_dev->miimon_interval != interval) {
1121 netdev_dev->miimon_interval = interval;
1122 timer_set_expired(&netdev_dev->miimon_timer);
1123 }
1124
1125 return 0;
1126 }
1127
1128 static void
1129 netdev_linux_miimon_run(void)
1130 {
1131 struct shash device_shash;
1132 struct shash_node *node;
1133
1134 shash_init(&device_shash);
1135 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1136 SHASH_FOR_EACH (node, &device_shash) {
1137 struct netdev_dev_linux *dev = node->data;
1138 bool miimon;
1139
1140 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1141 continue;
1142 }
1143
1144 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1145 if (miimon != dev->miimon) {
1146 dev->miimon = miimon;
1147 netdev_dev_linux_changed(dev);
1148 }
1149
1150 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1151 }
1152
1153 shash_destroy(&device_shash);
1154 }
1155
1156 static void
1157 netdev_linux_miimon_wait(void)
1158 {
1159 struct shash device_shash;
1160 struct shash_node *node;
1161
1162 shash_init(&device_shash);
1163 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1164 SHASH_FOR_EACH (node, &device_shash) {
1165 struct netdev_dev_linux *dev = node->data;
1166
1167 if (dev->miimon_interval > 0) {
1168 timer_wait(&dev->miimon_timer);
1169 }
1170 }
1171 shash_destroy(&device_shash);
1172 }
1173
1174 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1175 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1176 * enabled. */
1177 static bool
1178 check_for_working_netlink_stats(void)
1179 {
1180 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1181 * preferable, so if that works, we'll use it. */
1182 int ifindex = do_get_ifindex("lo");
1183 if (ifindex < 0) {
1184 VLOG_WARN("failed to get ifindex for lo, "
1185 "obtaining netdev stats from proc");
1186 return false;
1187 } else {
1188 struct netdev_stats stats;
1189 int error = get_stats_via_netlink(ifindex, &stats);
1190 if (!error) {
1191 VLOG_DBG("obtaining netdev stats via rtnetlink");
1192 return true;
1193 } else {
1194 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1195 "via proc (you are probably running a pre-2.6.19 "
1196 "kernel)", strerror(error));
1197 return false;
1198 }
1199 }
1200 }
1201
1202 static void
1203 swap_uint64(uint64_t *a, uint64_t *b)
1204 {
1205 uint64_t tmp = *a;
1206 *a = *b;
1207 *b = tmp;
1208 }
1209
1210 static void
1211 get_stats_via_vport(const struct netdev *netdev_,
1212 struct netdev_stats *stats)
1213 {
1214 struct netdev_dev_linux *netdev_dev =
1215 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1216
1217 if (netdev_dev->have_vport_stats ||
1218 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1219 int error;
1220
1221 error = netdev_vport_get_stats(netdev_, stats);
1222 if (error) {
1223 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1224 netdev_get_name(netdev_), error);
1225 }
1226 netdev_dev->have_vport_stats = !error;
1227 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1228 }
1229 }
1230
1231 static int
1232 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1233 struct netdev_stats *stats)
1234 {
1235 static int use_netlink_stats = -1;
1236 int error;
1237
1238 if (use_netlink_stats < 0) {
1239 use_netlink_stats = check_for_working_netlink_stats();
1240 }
1241
1242 if (use_netlink_stats) {
1243 int ifindex;
1244
1245 error = get_ifindex(netdev_, &ifindex);
1246 if (!error) {
1247 error = get_stats_via_netlink(ifindex, stats);
1248 }
1249 } else {
1250 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1251 }
1252
1253 if (error) {
1254 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1255 netdev_get_name(netdev_), error);
1256 }
1257 return error;
1258
1259 }
1260
1261 /* Retrieves current device stats for 'netdev-linux'. */
1262 static int
1263 netdev_linux_get_stats(const struct netdev *netdev_,
1264 struct netdev_stats *stats)
1265 {
1266 struct netdev_dev_linux *netdev_dev =
1267 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1268 struct netdev_stats dev_stats;
1269 int error;
1270
1271 get_stats_via_vport(netdev_, stats);
1272
1273 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1274
1275 if (error) {
1276 if (!netdev_dev->have_vport_stats) {
1277 return error;
1278 } else {
1279 return 0;
1280 }
1281 }
1282
1283 if (!netdev_dev->have_vport_stats) {
1284 /* stats not available from OVS then use ioctl stats. */
1285 *stats = dev_stats;
1286 } else {
1287 stats->rx_errors += dev_stats.rx_errors;
1288 stats->tx_errors += dev_stats.tx_errors;
1289 stats->rx_dropped += dev_stats.rx_dropped;
1290 stats->tx_dropped += dev_stats.tx_dropped;
1291 stats->multicast += dev_stats.multicast;
1292 stats->collisions += dev_stats.collisions;
1293 stats->rx_length_errors += dev_stats.rx_length_errors;
1294 stats->rx_over_errors += dev_stats.rx_over_errors;
1295 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1296 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1297 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1298 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1299 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1300 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1301 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1302 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1303 stats->tx_window_errors += dev_stats.tx_window_errors;
1304 }
1305 return 0;
1306 }
1307
1308 /* Retrieves current device stats for 'netdev-tap' netdev or
1309 * netdev-internal. */
1310 static int
1311 netdev_pseudo_get_stats(const struct netdev *netdev_,
1312 struct netdev_stats *stats)
1313 {
1314 struct netdev_dev_linux *netdev_dev =
1315 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1316 struct netdev_stats dev_stats;
1317 int error;
1318
1319 get_stats_via_vport(netdev_, stats);
1320
1321 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1322 if (error) {
1323 if (!netdev_dev->have_vport_stats) {
1324 return error;
1325 } else {
1326 return 0;
1327 }
1328 }
1329
1330 /* If this port is an internal port then the transmit and receive stats
1331 * will appear to be swapped relative to the other ports since we are the
1332 * one sending the data, not a remote computer. For consistency, we swap
1333 * them back here. This does not apply if we are getting stats from the
1334 * vport layer because it always tracks stats from the perspective of the
1335 * switch. */
1336 if (!netdev_dev->have_vport_stats) {
1337 *stats = dev_stats;
1338 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1339 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1340 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1341 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1342 stats->rx_length_errors = 0;
1343 stats->rx_over_errors = 0;
1344 stats->rx_crc_errors = 0;
1345 stats->rx_frame_errors = 0;
1346 stats->rx_fifo_errors = 0;
1347 stats->rx_missed_errors = 0;
1348 stats->tx_aborted_errors = 0;
1349 stats->tx_carrier_errors = 0;
1350 stats->tx_fifo_errors = 0;
1351 stats->tx_heartbeat_errors = 0;
1352 stats->tx_window_errors = 0;
1353 } else {
1354 stats->rx_dropped += dev_stats.tx_dropped;
1355 stats->tx_dropped += dev_stats.rx_dropped;
1356
1357 stats->rx_errors += dev_stats.tx_errors;
1358 stats->tx_errors += dev_stats.rx_errors;
1359
1360 stats->multicast += dev_stats.multicast;
1361 stats->collisions += dev_stats.collisions;
1362 }
1363 return 0;
1364 }
1365
1366 /* Stores the features supported by 'netdev' into each of '*current',
1367 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1368 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1369 * successful, otherwise a positive errno value. */
1370 static int
1371 netdev_linux_get_features(const struct netdev *netdev,
1372 uint32_t *current, uint32_t *advertised,
1373 uint32_t *supported, uint32_t *peer)
1374 {
1375 struct ethtool_cmd ecmd;
1376 int error;
1377
1378 memset(&ecmd, 0, sizeof ecmd);
1379 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1380 ETHTOOL_GSET, "ETHTOOL_GSET");
1381 if (error) {
1382 return error;
1383 }
1384
1385 /* Supported features. */
1386 *supported = 0;
1387 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1388 *supported |= OFPPF_10MB_HD;
1389 }
1390 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1391 *supported |= OFPPF_10MB_FD;
1392 }
1393 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1394 *supported |= OFPPF_100MB_HD;
1395 }
1396 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1397 *supported |= OFPPF_100MB_FD;
1398 }
1399 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1400 *supported |= OFPPF_1GB_HD;
1401 }
1402 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1403 *supported |= OFPPF_1GB_FD;
1404 }
1405 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1406 *supported |= OFPPF_10GB_FD;
1407 }
1408 if (ecmd.supported & SUPPORTED_TP) {
1409 *supported |= OFPPF_COPPER;
1410 }
1411 if (ecmd.supported & SUPPORTED_FIBRE) {
1412 *supported |= OFPPF_FIBER;
1413 }
1414 if (ecmd.supported & SUPPORTED_Autoneg) {
1415 *supported |= OFPPF_AUTONEG;
1416 }
1417 if (ecmd.supported & SUPPORTED_Pause) {
1418 *supported |= OFPPF_PAUSE;
1419 }
1420 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1421 *supported |= OFPPF_PAUSE_ASYM;
1422 }
1423
1424 /* Advertised features. */
1425 *advertised = 0;
1426 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1427 *advertised |= OFPPF_10MB_HD;
1428 }
1429 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1430 *advertised |= OFPPF_10MB_FD;
1431 }
1432 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1433 *advertised |= OFPPF_100MB_HD;
1434 }
1435 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1436 *advertised |= OFPPF_100MB_FD;
1437 }
1438 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1439 *advertised |= OFPPF_1GB_HD;
1440 }
1441 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1442 *advertised |= OFPPF_1GB_FD;
1443 }
1444 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1445 *advertised |= OFPPF_10GB_FD;
1446 }
1447 if (ecmd.advertising & ADVERTISED_TP) {
1448 *advertised |= OFPPF_COPPER;
1449 }
1450 if (ecmd.advertising & ADVERTISED_FIBRE) {
1451 *advertised |= OFPPF_FIBER;
1452 }
1453 if (ecmd.advertising & ADVERTISED_Autoneg) {
1454 *advertised |= OFPPF_AUTONEG;
1455 }
1456 if (ecmd.advertising & ADVERTISED_Pause) {
1457 *advertised |= OFPPF_PAUSE;
1458 }
1459 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1460 *advertised |= OFPPF_PAUSE_ASYM;
1461 }
1462
1463 /* Current settings. */
1464 if (ecmd.speed == SPEED_10) {
1465 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1466 } else if (ecmd.speed == SPEED_100) {
1467 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1468 } else if (ecmd.speed == SPEED_1000) {
1469 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1470 } else if (ecmd.speed == SPEED_10000) {
1471 *current = OFPPF_10GB_FD;
1472 } else {
1473 *current = 0;
1474 }
1475
1476 if (ecmd.port == PORT_TP) {
1477 *current |= OFPPF_COPPER;
1478 } else if (ecmd.port == PORT_FIBRE) {
1479 *current |= OFPPF_FIBER;
1480 }
1481
1482 if (ecmd.autoneg) {
1483 *current |= OFPPF_AUTONEG;
1484 }
1485
1486 /* Peer advertisements. */
1487 *peer = 0; /* XXX */
1488
1489 return 0;
1490 }
1491
1492 /* Set the features advertised by 'netdev' to 'advertise'. */
1493 static int
1494 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1495 {
1496 struct ethtool_cmd ecmd;
1497 int error;
1498
1499 memset(&ecmd, 0, sizeof ecmd);
1500 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1501 ETHTOOL_GSET, "ETHTOOL_GSET");
1502 if (error) {
1503 return error;
1504 }
1505
1506 ecmd.advertising = 0;
1507 if (advertise & OFPPF_10MB_HD) {
1508 ecmd.advertising |= ADVERTISED_10baseT_Half;
1509 }
1510 if (advertise & OFPPF_10MB_FD) {
1511 ecmd.advertising |= ADVERTISED_10baseT_Full;
1512 }
1513 if (advertise & OFPPF_100MB_HD) {
1514 ecmd.advertising |= ADVERTISED_100baseT_Half;
1515 }
1516 if (advertise & OFPPF_100MB_FD) {
1517 ecmd.advertising |= ADVERTISED_100baseT_Full;
1518 }
1519 if (advertise & OFPPF_1GB_HD) {
1520 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1521 }
1522 if (advertise & OFPPF_1GB_FD) {
1523 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1524 }
1525 if (advertise & OFPPF_10GB_FD) {
1526 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1527 }
1528 if (advertise & OFPPF_COPPER) {
1529 ecmd.advertising |= ADVERTISED_TP;
1530 }
1531 if (advertise & OFPPF_FIBER) {
1532 ecmd.advertising |= ADVERTISED_FIBRE;
1533 }
1534 if (advertise & OFPPF_AUTONEG) {
1535 ecmd.advertising |= ADVERTISED_Autoneg;
1536 }
1537 if (advertise & OFPPF_PAUSE) {
1538 ecmd.advertising |= ADVERTISED_Pause;
1539 }
1540 if (advertise & OFPPF_PAUSE_ASYM) {
1541 ecmd.advertising |= ADVERTISED_Asym_Pause;
1542 }
1543 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1544 ETHTOOL_SSET, "ETHTOOL_SSET");
1545 }
1546
1547 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1548 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1549 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1550 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1551 * sets '*vlan_vid' to -1. */
1552 static int
1553 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1554 {
1555 const char *netdev_name = netdev_get_name(netdev);
1556 struct ds line = DS_EMPTY_INITIALIZER;
1557 FILE *stream = NULL;
1558 int error;
1559 char *fn;
1560
1561 COVERAGE_INC(netdev_get_vlan_vid);
1562 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1563 stream = fopen(fn, "r");
1564 if (!stream) {
1565 error = errno;
1566 goto done;
1567 }
1568
1569 if (ds_get_line(&line, stream)) {
1570 if (ferror(stream)) {
1571 error = errno;
1572 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1573 } else {
1574 error = EPROTO;
1575 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1576 }
1577 goto done;
1578 }
1579
1580 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1581 error = EPROTO;
1582 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1583 fn, ds_cstr(&line));
1584 goto done;
1585 }
1586
1587 error = 0;
1588
1589 done:
1590 free(fn);
1591 if (stream) {
1592 fclose(stream);
1593 }
1594 ds_destroy(&line);
1595 if (error) {
1596 *vlan_vid = -1;
1597 }
1598 return error;
1599 }
1600
1601 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1602 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1603
1604 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1605 * positive errno value.
1606 *
1607 * This function is equivalent to running
1608 * /sbin/tc qdisc del dev %s handle ffff: ingress
1609 * but it is much, much faster.
1610 */
1611 static int
1612 netdev_linux_remove_policing(struct netdev *netdev)
1613 {
1614 struct netdev_dev_linux *netdev_dev =
1615 netdev_dev_linux_cast(netdev_get_dev(netdev));
1616 const char *netdev_name = netdev_get_name(netdev);
1617
1618 struct ofpbuf request;
1619 struct tcmsg *tcmsg;
1620 int error;
1621
1622 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1623 if (!tcmsg) {
1624 return ENODEV;
1625 }
1626 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1627 tcmsg->tcm_parent = TC_H_INGRESS;
1628 nl_msg_put_string(&request, TCA_KIND, "ingress");
1629 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1630
1631 error = tc_transact(&request, NULL);
1632 if (error && error != ENOENT && error != EINVAL) {
1633 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1634 netdev_name, strerror(error));
1635 return error;
1636 }
1637
1638 netdev_dev->kbits_rate = 0;
1639 netdev_dev->kbits_burst = 0;
1640 netdev_dev->cache_valid |= VALID_POLICING;
1641 return 0;
1642 }
1643
1644 /* Attempts to set input rate limiting (policing) policy. */
1645 static int
1646 netdev_linux_set_policing(struct netdev *netdev,
1647 uint32_t kbits_rate, uint32_t kbits_burst)
1648 {
1649 struct netdev_dev_linux *netdev_dev =
1650 netdev_dev_linux_cast(netdev_get_dev(netdev));
1651 const char *netdev_name = netdev_get_name(netdev);
1652 char command[1024];
1653
1654 COVERAGE_INC(netdev_set_policing);
1655
1656 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1657 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1658 : kbits_burst); /* Stick with user-specified value. */
1659
1660 if (netdev_dev->cache_valid & VALID_POLICING
1661 && netdev_dev->kbits_rate == kbits_rate
1662 && netdev_dev->kbits_burst == kbits_burst) {
1663 /* Assume that settings haven't changed since we last set them. */
1664 return 0;
1665 }
1666
1667 netdev_linux_remove_policing(netdev);
1668 if (kbits_rate) {
1669 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1670 if (system(command) != 0) {
1671 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1672 return -1;
1673 }
1674
1675 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1676 kbits_rate, kbits_burst);
1677 if (system(command) != 0) {
1678 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1679 netdev_name);
1680 return -1;
1681 }
1682
1683 netdev_dev->kbits_rate = kbits_rate;
1684 netdev_dev->kbits_burst = kbits_burst;
1685 netdev_dev->cache_valid |= VALID_POLICING;
1686 }
1687
1688 return 0;
1689 }
1690
1691 static int
1692 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1693 struct sset *types)
1694 {
1695 const struct tc_ops **opsp;
1696
1697 for (opsp = tcs; *opsp != NULL; opsp++) {
1698 const struct tc_ops *ops = *opsp;
1699 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1700 sset_add(types, ops->ovs_name);
1701 }
1702 }
1703 return 0;
1704 }
1705
1706 static const struct tc_ops *
1707 tc_lookup_ovs_name(const char *name)
1708 {
1709 const struct tc_ops **opsp;
1710
1711 for (opsp = tcs; *opsp != NULL; opsp++) {
1712 const struct tc_ops *ops = *opsp;
1713 if (!strcmp(name, ops->ovs_name)) {
1714 return ops;
1715 }
1716 }
1717 return NULL;
1718 }
1719
1720 static const struct tc_ops *
1721 tc_lookup_linux_name(const char *name)
1722 {
1723 const struct tc_ops **opsp;
1724
1725 for (opsp = tcs; *opsp != NULL; opsp++) {
1726 const struct tc_ops *ops = *opsp;
1727 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1728 return ops;
1729 }
1730 }
1731 return NULL;
1732 }
1733
1734 static struct tc_queue *
1735 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1736 size_t hash)
1737 {
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1740 struct tc_queue *queue;
1741
1742 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1743 if (queue->queue_id == queue_id) {
1744 return queue;
1745 }
1746 }
1747 return NULL;
1748 }
1749
1750 static struct tc_queue *
1751 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1752 {
1753 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1754 }
1755
1756 static int
1757 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1758 const char *type,
1759 struct netdev_qos_capabilities *caps)
1760 {
1761 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1762 if (!ops) {
1763 return EOPNOTSUPP;
1764 }
1765 caps->n_queues = ops->n_queues;
1766 return 0;
1767 }
1768
1769 static int
1770 netdev_linux_get_qos(const struct netdev *netdev,
1771 const char **typep, struct shash *details)
1772 {
1773 struct netdev_dev_linux *netdev_dev =
1774 netdev_dev_linux_cast(netdev_get_dev(netdev));
1775 int error;
1776
1777 error = tc_query_qdisc(netdev);
1778 if (error) {
1779 return error;
1780 }
1781
1782 *typep = netdev_dev->tc->ops->ovs_name;
1783 return (netdev_dev->tc->ops->qdisc_get
1784 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1785 : 0);
1786 }
1787
1788 static int
1789 netdev_linux_set_qos(struct netdev *netdev,
1790 const char *type, const struct shash *details)
1791 {
1792 struct netdev_dev_linux *netdev_dev =
1793 netdev_dev_linux_cast(netdev_get_dev(netdev));
1794 const struct tc_ops *new_ops;
1795 int error;
1796
1797 new_ops = tc_lookup_ovs_name(type);
1798 if (!new_ops || !new_ops->tc_install) {
1799 return EOPNOTSUPP;
1800 }
1801
1802 error = tc_query_qdisc(netdev);
1803 if (error) {
1804 return error;
1805 }
1806
1807 if (new_ops == netdev_dev->tc->ops) {
1808 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1809 } else {
1810 /* Delete existing qdisc. */
1811 error = tc_del_qdisc(netdev);
1812 if (error) {
1813 return error;
1814 }
1815 assert(netdev_dev->tc == NULL);
1816
1817 /* Install new qdisc. */
1818 error = new_ops->tc_install(netdev, details);
1819 assert((error == 0) == (netdev_dev->tc != NULL));
1820
1821 return error;
1822 }
1823 }
1824
1825 static int
1826 netdev_linux_get_queue(const struct netdev *netdev,
1827 unsigned int queue_id, struct shash *details)
1828 {
1829 struct netdev_dev_linux *netdev_dev =
1830 netdev_dev_linux_cast(netdev_get_dev(netdev));
1831 int error;
1832
1833 error = tc_query_qdisc(netdev);
1834 if (error) {
1835 return error;
1836 } else {
1837 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1838 return (queue
1839 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1840 : ENOENT);
1841 }
1842 }
1843
1844 static int
1845 netdev_linux_set_queue(struct netdev *netdev,
1846 unsigned int queue_id, const struct shash *details)
1847 {
1848 struct netdev_dev_linux *netdev_dev =
1849 netdev_dev_linux_cast(netdev_get_dev(netdev));
1850 int error;
1851
1852 error = tc_query_qdisc(netdev);
1853 if (error) {
1854 return error;
1855 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1856 || !netdev_dev->tc->ops->class_set) {
1857 return EINVAL;
1858 }
1859
1860 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1861 }
1862
1863 static int
1864 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1865 {
1866 struct netdev_dev_linux *netdev_dev =
1867 netdev_dev_linux_cast(netdev_get_dev(netdev));
1868 int error;
1869
1870 error = tc_query_qdisc(netdev);
1871 if (error) {
1872 return error;
1873 } else if (!netdev_dev->tc->ops->class_delete) {
1874 return EINVAL;
1875 } else {
1876 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1877 return (queue
1878 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1879 : ENOENT);
1880 }
1881 }
1882
1883 static int
1884 netdev_linux_get_queue_stats(const struct netdev *netdev,
1885 unsigned int queue_id,
1886 struct netdev_queue_stats *stats)
1887 {
1888 struct netdev_dev_linux *netdev_dev =
1889 netdev_dev_linux_cast(netdev_get_dev(netdev));
1890 int error;
1891
1892 error = tc_query_qdisc(netdev);
1893 if (error) {
1894 return error;
1895 } else if (!netdev_dev->tc->ops->class_get_stats) {
1896 return EOPNOTSUPP;
1897 } else {
1898 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1899 return (queue
1900 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1901 : ENOENT);
1902 }
1903 }
1904
1905 static bool
1906 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1907 {
1908 struct ofpbuf request;
1909 struct tcmsg *tcmsg;
1910
1911 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1912 if (!tcmsg) {
1913 return false;
1914 }
1915 tcmsg->tcm_parent = 0;
1916 nl_dump_start(dump, rtnl_sock, &request);
1917 ofpbuf_uninit(&request);
1918 return true;
1919 }
1920
1921 static int
1922 netdev_linux_dump_queues(const struct netdev *netdev,
1923 netdev_dump_queues_cb *cb, void *aux)
1924 {
1925 struct netdev_dev_linux *netdev_dev =
1926 netdev_dev_linux_cast(netdev_get_dev(netdev));
1927 struct tc_queue *queue;
1928 struct shash details;
1929 int last_error;
1930 int error;
1931
1932 error = tc_query_qdisc(netdev);
1933 if (error) {
1934 return error;
1935 } else if (!netdev_dev->tc->ops->class_get) {
1936 return EOPNOTSUPP;
1937 }
1938
1939 last_error = 0;
1940 shash_init(&details);
1941 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1942 shash_clear(&details);
1943
1944 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1945 if (!error) {
1946 (*cb)(queue->queue_id, &details, aux);
1947 } else {
1948 last_error = error;
1949 }
1950 }
1951 shash_destroy(&details);
1952
1953 return last_error;
1954 }
1955
1956 static int
1957 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1958 netdev_dump_queue_stats_cb *cb, void *aux)
1959 {
1960 struct netdev_dev_linux *netdev_dev =
1961 netdev_dev_linux_cast(netdev_get_dev(netdev));
1962 struct nl_dump dump;
1963 struct ofpbuf msg;
1964 int last_error;
1965 int error;
1966
1967 error = tc_query_qdisc(netdev);
1968 if (error) {
1969 return error;
1970 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1971 return EOPNOTSUPP;
1972 }
1973
1974 last_error = 0;
1975 if (!start_queue_dump(netdev, &dump)) {
1976 return ENODEV;
1977 }
1978 while (nl_dump_next(&dump, &msg)) {
1979 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1980 if (error) {
1981 last_error = error;
1982 }
1983 }
1984
1985 error = nl_dump_done(&dump);
1986 return error ? error : last_error;
1987 }
1988
1989 static int
1990 netdev_linux_get_in4(const struct netdev *netdev_,
1991 struct in_addr *address, struct in_addr *netmask)
1992 {
1993 struct netdev_dev_linux *netdev_dev =
1994 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1995
1996 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1997 int error;
1998
1999 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2000 SIOCGIFADDR, "SIOCGIFADDR");
2001 if (error) {
2002 return error;
2003 }
2004
2005 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2006 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2007 if (error) {
2008 return error;
2009 }
2010
2011 netdev_dev->cache_valid |= VALID_IN4;
2012 }
2013 *address = netdev_dev->address;
2014 *netmask = netdev_dev->netmask;
2015 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2016 }
2017
2018 static int
2019 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2020 struct in_addr netmask)
2021 {
2022 struct netdev_dev_linux *netdev_dev =
2023 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2024 int error;
2025
2026 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2027 if (!error) {
2028 netdev_dev->cache_valid |= VALID_IN4;
2029 netdev_dev->address = address;
2030 netdev_dev->netmask = netmask;
2031 if (address.s_addr != INADDR_ANY) {
2032 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2033 "SIOCSIFNETMASK", netmask);
2034 }
2035 }
2036 return error;
2037 }
2038
2039 static bool
2040 parse_if_inet6_line(const char *line,
2041 struct in6_addr *in6, char ifname[16 + 1])
2042 {
2043 uint8_t *s6 = in6->s6_addr;
2044 #define X8 "%2"SCNx8
2045 return sscanf(line,
2046 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2047 "%*x %*x %*x %*x %16s\n",
2048 &s6[0], &s6[1], &s6[2], &s6[3],
2049 &s6[4], &s6[5], &s6[6], &s6[7],
2050 &s6[8], &s6[9], &s6[10], &s6[11],
2051 &s6[12], &s6[13], &s6[14], &s6[15],
2052 ifname) == 17;
2053 }
2054
2055 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2056 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2057 static int
2058 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2059 {
2060 struct netdev_dev_linux *netdev_dev =
2061 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2062 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2063 FILE *file;
2064 char line[128];
2065
2066 netdev_dev->in6 = in6addr_any;
2067
2068 file = fopen("/proc/net/if_inet6", "r");
2069 if (file != NULL) {
2070 const char *name = netdev_get_name(netdev_);
2071 while (fgets(line, sizeof line, file)) {
2072 struct in6_addr in6_tmp;
2073 char ifname[16 + 1];
2074 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2075 && !strcmp(name, ifname))
2076 {
2077 netdev_dev->in6 = in6_tmp;
2078 break;
2079 }
2080 }
2081 fclose(file);
2082 }
2083 netdev_dev->cache_valid |= VALID_IN6;
2084 }
2085 *in6 = netdev_dev->in6;
2086 return 0;
2087 }
2088
2089 static void
2090 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2091 {
2092 struct sockaddr_in sin;
2093 memset(&sin, 0, sizeof sin);
2094 sin.sin_family = AF_INET;
2095 sin.sin_addr = addr;
2096 sin.sin_port = 0;
2097
2098 memset(sa, 0, sizeof *sa);
2099 memcpy(sa, &sin, sizeof sin);
2100 }
2101
2102 static int
2103 do_set_addr(struct netdev *netdev,
2104 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2105 {
2106 struct ifreq ifr;
2107 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2108 make_in4_sockaddr(&ifr.ifr_addr, addr);
2109
2110 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2111 ioctl_name);
2112 }
2113
2114 /* Adds 'router' as a default IP gateway. */
2115 static int
2116 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2117 {
2118 struct in_addr any = { INADDR_ANY };
2119 struct rtentry rt;
2120 int error;
2121
2122 memset(&rt, 0, sizeof rt);
2123 make_in4_sockaddr(&rt.rt_dst, any);
2124 make_in4_sockaddr(&rt.rt_gateway, router);
2125 make_in4_sockaddr(&rt.rt_genmask, any);
2126 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2127 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2128 if (error) {
2129 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2130 }
2131 return error;
2132 }
2133
2134 static int
2135 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2136 char **netdev_name)
2137 {
2138 static const char fn[] = "/proc/net/route";
2139 FILE *stream;
2140 char line[256];
2141 int ln;
2142
2143 *netdev_name = NULL;
2144 stream = fopen(fn, "r");
2145 if (stream == NULL) {
2146 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2147 return errno;
2148 }
2149
2150 ln = 0;
2151 while (fgets(line, sizeof line, stream)) {
2152 if (++ln >= 2) {
2153 char iface[17];
2154 ovs_be32 dest, gateway, mask;
2155 int refcnt, metric, mtu;
2156 unsigned int flags, use, window, irtt;
2157
2158 if (sscanf(line,
2159 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2160 " %d %u %u\n",
2161 iface, &dest, &gateway, &flags, &refcnt,
2162 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2163
2164 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2165 fn, ln, line);
2166 continue;
2167 }
2168 if (!(flags & RTF_UP)) {
2169 /* Skip routes that aren't up. */
2170 continue;
2171 }
2172
2173 /* The output of 'dest', 'mask', and 'gateway' were given in
2174 * network byte order, so we don't need need any endian
2175 * conversions here. */
2176 if ((dest & mask) == (host->s_addr & mask)) {
2177 if (!gateway) {
2178 /* The host is directly reachable. */
2179 next_hop->s_addr = 0;
2180 } else {
2181 /* To reach the host, we must go through a gateway. */
2182 next_hop->s_addr = gateway;
2183 }
2184 *netdev_name = xstrdup(iface);
2185 fclose(stream);
2186 return 0;
2187 }
2188 }
2189 }
2190
2191 fclose(stream);
2192 return ENXIO;
2193 }
2194
2195 static int
2196 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2197 {
2198 struct ethtool_drvinfo drvinfo;
2199 int error;
2200
2201 memset(&drvinfo, 0, sizeof drvinfo);
2202 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2203 (struct ethtool_cmd *)&drvinfo,
2204 ETHTOOL_GDRVINFO,
2205 "ETHTOOL_GDRVINFO");
2206 if (!error) {
2207 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2208 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2209 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2210 }
2211
2212 return error;
2213 }
2214
2215 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2216 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2217 * returns 0. Otherwise, it returns a positive errno value; in particular,
2218 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2219 static int
2220 netdev_linux_arp_lookup(const struct netdev *netdev,
2221 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2222 {
2223 struct arpreq r;
2224 struct sockaddr_in sin;
2225 int retval;
2226
2227 memset(&r, 0, sizeof r);
2228 memset(&sin, 0, sizeof sin);
2229 sin.sin_family = AF_INET;
2230 sin.sin_addr.s_addr = ip;
2231 sin.sin_port = 0;
2232 memcpy(&r.arp_pa, &sin, sizeof sin);
2233 r.arp_ha.sa_family = ARPHRD_ETHER;
2234 r.arp_flags = 0;
2235 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2236 COVERAGE_INC(netdev_arp_lookup);
2237 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2238 if (!retval) {
2239 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2240 } else if (retval != ENXIO) {
2241 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2242 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2243 }
2244 return retval;
2245 }
2246
2247 static int
2248 nd_to_iff_flags(enum netdev_flags nd)
2249 {
2250 int iff = 0;
2251 if (nd & NETDEV_UP) {
2252 iff |= IFF_UP;
2253 }
2254 if (nd & NETDEV_PROMISC) {
2255 iff |= IFF_PROMISC;
2256 }
2257 return iff;
2258 }
2259
2260 static int
2261 iff_to_nd_flags(int iff)
2262 {
2263 enum netdev_flags nd = 0;
2264 if (iff & IFF_UP) {
2265 nd |= NETDEV_UP;
2266 }
2267 if (iff & IFF_PROMISC) {
2268 nd |= NETDEV_PROMISC;
2269 }
2270 return nd;
2271 }
2272
2273 static int
2274 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2275 enum netdev_flags on, enum netdev_flags *old_flagsp)
2276 {
2277 int old_flags, new_flags;
2278 int error;
2279
2280 error = get_flags(netdev, &old_flags);
2281 if (!error) {
2282 *old_flagsp = iff_to_nd_flags(old_flags);
2283 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2284 if (new_flags != old_flags) {
2285 error = set_flags(netdev, new_flags);
2286 }
2287 }
2288 return error;
2289 }
2290
2291 static unsigned int
2292 netdev_linux_change_seq(const struct netdev *netdev)
2293 {
2294 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2295 }
2296
2297 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2298 { \
2299 NAME, \
2300 \
2301 netdev_linux_init, \
2302 netdev_linux_run, \
2303 netdev_linux_wait, \
2304 \
2305 CREATE, \
2306 netdev_linux_destroy, \
2307 NULL, /* get_config */ \
2308 NULL, /* set_config */ \
2309 \
2310 netdev_linux_open, \
2311 netdev_linux_close, \
2312 \
2313 netdev_linux_listen, \
2314 netdev_linux_recv, \
2315 netdev_linux_recv_wait, \
2316 netdev_linux_drain, \
2317 \
2318 netdev_linux_send, \
2319 netdev_linux_send_wait, \
2320 \
2321 netdev_linux_set_etheraddr, \
2322 netdev_linux_get_etheraddr, \
2323 netdev_linux_get_mtu, \
2324 netdev_linux_set_mtu, \
2325 netdev_linux_get_ifindex, \
2326 netdev_linux_get_carrier, \
2327 netdev_linux_get_carrier_resets, \
2328 netdev_linux_set_miimon_interval, \
2329 GET_STATS, \
2330 SET_STATS, \
2331 \
2332 netdev_linux_get_features, \
2333 netdev_linux_set_advertisements, \
2334 netdev_linux_get_vlan_vid, \
2335 \
2336 netdev_linux_set_policing, \
2337 netdev_linux_get_qos_types, \
2338 netdev_linux_get_qos_capabilities, \
2339 netdev_linux_get_qos, \
2340 netdev_linux_set_qos, \
2341 netdev_linux_get_queue, \
2342 netdev_linux_set_queue, \
2343 netdev_linux_delete_queue, \
2344 netdev_linux_get_queue_stats, \
2345 netdev_linux_dump_queues, \
2346 netdev_linux_dump_queue_stats, \
2347 \
2348 netdev_linux_get_in4, \
2349 netdev_linux_set_in4, \
2350 netdev_linux_get_in6, \
2351 netdev_linux_add_router, \
2352 netdev_linux_get_next_hop, \
2353 netdev_linux_get_status, \
2354 netdev_linux_arp_lookup, \
2355 \
2356 netdev_linux_update_flags, \
2357 \
2358 netdev_linux_change_seq \
2359 }
2360
2361 const struct netdev_class netdev_linux_class =
2362 NETDEV_LINUX_CLASS(
2363 "system",
2364 netdev_linux_create,
2365 netdev_linux_get_stats,
2366 NULL); /* set_stats */
2367
2368 const struct netdev_class netdev_tap_class =
2369 NETDEV_LINUX_CLASS(
2370 "tap",
2371 netdev_linux_create_tap,
2372 netdev_pseudo_get_stats,
2373 NULL); /* set_stats */
2374
2375 const struct netdev_class netdev_internal_class =
2376 NETDEV_LINUX_CLASS(
2377 "internal",
2378 netdev_linux_create,
2379 netdev_pseudo_get_stats,
2380 netdev_vport_set_stats);
2381 \f
2382 /* HTB traffic control class. */
2383
2384 #define HTB_N_QUEUES 0xf000
2385
2386 struct htb {
2387 struct tc tc;
2388 unsigned int max_rate; /* In bytes/s. */
2389 };
2390
2391 struct htb_class {
2392 struct tc_queue tc_queue;
2393 unsigned int min_rate; /* In bytes/s. */
2394 unsigned int max_rate; /* In bytes/s. */
2395 unsigned int burst; /* In bytes. */
2396 unsigned int priority; /* Lower values are higher priorities. */
2397 };
2398
2399 static struct htb *
2400 htb_get__(const struct netdev *netdev)
2401 {
2402 struct netdev_dev_linux *netdev_dev =
2403 netdev_dev_linux_cast(netdev_get_dev(netdev));
2404 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2405 }
2406
2407 static void
2408 htb_install__(struct netdev *netdev, uint64_t max_rate)
2409 {
2410 struct netdev_dev_linux *netdev_dev =
2411 netdev_dev_linux_cast(netdev_get_dev(netdev));
2412 struct htb *htb;
2413
2414 htb = xmalloc(sizeof *htb);
2415 tc_init(&htb->tc, &tc_ops_htb);
2416 htb->max_rate = max_rate;
2417
2418 netdev_dev->tc = &htb->tc;
2419 }
2420
2421 /* Create an HTB qdisc.
2422 *
2423 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2424 static int
2425 htb_setup_qdisc__(struct netdev *netdev)
2426 {
2427 size_t opt_offset;
2428 struct tc_htb_glob opt;
2429 struct ofpbuf request;
2430 struct tcmsg *tcmsg;
2431
2432 tc_del_qdisc(netdev);
2433
2434 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2435 NLM_F_EXCL | NLM_F_CREATE, &request);
2436 if (!tcmsg) {
2437 return ENODEV;
2438 }
2439 tcmsg->tcm_handle = tc_make_handle(1, 0);
2440 tcmsg->tcm_parent = TC_H_ROOT;
2441
2442 nl_msg_put_string(&request, TCA_KIND, "htb");
2443
2444 memset(&opt, 0, sizeof opt);
2445 opt.rate2quantum = 10;
2446 opt.version = 3;
2447 opt.defcls = 1;
2448
2449 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2450 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2451 nl_msg_end_nested(&request, opt_offset);
2452
2453 return tc_transact(&request, NULL);
2454 }
2455
2456 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2457 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2458 static int
2459 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2460 unsigned int parent, struct htb_class *class)
2461 {
2462 size_t opt_offset;
2463 struct tc_htb_opt opt;
2464 struct ofpbuf request;
2465 struct tcmsg *tcmsg;
2466 int error;
2467 int mtu;
2468
2469 error = netdev_get_mtu(netdev, &mtu);
2470 if (error) {
2471 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2472 netdev_get_name(netdev));
2473 return error;
2474 }
2475
2476 memset(&opt, 0, sizeof opt);
2477 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2478 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2479 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2480 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2481 opt.prio = class->priority;
2482
2483 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2484 if (!tcmsg) {
2485 return ENODEV;
2486 }
2487 tcmsg->tcm_handle = handle;
2488 tcmsg->tcm_parent = parent;
2489
2490 nl_msg_put_string(&request, TCA_KIND, "htb");
2491 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2492 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2493 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2494 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2495 nl_msg_end_nested(&request, opt_offset);
2496
2497 error = tc_transact(&request, NULL);
2498 if (error) {
2499 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2500 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2501 netdev_get_name(netdev),
2502 tc_get_major(handle), tc_get_minor(handle),
2503 tc_get_major(parent), tc_get_minor(parent),
2504 class->min_rate, class->max_rate,
2505 class->burst, class->priority, strerror(error));
2506 }
2507 return error;
2508 }
2509
2510 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2511 * description of them into 'details'. The description complies with the
2512 * specification given in the vswitch database documentation for linux-htb
2513 * queue details. */
2514 static int
2515 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2516 {
2517 static const struct nl_policy tca_htb_policy[] = {
2518 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2519 .min_len = sizeof(struct tc_htb_opt) },
2520 };
2521
2522 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2523 const struct tc_htb_opt *htb;
2524
2525 if (!nl_parse_nested(nl_options, tca_htb_policy,
2526 attrs, ARRAY_SIZE(tca_htb_policy))) {
2527 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2528 return EPROTO;
2529 }
2530
2531 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2532 class->min_rate = htb->rate.rate;
2533 class->max_rate = htb->ceil.rate;
2534 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2535 class->priority = htb->prio;
2536 return 0;
2537 }
2538
2539 static int
2540 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2541 struct htb_class *options,
2542 struct netdev_queue_stats *stats)
2543 {
2544 struct nlattr *nl_options;
2545 unsigned int handle;
2546 int error;
2547
2548 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2549 if (!error && queue_id) {
2550 unsigned int major = tc_get_major(handle);
2551 unsigned int minor = tc_get_minor(handle);
2552 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2553 *queue_id = minor - 1;
2554 } else {
2555 error = EPROTO;
2556 }
2557 }
2558 if (!error && options) {
2559 error = htb_parse_tca_options__(nl_options, options);
2560 }
2561 return error;
2562 }
2563
2564 static void
2565 htb_parse_qdisc_details__(struct netdev *netdev,
2566 const struct shash *details, struct htb_class *hc)
2567 {
2568 const char *max_rate_s;
2569
2570 max_rate_s = shash_find_data(details, "max-rate");
2571 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2572 if (!hc->max_rate) {
2573 uint32_t current;
2574
2575 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2576 hc->max_rate = netdev_features_to_bps(current) / 8;
2577 }
2578 hc->min_rate = hc->max_rate;
2579 hc->burst = 0;
2580 hc->priority = 0;
2581 }
2582
2583 static int
2584 htb_parse_class_details__(struct netdev *netdev,
2585 const struct shash *details, struct htb_class *hc)
2586 {
2587 const struct htb *htb = htb_get__(netdev);
2588 const char *min_rate_s = shash_find_data(details, "min-rate");
2589 const char *max_rate_s = shash_find_data(details, "max-rate");
2590 const char *burst_s = shash_find_data(details, "burst");
2591 const char *priority_s = shash_find_data(details, "priority");
2592 int mtu, error;
2593
2594 error = netdev_get_mtu(netdev, &mtu);
2595 if (error) {
2596 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2597 netdev_get_name(netdev));
2598 return error;
2599 }
2600
2601 /* HTB requires at least an mtu sized min-rate to send any traffic even
2602 * on uncongested links. */
2603 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2604 hc->min_rate = MAX(hc->min_rate, mtu);
2605 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2606
2607 /* max-rate */
2608 hc->max_rate = (max_rate_s
2609 ? strtoull(max_rate_s, NULL, 10) / 8
2610 : htb->max_rate);
2611 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2612 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2613
2614 /* burst
2615 *
2616 * According to hints in the documentation that I've read, it is important
2617 * that 'burst' be at least as big as the largest frame that might be
2618 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2619 * but having it a bit too small is a problem. Since netdev_get_mtu()
2620 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2621 * the MTU. We actually add 64, instead of 14, as a guard against
2622 * additional headers get tacked on somewhere that we're not aware of. */
2623 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2624 hc->burst = MAX(hc->burst, mtu + 64);
2625
2626 /* priority */
2627 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2628
2629 return 0;
2630 }
2631
2632 static int
2633 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2634 unsigned int parent, struct htb_class *options,
2635 struct netdev_queue_stats *stats)
2636 {
2637 struct ofpbuf *reply;
2638 int error;
2639
2640 error = tc_query_class(netdev, handle, parent, &reply);
2641 if (!error) {
2642 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2643 ofpbuf_delete(reply);
2644 }
2645 return error;
2646 }
2647
2648 static int
2649 htb_tc_install(struct netdev *netdev, const struct shash *details)
2650 {
2651 int error;
2652
2653 error = htb_setup_qdisc__(netdev);
2654 if (!error) {
2655 struct htb_class hc;
2656
2657 htb_parse_qdisc_details__(netdev, details, &hc);
2658 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2659 tc_make_handle(1, 0), &hc);
2660 if (!error) {
2661 htb_install__(netdev, hc.max_rate);
2662 }
2663 }
2664 return error;
2665 }
2666
2667 static struct htb_class *
2668 htb_class_cast__(const struct tc_queue *queue)
2669 {
2670 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2671 }
2672
2673 static void
2674 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2675 const struct htb_class *hc)
2676 {
2677 struct htb *htb = htb_get__(netdev);
2678 size_t hash = hash_int(queue_id, 0);
2679 struct tc_queue *queue;
2680 struct htb_class *hcp;
2681
2682 queue = tc_find_queue__(netdev, queue_id, hash);
2683 if (queue) {
2684 hcp = htb_class_cast__(queue);
2685 } else {
2686 hcp = xmalloc(sizeof *hcp);
2687 queue = &hcp->tc_queue;
2688 queue->queue_id = queue_id;
2689 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2690 }
2691
2692 hcp->min_rate = hc->min_rate;
2693 hcp->max_rate = hc->max_rate;
2694 hcp->burst = hc->burst;
2695 hcp->priority = hc->priority;
2696 }
2697
2698 static int
2699 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2700 {
2701 struct ofpbuf msg;
2702 struct nl_dump dump;
2703 struct htb_class hc;
2704
2705 /* Get qdisc options. */
2706 hc.max_rate = 0;
2707 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2708 htb_install__(netdev, hc.max_rate);
2709
2710 /* Get queues. */
2711 if (!start_queue_dump(netdev, &dump)) {
2712 return ENODEV;
2713 }
2714 while (nl_dump_next(&dump, &msg)) {
2715 unsigned int queue_id;
2716
2717 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2718 htb_update_queue__(netdev, queue_id, &hc);
2719 }
2720 }
2721 nl_dump_done(&dump);
2722
2723 return 0;
2724 }
2725
2726 static void
2727 htb_tc_destroy(struct tc *tc)
2728 {
2729 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2730 struct htb_class *hc, *next;
2731
2732 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2733 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2734 free(hc);
2735 }
2736 tc_destroy(tc);
2737 free(htb);
2738 }
2739
2740 static int
2741 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2742 {
2743 const struct htb *htb = htb_get__(netdev);
2744 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2745 return 0;
2746 }
2747
2748 static int
2749 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2750 {
2751 struct htb_class hc;
2752 int error;
2753
2754 htb_parse_qdisc_details__(netdev, details, &hc);
2755 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2756 tc_make_handle(1, 0), &hc);
2757 if (!error) {
2758 htb_get__(netdev)->max_rate = hc.max_rate;
2759 }
2760 return error;
2761 }
2762
2763 static int
2764 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2765 const struct tc_queue *queue, struct shash *details)
2766 {
2767 const struct htb_class *hc = htb_class_cast__(queue);
2768
2769 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2770 if (hc->min_rate != hc->max_rate) {
2771 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2772 }
2773 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2774 if (hc->priority) {
2775 shash_add(details, "priority", xasprintf("%u", hc->priority));
2776 }
2777 return 0;
2778 }
2779
2780 static int
2781 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2782 const struct shash *details)
2783 {
2784 struct htb_class hc;
2785 int error;
2786
2787 error = htb_parse_class_details__(netdev, details, &hc);
2788 if (error) {
2789 return error;
2790 }
2791
2792 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2793 tc_make_handle(1, 0xfffe), &hc);
2794 if (error) {
2795 return error;
2796 }
2797
2798 htb_update_queue__(netdev, queue_id, &hc);
2799 return 0;
2800 }
2801
2802 static int
2803 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2804 {
2805 struct htb_class *hc = htb_class_cast__(queue);
2806 struct htb *htb = htb_get__(netdev);
2807 int error;
2808
2809 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2810 if (!error) {
2811 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2812 free(hc);
2813 }
2814 return error;
2815 }
2816
2817 static int
2818 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2819 struct netdev_queue_stats *stats)
2820 {
2821 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2822 tc_make_handle(1, 0xfffe), NULL, stats);
2823 }
2824
2825 static int
2826 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2827 const struct ofpbuf *nlmsg,
2828 netdev_dump_queue_stats_cb *cb, void *aux)
2829 {
2830 struct netdev_queue_stats stats;
2831 unsigned int handle, major, minor;
2832 int error;
2833
2834 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2835 if (error) {
2836 return error;
2837 }
2838
2839 major = tc_get_major(handle);
2840 minor = tc_get_minor(handle);
2841 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2842 (*cb)(minor - 1, &stats, aux);
2843 }
2844 return 0;
2845 }
2846
2847 static const struct tc_ops tc_ops_htb = {
2848 "htb", /* linux_name */
2849 "linux-htb", /* ovs_name */
2850 HTB_N_QUEUES, /* n_queues */
2851 htb_tc_install,
2852 htb_tc_load,
2853 htb_tc_destroy,
2854 htb_qdisc_get,
2855 htb_qdisc_set,
2856 htb_class_get,
2857 htb_class_set,
2858 htb_class_delete,
2859 htb_class_get_stats,
2860 htb_class_dump_stats
2861 };
2862 \f
2863 /* "linux-hfsc" traffic control class. */
2864
2865 #define HFSC_N_QUEUES 0xf000
2866
2867 struct hfsc {
2868 struct tc tc;
2869 uint32_t max_rate;
2870 };
2871
2872 struct hfsc_class {
2873 struct tc_queue tc_queue;
2874 uint32_t min_rate;
2875 uint32_t max_rate;
2876 };
2877
2878 static struct hfsc *
2879 hfsc_get__(const struct netdev *netdev)
2880 {
2881 struct netdev_dev_linux *netdev_dev;
2882 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2883 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2884 }
2885
2886 static struct hfsc_class *
2887 hfsc_class_cast__(const struct tc_queue *queue)
2888 {
2889 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2890 }
2891
2892 static void
2893 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2894 {
2895 struct netdev_dev_linux * netdev_dev;
2896 struct hfsc *hfsc;
2897
2898 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2899 hfsc = xmalloc(sizeof *hfsc);
2900 tc_init(&hfsc->tc, &tc_ops_hfsc);
2901 hfsc->max_rate = max_rate;
2902 netdev_dev->tc = &hfsc->tc;
2903 }
2904
2905 static void
2906 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2907 const struct hfsc_class *hc)
2908 {
2909 size_t hash;
2910 struct hfsc *hfsc;
2911 struct hfsc_class *hcp;
2912 struct tc_queue *queue;
2913
2914 hfsc = hfsc_get__(netdev);
2915 hash = hash_int(queue_id, 0);
2916
2917 queue = tc_find_queue__(netdev, queue_id, hash);
2918 if (queue) {
2919 hcp = hfsc_class_cast__(queue);
2920 } else {
2921 hcp = xmalloc(sizeof *hcp);
2922 queue = &hcp->tc_queue;
2923 queue->queue_id = queue_id;
2924 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2925 }
2926
2927 hcp->min_rate = hc->min_rate;
2928 hcp->max_rate = hc->max_rate;
2929 }
2930
2931 static int
2932 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2933 {
2934 const struct tc_service_curve *rsc, *fsc, *usc;
2935 static const struct nl_policy tca_hfsc_policy[] = {
2936 [TCA_HFSC_RSC] = {
2937 .type = NL_A_UNSPEC,
2938 .optional = false,
2939 .min_len = sizeof(struct tc_service_curve),
2940 },
2941 [TCA_HFSC_FSC] = {
2942 .type = NL_A_UNSPEC,
2943 .optional = false,
2944 .min_len = sizeof(struct tc_service_curve),
2945 },
2946 [TCA_HFSC_USC] = {
2947 .type = NL_A_UNSPEC,
2948 .optional = false,
2949 .min_len = sizeof(struct tc_service_curve),
2950 },
2951 };
2952 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2953
2954 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2955 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2956 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2957 return EPROTO;
2958 }
2959
2960 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2961 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2962 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2963
2964 if (rsc->m1 != 0 || rsc->d != 0 ||
2965 fsc->m1 != 0 || fsc->d != 0 ||
2966 usc->m1 != 0 || usc->d != 0) {
2967 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2968 "Non-linear service curves are not supported.");
2969 return EPROTO;
2970 }
2971
2972 if (rsc->m2 != fsc->m2) {
2973 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2974 "Real-time service curves are not supported ");
2975 return EPROTO;
2976 }
2977
2978 if (rsc->m2 > usc->m2) {
2979 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2980 "Min-rate service curve is greater than "
2981 "the max-rate service curve.");
2982 return EPROTO;
2983 }
2984
2985 class->min_rate = fsc->m2;
2986 class->max_rate = usc->m2;
2987 return 0;
2988 }
2989
2990 static int
2991 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2992 struct hfsc_class *options,
2993 struct netdev_queue_stats *stats)
2994 {
2995 int error;
2996 unsigned int handle;
2997 struct nlattr *nl_options;
2998
2999 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3000 if (error) {
3001 return error;
3002 }
3003
3004 if (queue_id) {
3005 unsigned int major, minor;
3006
3007 major = tc_get_major(handle);
3008 minor = tc_get_minor(handle);
3009 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3010 *queue_id = minor - 1;
3011 } else {
3012 return EPROTO;
3013 }
3014 }
3015
3016 if (options) {
3017 error = hfsc_parse_tca_options__(nl_options, options);
3018 }
3019
3020 return error;
3021 }
3022
3023 static int
3024 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3025 unsigned int parent, struct hfsc_class *options,
3026 struct netdev_queue_stats *stats)
3027 {
3028 int error;
3029 struct ofpbuf *reply;
3030
3031 error = tc_query_class(netdev, handle, parent, &reply);
3032 if (error) {
3033 return error;
3034 }
3035
3036 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3037 ofpbuf_delete(reply);
3038 return error;
3039 }
3040
3041 static void
3042 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3043 struct hfsc_class *class)
3044 {
3045 uint32_t max_rate;
3046 const char *max_rate_s;
3047
3048 max_rate_s = shash_find_data(details, "max-rate");
3049 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3050
3051 if (!max_rate) {
3052 uint32_t current;
3053
3054 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3055 max_rate = netdev_features_to_bps(current) / 8;
3056 }
3057
3058 class->min_rate = max_rate;
3059 class->max_rate = max_rate;
3060 }
3061
3062 static int
3063 hfsc_parse_class_details__(struct netdev *netdev,
3064 const struct shash *details,
3065 struct hfsc_class * class)
3066 {
3067 const struct hfsc *hfsc;
3068 uint32_t min_rate, max_rate;
3069 const char *min_rate_s, *max_rate_s;
3070
3071 hfsc = hfsc_get__(netdev);
3072 min_rate_s = shash_find_data(details, "min-rate");
3073 max_rate_s = shash_find_data(details, "max-rate");
3074
3075 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3076 min_rate = MAX(min_rate, 1);
3077 min_rate = MIN(min_rate, hfsc->max_rate);
3078
3079 max_rate = (max_rate_s
3080 ? strtoull(max_rate_s, NULL, 10) / 8
3081 : hfsc->max_rate);
3082 max_rate = MAX(max_rate, min_rate);
3083 max_rate = MIN(max_rate, hfsc->max_rate);
3084
3085 class->min_rate = min_rate;
3086 class->max_rate = max_rate;
3087
3088 return 0;
3089 }
3090
3091 /* Create an HFSC qdisc.
3092 *
3093 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3094 static int
3095 hfsc_setup_qdisc__(struct netdev * netdev)
3096 {
3097 struct tcmsg *tcmsg;
3098 struct ofpbuf request;
3099 struct tc_hfsc_qopt opt;
3100
3101 tc_del_qdisc(netdev);
3102
3103 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3104 NLM_F_EXCL | NLM_F_CREATE, &request);
3105
3106 if (!tcmsg) {
3107 return ENODEV;
3108 }
3109
3110 tcmsg->tcm_handle = tc_make_handle(1, 0);
3111 tcmsg->tcm_parent = TC_H_ROOT;
3112
3113 memset(&opt, 0, sizeof opt);
3114 opt.defcls = 1;
3115
3116 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3117 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3118
3119 return tc_transact(&request, NULL);
3120 }
3121
3122 /* Create an HFSC class.
3123 *
3124 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3125 * sc rate <min_rate> ul rate <max_rate>" */
3126 static int
3127 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3128 unsigned int parent, struct hfsc_class *class)
3129 {
3130 int error;
3131 size_t opt_offset;
3132 struct tcmsg *tcmsg;
3133 struct ofpbuf request;
3134 struct tc_service_curve min, max;
3135
3136 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3137
3138 if (!tcmsg) {
3139 return ENODEV;
3140 }
3141
3142 tcmsg->tcm_handle = handle;
3143 tcmsg->tcm_parent = parent;
3144
3145 min.m1 = 0;
3146 min.d = 0;
3147 min.m2 = class->min_rate;
3148
3149 max.m1 = 0;
3150 max.d = 0;
3151 max.m2 = class->max_rate;
3152
3153 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3154 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3155 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3156 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3157 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3158 nl_msg_end_nested(&request, opt_offset);
3159
3160 error = tc_transact(&request, NULL);
3161 if (error) {
3162 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3163 "min-rate %ubps, max-rate %ubps (%s)",
3164 netdev_get_name(netdev),
3165 tc_get_major(handle), tc_get_minor(handle),
3166 tc_get_major(parent), tc_get_minor(parent),
3167 class->min_rate, class->max_rate, strerror(error));
3168 }
3169
3170 return error;
3171 }
3172
3173 static int
3174 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3175 {
3176 int error;
3177 struct hfsc_class class;
3178
3179 error = hfsc_setup_qdisc__(netdev);
3180
3181 if (error) {
3182 return error;
3183 }
3184
3185 hfsc_parse_qdisc_details__(netdev, details, &class);
3186 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3187 tc_make_handle(1, 0), &class);
3188
3189 if (error) {
3190 return error;
3191 }
3192
3193 hfsc_install__(netdev, class.max_rate);
3194 return 0;
3195 }
3196
3197 static int
3198 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3199 {
3200 struct ofpbuf msg;
3201 struct nl_dump dump;
3202 struct hfsc_class hc;
3203
3204 hc.max_rate = 0;
3205 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3206 hfsc_install__(netdev, hc.max_rate);
3207
3208 if (!start_queue_dump(netdev, &dump)) {
3209 return ENODEV;
3210 }
3211
3212 while (nl_dump_next(&dump, &msg)) {
3213 unsigned int queue_id;
3214
3215 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3216 hfsc_update_queue__(netdev, queue_id, &hc);
3217 }
3218 }
3219
3220 nl_dump_done(&dump);
3221 return 0;
3222 }
3223
3224 static void
3225 hfsc_tc_destroy(struct tc *tc)
3226 {
3227 struct hfsc *hfsc;
3228 struct hfsc_class *hc, *next;
3229
3230 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3231
3232 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3233 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3234 free(hc);
3235 }
3236
3237 tc_destroy(tc);
3238 free(hfsc);
3239 }
3240
3241 static int
3242 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3243 {
3244 const struct hfsc *hfsc;
3245 hfsc = hfsc_get__(netdev);
3246 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3247 return 0;
3248 }
3249
3250 static int
3251 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3252 {
3253 int error;
3254 struct hfsc_class class;
3255
3256 hfsc_parse_qdisc_details__(netdev, details, &class);
3257 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3258 tc_make_handle(1, 0), &class);
3259
3260 if (!error) {
3261 hfsc_get__(netdev)->max_rate = class.max_rate;
3262 }
3263
3264 return error;
3265 }
3266
3267 static int
3268 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3269 const struct tc_queue *queue, struct shash *details)
3270 {
3271 const struct hfsc_class *hc;
3272
3273 hc = hfsc_class_cast__(queue);
3274 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3275 if (hc->min_rate != hc->max_rate) {
3276 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3277 }
3278 return 0;
3279 }
3280
3281 static int
3282 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3283 const struct shash *details)
3284 {
3285 int error;
3286 struct hfsc_class class;
3287
3288 error = hfsc_parse_class_details__(netdev, details, &class);
3289 if (error) {
3290 return error;
3291 }
3292
3293 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3294 tc_make_handle(1, 0xfffe), &class);
3295 if (error) {
3296 return error;
3297 }
3298
3299 hfsc_update_queue__(netdev, queue_id, &class);
3300 return 0;
3301 }
3302
3303 static int
3304 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3305 {
3306 int error;
3307 struct hfsc *hfsc;
3308 struct hfsc_class *hc;
3309
3310 hc = hfsc_class_cast__(queue);
3311 hfsc = hfsc_get__(netdev);
3312
3313 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3314 if (!error) {
3315 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3316 free(hc);
3317 }
3318 return error;
3319 }
3320
3321 static int
3322 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3323 struct netdev_queue_stats *stats)
3324 {
3325 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3326 tc_make_handle(1, 0xfffe), NULL, stats);
3327 }
3328
3329 static int
3330 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3331 const struct ofpbuf *nlmsg,
3332 netdev_dump_queue_stats_cb *cb, void *aux)
3333 {
3334 struct netdev_queue_stats stats;
3335 unsigned int handle, major, minor;
3336 int error;
3337
3338 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3339 if (error) {
3340 return error;
3341 }
3342
3343 major = tc_get_major(handle);
3344 minor = tc_get_minor(handle);
3345 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3346 (*cb)(minor - 1, &stats, aux);
3347 }
3348 return 0;
3349 }
3350
3351 static const struct tc_ops tc_ops_hfsc = {
3352 "hfsc", /* linux_name */
3353 "linux-hfsc", /* ovs_name */
3354 HFSC_N_QUEUES, /* n_queues */
3355 hfsc_tc_install, /* tc_install */
3356 hfsc_tc_load, /* tc_load */
3357 hfsc_tc_destroy, /* tc_destroy */
3358 hfsc_qdisc_get, /* qdisc_get */
3359 hfsc_qdisc_set, /* qdisc_set */
3360 hfsc_class_get, /* class_get */
3361 hfsc_class_set, /* class_set */
3362 hfsc_class_delete, /* class_delete */
3363 hfsc_class_get_stats, /* class_get_stats */
3364 hfsc_class_dump_stats /* class_dump_stats */
3365 };
3366 \f
3367 /* "linux-default" traffic control class.
3368 *
3369 * This class represents the default, unnamed Linux qdisc. It corresponds to
3370 * the "" (empty string) QoS type in the OVS database. */
3371
3372 static void
3373 default_install__(struct netdev *netdev)
3374 {
3375 struct netdev_dev_linux *netdev_dev =
3376 netdev_dev_linux_cast(netdev_get_dev(netdev));
3377 static struct tc *tc;
3378
3379 if (!tc) {
3380 tc = xmalloc(sizeof *tc);
3381 tc_init(tc, &tc_ops_default);
3382 }
3383 netdev_dev->tc = tc;
3384 }
3385
3386 static int
3387 default_tc_install(struct netdev *netdev,
3388 const struct shash *details OVS_UNUSED)
3389 {
3390 default_install__(netdev);
3391 return 0;
3392 }
3393
3394 static int
3395 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3396 {
3397 default_install__(netdev);
3398 return 0;
3399 }
3400
3401 static const struct tc_ops tc_ops_default = {
3402 NULL, /* linux_name */
3403 "", /* ovs_name */
3404 0, /* n_queues */
3405 default_tc_install,
3406 default_tc_load,
3407 NULL, /* tc_destroy */
3408 NULL, /* qdisc_get */
3409 NULL, /* qdisc_set */
3410 NULL, /* class_get */
3411 NULL, /* class_set */
3412 NULL, /* class_delete */
3413 NULL, /* class_get_stats */
3414 NULL /* class_dump_stats */
3415 };
3416 \f
3417 /* "linux-other" traffic control class.
3418 *
3419 * */
3420
3421 static int
3422 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3423 {
3424 struct netdev_dev_linux *netdev_dev =
3425 netdev_dev_linux_cast(netdev_get_dev(netdev));
3426 static struct tc *tc;
3427
3428 if (!tc) {
3429 tc = xmalloc(sizeof *tc);
3430 tc_init(tc, &tc_ops_other);
3431 }
3432 netdev_dev->tc = tc;
3433 return 0;
3434 }
3435
3436 static const struct tc_ops tc_ops_other = {
3437 NULL, /* linux_name */
3438 "linux-other", /* ovs_name */
3439 0, /* n_queues */
3440 NULL, /* tc_install */
3441 other_tc_load,
3442 NULL, /* tc_destroy */
3443 NULL, /* qdisc_get */
3444 NULL, /* qdisc_set */
3445 NULL, /* class_get */
3446 NULL, /* class_set */
3447 NULL, /* class_delete */
3448 NULL, /* class_get_stats */
3449 NULL /* class_dump_stats */
3450 };
3451 \f
3452 /* Traffic control. */
3453
3454 /* Number of kernel "tc" ticks per second. */
3455 static double ticks_per_s;
3456
3457 /* Number of kernel "jiffies" per second. This is used for the purpose of
3458 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3459 * one jiffy's worth of data.
3460 *
3461 * There are two possibilities here:
3462 *
3463 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3464 * approximate range of 100 to 1024. That means that we really need to
3465 * make sure that the qdisc can buffer that much data.
3466 *
3467 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3468 * has finely granular timers and there's no need to fudge additional room
3469 * for buffers. (There's no extra effort needed to implement that: the
3470 * large 'buffer_hz' is used as a divisor, so practically any number will
3471 * come out as 0 in the division. Small integer results in the case of
3472 * really high dividends won't have any real effect anyhow.)
3473 */
3474 static unsigned int buffer_hz;
3475
3476 /* Returns tc handle 'major':'minor'. */
3477 static unsigned int
3478 tc_make_handle(unsigned int major, unsigned int minor)
3479 {
3480 return TC_H_MAKE(major << 16, minor);
3481 }
3482
3483 /* Returns the major number from 'handle'. */
3484 static unsigned int
3485 tc_get_major(unsigned int handle)
3486 {
3487 return TC_H_MAJ(handle) >> 16;
3488 }
3489
3490 /* Returns the minor number from 'handle'. */
3491 static unsigned int
3492 tc_get_minor(unsigned int handle)
3493 {
3494 return TC_H_MIN(handle);
3495 }
3496
3497 static struct tcmsg *
3498 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3499 struct ofpbuf *request)
3500 {
3501 struct tcmsg *tcmsg;
3502 int ifindex;
3503 int error;
3504
3505 error = get_ifindex(netdev, &ifindex);
3506 if (error) {
3507 return NULL;
3508 }
3509
3510 ofpbuf_init(request, 512);
3511 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3512 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3513 tcmsg->tcm_family = AF_UNSPEC;
3514 tcmsg->tcm_ifindex = ifindex;
3515 /* Caller should fill in tcmsg->tcm_handle. */
3516 /* Caller should fill in tcmsg->tcm_parent. */
3517
3518 return tcmsg;
3519 }
3520
3521 static int
3522 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3523 {
3524 int error = nl_sock_transact(rtnl_sock, request, replyp);
3525 ofpbuf_uninit(request);
3526 return error;
3527 }
3528
3529 static void
3530 read_psched(void)
3531 {
3532 /* The values in psched are not individually very meaningful, but they are
3533 * important. The tables below show some values seen in the wild.
3534 *
3535 * Some notes:
3536 *
3537 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3538 * (Before that, there are hints that it was 1000000000.)
3539 *
3540 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3541 * above.
3542 *
3543 * /proc/net/psched
3544 * -----------------------------------
3545 * [1] 000c8000 000f4240 000f4240 00000064
3546 * [2] 000003e8 00000400 000f4240 3b9aca00
3547 * [3] 000003e8 00000400 000f4240 3b9aca00
3548 * [4] 000003e8 00000400 000f4240 00000064
3549 * [5] 000003e8 00000040 000f4240 3b9aca00
3550 * [6] 000003e8 00000040 000f4240 000000f9
3551 *
3552 * a b c d ticks_per_s buffer_hz
3553 * ------- --------- ---------- ------------- ----------- -------------
3554 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3555 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3556 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3557 * [4] 1,000 1,024 1,000,000 100 976,562 100
3558 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3559 * [6] 1,000 64 1,000,000 249 15,625,000 249
3560 *
3561 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3562 * [2] 2.6.26-1-686-bigmem from Debian lenny
3563 * [3] 2.6.26-2-sparc64 from Debian lenny
3564 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3565 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3566 * [6] 2.6.34 from kernel.org on KVM
3567 */
3568 static const char fn[] = "/proc/net/psched";
3569 unsigned int a, b, c, d;
3570 FILE *stream;
3571
3572 ticks_per_s = 1.0;
3573 buffer_hz = 100;
3574
3575 stream = fopen(fn, "r");
3576 if (!stream) {
3577 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3578 return;
3579 }
3580
3581 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3582 VLOG_WARN("%s: read failed", fn);
3583 fclose(stream);
3584 return;
3585 }
3586 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3587 fclose(stream);
3588
3589 if (!a || !c) {
3590 VLOG_WARN("%s: invalid scheduler parameters", fn);
3591 return;
3592 }
3593
3594 ticks_per_s = (double) a * c / b;
3595 if (c == 1000000) {
3596 buffer_hz = d;
3597 } else {
3598 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3599 fn, a, b, c, d);
3600 }
3601 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3602 }
3603
3604 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3605 * rate of 'rate' bytes per second. */
3606 static unsigned int
3607 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3608 {
3609 if (!buffer_hz) {
3610 read_psched();
3611 }
3612 return (rate * ticks) / ticks_per_s;
3613 }
3614
3615 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3616 * rate of 'rate' bytes per second. */
3617 static unsigned int
3618 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3619 {
3620 if (!buffer_hz) {
3621 read_psched();
3622 }
3623 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3624 }
3625
3626 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3627 * a transmission rate of 'rate' bytes per second. */
3628 static unsigned int
3629 tc_buffer_per_jiffy(unsigned int rate)
3630 {
3631 if (!buffer_hz) {
3632 read_psched();
3633 }
3634 return rate / buffer_hz;
3635 }
3636
3637 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3638 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3639 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3640 * stores NULL into it if it is absent.
3641 *
3642 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3643 * 'msg'.
3644 *
3645 * Returns 0 if successful, otherwise a positive errno value. */
3646 static int
3647 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3648 struct nlattr **options)
3649 {
3650 static const struct nl_policy tca_policy[] = {
3651 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3652 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3653 };
3654 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3655
3656 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3657 tca_policy, ta, ARRAY_SIZE(ta))) {
3658 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3659 goto error;
3660 }
3661
3662 if (kind) {
3663 *kind = nl_attr_get_string(ta[TCA_KIND]);
3664 }
3665
3666 if (options) {
3667 *options = ta[TCA_OPTIONS];
3668 }
3669
3670 return 0;
3671
3672 error:
3673 if (kind) {
3674 *kind = NULL;
3675 }
3676 if (options) {
3677 *options = NULL;
3678 }
3679 return EPROTO;
3680 }
3681
3682 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3683 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3684 * into '*options', and its queue statistics into '*stats'. Any of the output
3685 * arguments may be null.
3686 *
3687 * Returns 0 if successful, otherwise a positive errno value. */
3688 static int
3689 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3690 struct nlattr **options, struct netdev_queue_stats *stats)
3691 {
3692 static const struct nl_policy tca_policy[] = {
3693 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3694 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3695 };
3696 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3697
3698 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3699 tca_policy, ta, ARRAY_SIZE(ta))) {
3700 VLOG_WARN_RL(&rl, "failed to parse class message");
3701 goto error;
3702 }
3703
3704 if (handlep) {
3705 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3706 *handlep = tc->tcm_handle;
3707 }
3708
3709 if (options) {
3710 *options = ta[TCA_OPTIONS];
3711 }
3712
3713 if (stats) {
3714 const struct gnet_stats_queue *gsq;
3715 struct gnet_stats_basic gsb;
3716
3717 static const struct nl_policy stats_policy[] = {
3718 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3719 .min_len = sizeof gsb },
3720 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3721 .min_len = sizeof *gsq },
3722 };
3723 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3724
3725 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3726 sa, ARRAY_SIZE(sa))) {
3727 VLOG_WARN_RL(&rl, "failed to parse class stats");
3728 goto error;
3729 }
3730
3731 /* Alignment issues screw up the length of struct gnet_stats_basic on
3732 * some arch/bitsize combinations. Newer versions of Linux have a
3733 * struct gnet_stats_basic_packed, but we can't depend on that. The
3734 * easiest thing to do is just to make a copy. */
3735 memset(&gsb, 0, sizeof gsb);
3736 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3737 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3738 stats->tx_bytes = gsb.bytes;
3739 stats->tx_packets = gsb.packets;
3740
3741 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3742 stats->tx_errors = gsq->drops;
3743 }
3744
3745 return 0;
3746
3747 error:
3748 if (options) {
3749 *options = NULL;
3750 }
3751 if (stats) {
3752 memset(stats, 0, sizeof *stats);
3753 }
3754 return EPROTO;
3755 }
3756
3757 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3758 * on 'netdev'. */
3759 static int
3760 tc_query_class(const struct netdev *netdev,
3761 unsigned int handle, unsigned int parent,
3762 struct ofpbuf **replyp)
3763 {
3764 struct ofpbuf request;
3765 struct tcmsg *tcmsg;
3766 int error;
3767
3768 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3769 if (!tcmsg) {
3770 return ENODEV;
3771 }
3772 tcmsg->tcm_handle = handle;
3773 tcmsg->tcm_parent = parent;
3774
3775 error = tc_transact(&request, replyp);
3776 if (error) {
3777 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3778 netdev_get_name(netdev),
3779 tc_get_major(handle), tc_get_minor(handle),
3780 tc_get_major(parent), tc_get_minor(parent),
3781 strerror(error));
3782 }
3783 return error;
3784 }
3785
3786 /* Equivalent to "tc class del dev <name> handle <handle>". */
3787 static int
3788 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3789 {
3790 struct ofpbuf request;
3791 struct tcmsg *tcmsg;
3792 int error;
3793
3794 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3795 if (!tcmsg) {
3796 return ENODEV;
3797 }
3798 tcmsg->tcm_handle = handle;
3799 tcmsg->tcm_parent = 0;
3800
3801 error = tc_transact(&request, NULL);
3802 if (error) {
3803 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3804 netdev_get_name(netdev),
3805 tc_get_major(handle), tc_get_minor(handle),
3806 strerror(error));
3807 }
3808 return error;
3809 }
3810
3811 /* Equivalent to "tc qdisc del dev <name> root". */
3812 static int
3813 tc_del_qdisc(struct netdev *netdev)
3814 {
3815 struct netdev_dev_linux *netdev_dev =
3816 netdev_dev_linux_cast(netdev_get_dev(netdev));
3817 struct ofpbuf request;
3818 struct tcmsg *tcmsg;
3819 int error;
3820
3821 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3822 if (!tcmsg) {
3823 return ENODEV;
3824 }
3825 tcmsg->tcm_handle = tc_make_handle(1, 0);
3826 tcmsg->tcm_parent = TC_H_ROOT;
3827
3828 error = tc_transact(&request, NULL);
3829 if (error == EINVAL) {
3830 /* EINVAL probably means that the default qdisc was in use, in which
3831 * case we've accomplished our purpose. */
3832 error = 0;
3833 }
3834 if (!error && netdev_dev->tc) {
3835 if (netdev_dev->tc->ops->tc_destroy) {
3836 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3837 }
3838 netdev_dev->tc = NULL;
3839 }
3840 return error;
3841 }
3842
3843 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3844 * kernel to determine what they are. Returns 0 if successful, otherwise a
3845 * positive errno value. */
3846 static int
3847 tc_query_qdisc(const struct netdev *netdev)
3848 {
3849 struct netdev_dev_linux *netdev_dev =
3850 netdev_dev_linux_cast(netdev_get_dev(netdev));
3851 struct ofpbuf request, *qdisc;
3852 const struct tc_ops *ops;
3853 struct tcmsg *tcmsg;
3854 int load_error;
3855 int error;
3856
3857 if (netdev_dev->tc) {
3858 return 0;
3859 }
3860
3861 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3862 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3863 * 2.6.35 without that fix backported to it.
3864 *
3865 * To avoid the OOPS, we must not make a request that would attempt to dump
3866 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3867 * few others. There are a few ways that I can see to do this, but most of
3868 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3869 * technique chosen here is to assume that any non-default qdisc that we
3870 * create will have a class with handle 1:0. The built-in qdiscs only have
3871 * a class with handle 0:0.
3872 *
3873 * We could check for Linux 2.6.35+ and use a more straightforward method
3874 * there. */
3875 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3876 if (!tcmsg) {
3877 return ENODEV;
3878 }
3879 tcmsg->tcm_handle = tc_make_handle(1, 0);
3880 tcmsg->tcm_parent = 0;
3881
3882 /* Figure out what tc class to instantiate. */
3883 error = tc_transact(&request, &qdisc);
3884 if (!error) {
3885 const char *kind;
3886
3887 error = tc_parse_qdisc(qdisc, &kind, NULL);
3888 if (error) {
3889 ops = &tc_ops_other;
3890 } else {
3891 ops = tc_lookup_linux_name(kind);
3892 if (!ops) {
3893 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3894 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3895
3896 ops = &tc_ops_other;
3897 }
3898 }
3899 } else if (error == ENOENT) {
3900 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3901 * other entity that doesn't have a handle 1:0. We will assume
3902 * that it's the system default qdisc. */
3903 ops = &tc_ops_default;
3904 error = 0;
3905 } else {
3906 /* Who knows? Maybe the device got deleted. */
3907 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3908 netdev_get_name(netdev), strerror(error));
3909 ops = &tc_ops_other;
3910 }
3911
3912 /* Instantiate it. */
3913 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3914 assert((load_error == 0) == (netdev_dev->tc != NULL));
3915 ofpbuf_delete(qdisc);
3916
3917 return error ? error : load_error;
3918 }
3919
3920 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3921 approximate the time to transmit packets of various lengths. For an MTU of
3922 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3923 represents two possible packet lengths; for a MTU of 513 through 1024, four
3924 possible lengths; and so on.
3925
3926 Returns, for the specified 'mtu', the number of bits that packet lengths
3927 need to be shifted right to fit within such a 256-entry table. */
3928 static int
3929 tc_calc_cell_log(unsigned int mtu)
3930 {
3931 int cell_log;
3932
3933 if (!mtu) {
3934 mtu = ETH_PAYLOAD_MAX;
3935 }
3936 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3937
3938 for (cell_log = 0; mtu >= 256; cell_log++) {
3939 mtu >>= 1;
3940 }
3941
3942 return cell_log;
3943 }
3944
3945 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3946 * of 'mtu'. */
3947 static void
3948 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3949 {
3950 memset(rate, 0, sizeof *rate);
3951 rate->cell_log = tc_calc_cell_log(mtu);
3952 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3953 /* rate->cell_align = 0; */ /* distro headers. */
3954 rate->mpu = ETH_TOTAL_MIN;
3955 rate->rate = Bps;
3956 }
3957
3958 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3959 * attribute of the specified "type".
3960 *
3961 * See tc_calc_cell_log() above for a description of "rtab"s. */
3962 static void
3963 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3964 {
3965 uint32_t *rtab;
3966 unsigned int i;
3967
3968 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3969 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3970 unsigned packet_size = (i + 1) << rate->cell_log;
3971 if (packet_size < rate->mpu) {
3972 packet_size = rate->mpu;
3973 }
3974 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3975 }
3976 }
3977
3978 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3979 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3980 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3981 * 0 is fine.) */
3982 static int
3983 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3984 {
3985 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3986 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3987 }
3988 \f
3989 /* Copies 'src' into 'dst', performing format conversion in the process. */
3990 static void
3991 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3992 const struct rtnl_link_stats *src)
3993 {
3994 dst->rx_packets = src->rx_packets;
3995 dst->tx_packets = src->tx_packets;
3996 dst->rx_bytes = src->rx_bytes;
3997 dst->tx_bytes = src->tx_bytes;
3998 dst->rx_errors = src->rx_errors;
3999 dst->tx_errors = src->tx_errors;
4000 dst->rx_dropped = src->rx_dropped;
4001 dst->tx_dropped = src->tx_dropped;
4002 dst->multicast = src->multicast;
4003 dst->collisions = src->collisions;
4004 dst->rx_length_errors = src->rx_length_errors;
4005 dst->rx_over_errors = src->rx_over_errors;
4006 dst->rx_crc_errors = src->rx_crc_errors;
4007 dst->rx_frame_errors = src->rx_frame_errors;
4008 dst->rx_fifo_errors = src->rx_fifo_errors;
4009 dst->rx_missed_errors = src->rx_missed_errors;
4010 dst->tx_aborted_errors = src->tx_aborted_errors;
4011 dst->tx_carrier_errors = src->tx_carrier_errors;
4012 dst->tx_fifo_errors = src->tx_fifo_errors;
4013 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4014 dst->tx_window_errors = src->tx_window_errors;
4015 }
4016
4017 \f
4018 /* Utility functions. */
4019
4020 static int
4021 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4022 {
4023 /* Policy for RTNLGRP_LINK messages.
4024 *
4025 * There are *many* more fields in these messages, but currently we only
4026 * care about these fields. */
4027 static const struct nl_policy rtnlgrp_link_policy[] = {
4028 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4029 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4030 .min_len = sizeof(struct rtnl_link_stats) },
4031 };
4032
4033 struct ofpbuf request;
4034 struct ofpbuf *reply;
4035 struct ifinfomsg *ifi;
4036 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4037 int error;
4038
4039 ofpbuf_init(&request, 0);
4040 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4041 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4042 ifi->ifi_family = PF_UNSPEC;
4043 ifi->ifi_index = ifindex;
4044 error = nl_sock_transact(rtnl_sock, &request, &reply);
4045 ofpbuf_uninit(&request);
4046 if (error) {
4047 return error;
4048 }
4049
4050 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4051 rtnlgrp_link_policy,
4052 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4053 ofpbuf_delete(reply);
4054 return EPROTO;
4055 }
4056
4057 if (!attrs[IFLA_STATS]) {
4058 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4059 ofpbuf_delete(reply);
4060 return EPROTO;
4061 }
4062
4063 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4064
4065 ofpbuf_delete(reply);
4066
4067 return 0;
4068 }
4069
4070 static int
4071 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4072 {
4073 static const char fn[] = "/proc/net/dev";
4074 char line[1024];
4075 FILE *stream;
4076 int ln;
4077
4078 stream = fopen(fn, "r");
4079 if (!stream) {
4080 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4081 return errno;
4082 }
4083
4084 ln = 0;
4085 while (fgets(line, sizeof line, stream)) {
4086 if (++ln >= 3) {
4087 char devname[16];
4088 #define X64 "%"SCNu64
4089 if (sscanf(line,
4090 " %15[^:]:"
4091 X64 X64 X64 X64 X64 X64 X64 "%*u"
4092 X64 X64 X64 X64 X64 X64 X64 "%*u",
4093 devname,
4094 &stats->rx_bytes,
4095 &stats->rx_packets,
4096 &stats->rx_errors,
4097 &stats->rx_dropped,
4098 &stats->rx_fifo_errors,
4099 &stats->rx_frame_errors,
4100 &stats->multicast,
4101 &stats->tx_bytes,
4102 &stats->tx_packets,
4103 &stats->tx_errors,
4104 &stats->tx_dropped,
4105 &stats->tx_fifo_errors,
4106 &stats->collisions,
4107 &stats->tx_carrier_errors) != 15) {
4108 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4109 } else if (!strcmp(devname, netdev_name)) {
4110 stats->rx_length_errors = UINT64_MAX;
4111 stats->rx_over_errors = UINT64_MAX;
4112 stats->rx_crc_errors = UINT64_MAX;
4113 stats->rx_missed_errors = UINT64_MAX;
4114 stats->tx_aborted_errors = UINT64_MAX;
4115 stats->tx_heartbeat_errors = UINT64_MAX;
4116 stats->tx_window_errors = UINT64_MAX;
4117 fclose(stream);
4118 return 0;
4119 }
4120 }
4121 }
4122 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4123 fclose(stream);
4124 return ENODEV;
4125 }
4126
4127 static int
4128 get_carrier_via_sysfs(const char *name, bool *carrier)
4129 {
4130 char line[8];
4131 int retval;
4132
4133 int error = 0;
4134 char *fn = NULL;
4135 int fd = -1;
4136
4137 *carrier = false;
4138
4139 fn = xasprintf("/sys/class/net/%s/carrier", name);
4140 fd = open(fn, O_RDONLY);
4141 if (fd < 0) {
4142 error = errno;
4143 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4144 goto exit;
4145 }
4146
4147 retval = read(fd, line, sizeof line);
4148 if (retval < 0) {
4149 error = errno;
4150 if (error == EINVAL) {
4151 /* This is the normal return value when we try to check carrier if
4152 * the network device is not up. */
4153 } else {
4154 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4155 }
4156 goto exit;
4157 } else if (retval == 0) {
4158 error = EPROTO;
4159 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4160 goto exit;
4161 }
4162
4163 if (line[0] != '0' && line[0] != '1') {
4164 error = EPROTO;
4165 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4166 goto exit;
4167 }
4168 *carrier = line[0] != '0';
4169 error = 0;
4170
4171 exit:
4172 if (fd >= 0) {
4173 close(fd);
4174 }
4175 free(fn);
4176 return error;
4177 }
4178
4179 static int
4180 get_flags(const struct netdev *netdev, int *flags)
4181 {
4182 struct ifreq ifr;
4183 int error;
4184
4185 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4186 "SIOCGIFFLAGS");
4187 *flags = ifr.ifr_flags;
4188 return error;
4189 }
4190
4191 static int
4192 set_flags(struct netdev *netdev, int flags)
4193 {
4194 struct ifreq ifr;
4195
4196 ifr.ifr_flags = flags;
4197 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4198 "SIOCSIFFLAGS");
4199 }
4200
4201 static int
4202 do_get_ifindex(const char *netdev_name)
4203 {
4204 struct ifreq ifr;
4205
4206 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4207 COVERAGE_INC(netdev_get_ifindex);
4208 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4209 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4210 netdev_name, strerror(errno));
4211 return -errno;
4212 }
4213 return ifr.ifr_ifindex;
4214 }
4215
4216 static int
4217 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4218 {
4219 struct netdev_dev_linux *netdev_dev =
4220 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4221 *ifindexp = 0;
4222 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4223 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4224 if (ifindex < 0) {
4225 return -ifindex;
4226 }
4227 netdev_dev->cache_valid |= VALID_IFINDEX;
4228 netdev_dev->ifindex = ifindex;
4229 }
4230 *ifindexp = netdev_dev->ifindex;
4231 return 0;
4232 }
4233
4234 static int
4235 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4236 {
4237 struct ifreq ifr;
4238 int hwaddr_family;
4239
4240 memset(&ifr, 0, sizeof ifr);
4241 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4242 COVERAGE_INC(netdev_get_hwaddr);
4243 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4244 /* ENODEV probably means that a vif disappeared asynchronously and
4245 * hasn't been removed from the database yet, so reduce the log level
4246 * to INFO for that case. */
4247 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4248 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4249 netdev_name, strerror(errno));
4250 return errno;
4251 }
4252 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4253 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4254 VLOG_WARN("%s device has unknown hardware address family %d",
4255 netdev_name, hwaddr_family);
4256 }
4257 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4258 return 0;
4259 }
4260
4261 static int
4262 set_etheraddr(const char *netdev_name, int hwaddr_family,
4263 const uint8_t mac[ETH_ADDR_LEN])
4264 {
4265 struct ifreq ifr;
4266
4267 memset(&ifr, 0, sizeof ifr);
4268 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4269 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4270 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4271 COVERAGE_INC(netdev_set_hwaddr);
4272 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4273 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4274 netdev_name, strerror(errno));
4275 return errno;
4276 }
4277 return 0;
4278 }
4279
4280 static int
4281 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4282 int cmd, const char *cmd_name)
4283 {
4284 struct ifreq ifr;
4285
4286 memset(&ifr, 0, sizeof ifr);
4287 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4288 ifr.ifr_data = (caddr_t) ecmd;
4289
4290 ecmd->cmd = cmd;
4291 COVERAGE_INC(netdev_ethtool);
4292 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4293 return 0;
4294 } else {
4295 if (errno != EOPNOTSUPP) {
4296 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4297 "failed: %s", cmd_name, name, strerror(errno));
4298 } else {
4299 /* The device doesn't support this operation. That's pretty
4300 * common, so there's no point in logging anything. */
4301 }
4302 return errno;
4303 }
4304 }
4305
4306 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4307 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4308 int
4309 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4310 const char *flag_name, bool enable)
4311 {
4312 const char *netdev_name = netdev_get_name(netdev);
4313 struct ethtool_value evalue;
4314 uint32_t new_flags;
4315 int error;
4316
4317 memset(&evalue, 0, sizeof evalue);
4318 error = netdev_linux_do_ethtool(netdev_name,
4319 (struct ethtool_cmd *)&evalue,
4320 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4321 if (error) {
4322 return error;
4323 }
4324
4325 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4326 error = netdev_linux_do_ethtool(netdev_name,
4327 (struct ethtool_cmd *)&evalue,
4328 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4329 if (error) {
4330 return error;
4331 }
4332
4333 memset(&evalue, 0, sizeof evalue);
4334 error = netdev_linux_do_ethtool(netdev_name,
4335 (struct ethtool_cmd *)&evalue,
4336 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4337 if (error) {
4338 return error;
4339 }
4340
4341 if (new_flags != evalue.data) {
4342 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4343 "device %s failed", enable ? "enable" : "disable",
4344 flag_name, netdev_name);
4345 return EOPNOTSUPP;
4346 }
4347
4348 return 0;
4349 }
4350
4351 static int
4352 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4353 const char *cmd_name)
4354 {
4355 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4356 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4357 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4358 strerror(errno));
4359 return errno;
4360 }
4361 return 0;
4362 }
4363
4364 static int
4365 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4366 int cmd, const char *cmd_name)
4367 {
4368 struct ifreq ifr;
4369 int error;
4370
4371 ifr.ifr_addr.sa_family = AF_INET;
4372 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4373 if (!error) {
4374 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4375 *ip = sin->sin_addr;
4376 }
4377 return error;
4378 }
4379
4380 /* Returns an AF_PACKET raw socket or a negative errno value. */
4381 static int
4382 af_packet_sock(void)
4383 {
4384 static int sock = INT_MIN;
4385
4386 if (sock == INT_MIN) {
4387 sock = socket(AF_PACKET, SOCK_RAW, 0);
4388 if (sock >= 0) {
4389 set_nonblocking(sock);
4390 } else {
4391 sock = -errno;
4392 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4393 }
4394 }
4395
4396 return sock;
4397 }