]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
Avoid shadowing local variable names.
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010 Nicira Networks.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <assert.h>
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <arpa/inet.h>
22 #include <inttypes.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
25 #include <linux/ip.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
37 #include <net/if.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
43 #include <poll.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <unistd.h>
47
48 #include "coverage.h"
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
51 #include "netdev-provider.h"
52 #include "netdev-vport.h"
53 #include "netlink.h"
54 #include "ofpbuf.h"
55 #include "openflow/openflow.h"
56 #include "packets.h"
57 #include "poll-loop.h"
58 #include "port-array.h"
59 #include "rtnetlink.h"
60 #include "socket-util.h"
61 #include "shash.h"
62 #include "svec.h"
63 #include "vlog.h"
64
65 VLOG_DEFINE_THIS_MODULE(netdev_linux)
66 \f
67 /* These were introduced in Linux 2.6.14, so they might be missing if we have
68 * old headers. */
69 #ifndef ADVERTISED_Pause
70 #define ADVERTISED_Pause (1 << 13)
71 #endif
72 #ifndef ADVERTISED_Asym_Pause
73 #define ADVERTISED_Asym_Pause (1 << 14)
74 #endif
75
76 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
77 * headers. */
78 #ifndef TC_RTAB_SIZE
79 #define TC_RTAB_SIZE 1024
80 #endif
81
82 static struct rtnetlink_notifier netdev_linux_cache_notifier;
83 static int cache_notifier_refcount;
84
85 enum {
86 VALID_IFINDEX = 1 << 0,
87 VALID_ETHERADDR = 1 << 1,
88 VALID_IN4 = 1 << 2,
89 VALID_IN6 = 1 << 3,
90 VALID_MTU = 1 << 4,
91 VALID_CARRIER = 1 << 5,
92 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
93 VALID_POLICING = 1 << 7,
94 VALID_HAVE_VPORT_STATS = 1 << 8
95 };
96
97 struct tap_state {
98 int fd;
99 bool opened;
100 };
101 \f
102 /* Traffic control. */
103
104 /* An instance of a traffic control class. Always associated with a particular
105 * network device. */
106 struct tc {
107 const struct tc_ops *ops;
108
109 /* Maps from queue ID to tc-specific data.
110 *
111 * The generic netdev TC layer uses this to the following extent: if an
112 * entry is nonnull, then the queue whose ID is the index is assumed to
113 * exist; if an entry is null, then that queue is assumed not to exist.
114 * Implementations must adhere to this scheme, although they may store
115 * whatever they like as data.
116 */
117 struct port_array queues;
118 };
119
120 /* A particular kind of traffic control. Each implementation generally maps to
121 * one particular Linux qdisc class.
122 *
123 * The functions below return 0 if successful or a positive errno value on
124 * failure, except where otherwise noted. All of them must be provided, except
125 * where otherwise noted. */
126 struct tc_ops {
127 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
128 * This is null for tc_ops_default and tc_ops_other, for which there are no
129 * appropriate values. */
130 const char *linux_name;
131
132 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
133 const char *ovs_name;
134
135 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
136 * queues. The queues are numbered 0 through n_queues - 1. */
137 unsigned int n_queues;
138
139 /* Called to install this TC class on 'netdev'. The implementation should
140 * make the Netlink calls required to set up 'netdev' with the right qdisc
141 * and configure it according to 'details'. The implementation may assume
142 * that the current qdisc is the default; that is, there is no need for it
143 * to delete the current qdisc before installing itself.
144 *
145 * The contents of 'details' should be documented as valid for 'ovs_name'
146 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
147 * (which is built as ovs-vswitchd.conf.db(8)).
148 *
149 * This function must return 0 if and only if it sets 'netdev->tc' to an
150 * initialized 'struct tc'.
151 *
152 * (This function is null for tc_ops_other, which cannot be installed. For
153 * other TC classes it should always be nonnull.) */
154 int (*tc_install)(struct netdev *netdev, const struct shash *details);
155
156 /* Called when the netdev code determines (through a Netlink query) that
157 * this TC class's qdisc is installed on 'netdev', but we didn't install
158 * it ourselves and so don't know any of the details.
159 *
160 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
161 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
162 * implementation should parse the other attributes of 'nlmsg' as
163 * necessary to determine its configuration. If necessary it should also
164 * use Netlink queries to determine the configuration of queues on
165 * 'netdev'.
166 *
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'. */
169 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
170
171 /* Destroys the data structures allocated by the implementation as part of
172 * 'tc'. (This includes destroying 'tc->queues' by calling
173 * tc_destroy(tc).
174 *
175 * The implementation should not need to perform any Netlink calls. If
176 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
177 * (But it may not be desirable.)
178 *
179 * This function may be null if 'tc' is trivial. */
180 void (*tc_destroy)(struct tc *tc);
181
182 /* Retrieves details of 'netdev->tc' configuration into 'details'.
183 *
184 * The implementation should not need to perform any Netlink calls, because
185 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
186 * cached the configuration.
187 *
188 * The contents of 'details' should be documented as valid for 'ovs_name'
189 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
190 * (which is built as ovs-vswitchd.conf.db(8)).
191 *
192 * This function may be null if 'tc' is not configurable.
193 */
194 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
195
196 /* Reconfigures 'netdev->tc' according to 'details', performing any
197 * required Netlink calls to complete the reconfiguration.
198 *
199 * The contents of 'details' should be documented as valid for 'ovs_name'
200 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
201 * (which is built as ovs-vswitchd.conf.db(8)).
202 *
203 * This function may be null if 'tc' is not configurable.
204 */
205 int (*qdisc_set)(struct netdev *, const struct shash *details);
206
207 /* Retrieves details of 'queue_id' on 'netdev->tc' into 'details'. The
208 * caller ensures that 'queues' has a nonnull value for index 'queue_id.
209 *
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "Queue" table in
212 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
213 *
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the queue configuration.
217 *
218 * This function may be null if 'tc' does not have queues ('n_queues' is
219 * 0). */
220 int (*class_get)(const struct netdev *netdev, unsigned int queue_id,
221 struct shash *details);
222
223 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
224 * 'details', perfoming any required Netlink calls to complete the
225 * reconfiguration. The caller ensures that 'queue_id' is less than
226 * 'n_queues'.
227 *
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 *
232 * This function may be null if 'tc' does not have queues or its queues are
233 * not configurable. */
234 int (*class_set)(struct netdev *, unsigned int queue_id,
235 const struct shash *details);
236
237 /* Deletes 'queue_id' from 'netdev->tc'. The caller ensures that 'queues'
238 * has a nonnull value for index 'queue_id.
239 *
240 * This function may be null if 'tc' does not have queues or its queues
241 * cannot be deleted. */
242 int (*class_delete)(struct netdev *, unsigned int queue_id);
243
244 /* Obtains stats for 'queue' from 'netdev->tc'. The caller ensures that
245 * 'queues' has a nonnull value for index 'queue_id.
246 *
247 * On success, initializes '*stats'.
248 *
249 * This function may be null if 'tc' does not have queues or if it cannot
250 * report queue statistics. */
251 int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
252 struct netdev_queue_stats *stats);
253
254 /* Extracts queue stats from 'nlmsg', which is a response to a
255 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
256 *
257 * This function may be null if 'tc' does not have queues or if it cannot
258 * report queue statistics. */
259 int (*class_dump_stats)(const struct netdev *netdev,
260 const struct ofpbuf *nlmsg,
261 netdev_dump_queue_stats_cb *cb, void *aux);
262 };
263
264 static void
265 tc_init(struct tc *tc, const struct tc_ops *ops)
266 {
267 tc->ops = ops;
268 port_array_init(&tc->queues);
269 }
270
271 static void
272 tc_destroy(struct tc *tc)
273 {
274 port_array_destroy(&tc->queues);
275 }
276
277 static const struct tc_ops tc_ops_htb;
278 static const struct tc_ops tc_ops_default;
279 static const struct tc_ops tc_ops_other;
280
281 static const struct tc_ops *tcs[] = {
282 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
283 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
284 &tc_ops_other, /* Some other qdisc. */
285 NULL
286 };
287
288 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
289 static unsigned int tc_get_major(unsigned int handle);
290 static unsigned int tc_get_minor(unsigned int handle);
291
292 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
293 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
294 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
295
296 static struct tcmsg *tc_make_request(const struct netdev *, int type,
297 unsigned int flags, struct ofpbuf *);
298 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
299
300 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
301 struct nlattr **options);
302 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
303 struct nlattr **options,
304 struct netdev_queue_stats *);
305 static int tc_query_class(const struct netdev *,
306 unsigned int handle, unsigned int parent,
307 struct ofpbuf **replyp);
308 static int tc_delete_class(const struct netdev *, unsigned int handle);
309
310 static int tc_del_qdisc(struct netdev *netdev);
311 static int tc_query_qdisc(const struct netdev *netdev);
312
313 static int tc_calc_cell_log(unsigned int mtu);
314 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
315 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
316 const struct tc_ratespec *rate);
317 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
318 \f
319 struct netdev_dev_linux {
320 struct netdev_dev netdev_dev;
321
322 struct shash_node *shash_node;
323 unsigned int cache_valid;
324
325 /* The following are figured out "on demand" only. They are only valid
326 * when the corresponding VALID_* bit in 'cache_valid' is set. */
327 int ifindex;
328 uint8_t etheraddr[ETH_ADDR_LEN];
329 struct in_addr address, netmask;
330 struct in6_addr in6;
331 int mtu;
332 int carrier;
333 bool is_internal; /* Is this an openvswitch internal device? */
334 bool is_tap; /* Is this a tuntap device? */
335 uint32_t kbits_rate; /* Policing data. */
336 uint32_t kbits_burst;
337 bool have_vport_stats;
338 struct tc *tc;
339
340 union {
341 struct tap_state tap;
342 } state;
343 };
344
345 struct netdev_linux {
346 struct netdev netdev;
347 int fd;
348 };
349
350 /* An AF_INET socket (used for ioctl operations). */
351 static int af_inet_sock = -1;
352
353 /* A Netlink routing socket that is not subscribed to any multicast groups. */
354 static struct nl_sock *rtnl_sock;
355
356 struct netdev_linux_notifier {
357 struct netdev_notifier notifier;
358 struct list node;
359 };
360
361 static struct shash netdev_linux_notifiers =
362 SHASH_INITIALIZER(&netdev_linux_notifiers);
363 static struct rtnetlink_notifier netdev_linux_poll_notifier;
364
365 /* This is set pretty low because we probably won't learn anything from the
366 * additional log messages. */
367 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
368
369 static int netdev_linux_init(void);
370
371 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
372 int cmd, const char *cmd_name);
373 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
374 const char *cmd_name);
375 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
376 int cmd, const char *cmd_name);
377 static int get_flags(const struct netdev *, int *flagsp);
378 static int set_flags(struct netdev *, int flags);
379 static int do_get_ifindex(const char *netdev_name);
380 static int get_ifindex(const struct netdev *, int *ifindexp);
381 static int do_set_addr(struct netdev *netdev,
382 int ioctl_nr, const char *ioctl_name,
383 struct in_addr addr);
384 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
385 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
386 const uint8_t[ETH_ADDR_LEN]);
387 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
388 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
389
390 static bool
391 is_netdev_linux_class(const struct netdev_class *netdev_class)
392 {
393 return netdev_class->init == netdev_linux_init;
394 }
395
396 static struct netdev_dev_linux *
397 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
398 {
399 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
400 assert(is_netdev_linux_class(netdev_class));
401
402 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
403 }
404
405 static struct netdev_linux *
406 netdev_linux_cast(const struct netdev *netdev)
407 {
408 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
409 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
410 assert(is_netdev_linux_class(netdev_class));
411
412 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
413 }
414 \f
415 static int
416 netdev_linux_init(void)
417 {
418 static int status = -1;
419 if (status < 0) {
420 /* Create AF_INET socket. */
421 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
422 status = af_inet_sock >= 0 ? 0 : errno;
423 if (status) {
424 VLOG_ERR("failed to create inet socket: %s", strerror(status));
425 }
426
427 /* Create rtnetlink socket. */
428 if (!status) {
429 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
430 if (status) {
431 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
432 strerror(status));
433 }
434 }
435 }
436 return status;
437 }
438
439 static void
440 netdev_linux_run(void)
441 {
442 rtnetlink_notifier_run();
443 }
444
445 static void
446 netdev_linux_wait(void)
447 {
448 rtnetlink_notifier_wait();
449 }
450
451 static void
452 netdev_linux_cache_cb(const struct rtnetlink_change *change,
453 void *aux OVS_UNUSED)
454 {
455 struct netdev_dev_linux *dev;
456 if (change) {
457 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
458 if (base_dev) {
459 const struct netdev_class *netdev_class =
460 netdev_dev_get_class(base_dev);
461
462 if (is_netdev_linux_class(netdev_class)) {
463 dev = netdev_dev_linux_cast(base_dev);
464 dev->cache_valid = 0;
465 }
466 }
467 } else {
468 struct shash device_shash;
469 struct shash_node *node;
470
471 shash_init(&device_shash);
472 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
473 SHASH_FOR_EACH (node, &device_shash) {
474 dev = node->data;
475 dev->cache_valid = 0;
476 }
477 shash_destroy(&device_shash);
478 }
479 }
480
481 /* Creates the netdev device of 'type' with 'name'. */
482 static int
483 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
484 const struct shash *args, struct netdev_dev **netdev_devp)
485 {
486 struct netdev_dev_linux *netdev_dev;
487 int error;
488
489 if (!shash_is_empty(args)) {
490 VLOG_WARN("%s: arguments for system devices should be empty", name);
491 }
492
493 if (!cache_notifier_refcount) {
494 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
495 netdev_linux_cache_cb, NULL);
496 if (error) {
497 return error;
498 }
499 }
500 cache_notifier_refcount++;
501
502 netdev_dev = xzalloc(sizeof *netdev_dev);
503 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
504
505 *netdev_devp = &netdev_dev->netdev_dev;
506 return 0;
507 }
508
509 /* For most types of netdevs we open the device for each call of
510 * netdev_open(). However, this is not the case with tap devices,
511 * since it is only possible to open the device once. In this
512 * situation we share a single file descriptor, and consequently
513 * buffers, across all readers. Therefore once data is read it will
514 * be unavailable to other reads for tap devices. */
515 static int
516 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
517 const struct shash *args, struct netdev_dev **netdev_devp)
518 {
519 struct netdev_dev_linux *netdev_dev;
520 struct tap_state *state;
521 static const char tap_dev[] = "/dev/net/tun";
522 struct ifreq ifr;
523 int error;
524
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
527 }
528
529 netdev_dev = xzalloc(sizeof *netdev_dev);
530 state = &netdev_dev->state.tap;
531
532 /* Open tap device. */
533 state->fd = open(tap_dev, O_RDWR);
534 if (state->fd < 0) {
535 error = errno;
536 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
537 goto error;
538 }
539
540 /* Create tap device. */
541 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
542 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
543 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
544 VLOG_WARN("%s: creating tap device failed: %s", name,
545 strerror(errno));
546 error = errno;
547 goto error;
548 }
549
550 /* Make non-blocking. */
551 error = set_nonblocking(state->fd);
552 if (error) {
553 goto error;
554 }
555
556 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
557 *netdev_devp = &netdev_dev->netdev_dev;
558 return 0;
559
560 error:
561 free(netdev_dev);
562 return error;
563 }
564
565 static void
566 destroy_tap(struct netdev_dev_linux *netdev_dev)
567 {
568 struct tap_state *state = &netdev_dev->state.tap;
569
570 if (state->fd >= 0) {
571 close(state->fd);
572 }
573 }
574
575 /* Destroys the netdev device 'netdev_dev_'. */
576 static void
577 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
578 {
579 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
580 const char *type = netdev_dev_get_type(netdev_dev_);
581
582 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
583 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
584 }
585
586 if (!strcmp(type, "system")) {
587 cache_notifier_refcount--;
588
589 if (!cache_notifier_refcount) {
590 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
591 }
592 } else if (!strcmp(type, "tap")) {
593 destroy_tap(netdev_dev);
594 }
595
596 free(netdev_dev);
597 }
598
599 static int
600 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
601 struct netdev **netdevp)
602 {
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 struct netdev_linux *netdev;
605 enum netdev_flags flags;
606 int error;
607
608 /* Allocate network device. */
609 netdev = xzalloc(sizeof *netdev);
610 netdev->fd = -1;
611 netdev_init(&netdev->netdev, netdev_dev_);
612
613 error = netdev_get_flags(&netdev->netdev, &flags);
614 if (error == ENODEV) {
615 goto error;
616 }
617
618 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
619 !netdev_dev->state.tap.opened) {
620
621 /* We assume that the first user of the tap device is the primary user
622 * and give them the tap FD. Subsequent users probably just expect
623 * this to be a system device so open it normally to avoid send/receive
624 * directions appearing to be reversed. */
625 netdev->fd = netdev_dev->state.tap.fd;
626 netdev_dev->state.tap.opened = true;
627 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
628 struct sockaddr_ll sll;
629 int protocol;
630 int ifindex;
631
632 /* Create file descriptor. */
633 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
634 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
635 : ethertype);
636 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
637 if (netdev->fd < 0) {
638 error = errno;
639 goto error;
640 }
641
642 /* Set non-blocking mode. */
643 error = set_nonblocking(netdev->fd);
644 if (error) {
645 goto error;
646 }
647
648 /* Get ethernet device index. */
649 error = get_ifindex(&netdev->netdev, &ifindex);
650 if (error) {
651 goto error;
652 }
653
654 /* Bind to specific ethernet device. */
655 memset(&sll, 0, sizeof sll);
656 sll.sll_family = AF_PACKET;
657 sll.sll_ifindex = ifindex;
658 if (bind(netdev->fd,
659 (struct sockaddr *) &sll, sizeof sll) < 0) {
660 error = errno;
661 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
662 strerror(error));
663 goto error;
664 }
665
666 /* Between the socket() and bind() calls above, the socket receives all
667 * packets of the requested type on all system interfaces. We do not
668 * want to receive that data, but there is no way to avoid it. So we
669 * must now drain out the receive queue. */
670 error = drain_rcvbuf(netdev->fd);
671 if (error) {
672 goto error;
673 }
674 }
675
676 *netdevp = &netdev->netdev;
677 return 0;
678
679 error:
680 netdev_uninit(&netdev->netdev, true);
681 return error;
682 }
683
684 /* Closes and destroys 'netdev'. */
685 static void
686 netdev_linux_close(struct netdev *netdev_)
687 {
688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
689
690 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
691 close(netdev->fd);
692 }
693 free(netdev);
694 }
695
696 /* Initializes 'svec' with a list of the names of all known network devices. */
697 static int
698 netdev_linux_enumerate(struct svec *svec)
699 {
700 struct if_nameindex *names;
701
702 names = if_nameindex();
703 if (names) {
704 size_t i;
705
706 for (i = 0; names[i].if_name != NULL; i++) {
707 svec_add(svec, names[i].if_name);
708 }
709 if_freenameindex(names);
710 return 0;
711 } else {
712 VLOG_WARN("could not obtain list of network device names: %s",
713 strerror(errno));
714 return errno;
715 }
716 }
717
718 static int
719 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
720 {
721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
722
723 if (netdev->fd < 0) {
724 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
725 return -EAGAIN;
726 }
727
728 for (;;) {
729 ssize_t retval = read(netdev->fd, data, size);
730 if (retval >= 0) {
731 return retval;
732 } else if (errno != EINTR) {
733 if (errno != EAGAIN) {
734 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
735 strerror(errno), netdev_get_name(netdev_));
736 }
737 return -errno;
738 }
739 }
740 }
741
742 /* Registers with the poll loop to wake up from the next call to poll_block()
743 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
744 static void
745 netdev_linux_recv_wait(struct netdev *netdev_)
746 {
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
748 if (netdev->fd >= 0) {
749 poll_fd_wait(netdev->fd, POLLIN);
750 }
751 }
752
753 /* Discards all packets waiting to be received from 'netdev'. */
754 static int
755 netdev_linux_drain(struct netdev *netdev_)
756 {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
759 return 0;
760 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
761 struct ifreq ifr;
762 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
763 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
764 if (error) {
765 return error;
766 }
767 drain_fd(netdev->fd, ifr.ifr_qlen);
768 return 0;
769 } else {
770 return drain_rcvbuf(netdev->fd);
771 }
772 }
773
774 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
775 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
776 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
777 * the packet is too big or too small to transmit on the device.
778 *
779 * The caller retains ownership of 'buffer' in all cases.
780 *
781 * The kernel maintains a packet transmission queue, so the caller is not
782 * expected to do additional queuing of packets. */
783 static int
784 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
785 {
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
787
788 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
789 */
790 if (netdev->fd < 0) {
791 return EPIPE;
792 }
793
794 for (;;) {
795 ssize_t retval = write(netdev->fd, data, size);
796 if (retval < 0) {
797 /* The Linux AF_PACKET implementation never blocks waiting for room
798 * for packets, instead returning ENOBUFS. Translate this into
799 * EAGAIN for the caller. */
800 if (errno == ENOBUFS) {
801 return EAGAIN;
802 } else if (errno == EINTR) {
803 continue;
804 } else if (errno != EAGAIN) {
805 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
806 netdev_get_name(netdev_), strerror(errno));
807 }
808 return errno;
809 } else if (retval != size) {
810 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
811 "%zu) on %s", retval, size, netdev_get_name(netdev_));
812 return EMSGSIZE;
813 } else {
814 return 0;
815 }
816 }
817 }
818
819 /* Registers with the poll loop to wake up from the next call to poll_block()
820 * when the packet transmission queue has sufficient room to transmit a packet
821 * with netdev_send().
822 *
823 * The kernel maintains a packet transmission queue, so the client is not
824 * expected to do additional queuing of packets. Thus, this function is
825 * unlikely to ever be used. It is included for completeness. */
826 static void
827 netdev_linux_send_wait(struct netdev *netdev_)
828 {
829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 if (netdev->fd < 0) {
831 /* Nothing to do. */
832 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
833 poll_fd_wait(netdev->fd, POLLOUT);
834 } else {
835 /* TAP device always accepts packets.*/
836 poll_immediate_wake();
837 }
838 }
839
840 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
841 * otherwise a positive errno value. */
842 static int
843 netdev_linux_set_etheraddr(struct netdev *netdev_,
844 const uint8_t mac[ETH_ADDR_LEN])
845 {
846 struct netdev_dev_linux *netdev_dev =
847 netdev_dev_linux_cast(netdev_get_dev(netdev_));
848 int error;
849
850 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
851 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
852 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
853 if (!error) {
854 netdev_dev->cache_valid |= VALID_ETHERADDR;
855 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
856 }
857 } else {
858 error = 0;
859 }
860 return error;
861 }
862
863 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
864 * free the returned buffer. */
865 static int
866 netdev_linux_get_etheraddr(const struct netdev *netdev_,
867 uint8_t mac[ETH_ADDR_LEN])
868 {
869 struct netdev_dev_linux *netdev_dev =
870 netdev_dev_linux_cast(netdev_get_dev(netdev_));
871 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
872 int error = get_etheraddr(netdev_get_name(netdev_),
873 netdev_dev->etheraddr);
874 if (error) {
875 return error;
876 }
877 netdev_dev->cache_valid |= VALID_ETHERADDR;
878 }
879 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
880 return 0;
881 }
882
883 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
884 * in bytes, not including the hardware header; thus, this is typically 1500
885 * bytes for Ethernet devices. */
886 static int
887 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
888 {
889 struct netdev_dev_linux *netdev_dev =
890 netdev_dev_linux_cast(netdev_get_dev(netdev_));
891 if (!(netdev_dev->cache_valid & VALID_MTU)) {
892 struct ifreq ifr;
893 int error;
894
895 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
896 SIOCGIFMTU, "SIOCGIFMTU");
897 if (error) {
898 return error;
899 }
900 netdev_dev->mtu = ifr.ifr_mtu;
901 netdev_dev->cache_valid |= VALID_MTU;
902 }
903 *mtup = netdev_dev->mtu;
904 return 0;
905 }
906
907 /* Returns the ifindex of 'netdev', if successful, as a positive number.
908 * On failure, returns a negative errno value. */
909 static int
910 netdev_linux_get_ifindex(const struct netdev *netdev)
911 {
912 int ifindex, error;
913
914 error = get_ifindex(netdev, &ifindex);
915 return error ? -error : ifindex;
916 }
917
918 static int
919 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
920 {
921 struct netdev_dev_linux *netdev_dev =
922 netdev_dev_linux_cast(netdev_get_dev(netdev_));
923 int error = 0;
924 char *fn = NULL;
925 int fd = -1;
926
927 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
928 char line[8];
929 int retval;
930
931 fn = xasprintf("/sys/class/net/%s/carrier",
932 netdev_get_name(netdev_));
933 fd = open(fn, O_RDONLY);
934 if (fd < 0) {
935 error = errno;
936 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
937 goto exit;
938 }
939
940 retval = read(fd, line, sizeof line);
941 if (retval < 0) {
942 error = errno;
943 if (error == EINVAL) {
944 /* This is the normal return value when we try to check carrier
945 * if the network device is not up. */
946 } else {
947 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
948 }
949 goto exit;
950 } else if (retval == 0) {
951 error = EPROTO;
952 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
953 goto exit;
954 }
955
956 if (line[0] != '0' && line[0] != '1') {
957 error = EPROTO;
958 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
959 fn, line[0]);
960 goto exit;
961 }
962 netdev_dev->carrier = line[0] != '0';
963 netdev_dev->cache_valid |= VALID_CARRIER;
964 }
965 *carrier = netdev_dev->carrier;
966 error = 0;
967
968 exit:
969 if (fd >= 0) {
970 close(fd);
971 }
972 free(fn);
973 return error;
974 }
975
976 /* Check whether we can we use RTM_GETLINK to get network device statistics.
977 * In pre-2.6.19 kernels, this was only available if wireless extensions were
978 * enabled. */
979 static bool
980 check_for_working_netlink_stats(void)
981 {
982 /* Decide on the netdev_get_stats() implementation to use. Netlink is
983 * preferable, so if that works, we'll use it. */
984 int ifindex = do_get_ifindex("lo");
985 if (ifindex < 0) {
986 VLOG_WARN("failed to get ifindex for lo, "
987 "obtaining netdev stats from proc");
988 return false;
989 } else {
990 struct netdev_stats stats;
991 int error = get_stats_via_netlink(ifindex, &stats);
992 if (!error) {
993 VLOG_DBG("obtaining netdev stats via rtnetlink");
994 return true;
995 } else {
996 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
997 "via proc (you are probably running a pre-2.6.19 "
998 "kernel)", strerror(error));
999 return false;
1000 }
1001 }
1002 }
1003
1004 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1005 static void
1006 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1007 {
1008 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1009 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1010 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1011
1012 netdev_dev->is_tap = !strcmp(type, "tap");
1013 netdev_dev->is_internal = false;
1014 if (!netdev_dev->is_tap) {
1015 struct ethtool_drvinfo drvinfo;
1016 int error;
1017
1018 memset(&drvinfo, 0, sizeof drvinfo);
1019 error = netdev_linux_do_ethtool(name,
1020 (struct ethtool_cmd *)&drvinfo,
1021 ETHTOOL_GDRVINFO,
1022 "ETHTOOL_GDRVINFO");
1023
1024 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1025 netdev_dev->is_internal = true;
1026 }
1027 }
1028
1029 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1030 }
1031 }
1032
1033 static void
1034 swap_uint64(uint64_t *a, uint64_t *b)
1035 {
1036 *a ^= *b;
1037 *b ^= *a;
1038 *a ^= *b;
1039 }
1040
1041 /* Retrieves current device stats for 'netdev'. */
1042 static int
1043 netdev_linux_get_stats(const struct netdev *netdev_,
1044 struct netdev_stats *stats)
1045 {
1046 struct netdev_dev_linux *netdev_dev =
1047 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1048 static int use_netlink_stats = -1;
1049 int error;
1050
1051 COVERAGE_INC(netdev_get_stats);
1052
1053 if (netdev_dev->have_vport_stats ||
1054 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1055
1056 error = netdev_vport_get_stats(netdev_, stats);
1057 netdev_dev->have_vport_stats = !error;
1058 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1059 }
1060
1061 if (!netdev_dev->have_vport_stats) {
1062 if (use_netlink_stats < 0) {
1063 use_netlink_stats = check_for_working_netlink_stats();
1064 }
1065 if (use_netlink_stats) {
1066 int ifindex;
1067
1068 error = get_ifindex(netdev_, &ifindex);
1069 if (!error) {
1070 error = get_stats_via_netlink(ifindex, stats);
1071 }
1072 } else {
1073 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1074 }
1075 }
1076
1077 /* If this port is an internal port then the transmit and receive stats
1078 * will appear to be swapped relative to the other ports since we are the
1079 * one sending the data, not a remote computer. For consistency, we swap
1080 * them back here. This does not apply if we are getting stats from the
1081 * vport layer because it always tracks stats from the perspective of the
1082 * switch. */
1083 netdev_linux_update_is_pseudo(netdev_dev);
1084 if (!error && !netdev_dev->have_vport_stats &&
1085 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1086 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1087 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1088 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1089 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1090 stats->rx_length_errors = 0;
1091 stats->rx_over_errors = 0;
1092 stats->rx_crc_errors = 0;
1093 stats->rx_frame_errors = 0;
1094 stats->rx_fifo_errors = 0;
1095 stats->rx_missed_errors = 0;
1096 stats->tx_aborted_errors = 0;
1097 stats->tx_carrier_errors = 0;
1098 stats->tx_fifo_errors = 0;
1099 stats->tx_heartbeat_errors = 0;
1100 stats->tx_window_errors = 0;
1101 }
1102
1103 return error;
1104 }
1105
1106 /* Stores the features supported by 'netdev' into each of '*current',
1107 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1108 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1109 * successful, otherwise a positive errno value. */
1110 static int
1111 netdev_linux_get_features(struct netdev *netdev,
1112 uint32_t *current, uint32_t *advertised,
1113 uint32_t *supported, uint32_t *peer)
1114 {
1115 struct ethtool_cmd ecmd;
1116 int error;
1117
1118 memset(&ecmd, 0, sizeof ecmd);
1119 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1120 ETHTOOL_GSET, "ETHTOOL_GSET");
1121 if (error) {
1122 return error;
1123 }
1124
1125 /* Supported features. */
1126 *supported = 0;
1127 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1128 *supported |= OFPPF_10MB_HD;
1129 }
1130 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1131 *supported |= OFPPF_10MB_FD;
1132 }
1133 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1134 *supported |= OFPPF_100MB_HD;
1135 }
1136 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1137 *supported |= OFPPF_100MB_FD;
1138 }
1139 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1140 *supported |= OFPPF_1GB_HD;
1141 }
1142 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1143 *supported |= OFPPF_1GB_FD;
1144 }
1145 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1146 *supported |= OFPPF_10GB_FD;
1147 }
1148 if (ecmd.supported & SUPPORTED_TP) {
1149 *supported |= OFPPF_COPPER;
1150 }
1151 if (ecmd.supported & SUPPORTED_FIBRE) {
1152 *supported |= OFPPF_FIBER;
1153 }
1154 if (ecmd.supported & SUPPORTED_Autoneg) {
1155 *supported |= OFPPF_AUTONEG;
1156 }
1157 if (ecmd.supported & SUPPORTED_Pause) {
1158 *supported |= OFPPF_PAUSE;
1159 }
1160 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1161 *supported |= OFPPF_PAUSE_ASYM;
1162 }
1163
1164 /* Advertised features. */
1165 *advertised = 0;
1166 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1167 *advertised |= OFPPF_10MB_HD;
1168 }
1169 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1170 *advertised |= OFPPF_10MB_FD;
1171 }
1172 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1173 *advertised |= OFPPF_100MB_HD;
1174 }
1175 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1176 *advertised |= OFPPF_100MB_FD;
1177 }
1178 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1179 *advertised |= OFPPF_1GB_HD;
1180 }
1181 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1182 *advertised |= OFPPF_1GB_FD;
1183 }
1184 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1185 *advertised |= OFPPF_10GB_FD;
1186 }
1187 if (ecmd.advertising & ADVERTISED_TP) {
1188 *advertised |= OFPPF_COPPER;
1189 }
1190 if (ecmd.advertising & ADVERTISED_FIBRE) {
1191 *advertised |= OFPPF_FIBER;
1192 }
1193 if (ecmd.advertising & ADVERTISED_Autoneg) {
1194 *advertised |= OFPPF_AUTONEG;
1195 }
1196 if (ecmd.advertising & ADVERTISED_Pause) {
1197 *advertised |= OFPPF_PAUSE;
1198 }
1199 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1200 *advertised |= OFPPF_PAUSE_ASYM;
1201 }
1202
1203 /* Current settings. */
1204 if (ecmd.speed == SPEED_10) {
1205 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1206 } else if (ecmd.speed == SPEED_100) {
1207 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1208 } else if (ecmd.speed == SPEED_1000) {
1209 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1210 } else if (ecmd.speed == SPEED_10000) {
1211 *current = OFPPF_10GB_FD;
1212 } else {
1213 *current = 0;
1214 }
1215
1216 if (ecmd.port == PORT_TP) {
1217 *current |= OFPPF_COPPER;
1218 } else if (ecmd.port == PORT_FIBRE) {
1219 *current |= OFPPF_FIBER;
1220 }
1221
1222 if (ecmd.autoneg) {
1223 *current |= OFPPF_AUTONEG;
1224 }
1225
1226 /* Peer advertisements. */
1227 *peer = 0; /* XXX */
1228
1229 return 0;
1230 }
1231
1232 /* Set the features advertised by 'netdev' to 'advertise'. */
1233 static int
1234 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1235 {
1236 struct ethtool_cmd ecmd;
1237 int error;
1238
1239 memset(&ecmd, 0, sizeof ecmd);
1240 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1241 ETHTOOL_GSET, "ETHTOOL_GSET");
1242 if (error) {
1243 return error;
1244 }
1245
1246 ecmd.advertising = 0;
1247 if (advertise & OFPPF_10MB_HD) {
1248 ecmd.advertising |= ADVERTISED_10baseT_Half;
1249 }
1250 if (advertise & OFPPF_10MB_FD) {
1251 ecmd.advertising |= ADVERTISED_10baseT_Full;
1252 }
1253 if (advertise & OFPPF_100MB_HD) {
1254 ecmd.advertising |= ADVERTISED_100baseT_Half;
1255 }
1256 if (advertise & OFPPF_100MB_FD) {
1257 ecmd.advertising |= ADVERTISED_100baseT_Full;
1258 }
1259 if (advertise & OFPPF_1GB_HD) {
1260 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1261 }
1262 if (advertise & OFPPF_1GB_FD) {
1263 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1264 }
1265 if (advertise & OFPPF_10GB_FD) {
1266 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1267 }
1268 if (advertise & OFPPF_COPPER) {
1269 ecmd.advertising |= ADVERTISED_TP;
1270 }
1271 if (advertise & OFPPF_FIBER) {
1272 ecmd.advertising |= ADVERTISED_FIBRE;
1273 }
1274 if (advertise & OFPPF_AUTONEG) {
1275 ecmd.advertising |= ADVERTISED_Autoneg;
1276 }
1277 if (advertise & OFPPF_PAUSE) {
1278 ecmd.advertising |= ADVERTISED_Pause;
1279 }
1280 if (advertise & OFPPF_PAUSE_ASYM) {
1281 ecmd.advertising |= ADVERTISED_Asym_Pause;
1282 }
1283 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1284 ETHTOOL_SSET, "ETHTOOL_SSET");
1285 }
1286
1287 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1288 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1289 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1290 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1291 * sets '*vlan_vid' to -1. */
1292 static int
1293 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1294 {
1295 const char *netdev_name = netdev_get_name(netdev);
1296 struct ds line = DS_EMPTY_INITIALIZER;
1297 FILE *stream = NULL;
1298 int error;
1299 char *fn;
1300
1301 COVERAGE_INC(netdev_get_vlan_vid);
1302 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1303 stream = fopen(fn, "r");
1304 if (!stream) {
1305 error = errno;
1306 goto done;
1307 }
1308
1309 if (ds_get_line(&line, stream)) {
1310 if (ferror(stream)) {
1311 error = errno;
1312 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1313 } else {
1314 error = EPROTO;
1315 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1316 }
1317 goto done;
1318 }
1319
1320 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1321 error = EPROTO;
1322 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1323 fn, ds_cstr(&line));
1324 goto done;
1325 }
1326
1327 error = 0;
1328
1329 done:
1330 free(fn);
1331 if (stream) {
1332 fclose(stream);
1333 }
1334 ds_destroy(&line);
1335 if (error) {
1336 *vlan_vid = -1;
1337 }
1338 return error;
1339 }
1340
1341 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1342 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1343
1344 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1345 * positive errno value.
1346 *
1347 * This function is equivalent to running
1348 * /sbin/tc qdisc del dev %s handle ffff: ingress
1349 * but it is much, much faster.
1350 */
1351 static int
1352 netdev_linux_remove_policing(struct netdev *netdev)
1353 {
1354 struct netdev_dev_linux *netdev_dev =
1355 netdev_dev_linux_cast(netdev_get_dev(netdev));
1356 const char *netdev_name = netdev_get_name(netdev);
1357
1358 struct ofpbuf request;
1359 struct tcmsg *tcmsg;
1360 int error;
1361
1362 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1363 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1364 tcmsg->tcm_parent = TC_H_INGRESS;
1365 nl_msg_put_string(&request, TCA_KIND, "ingress");
1366 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1367
1368 error = tc_transact(&request, NULL);
1369 if (error && error != ENOENT && error != EINVAL) {
1370 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1371 netdev_name, strerror(error));
1372 return error;
1373 }
1374
1375 netdev_dev->kbits_rate = 0;
1376 netdev_dev->kbits_burst = 0;
1377 netdev_dev->cache_valid |= VALID_POLICING;
1378 return 0;
1379 }
1380
1381 /* Attempts to set input rate limiting (policing) policy. */
1382 static int
1383 netdev_linux_set_policing(struct netdev *netdev,
1384 uint32_t kbits_rate, uint32_t kbits_burst)
1385 {
1386 struct netdev_dev_linux *netdev_dev =
1387 netdev_dev_linux_cast(netdev_get_dev(netdev));
1388 const char *netdev_name = netdev_get_name(netdev);
1389 char command[1024];
1390
1391 COVERAGE_INC(netdev_set_policing);
1392
1393 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1394 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1395 : kbits_burst); /* Stick with user-specified value. */
1396
1397 if (netdev_dev->cache_valid & VALID_POLICING
1398 && netdev_dev->kbits_rate == kbits_rate
1399 && netdev_dev->kbits_burst == kbits_burst) {
1400 /* Assume that settings haven't changed since we last set them. */
1401 return 0;
1402 }
1403
1404 netdev_linux_remove_policing(netdev);
1405 if (kbits_rate) {
1406 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1407 if (system(command) != 0) {
1408 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1409 return -1;
1410 }
1411
1412 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1413 kbits_rate, kbits_burst);
1414 if (system(command) != 0) {
1415 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1416 netdev_name);
1417 return -1;
1418 }
1419
1420 netdev_dev->kbits_rate = kbits_rate;
1421 netdev_dev->kbits_burst = kbits_burst;
1422 netdev_dev->cache_valid |= VALID_POLICING;
1423 }
1424
1425 return 0;
1426 }
1427
1428 static int
1429 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1430 struct svec *types)
1431 {
1432 const struct tc_ops **opsp;
1433
1434 for (opsp = tcs; *opsp != NULL; opsp++) {
1435 const struct tc_ops *ops = *opsp;
1436 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1437 svec_add(types, ops->ovs_name);
1438 }
1439 }
1440 return 0;
1441 }
1442
1443 static const struct tc_ops *
1444 tc_lookup_ovs_name(const char *name)
1445 {
1446 const struct tc_ops **opsp;
1447
1448 for (opsp = tcs; *opsp != NULL; opsp++) {
1449 const struct tc_ops *ops = *opsp;
1450 if (!strcmp(name, ops->ovs_name)) {
1451 return ops;
1452 }
1453 }
1454 return NULL;
1455 }
1456
1457 static const struct tc_ops *
1458 tc_lookup_linux_name(const char *name)
1459 {
1460 const struct tc_ops **opsp;
1461
1462 for (opsp = tcs; *opsp != NULL; opsp++) {
1463 const struct tc_ops *ops = *opsp;
1464 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1465 return ops;
1466 }
1467 }
1468 return NULL;
1469 }
1470
1471 static int
1472 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1473 const char *type,
1474 struct netdev_qos_capabilities *caps)
1475 {
1476 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1477 if (!ops) {
1478 return EOPNOTSUPP;
1479 }
1480 caps->n_queues = ops->n_queues;
1481 return 0;
1482 }
1483
1484 static int
1485 netdev_linux_get_qos(const struct netdev *netdev,
1486 const char **typep, struct shash *details)
1487 {
1488 struct netdev_dev_linux *netdev_dev =
1489 netdev_dev_linux_cast(netdev_get_dev(netdev));
1490 int error;
1491
1492 error = tc_query_qdisc(netdev);
1493 if (error) {
1494 return error;
1495 }
1496
1497 *typep = netdev_dev->tc->ops->ovs_name;
1498 return (netdev_dev->tc->ops->qdisc_get
1499 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1500 : 0);
1501 }
1502
1503 static int
1504 netdev_linux_set_qos(struct netdev *netdev,
1505 const char *type, const struct shash *details)
1506 {
1507 struct netdev_dev_linux *netdev_dev =
1508 netdev_dev_linux_cast(netdev_get_dev(netdev));
1509 const struct tc_ops *new_ops;
1510 int error;
1511
1512 new_ops = tc_lookup_ovs_name(type);
1513 if (!new_ops || !new_ops->tc_install) {
1514 return EOPNOTSUPP;
1515 }
1516
1517 error = tc_query_qdisc(netdev);
1518 if (error) {
1519 return error;
1520 }
1521
1522 if (new_ops == netdev_dev->tc->ops) {
1523 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1524 } else {
1525 /* Delete existing qdisc. */
1526 error = tc_del_qdisc(netdev);
1527 if (error) {
1528 return error;
1529 }
1530 assert(netdev_dev->tc == NULL);
1531
1532 /* Install new qdisc. */
1533 error = new_ops->tc_install(netdev, details);
1534 assert((error == 0) == (netdev_dev->tc != NULL));
1535
1536 return error;
1537 }
1538 }
1539
1540 static int
1541 netdev_linux_get_queue(const struct netdev *netdev,
1542 unsigned int queue_id, struct shash *details)
1543 {
1544 struct netdev_dev_linux *netdev_dev =
1545 netdev_dev_linux_cast(netdev_get_dev(netdev));
1546 int error;
1547
1548 error = tc_query_qdisc(netdev);
1549 if (error) {
1550 return error;
1551 } else if (queue_id > UINT16_MAX
1552 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1553 return ENOENT;
1554 }
1555
1556 return netdev_dev->tc->ops->class_get(netdev, queue_id, details);
1557 }
1558
1559 static int
1560 netdev_linux_set_queue(struct netdev *netdev,
1561 unsigned int queue_id, const struct shash *details)
1562 {
1563 struct netdev_dev_linux *netdev_dev =
1564 netdev_dev_linux_cast(netdev_get_dev(netdev));
1565 int error;
1566
1567 error = tc_query_qdisc(netdev);
1568 if (error) {
1569 return error;
1570 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1571 || !netdev_dev->tc->ops->class_set) {
1572 return EINVAL;
1573 }
1574
1575 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1576 }
1577
1578 static int
1579 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1580 {
1581 struct netdev_dev_linux *netdev_dev =
1582 netdev_dev_linux_cast(netdev_get_dev(netdev));
1583 int error;
1584
1585 error = tc_query_qdisc(netdev);
1586 if (error) {
1587 return error;
1588 } else if (!netdev_dev->tc->ops->class_delete) {
1589 return EINVAL;
1590 } else if (queue_id > UINT16_MAX
1591 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1592 return ENOENT;
1593 }
1594
1595 return netdev_dev->tc->ops->class_delete(netdev, queue_id);
1596 }
1597
1598 static int
1599 netdev_linux_get_queue_stats(const struct netdev *netdev,
1600 unsigned int queue_id,
1601 struct netdev_queue_stats *stats)
1602 {
1603 struct netdev_dev_linux *netdev_dev =
1604 netdev_dev_linux_cast(netdev_get_dev(netdev));
1605 int error;
1606
1607 error = tc_query_qdisc(netdev);
1608 if (error) {
1609 return error;
1610 } else if (queue_id > UINT16_MAX
1611 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1612 return ENOENT;
1613 } else if (!netdev_dev->tc->ops->class_get_stats) {
1614 return EOPNOTSUPP;
1615 }
1616
1617 return netdev_dev->tc->ops->class_get_stats(netdev, queue_id, stats);
1618 }
1619
1620 static void
1621 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1622 {
1623 struct ofpbuf request;
1624 struct tcmsg *tcmsg;
1625
1626 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1627 tcmsg->tcm_parent = 0;
1628 nl_dump_start(dump, rtnl_sock, &request);
1629 ofpbuf_uninit(&request);
1630 }
1631
1632 static int
1633 netdev_linux_dump_queues(const struct netdev *netdev,
1634 netdev_dump_queues_cb *cb, void *aux)
1635 {
1636 struct netdev_dev_linux *netdev_dev =
1637 netdev_dev_linux_cast(netdev_get_dev(netdev));
1638 unsigned int queue_id;
1639 struct shash details;
1640 int last_error;
1641 void *queue;
1642 int error;
1643
1644 error = tc_query_qdisc(netdev);
1645 if (error) {
1646 return error;
1647 } else if (!netdev_dev->tc->ops->class_get) {
1648 return EOPNOTSUPP;
1649 }
1650
1651 last_error = 0;
1652 shash_init(&details);
1653 PORT_ARRAY_FOR_EACH (queue, &netdev_dev->tc->queues, queue_id) {
1654 shash_clear(&details);
1655
1656 error = netdev_dev->tc->ops->class_get(netdev, queue_id, &details);
1657 if (!error) {
1658 (*cb)(queue_id, &details, aux);
1659 } else {
1660 last_error = error;
1661 }
1662 }
1663 shash_destroy(&details);
1664
1665 return last_error;
1666 }
1667
1668 static int
1669 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1670 netdev_dump_queue_stats_cb *cb, void *aux)
1671 {
1672 struct netdev_dev_linux *netdev_dev =
1673 netdev_dev_linux_cast(netdev_get_dev(netdev));
1674 struct nl_dump dump;
1675 struct ofpbuf msg;
1676 int last_error;
1677 int error;
1678
1679 error = tc_query_qdisc(netdev);
1680 if (error) {
1681 return error;
1682 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1683 return EOPNOTSUPP;
1684 }
1685
1686 last_error = 0;
1687 start_queue_dump(netdev, &dump);
1688 while (nl_dump_next(&dump, &msg)) {
1689 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1690 if (error) {
1691 last_error = error;
1692 }
1693 }
1694
1695 error = nl_dump_done(&dump);
1696 return error ? error : last_error;
1697 }
1698
1699 static int
1700 netdev_linux_get_in4(const struct netdev *netdev_,
1701 struct in_addr *address, struct in_addr *netmask)
1702 {
1703 struct netdev_dev_linux *netdev_dev =
1704 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1705
1706 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1707 int error;
1708
1709 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1710 SIOCGIFADDR, "SIOCGIFADDR");
1711 if (error) {
1712 return error;
1713 }
1714
1715 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1716 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1717 if (error) {
1718 return error;
1719 }
1720
1721 netdev_dev->cache_valid |= VALID_IN4;
1722 }
1723 *address = netdev_dev->address;
1724 *netmask = netdev_dev->netmask;
1725 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1726 }
1727
1728 static int
1729 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1730 struct in_addr netmask)
1731 {
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1734 int error;
1735
1736 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1737 if (!error) {
1738 netdev_dev->cache_valid |= VALID_IN4;
1739 netdev_dev->address = address;
1740 netdev_dev->netmask = netmask;
1741 if (address.s_addr != INADDR_ANY) {
1742 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1743 "SIOCSIFNETMASK", netmask);
1744 }
1745 }
1746 return error;
1747 }
1748
1749 static bool
1750 parse_if_inet6_line(const char *line,
1751 struct in6_addr *in6, char ifname[16 + 1])
1752 {
1753 uint8_t *s6 = in6->s6_addr;
1754 #define X8 "%2"SCNx8
1755 return sscanf(line,
1756 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1757 "%*x %*x %*x %*x %16s\n",
1758 &s6[0], &s6[1], &s6[2], &s6[3],
1759 &s6[4], &s6[5], &s6[6], &s6[7],
1760 &s6[8], &s6[9], &s6[10], &s6[11],
1761 &s6[12], &s6[13], &s6[14], &s6[15],
1762 ifname) == 17;
1763 }
1764
1765 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1766 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1767 static int
1768 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1769 {
1770 struct netdev_dev_linux *netdev_dev =
1771 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1772 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1773 FILE *file;
1774 char line[128];
1775
1776 netdev_dev->in6 = in6addr_any;
1777
1778 file = fopen("/proc/net/if_inet6", "r");
1779 if (file != NULL) {
1780 const char *name = netdev_get_name(netdev_);
1781 while (fgets(line, sizeof line, file)) {
1782 struct in6_addr in6_tmp;
1783 char ifname[16 + 1];
1784 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1785 && !strcmp(name, ifname))
1786 {
1787 netdev_dev->in6 = in6_tmp;
1788 break;
1789 }
1790 }
1791 fclose(file);
1792 }
1793 netdev_dev->cache_valid |= VALID_IN6;
1794 }
1795 *in6 = netdev_dev->in6;
1796 return 0;
1797 }
1798
1799 static void
1800 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1801 {
1802 struct sockaddr_in sin;
1803 memset(&sin, 0, sizeof sin);
1804 sin.sin_family = AF_INET;
1805 sin.sin_addr = addr;
1806 sin.sin_port = 0;
1807
1808 memset(sa, 0, sizeof *sa);
1809 memcpy(sa, &sin, sizeof sin);
1810 }
1811
1812 static int
1813 do_set_addr(struct netdev *netdev,
1814 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1815 {
1816 struct ifreq ifr;
1817 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1818 make_in4_sockaddr(&ifr.ifr_addr, addr);
1819
1820 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1821 ioctl_name);
1822 }
1823
1824 /* Adds 'router' as a default IP gateway. */
1825 static int
1826 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1827 {
1828 struct in_addr any = { INADDR_ANY };
1829 struct rtentry rt;
1830 int error;
1831
1832 memset(&rt, 0, sizeof rt);
1833 make_in4_sockaddr(&rt.rt_dst, any);
1834 make_in4_sockaddr(&rt.rt_gateway, router);
1835 make_in4_sockaddr(&rt.rt_genmask, any);
1836 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1837 COVERAGE_INC(netdev_add_router);
1838 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1839 if (error) {
1840 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1841 }
1842 return error;
1843 }
1844
1845 static int
1846 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1847 char **netdev_name)
1848 {
1849 static const char fn[] = "/proc/net/route";
1850 FILE *stream;
1851 char line[256];
1852 int ln;
1853
1854 *netdev_name = NULL;
1855 stream = fopen(fn, "r");
1856 if (stream == NULL) {
1857 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1858 return errno;
1859 }
1860
1861 ln = 0;
1862 while (fgets(line, sizeof line, stream)) {
1863 if (++ln >= 2) {
1864 char iface[17];
1865 uint32_t dest, gateway, mask;
1866 int refcnt, metric, mtu;
1867 unsigned int flags, use, window, irtt;
1868
1869 if (sscanf(line,
1870 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1871 " %d %u %u\n",
1872 iface, &dest, &gateway, &flags, &refcnt,
1873 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1874
1875 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1876 fn, ln, line);
1877 continue;
1878 }
1879 if (!(flags & RTF_UP)) {
1880 /* Skip routes that aren't up. */
1881 continue;
1882 }
1883
1884 /* The output of 'dest', 'mask', and 'gateway' were given in
1885 * network byte order, so we don't need need any endian
1886 * conversions here. */
1887 if ((dest & mask) == (host->s_addr & mask)) {
1888 if (!gateway) {
1889 /* The host is directly reachable. */
1890 next_hop->s_addr = 0;
1891 } else {
1892 /* To reach the host, we must go through a gateway. */
1893 next_hop->s_addr = gateway;
1894 }
1895 *netdev_name = xstrdup(iface);
1896 fclose(stream);
1897 return 0;
1898 }
1899 }
1900 }
1901
1902 fclose(stream);
1903 return ENXIO;
1904 }
1905
1906 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1907 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1908 * returns 0. Otherwise, it returns a positive errno value; in particular,
1909 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1910 static int
1911 netdev_linux_arp_lookup(const struct netdev *netdev,
1912 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1913 {
1914 struct arpreq r;
1915 struct sockaddr_in sin;
1916 int retval;
1917
1918 memset(&r, 0, sizeof r);
1919 sin.sin_family = AF_INET;
1920 sin.sin_addr.s_addr = ip;
1921 sin.sin_port = 0;
1922 memcpy(&r.arp_pa, &sin, sizeof sin);
1923 r.arp_ha.sa_family = ARPHRD_ETHER;
1924 r.arp_flags = 0;
1925 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1926 COVERAGE_INC(netdev_arp_lookup);
1927 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1928 if (!retval) {
1929 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1930 } else if (retval != ENXIO) {
1931 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1932 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1933 }
1934 return retval;
1935 }
1936
1937 static int
1938 nd_to_iff_flags(enum netdev_flags nd)
1939 {
1940 int iff = 0;
1941 if (nd & NETDEV_UP) {
1942 iff |= IFF_UP;
1943 }
1944 if (nd & NETDEV_PROMISC) {
1945 iff |= IFF_PROMISC;
1946 }
1947 return iff;
1948 }
1949
1950 static int
1951 iff_to_nd_flags(int iff)
1952 {
1953 enum netdev_flags nd = 0;
1954 if (iff & IFF_UP) {
1955 nd |= NETDEV_UP;
1956 }
1957 if (iff & IFF_PROMISC) {
1958 nd |= NETDEV_PROMISC;
1959 }
1960 return nd;
1961 }
1962
1963 static int
1964 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1965 enum netdev_flags on, enum netdev_flags *old_flagsp)
1966 {
1967 int old_flags, new_flags;
1968 int error;
1969
1970 error = get_flags(netdev, &old_flags);
1971 if (!error) {
1972 *old_flagsp = iff_to_nd_flags(old_flags);
1973 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
1974 if (new_flags != old_flags) {
1975 error = set_flags(netdev, new_flags);
1976 }
1977 }
1978 return error;
1979 }
1980
1981 static void
1982 poll_notify(struct list *list)
1983 {
1984 struct netdev_linux_notifier *notifier;
1985 LIST_FOR_EACH (notifier, struct netdev_linux_notifier, node, list) {
1986 struct netdev_notifier *n = &notifier->notifier;
1987 n->cb(n);
1988 }
1989 }
1990
1991 static void
1992 netdev_linux_poll_cb(const struct rtnetlink_change *change,
1993 void *aux OVS_UNUSED)
1994 {
1995 if (change) {
1996 struct list *list = shash_find_data(&netdev_linux_notifiers,
1997 change->ifname);
1998 if (list) {
1999 poll_notify(list);
2000 }
2001 } else {
2002 struct shash_node *node;
2003 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2004 poll_notify(node->data);
2005 }
2006 }
2007 }
2008
2009 static int
2010 netdev_linux_poll_add(struct netdev *netdev,
2011 void (*cb)(struct netdev_notifier *), void *aux,
2012 struct netdev_notifier **notifierp)
2013 {
2014 const char *netdev_name = netdev_get_name(netdev);
2015 struct netdev_linux_notifier *notifier;
2016 struct list *list;
2017
2018 if (shash_is_empty(&netdev_linux_notifiers)) {
2019 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2020 netdev_linux_poll_cb, NULL);
2021 if (error) {
2022 return error;
2023 }
2024 }
2025
2026 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2027 if (!list) {
2028 list = xmalloc(sizeof *list);
2029 list_init(list);
2030 shash_add(&netdev_linux_notifiers, netdev_name, list);
2031 }
2032
2033 notifier = xmalloc(sizeof *notifier);
2034 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2035 list_push_back(list, &notifier->node);
2036 *notifierp = &notifier->notifier;
2037 return 0;
2038 }
2039
2040 static void
2041 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2042 {
2043 struct netdev_linux_notifier *notifier =
2044 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2045 struct list *list;
2046
2047 /* Remove 'notifier' from its list. */
2048 list = list_remove(&notifier->node);
2049 if (list_is_empty(list)) {
2050 /* The list is now empty. Remove it from the hash and free it. */
2051 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2052 shash_delete(&netdev_linux_notifiers,
2053 shash_find(&netdev_linux_notifiers, netdev_name));
2054 free(list);
2055 }
2056 free(notifier);
2057
2058 /* If that was the last notifier, unregister. */
2059 if (shash_is_empty(&netdev_linux_notifiers)) {
2060 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2061 }
2062 }
2063
2064 const struct netdev_class netdev_linux_class = {
2065 "system",
2066
2067 netdev_linux_init,
2068 netdev_linux_run,
2069 netdev_linux_wait,
2070
2071 netdev_linux_create_system,
2072 netdev_linux_destroy,
2073 NULL, /* reconfigure */
2074
2075 netdev_linux_open,
2076 netdev_linux_close,
2077
2078 netdev_linux_enumerate,
2079
2080 netdev_linux_recv,
2081 netdev_linux_recv_wait,
2082 netdev_linux_drain,
2083
2084 netdev_linux_send,
2085 netdev_linux_send_wait,
2086
2087 netdev_linux_set_etheraddr,
2088 netdev_linux_get_etheraddr,
2089 netdev_linux_get_mtu,
2090 netdev_linux_get_ifindex,
2091 netdev_linux_get_carrier,
2092 netdev_linux_get_stats,
2093 netdev_vport_set_stats,
2094
2095 netdev_linux_get_features,
2096 netdev_linux_set_advertisements,
2097 netdev_linux_get_vlan_vid,
2098
2099 netdev_linux_set_policing,
2100 netdev_linux_get_qos_types,
2101 netdev_linux_get_qos_capabilities,
2102 netdev_linux_get_qos,
2103 netdev_linux_set_qos,
2104 netdev_linux_get_queue,
2105 netdev_linux_set_queue,
2106 netdev_linux_delete_queue,
2107 netdev_linux_get_queue_stats,
2108 netdev_linux_dump_queues,
2109 netdev_linux_dump_queue_stats,
2110
2111 netdev_linux_get_in4,
2112 netdev_linux_set_in4,
2113 netdev_linux_get_in6,
2114 netdev_linux_add_router,
2115 netdev_linux_get_next_hop,
2116 netdev_linux_arp_lookup,
2117
2118 netdev_linux_update_flags,
2119
2120 netdev_linux_poll_add,
2121 netdev_linux_poll_remove,
2122 };
2123
2124 const struct netdev_class netdev_tap_class = {
2125 "tap",
2126
2127 netdev_linux_init,
2128 netdev_linux_run,
2129 netdev_linux_wait,
2130
2131 netdev_linux_create_tap,
2132 netdev_linux_destroy,
2133 NULL, /* reconfigure */
2134
2135 netdev_linux_open,
2136 netdev_linux_close,
2137
2138 NULL, /* enumerate */
2139
2140 netdev_linux_recv,
2141 netdev_linux_recv_wait,
2142 netdev_linux_drain,
2143
2144 netdev_linux_send,
2145 netdev_linux_send_wait,
2146
2147 netdev_linux_set_etheraddr,
2148 netdev_linux_get_etheraddr,
2149 netdev_linux_get_mtu,
2150 netdev_linux_get_ifindex,
2151 netdev_linux_get_carrier,
2152 netdev_linux_get_stats,
2153 NULL, /* set_stats */
2154
2155 netdev_linux_get_features,
2156 netdev_linux_set_advertisements,
2157 netdev_linux_get_vlan_vid,
2158
2159 netdev_linux_set_policing,
2160 netdev_linux_get_qos_types,
2161 netdev_linux_get_qos_capabilities,
2162 netdev_linux_get_qos,
2163 netdev_linux_set_qos,
2164 netdev_linux_get_queue,
2165 netdev_linux_set_queue,
2166 netdev_linux_delete_queue,
2167 netdev_linux_get_queue_stats,
2168 netdev_linux_dump_queues,
2169 netdev_linux_dump_queue_stats,
2170
2171 netdev_linux_get_in4,
2172 netdev_linux_set_in4,
2173 netdev_linux_get_in6,
2174 netdev_linux_add_router,
2175 netdev_linux_get_next_hop,
2176 netdev_linux_arp_lookup,
2177
2178 netdev_linux_update_flags,
2179
2180 netdev_linux_poll_add,
2181 netdev_linux_poll_remove,
2182 };
2183 \f
2184 /* HTB traffic control class. */
2185
2186 #define HTB_N_QUEUES 0xf000
2187
2188 struct htb {
2189 struct tc tc;
2190 unsigned int max_rate; /* In bytes/s. */
2191 };
2192
2193 struct htb_class {
2194 unsigned int min_rate; /* In bytes/s. */
2195 unsigned int max_rate; /* In bytes/s. */
2196 unsigned int burst; /* In bytes. */
2197 unsigned int priority; /* Lower values are higher priorities. */
2198 };
2199
2200 static struct htb *
2201 htb_get__(const struct netdev *netdev)
2202 {
2203 struct netdev_dev_linux *netdev_dev =
2204 netdev_dev_linux_cast(netdev_get_dev(netdev));
2205 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2206 }
2207
2208 static struct htb *
2209 htb_install__(struct netdev *netdev, uint64_t max_rate)
2210 {
2211 struct netdev_dev_linux *netdev_dev =
2212 netdev_dev_linux_cast(netdev_get_dev(netdev));
2213 struct htb *htb;
2214
2215 htb = xmalloc(sizeof *htb);
2216 tc_init(&htb->tc, &tc_ops_htb);
2217 htb->max_rate = max_rate;
2218
2219 netdev_dev->tc = &htb->tc;
2220
2221 return htb;
2222 }
2223
2224 /* Create an HTB qdisc.
2225 *
2226 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2227 * 0". */
2228 static int
2229 htb_setup_qdisc__(struct netdev *netdev)
2230 {
2231 size_t opt_offset;
2232 struct tc_htb_glob opt;
2233 struct ofpbuf request;
2234 struct tcmsg *tcmsg;
2235
2236 tc_del_qdisc(netdev);
2237
2238 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2239 NLM_F_EXCL | NLM_F_CREATE, &request);
2240 tcmsg->tcm_handle = tc_make_handle(1, 0);
2241 tcmsg->tcm_parent = TC_H_ROOT;
2242
2243 nl_msg_put_string(&request, TCA_KIND, "htb");
2244
2245 memset(&opt, 0, sizeof opt);
2246 opt.rate2quantum = 10;
2247 opt.version = 3;
2248 opt.defcls = 0;
2249
2250 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2251 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2252 nl_msg_end_nested(&request, opt_offset);
2253
2254 return tc_transact(&request, NULL);
2255 }
2256
2257 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2258 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2259 static int
2260 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2261 unsigned int parent, struct htb_class *class)
2262 {
2263 size_t opt_offset;
2264 struct tc_htb_opt opt;
2265 struct ofpbuf request;
2266 struct tcmsg *tcmsg;
2267 int error;
2268 int mtu;
2269
2270 netdev_get_mtu(netdev, &mtu);
2271
2272 memset(&opt, 0, sizeof opt);
2273 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2274 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2275 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2276 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2277 opt.prio = class->priority;
2278
2279 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2280 tcmsg->tcm_handle = handle;
2281 tcmsg->tcm_parent = parent;
2282
2283 nl_msg_put_string(&request, TCA_KIND, "htb");
2284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2285 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2286 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2287 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2288 nl_msg_end_nested(&request, opt_offset);
2289
2290 error = tc_transact(&request, NULL);
2291 if (error) {
2292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2293 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2294 netdev_get_name(netdev),
2295 tc_get_major(handle), tc_get_minor(handle),
2296 tc_get_major(parent), tc_get_minor(parent),
2297 class->min_rate, class->max_rate,
2298 class->burst, class->priority, strerror(error));
2299 }
2300 return error;
2301 }
2302
2303 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2304 * description of them into 'details'. The description complies with the
2305 * specification given in the vswitch database documentation for linux-htb
2306 * queue details. */
2307 static int
2308 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2309 {
2310 static const struct nl_policy tca_htb_policy[] = {
2311 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2312 .min_len = sizeof(struct tc_htb_opt) },
2313 };
2314
2315 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2316 const struct tc_htb_opt *htb;
2317
2318 if (!nl_parse_nested(nl_options, tca_htb_policy,
2319 attrs, ARRAY_SIZE(tca_htb_policy))) {
2320 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2321 return EPROTO;
2322 }
2323
2324 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2325 class->min_rate = htb->rate.rate;
2326 class->max_rate = htb->ceil.rate;
2327 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2328 class->priority = htb->prio;
2329 return 0;
2330 }
2331
2332 static int
2333 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2334 struct htb_class *options,
2335 struct netdev_queue_stats *stats)
2336 {
2337 struct nlattr *nl_options;
2338 unsigned int handle;
2339 int error;
2340
2341 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2342 if (!error && queue_id) {
2343 unsigned int major = tc_get_major(handle);
2344 unsigned int minor = tc_get_minor(handle);
2345 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2346 *queue_id = minor - 1;
2347 } else {
2348 error = EPROTO;
2349 }
2350 }
2351 if (!error && options) {
2352 error = htb_parse_tca_options__(nl_options, options);
2353 }
2354 return error;
2355 }
2356
2357 static void
2358 htb_parse_qdisc_details__(struct netdev *netdev,
2359 const struct shash *details, struct htb_class *hc)
2360 {
2361 const char *max_rate_s;
2362
2363 max_rate_s = shash_find_data(details, "max-rate");
2364 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2365 if (!hc->max_rate) {
2366 uint32_t current;
2367
2368 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2369 hc->max_rate = netdev_features_to_bps(current) / 8;
2370 }
2371 hc->min_rate = hc->max_rate;
2372 hc->burst = 0;
2373 hc->priority = 0;
2374 }
2375
2376 static int
2377 htb_parse_class_details__(struct netdev *netdev,
2378 const struct shash *details, struct htb_class *hc)
2379 {
2380 const struct htb *htb = htb_get__(netdev);
2381 const char *min_rate_s = shash_find_data(details, "min-rate");
2382 const char *max_rate_s = shash_find_data(details, "max-rate");
2383 const char *burst_s = shash_find_data(details, "burst");
2384 const char *priority_s = shash_find_data(details, "priority");
2385 int mtu;
2386
2387 /* min-rate */
2388 if (!min_rate_s) {
2389 /* min-rate is required. */
2390 return EINVAL;
2391 }
2392 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2393 hc->min_rate = MAX(hc->min_rate, 0);
2394 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2395
2396 /* max-rate */
2397 hc->max_rate = (max_rate_s
2398 ? strtoull(max_rate_s, NULL, 10) / 8
2399 : htb->max_rate);
2400 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2401 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2402
2403 /* burst
2404 *
2405 * According to hints in the documentation that I've read, it is important
2406 * that 'burst' be at least as big as the largest frame that might be
2407 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2408 * but having it a bit too small is a problem. Since netdev_get_mtu()
2409 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2410 * the MTU. We actually add 64, instead of 14, as a guard against
2411 * additional headers get tacked on somewhere that we're not aware of. */
2412 netdev_get_mtu(netdev, &mtu);
2413 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2414 hc->burst = MAX(hc->burst, mtu + 64);
2415
2416 /* priority */
2417 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2418
2419 return 0;
2420 }
2421
2422 static int
2423 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2424 unsigned int parent, struct htb_class *options,
2425 struct netdev_queue_stats *stats)
2426 {
2427 struct ofpbuf *reply;
2428 int error;
2429
2430 error = tc_query_class(netdev, handle, parent, &reply);
2431 if (!error) {
2432 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2433 ofpbuf_delete(reply);
2434 }
2435 return error;
2436 }
2437
2438 static int
2439 htb_tc_install(struct netdev *netdev, const struct shash *details)
2440 {
2441 int error;
2442
2443 error = htb_setup_qdisc__(netdev);
2444 if (!error) {
2445 struct htb_class hc;
2446
2447 htb_parse_qdisc_details__(netdev, details, &hc);
2448 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2449 tc_make_handle(1, 0), &hc);
2450 if (!error) {
2451 htb_install__(netdev, hc.max_rate);
2452 }
2453 }
2454 return error;
2455 }
2456
2457 static void
2458 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2459 const struct htb_class *hc)
2460 {
2461 struct htb *htb = htb_get__(netdev);
2462 struct htb_class *hcp;
2463
2464 hcp = port_array_get(&htb->tc.queues, queue_id);
2465 if (!hcp) {
2466 hcp = xmalloc(sizeof *hcp);
2467 port_array_set(&htb->tc.queues, queue_id, hcp);
2468 }
2469 *hcp = *hc;
2470 }
2471
2472 static int
2473 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2474 {
2475 struct shash details = SHASH_INITIALIZER(&details);
2476 struct ofpbuf msg;
2477 struct nl_dump dump;
2478 struct htb_class hc;
2479 struct htb *htb;
2480
2481 /* Get qdisc options. */
2482 hc.max_rate = 0;
2483 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2484 htb = htb_install__(netdev, hc.max_rate);
2485
2486 /* Get queues. */
2487 start_queue_dump(netdev, &dump);
2488 shash_init(&details);
2489 while (nl_dump_next(&dump, &msg)) {
2490 unsigned int queue_id;
2491
2492 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2493 htb_update_queue__(netdev, queue_id, &hc);
2494 }
2495 }
2496 nl_dump_done(&dump);
2497
2498 return 0;
2499 }
2500
2501 static void
2502 htb_tc_destroy(struct tc *tc)
2503 {
2504 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2505 unsigned int queue_id;
2506 struct htb_class *hc;
2507
2508 PORT_ARRAY_FOR_EACH (hc, &htb->tc.queues, queue_id) {
2509 free(hc);
2510 }
2511 tc_destroy(tc);
2512 free(htb);
2513 }
2514
2515 static int
2516 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2517 {
2518 const struct htb *htb = htb_get__(netdev);
2519 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2520 return 0;
2521 }
2522
2523 static int
2524 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2525 {
2526 struct htb_class hc;
2527 int error;
2528
2529 htb_parse_qdisc_details__(netdev, details, &hc);
2530 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2531 tc_make_handle(1, 0), &hc);
2532 if (!error) {
2533 htb_get__(netdev)->max_rate = hc.max_rate;
2534 }
2535 return error;
2536 }
2537
2538 static int
2539 htb_class_get(const struct netdev *netdev, unsigned int queue_id,
2540 struct shash *details)
2541 {
2542 const struct htb *htb = htb_get__(netdev);
2543 const struct htb_class *hc;
2544
2545 hc = port_array_get(&htb->tc.queues, queue_id);
2546 assert(hc != NULL);
2547
2548 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2549 if (hc->min_rate != hc->max_rate) {
2550 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2551 }
2552 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2553 if (hc->priority) {
2554 shash_add(details, "priority", xasprintf("%u", hc->priority));
2555 }
2556 return 0;
2557 }
2558
2559 static int
2560 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2561 const struct shash *details)
2562 {
2563 struct htb_class hc;
2564 int error;
2565
2566 error = htb_parse_class_details__(netdev, details, &hc);
2567 if (error) {
2568 return error;
2569 }
2570
2571 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2572 tc_make_handle(1, 0xfffe), &hc);
2573 if (error) {
2574 return error;
2575 }
2576
2577 htb_update_queue__(netdev, queue_id, &hc);
2578 return 0;
2579 }
2580
2581 static int
2582 htb_class_delete(struct netdev *netdev, unsigned int queue_id)
2583 {
2584 struct htb *htb = htb_get__(netdev);
2585 struct htb_class *hc;
2586 int error;
2587
2588 hc = port_array_get(&htb->tc.queues, queue_id);
2589 assert(hc != NULL);
2590
2591 error = tc_delete_class(netdev, tc_make_handle(1, queue_id + 1));
2592 if (!error) {
2593 free(hc);
2594 port_array_delete(&htb->tc.queues, queue_id);
2595 }
2596 return error;
2597 }
2598
2599 static int
2600 htb_class_get_stats(const struct netdev *netdev, unsigned int queue_id,
2601 struct netdev_queue_stats *stats)
2602 {
2603 return htb_query_class__(netdev, tc_make_handle(1, queue_id + 1),
2604 tc_make_handle(1, 0xfffe), NULL, stats);
2605 }
2606
2607 static int
2608 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2609 const struct ofpbuf *nlmsg,
2610 netdev_dump_queue_stats_cb *cb, void *aux)
2611 {
2612 struct netdev_queue_stats stats;
2613 unsigned int handle, major, minor;
2614 int error;
2615
2616 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2617 if (error) {
2618 return error;
2619 }
2620
2621 major = tc_get_major(handle);
2622 minor = tc_get_minor(handle);
2623 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2624 (*cb)(tc_get_minor(handle), &stats, aux);
2625 }
2626 return 0;
2627 }
2628
2629 static const struct tc_ops tc_ops_htb = {
2630 "htb", /* linux_name */
2631 "linux-htb", /* ovs_name */
2632 HTB_N_QUEUES, /* n_queues */
2633 htb_tc_install,
2634 htb_tc_load,
2635 htb_tc_destroy,
2636 htb_qdisc_get,
2637 htb_qdisc_set,
2638 htb_class_get,
2639 htb_class_set,
2640 htb_class_delete,
2641 htb_class_get_stats,
2642 htb_class_dump_stats
2643 };
2644 \f
2645 /* "linux-default" traffic control class.
2646 *
2647 * This class represents the default, unnamed Linux qdisc. It corresponds to
2648 * the "" (empty string) QoS type in the OVS database. */
2649
2650 static void
2651 default_install__(struct netdev *netdev)
2652 {
2653 struct netdev_dev_linux *netdev_dev =
2654 netdev_dev_linux_cast(netdev_get_dev(netdev));
2655 static struct tc *tc;
2656
2657 if (!tc) {
2658 tc = xmalloc(sizeof *tc);
2659 tc_init(tc, &tc_ops_default);
2660 }
2661 netdev_dev->tc = tc;
2662 }
2663
2664 static int
2665 default_tc_install(struct netdev *netdev,
2666 const struct shash *details OVS_UNUSED)
2667 {
2668 default_install__(netdev);
2669 return 0;
2670 }
2671
2672 static int
2673 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2674 {
2675 default_install__(netdev);
2676 return 0;
2677 }
2678
2679 static const struct tc_ops tc_ops_default = {
2680 NULL, /* linux_name */
2681 "", /* ovs_name */
2682 0, /* n_queues */
2683 default_tc_install,
2684 default_tc_load,
2685 NULL, /* tc_destroy */
2686 NULL, /* qdisc_get */
2687 NULL, /* qdisc_set */
2688 NULL, /* class_get */
2689 NULL, /* class_set */
2690 NULL, /* class_delete */
2691 NULL, /* class_get_stats */
2692 NULL /* class_dump_stats */
2693 };
2694 \f
2695 /* "linux-other" traffic control class.
2696 *
2697 * */
2698
2699 static int
2700 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2701 {
2702 struct netdev_dev_linux *netdev_dev =
2703 netdev_dev_linux_cast(netdev_get_dev(netdev));
2704 static struct tc *tc;
2705
2706 if (!tc) {
2707 tc = xmalloc(sizeof *tc);
2708 tc_init(tc, &tc_ops_other);
2709 }
2710 netdev_dev->tc = tc;
2711 return 0;
2712 }
2713
2714 static const struct tc_ops tc_ops_other = {
2715 NULL, /* linux_name */
2716 "linux-other", /* ovs_name */
2717 0, /* n_queues */
2718 NULL, /* tc_install */
2719 other_tc_load,
2720 NULL, /* tc_destroy */
2721 NULL, /* qdisc_get */
2722 NULL, /* qdisc_set */
2723 NULL, /* class_get */
2724 NULL, /* class_set */
2725 NULL, /* class_delete */
2726 NULL, /* class_get_stats */
2727 NULL /* class_dump_stats */
2728 };
2729 \f
2730 /* Traffic control. */
2731
2732 /* Number of kernel "tc" ticks per second. */
2733 static double ticks_per_s;
2734
2735 /* Number of kernel "jiffies" per second. This is used for the purpose of
2736 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2737 * one jiffy's worth of data.
2738 *
2739 * There are two possibilities here:
2740 *
2741 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2742 * approximate range of 100 to 1024. That means that we really need to
2743 * make sure that the qdisc can buffer that much data.
2744 *
2745 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2746 * has finely granular timers and there's no need to fudge additional room
2747 * for buffers. (There's no extra effort needed to implement that: the
2748 * large 'buffer_hz' is used as a divisor, so practically any number will
2749 * come out as 0 in the division. Small integer results in the case of
2750 * really high dividends won't have any real effect anyhow.)
2751 */
2752 static unsigned int buffer_hz;
2753
2754 /* Returns tc handle 'major':'minor'. */
2755 static unsigned int
2756 tc_make_handle(unsigned int major, unsigned int minor)
2757 {
2758 return TC_H_MAKE(major << 16, minor);
2759 }
2760
2761 /* Returns the major number from 'handle'. */
2762 static unsigned int
2763 tc_get_major(unsigned int handle)
2764 {
2765 return TC_H_MAJ(handle) >> 16;
2766 }
2767
2768 /* Returns the minor number from 'handle'. */
2769 static unsigned int
2770 tc_get_minor(unsigned int handle)
2771 {
2772 return TC_H_MIN(handle);
2773 }
2774
2775 static struct tcmsg *
2776 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2777 struct ofpbuf *request)
2778 {
2779 struct tcmsg *tcmsg;
2780 int ifindex;
2781 int error;
2782
2783 error = get_ifindex(netdev, &ifindex);
2784 if (error) {
2785 return NULL;
2786 }
2787
2788 ofpbuf_init(request, 512);
2789 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2790 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2791 tcmsg->tcm_family = AF_UNSPEC;
2792 tcmsg->tcm_ifindex = ifindex;
2793 /* Caller should fill in tcmsg->tcm_handle. */
2794 /* Caller should fill in tcmsg->tcm_parent. */
2795
2796 return tcmsg;
2797 }
2798
2799 static int
2800 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2801 {
2802 int error = nl_sock_transact(rtnl_sock, request, replyp);
2803 ofpbuf_uninit(request);
2804 return error;
2805 }
2806
2807 static void
2808 read_psched(void)
2809 {
2810 /* The values in psched are not individually very meaningful, but they are
2811 * important. The tables below show some values seen in the wild.
2812 *
2813 * Some notes:
2814 *
2815 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2816 * (Before that, there are hints that it was 1000000000.)
2817 *
2818 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2819 * above.
2820 *
2821 * /proc/net/psched
2822 * -----------------------------------
2823 * [1] 000c8000 000f4240 000f4240 00000064
2824 * [2] 000003e8 00000400 000f4240 3b9aca00
2825 * [3] 000003e8 00000400 000f4240 3b9aca00
2826 * [4] 000003e8 00000400 000f4240 00000064
2827 * [5] 000003e8 00000040 000f4240 3b9aca00
2828 * [6] 000003e8 00000040 000f4240 000000f9
2829 *
2830 * a b c d ticks_per_s buffer_hz
2831 * ------- --------- ---------- ------------- ----------- -------------
2832 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2833 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2834 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2835 * [4] 1,000 1,024 1,000,000 100 976,562 100
2836 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2837 * [6] 1,000 64 1,000,000 249 15,625,000 249
2838 *
2839 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2840 * [2] 2.6.26-1-686-bigmem from Debian lenny
2841 * [3] 2.6.26-2-sparc64 from Debian lenny
2842 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2843 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2844 * [6] 2.6.34 from kernel.org on KVM
2845 */
2846 static const char fn[] = "/proc/net/psched";
2847 unsigned int a, b, c, d;
2848 FILE *stream;
2849
2850 ticks_per_s = 1.0;
2851 buffer_hz = 100;
2852
2853 stream = fopen(fn, "r");
2854 if (!stream) {
2855 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2856 return;
2857 }
2858
2859 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2860 VLOG_WARN("%s: read failed", fn);
2861 fclose(stream);
2862 return;
2863 }
2864 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2865 fclose(stream);
2866
2867 if (!a || !c) {
2868 VLOG_WARN("%s: invalid scheduler parameters", fn);
2869 return;
2870 }
2871
2872 ticks_per_s = (double) a * c / b;
2873 if (c == 1000000) {
2874 buffer_hz = d;
2875 } else {
2876 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2877 fn, a, b, c, d);
2878 }
2879 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2880 }
2881
2882 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2883 * rate of 'rate' bytes per second. */
2884 static unsigned int
2885 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2886 {
2887 if (!buffer_hz) {
2888 read_psched();
2889 }
2890 return (rate * ticks) / ticks_per_s;
2891 }
2892
2893 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2894 * rate of 'rate' bytes per second. */
2895 static unsigned int
2896 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2897 {
2898 if (!buffer_hz) {
2899 read_psched();
2900 }
2901 return ((unsigned long long int) ticks_per_s * size) / rate;
2902 }
2903
2904 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2905 * a transmission rate of 'rate' bytes per second. */
2906 static unsigned int
2907 tc_buffer_per_jiffy(unsigned int rate)
2908 {
2909 if (!buffer_hz) {
2910 read_psched();
2911 }
2912 return rate / buffer_hz;
2913 }
2914
2915 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2916 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2917 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2918 * stores NULL into it if it is absent.
2919 *
2920 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2921 * 'msg'.
2922 *
2923 * Returns 0 if successful, otherwise a positive errno value. */
2924 static int
2925 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2926 struct nlattr **options)
2927 {
2928 static const struct nl_policy tca_policy[] = {
2929 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2930 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2931 };
2932 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2933
2934 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2935 tca_policy, ta, ARRAY_SIZE(ta))) {
2936 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2937 goto error;
2938 }
2939
2940 if (kind) {
2941 *kind = nl_attr_get_string(ta[TCA_KIND]);
2942 }
2943
2944 if (options) {
2945 *options = ta[TCA_OPTIONS];
2946 }
2947
2948 return 0;
2949
2950 error:
2951 if (kind) {
2952 *kind = NULL;
2953 }
2954 if (options) {
2955 *options = NULL;
2956 }
2957 return EPROTO;
2958 }
2959
2960 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2961 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
2962 * into '*options', and its queue statistics into '*stats'. Any of the output
2963 * arguments may be null.
2964 *
2965 * Returns 0 if successful, otherwise a positive errno value. */
2966 static int
2967 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
2968 struct nlattr **options, struct netdev_queue_stats *stats)
2969 {
2970 static const struct nl_policy tca_policy[] = {
2971 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
2972 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
2973 };
2974 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2975
2976 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2977 tca_policy, ta, ARRAY_SIZE(ta))) {
2978 VLOG_WARN_RL(&rl, "failed to parse class message");
2979 goto error;
2980 }
2981
2982 if (handlep) {
2983 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
2984 *handlep = tc->tcm_handle;
2985 }
2986
2987 if (options) {
2988 *options = ta[TCA_OPTIONS];
2989 }
2990
2991 if (stats) {
2992 const struct gnet_stats_queue *gsq;
2993 struct gnet_stats_basic gsb;
2994
2995 static const struct nl_policy stats_policy[] = {
2996 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
2997 .min_len = sizeof gsb },
2998 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
2999 .min_len = sizeof *gsq },
3000 };
3001 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3002
3003 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3004 sa, ARRAY_SIZE(sa))) {
3005 VLOG_WARN_RL(&rl, "failed to parse class stats");
3006 goto error;
3007 }
3008
3009 /* Alignment issues screw up the length of struct gnet_stats_basic on
3010 * some arch/bitsize combinations. Newer versions of Linux have a
3011 * struct gnet_stats_basic_packed, but we can't depend on that. The
3012 * easiest thing to do is just to make a copy. */
3013 memset(&gsb, 0, sizeof gsb);
3014 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3015 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3016 stats->tx_bytes = gsb.bytes;
3017 stats->tx_packets = gsb.packets;
3018
3019 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3020 stats->tx_errors = gsq->drops;
3021 }
3022
3023 return 0;
3024
3025 error:
3026 if (options) {
3027 *options = NULL;
3028 }
3029 if (stats) {
3030 memset(stats, 0, sizeof *stats);
3031 }
3032 return EPROTO;
3033 }
3034
3035 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3036 * on 'netdev'. */
3037 static int
3038 tc_query_class(const struct netdev *netdev,
3039 unsigned int handle, unsigned int parent,
3040 struct ofpbuf **replyp)
3041 {
3042 struct ofpbuf request;
3043 struct tcmsg *tcmsg;
3044 int error;
3045
3046 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3047 tcmsg->tcm_handle = handle;
3048 tcmsg->tcm_parent = parent;
3049
3050 error = tc_transact(&request, replyp);
3051 if (error) {
3052 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3053 netdev_get_name(netdev),
3054 tc_get_major(handle), tc_get_minor(handle),
3055 tc_get_major(parent), tc_get_minor(parent),
3056 strerror(error));
3057 }
3058 return error;
3059 }
3060
3061 /* Equivalent to "tc class del dev <name> handle <handle>". */
3062 static int
3063 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3064 {
3065 struct ofpbuf request;
3066 struct tcmsg *tcmsg;
3067 int error;
3068
3069 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3070 tcmsg->tcm_handle = handle;
3071 tcmsg->tcm_parent = 0;
3072
3073 error = tc_transact(&request, NULL);
3074 if (error) {
3075 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3076 netdev_get_name(netdev),
3077 tc_get_major(handle), tc_get_minor(handle),
3078 strerror(error));
3079 }
3080 return error;
3081 }
3082
3083 /* Equivalent to "tc qdisc del dev <name> root". */
3084 static int
3085 tc_del_qdisc(struct netdev *netdev)
3086 {
3087 struct netdev_dev_linux *netdev_dev =
3088 netdev_dev_linux_cast(netdev_get_dev(netdev));
3089 struct ofpbuf request;
3090 struct tcmsg *tcmsg;
3091 int error;
3092
3093 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3094 tcmsg->tcm_handle = tc_make_handle(1, 0);
3095 tcmsg->tcm_parent = TC_H_ROOT;
3096
3097 error = tc_transact(&request, NULL);
3098 if (error == EINVAL) {
3099 /* EINVAL probably means that the default qdisc was in use, in which
3100 * case we've accomplished our purpose. */
3101 error = 0;
3102 }
3103 if (!error && netdev_dev->tc) {
3104 if (netdev_dev->tc->ops->tc_destroy) {
3105 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3106 }
3107 netdev_dev->tc = NULL;
3108 }
3109 return error;
3110 }
3111
3112 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3113 * kernel to determine what they are. Returns 0 if successful, otherwise a
3114 * positive errno value. */
3115 static int
3116 tc_query_qdisc(const struct netdev *netdev)
3117 {
3118 struct netdev_dev_linux *netdev_dev =
3119 netdev_dev_linux_cast(netdev_get_dev(netdev));
3120 struct ofpbuf request, *qdisc;
3121 const struct tc_ops *ops;
3122 struct tcmsg *tcmsg;
3123 int load_error;
3124 int error;
3125
3126 if (netdev_dev->tc) {
3127 return 0;
3128 }
3129
3130 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3131 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3132 * 2.6.35 without that fix backported to it.
3133 *
3134 * To avoid the OOPS, we must not make a request that would attempt to dump
3135 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3136 * few others. There are a few ways that I can see to do this, but most of
3137 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3138 * technique chosen here is to assume that any non-default qdisc that we
3139 * create will have a class with handle 1:0. The built-in qdiscs only have
3140 * a class with handle 0:0.
3141 *
3142 * We could check for Linux 2.6.35+ and use a more straightforward method
3143 * there. */
3144 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3145 tcmsg->tcm_handle = tc_make_handle(1, 0);
3146 tcmsg->tcm_parent = 0;
3147
3148 /* Figure out what tc class to instantiate. */
3149 error = tc_transact(&request, &qdisc);
3150 if (!error) {
3151 const char *kind;
3152
3153 error = tc_parse_qdisc(qdisc, &kind, NULL);
3154 if (error) {
3155 ops = &tc_ops_other;
3156 } else {
3157 ops = tc_lookup_linux_name(kind);
3158 if (!ops) {
3159 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3160 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3161
3162 ops = &tc_ops_other;
3163 }
3164 }
3165 } else if (error == ENOENT) {
3166 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3167 * other entity that doesn't have a handle 1:0. We will assume
3168 * that it's the system default qdisc. */
3169 ops = &tc_ops_default;
3170 error = 0;
3171 } else {
3172 /* Who knows? Maybe the device got deleted. */
3173 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3174 netdev_get_name(netdev), strerror(error));
3175 ops = &tc_ops_other;
3176 }
3177
3178 /* Instantiate it. */
3179 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3180 assert((load_error == 0) == (netdev_dev->tc != NULL));
3181 ofpbuf_delete(qdisc);
3182
3183 return error ? error : load_error;
3184 }
3185
3186 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3187 approximate the time to transmit packets of various lengths. For an MTU of
3188 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3189 represents two possible packet lengths; for a MTU of 513 through 1024, four
3190 possible lengths; and so on.
3191
3192 Returns, for the specified 'mtu', the number of bits that packet lengths
3193 need to be shifted right to fit within such a 256-entry table. */
3194 static int
3195 tc_calc_cell_log(unsigned int mtu)
3196 {
3197 int cell_log;
3198
3199 if (!mtu) {
3200 mtu = ETH_PAYLOAD_MAX;
3201 }
3202 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3203
3204 for (cell_log = 0; mtu >= 256; cell_log++) {
3205 mtu >>= 1;
3206 }
3207
3208 return cell_log;
3209 }
3210
3211 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3212 * of 'mtu'. */
3213 static void
3214 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3215 {
3216 memset(rate, 0, sizeof *rate);
3217 rate->cell_log = tc_calc_cell_log(mtu);
3218 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3219 /* rate->cell_align = 0; */ /* distro headers. */
3220 rate->mpu = ETH_TOTAL_MIN;
3221 rate->rate = Bps;
3222 }
3223
3224 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3225 * attribute of the specified "type".
3226 *
3227 * See tc_calc_cell_log() above for a description of "rtab"s. */
3228 static void
3229 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3230 {
3231 uint32_t *rtab;
3232 unsigned int i;
3233
3234 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3235 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3236 unsigned packet_size = (i + 1) << rate->cell_log;
3237 if (packet_size < rate->mpu) {
3238 packet_size = rate->mpu;
3239 }
3240 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3241 }
3242 }
3243
3244 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3245 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3246 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3247 * 0 is fine.)
3248 *
3249 * This */
3250 static int
3251 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3252 {
3253 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3254 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3255 }
3256
3257 \f
3258 /* Utility functions. */
3259
3260 static int
3261 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3262 {
3263 /* Policy for RTNLGRP_LINK messages.
3264 *
3265 * There are *many* more fields in these messages, but currently we only
3266 * care about these fields. */
3267 static const struct nl_policy rtnlgrp_link_policy[] = {
3268 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3269 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3270 .min_len = sizeof(struct rtnl_link_stats) },
3271 };
3272
3273 struct ofpbuf request;
3274 struct ofpbuf *reply;
3275 struct ifinfomsg *ifi;
3276 const struct rtnl_link_stats *rtnl_stats;
3277 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3278 int error;
3279
3280 ofpbuf_init(&request, 0);
3281 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3282 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3283 ifi->ifi_family = PF_UNSPEC;
3284 ifi->ifi_index = ifindex;
3285 error = nl_sock_transact(rtnl_sock, &request, &reply);
3286 ofpbuf_uninit(&request);
3287 if (error) {
3288 return error;
3289 }
3290
3291 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3292 rtnlgrp_link_policy,
3293 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3294 ofpbuf_delete(reply);
3295 return EPROTO;
3296 }
3297
3298 if (!attrs[IFLA_STATS]) {
3299 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3300 ofpbuf_delete(reply);
3301 return EPROTO;
3302 }
3303
3304 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3305 stats->rx_packets = rtnl_stats->rx_packets;
3306 stats->tx_packets = rtnl_stats->tx_packets;
3307 stats->rx_bytes = rtnl_stats->rx_bytes;
3308 stats->tx_bytes = rtnl_stats->tx_bytes;
3309 stats->rx_errors = rtnl_stats->rx_errors;
3310 stats->tx_errors = rtnl_stats->tx_errors;
3311 stats->rx_dropped = rtnl_stats->rx_dropped;
3312 stats->tx_dropped = rtnl_stats->tx_dropped;
3313 stats->multicast = rtnl_stats->multicast;
3314 stats->collisions = rtnl_stats->collisions;
3315 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3316 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3317 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3318 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3319 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3320 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3321 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3322 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3323 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3324 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3325 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3326
3327 ofpbuf_delete(reply);
3328
3329 return 0;
3330 }
3331
3332 static int
3333 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3334 {
3335 static const char fn[] = "/proc/net/dev";
3336 char line[1024];
3337 FILE *stream;
3338 int ln;
3339
3340 stream = fopen(fn, "r");
3341 if (!stream) {
3342 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3343 return errno;
3344 }
3345
3346 ln = 0;
3347 while (fgets(line, sizeof line, stream)) {
3348 if (++ln >= 3) {
3349 char devname[16];
3350 #define X64 "%"SCNu64
3351 if (sscanf(line,
3352 " %15[^:]:"
3353 X64 X64 X64 X64 X64 X64 X64 "%*u"
3354 X64 X64 X64 X64 X64 X64 X64 "%*u",
3355 devname,
3356 &stats->rx_bytes,
3357 &stats->rx_packets,
3358 &stats->rx_errors,
3359 &stats->rx_dropped,
3360 &stats->rx_fifo_errors,
3361 &stats->rx_frame_errors,
3362 &stats->multicast,
3363 &stats->tx_bytes,
3364 &stats->tx_packets,
3365 &stats->tx_errors,
3366 &stats->tx_dropped,
3367 &stats->tx_fifo_errors,
3368 &stats->collisions,
3369 &stats->tx_carrier_errors) != 15) {
3370 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3371 } else if (!strcmp(devname, netdev_name)) {
3372 stats->rx_length_errors = UINT64_MAX;
3373 stats->rx_over_errors = UINT64_MAX;
3374 stats->rx_crc_errors = UINT64_MAX;
3375 stats->rx_missed_errors = UINT64_MAX;
3376 stats->tx_aborted_errors = UINT64_MAX;
3377 stats->tx_heartbeat_errors = UINT64_MAX;
3378 stats->tx_window_errors = UINT64_MAX;
3379 fclose(stream);
3380 return 0;
3381 }
3382 }
3383 }
3384 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3385 fclose(stream);
3386 return ENODEV;
3387 }
3388
3389 static int
3390 get_flags(const struct netdev *netdev, int *flags)
3391 {
3392 struct ifreq ifr;
3393 int error;
3394
3395 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3396 "SIOCGIFFLAGS");
3397 *flags = ifr.ifr_flags;
3398 return error;
3399 }
3400
3401 static int
3402 set_flags(struct netdev *netdev, int flags)
3403 {
3404 struct ifreq ifr;
3405
3406 ifr.ifr_flags = flags;
3407 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3408 "SIOCSIFFLAGS");
3409 }
3410
3411 static int
3412 do_get_ifindex(const char *netdev_name)
3413 {
3414 struct ifreq ifr;
3415
3416 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3417 COVERAGE_INC(netdev_get_ifindex);
3418 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3419 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3420 netdev_name, strerror(errno));
3421 return -errno;
3422 }
3423 return ifr.ifr_ifindex;
3424 }
3425
3426 static int
3427 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3428 {
3429 struct netdev_dev_linux *netdev_dev =
3430 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3431 *ifindexp = 0;
3432 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3433 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3434 if (ifindex < 0) {
3435 return -ifindex;
3436 }
3437 netdev_dev->cache_valid |= VALID_IFINDEX;
3438 netdev_dev->ifindex = ifindex;
3439 }
3440 *ifindexp = netdev_dev->ifindex;
3441 return 0;
3442 }
3443
3444 static int
3445 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3446 {
3447 struct ifreq ifr;
3448 int hwaddr_family;
3449
3450 memset(&ifr, 0, sizeof ifr);
3451 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3452 COVERAGE_INC(netdev_get_hwaddr);
3453 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3454 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3455 netdev_name, strerror(errno));
3456 return errno;
3457 }
3458 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3459 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3460 VLOG_WARN("%s device has unknown hardware address family %d",
3461 netdev_name, hwaddr_family);
3462 }
3463 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3464 return 0;
3465 }
3466
3467 static int
3468 set_etheraddr(const char *netdev_name, int hwaddr_family,
3469 const uint8_t mac[ETH_ADDR_LEN])
3470 {
3471 struct ifreq ifr;
3472
3473 memset(&ifr, 0, sizeof ifr);
3474 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3475 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3476 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3477 COVERAGE_INC(netdev_set_hwaddr);
3478 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3479 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3480 netdev_name, strerror(errno));
3481 return errno;
3482 }
3483 return 0;
3484 }
3485
3486 static int
3487 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3488 int cmd, const char *cmd_name)
3489 {
3490 struct ifreq ifr;
3491
3492 memset(&ifr, 0, sizeof ifr);
3493 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3494 ifr.ifr_data = (caddr_t) ecmd;
3495
3496 ecmd->cmd = cmd;
3497 COVERAGE_INC(netdev_ethtool);
3498 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3499 return 0;
3500 } else {
3501 if (errno != EOPNOTSUPP) {
3502 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3503 "failed: %s", cmd_name, name, strerror(errno));
3504 } else {
3505 /* The device doesn't support this operation. That's pretty
3506 * common, so there's no point in logging anything. */
3507 }
3508 return errno;
3509 }
3510 }
3511
3512 static int
3513 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3514 const char *cmd_name)
3515 {
3516 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3517 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3518 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3519 strerror(errno));
3520 return errno;
3521 }
3522 return 0;
3523 }
3524
3525 static int
3526 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3527 int cmd, const char *cmd_name)
3528 {
3529 struct ifreq ifr;
3530 int error;
3531
3532 ifr.ifr_addr.sa_family = AF_INET;
3533 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3534 if (!error) {
3535 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3536 *ip = sin->sin_addr;
3537 }
3538 return error;
3539 }