]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev-linux: Fix fd leak on error path.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "packets.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "shash.h"
68 #include "socket-util.h"
69 #include "sset.h"
70 #include "timer.h"
71 #include "unaligned.h"
72 #include "vlog.h"
73
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
83
84 \f
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
89 #endif
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
92 #endif
93
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #endif
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 #endif
102
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105 #ifndef TC_RTAB_SIZE
106 #define TC_RTAB_SIZE 1024
107 #endif
108
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
111
112 enum {
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
122 };
123
124 struct tap_state {
125 int fd;
126 };
127 \f
128 /* Traffic control. */
129
130 /* An instance of a traffic control class. Always associated with a particular
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
135 struct tc {
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140 };
141
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
144 /* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148 struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
152 };
153
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
156 *
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
160 struct tc_ops {
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
165
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
168
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
172
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
178 *
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
182 *
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
185 *
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
193 *
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
199 * 'netdev'.
200 *
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * tc_destroy(tc).
208 *
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
212 *
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
215
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 *
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
221 *
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
225 *
226 * This function may be null if 'tc' is not configurable.
227 */
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 *
237 * This function may be null if 'tc' is not configurable.
238 */
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
240
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 *
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
251 *
252 * This function may be null if 'tc' does not have queues ('n_queues' is
253 * 0). */
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
256
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * 'n_queues'.
261 *
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 *
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
270
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
273 *
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
280 *
281 * On success, initializes '*stats'.
282 *
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
288
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 *
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
297 };
298
299 static void
300 tc_init(struct tc *tc, const struct tc_ops *ops)
301 {
302 tc->ops = ops;
303 hmap_init(&tc->queues);
304 }
305
306 static void
307 tc_destroy(struct tc *tc)
308 {
309 hmap_destroy(&tc->queues);
310 }
311
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
316
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
322 NULL
323 };
324
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
328
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 int kbits_burst);
339
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
349
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
352
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 \f
359 struct netdev_linux {
360 struct netdev up;
361
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
365
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
369
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 int ifindex;
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
375 struct in6_addr in6;
376 int mtu;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tc *tc;
396
397 union {
398 struct tap_state tap;
399 } state;
400 };
401
402 struct netdev_rx_linux {
403 struct netdev_rx up;
404 bool is_tap;
405 int fd;
406 };
407
408 static const struct netdev_rx_class netdev_rx_linux_class;
409
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416
417 static int netdev_linux_init(void);
418
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
439
440 static bool
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 {
443 return netdev_class->init == netdev_linux_init;
444 }
445
446 static bool
447 is_tap_netdev(const struct netdev *netdev)
448 {
449 return netdev_get_class(netdev) == &netdev_tap_class;
450 }
451
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
454 {
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
456
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
458 }
459
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
462 {
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
465 }
466 \f
467 static int
468 netdev_linux_init(void)
469 {
470 static int status = -1;
471 if (status < 0) {
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
475 if (status) {
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
477 }
478 }
479 return status;
480 }
481
482 static void
483 netdev_linux_run(void)
484 {
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
487 }
488
489 static void
490 netdev_linux_wait(void)
491 {
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
494 }
495
496 static void
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
499 {
500 dev->change_seq++;
501 if (!dev->change_seq) {
502 dev->change_seq++;
503 }
504
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
507 }
508 dev->ifi_flags = ifi_flags;
509
510 dev->cache_valid &= mask;
511 }
512
513 static void
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
516 {
517 if (change->nlmsg_type == RTM_NEWLINK) {
518 /* Keep drv-info */
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
520
521 /* Update netdev from rtnl-change msg. */
522 if (change->mtu) {
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
526 }
527
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
532 }
533
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
537
538 } else {
539 netdev_linux_changed(dev, change->ifi_flags, 0);
540 }
541 }
542
543 static void
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
546 {
547 struct netdev_linux *dev;
548 if (change) {
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
552 }
553 } else {
554 struct shash device_shash;
555 struct shash_node *node;
556
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
560 unsigned int flags;
561
562 dev = node->data;
563
564 get_flags(&dev->up, &flags);
565 netdev_linux_changed(dev, flags, 0);
566 }
567 shash_destroy(&device_shash);
568 }
569 }
570
571 static int
572 cache_notifier_ref(void)
573 {
574 if (!cache_notifier_refcount) {
575 ovs_assert(!netdev_linux_cache_notifier);
576
577 netdev_linux_cache_notifier =
578 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
579
580 if (!netdev_linux_cache_notifier) {
581 return EINVAL;
582 }
583 }
584 cache_notifier_refcount++;
585
586 return 0;
587 }
588
589 static void
590 cache_notifier_unref(void)
591 {
592 ovs_assert(cache_notifier_refcount > 0);
593 if (!--cache_notifier_refcount) {
594 ovs_assert(netdev_linux_cache_notifier);
595 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
596 netdev_linux_cache_notifier = NULL;
597 }
598 }
599
600 /* Creates system and internal devices. */
601 static int
602 netdev_linux_create(const struct netdev_class *class, const char *name,
603 struct netdev **netdevp)
604 {
605 struct netdev_linux *netdev;
606 int error;
607
608 error = cache_notifier_ref();
609 if (error) {
610 return error;
611 }
612
613 netdev = xzalloc(sizeof *netdev);
614 netdev->change_seq = 1;
615 netdev_init(&netdev->up, name, class);
616 error = get_flags(&netdev->up, &netdev->ifi_flags);
617 if (error == ENODEV) {
618 if (class != &netdev_internal_class) {
619 /* The device does not exist, so don't allow it to be opened. */
620 netdev_uninit(&netdev->up, false);
621 cache_notifier_unref();
622 free(netdev);
623 return ENODEV;
624 } else {
625 /* "Internal" netdevs have to be created as netdev objects before
626 * they exist in the kernel, because creating them in the kernel
627 * happens by passing a netdev object to dpif_port_add().
628 * Therefore, ignore the error. */
629 }
630 }
631
632 *netdevp = &netdev->up;
633 return 0;
634 }
635
636 /* For most types of netdevs we open the device for each call of
637 * netdev_open(). However, this is not the case with tap devices,
638 * since it is only possible to open the device once. In this
639 * situation we share a single file descriptor, and consequently
640 * buffers, across all readers. Therefore once data is read it will
641 * be unavailable to other reads for tap devices. */
642 static int
643 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
644 const char *name, struct netdev **netdevp)
645 {
646 struct netdev_linux *netdev;
647 struct tap_state *state;
648 static const char tap_dev[] = "/dev/net/tun";
649 struct ifreq ifr;
650 int error;
651
652 netdev = xzalloc(sizeof *netdev);
653 state = &netdev->state.tap;
654
655 error = cache_notifier_ref();
656 if (error) {
657 goto error;
658 }
659
660 /* Open tap device. */
661 state->fd = open(tap_dev, O_RDWR);
662 if (state->fd < 0) {
663 error = errno;
664 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
665 goto error_unref_notifier;
666 }
667
668 /* Create tap device. */
669 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
670 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
671 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
672 VLOG_WARN("%s: creating tap device failed: %s", name,
673 ovs_strerror(errno));
674 error = errno;
675 goto error_close;
676 }
677
678 /* Make non-blocking. */
679 error = set_nonblocking(state->fd);
680 if (error) {
681 goto error_close;
682 }
683
684 netdev_init(&netdev->up, name, &netdev_tap_class);
685 *netdevp = &netdev->up;
686 return 0;
687
688 error_close:
689 close(state->fd);
690 error_unref_notifier:
691 cache_notifier_unref();
692 error:
693 free(netdev);
694 return error;
695 }
696
697 static void
698 destroy_tap(struct netdev_linux *netdev)
699 {
700 struct tap_state *state = &netdev->state.tap;
701
702 if (state->fd >= 0) {
703 close(state->fd);
704 }
705 }
706
707 /* Destroys the netdev device 'netdev_'. */
708 static void
709 netdev_linux_destroy(struct netdev *netdev_)
710 {
711 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
712
713 if (netdev->tc && netdev->tc->ops->tc_destroy) {
714 netdev->tc->ops->tc_destroy(netdev->tc);
715 }
716
717 if (netdev_get_class(netdev_) == &netdev_tap_class) {
718 destroy_tap(netdev);
719 }
720 free(netdev);
721
722 cache_notifier_unref();
723 }
724
725 static int
726 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
727 {
728 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
729 bool is_tap = is_tap_netdev(netdev_);
730 struct netdev_rx_linux *rx;
731 int error;
732 int fd;
733
734 if (is_tap) {
735 fd = netdev->state.tap.fd;
736 } else {
737 struct sockaddr_ll sll;
738 int ifindex;
739 /* Result of tcpdump -dd inbound */
740 static struct sock_filter filt[] = {
741 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
742 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
743 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
744 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
745 };
746 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
747
748 /* Create file descriptor. */
749 fd = socket(PF_PACKET, SOCK_RAW, 0);
750 if (fd < 0) {
751 error = errno;
752 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
753 goto error;
754 }
755
756 /* Set non-blocking mode. */
757 error = set_nonblocking(fd);
758 if (error) {
759 goto error;
760 }
761
762 /* Get ethernet device index. */
763 error = get_ifindex(&netdev->up, &ifindex);
764 if (error) {
765 goto error;
766 }
767
768 /* Bind to specific ethernet device. */
769 memset(&sll, 0, sizeof sll);
770 sll.sll_family = AF_PACKET;
771 sll.sll_ifindex = ifindex;
772 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
773 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
774 error = errno;
775 VLOG_ERR("%s: failed to bind raw socket (%s)",
776 netdev_get_name(netdev_), ovs_strerror(error));
777 goto error;
778 }
779
780 /* Filter for only inbound packets. */
781 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
782 sizeof fprog);
783 if (error) {
784 error = errno;
785 VLOG_ERR("%s: failed attach filter (%s)",
786 netdev_get_name(netdev_), ovs_strerror(error));
787 goto error;
788 }
789 }
790
791 rx = xmalloc(sizeof *rx);
792 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
793 rx->is_tap = is_tap;
794 rx->fd = fd;
795
796 *rxp = &rx->up;
797 return 0;
798
799 error:
800 if (fd >= 0) {
801 close(fd);
802 }
803 return error;
804 }
805
806 static void
807 netdev_rx_linux_destroy(struct netdev_rx *rx_)
808 {
809 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
810
811 if (!rx->is_tap) {
812 close(rx->fd);
813 }
814 free(rx);
815 }
816
817 static int
818 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
819 {
820 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
821 ssize_t retval;
822
823 do {
824 retval = (rx->is_tap
825 ? read(rx->fd, data, size)
826 : recv(rx->fd, data, size, MSG_TRUNC));
827 } while (retval < 0 && errno == EINTR);
828
829 if (retval >= 0) {
830 return retval > size ? -EMSGSIZE : retval;
831 } else {
832 if (errno != EAGAIN) {
833 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
834 ovs_strerror(errno), netdev_rx_get_name(rx_));
835 }
836 return -errno;
837 }
838 }
839
840 static void
841 netdev_rx_linux_wait(struct netdev_rx *rx_)
842 {
843 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
844 poll_fd_wait(rx->fd, POLLIN);
845 }
846
847 static int
848 netdev_rx_linux_drain(struct netdev_rx *rx_)
849 {
850 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
851 if (rx->is_tap) {
852 struct ifreq ifr;
853 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
854 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
855 if (error) {
856 return error;
857 }
858 drain_fd(rx->fd, ifr.ifr_qlen);
859 return 0;
860 } else {
861 return drain_rcvbuf(rx->fd);
862 }
863 }
864
865 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
866 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
867 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
868 * the packet is too big or too small to transmit on the device.
869 *
870 * The caller retains ownership of 'buffer' in all cases.
871 *
872 * The kernel maintains a packet transmission queue, so the caller is not
873 * expected to do additional queuing of packets. */
874 static int
875 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
876 {
877 for (;;) {
878 ssize_t retval;
879
880 if (!is_tap_netdev(netdev_)) {
881 /* Use our AF_PACKET socket to send to this device. */
882 struct sockaddr_ll sll;
883 struct msghdr msg;
884 struct iovec iov;
885 int ifindex;
886 int error;
887 int sock;
888
889 sock = af_packet_sock();
890 if (sock < 0) {
891 return -sock;
892 }
893
894 error = get_ifindex(netdev_, &ifindex);
895 if (error) {
896 return error;
897 }
898
899 /* We don't bother setting most fields in sockaddr_ll because the
900 * kernel ignores them for SOCK_RAW. */
901 memset(&sll, 0, sizeof sll);
902 sll.sll_family = AF_PACKET;
903 sll.sll_ifindex = ifindex;
904
905 iov.iov_base = CONST_CAST(void *, data);
906 iov.iov_len = size;
907
908 msg.msg_name = &sll;
909 msg.msg_namelen = sizeof sll;
910 msg.msg_iov = &iov;
911 msg.msg_iovlen = 1;
912 msg.msg_control = NULL;
913 msg.msg_controllen = 0;
914 msg.msg_flags = 0;
915
916 retval = sendmsg(sock, &msg, 0);
917 } else {
918 /* Use the tap fd to send to this device. This is essential for
919 * tap devices, because packets sent to a tap device with an
920 * AF_PACKET socket will loop back to be *received* again on the
921 * tap device. This doesn't occur on other interface types
922 * because we attach a socket filter to the rx socket. */
923 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
924
925 retval = write(netdev->state.tap.fd, data, size);
926 }
927
928 if (retval < 0) {
929 /* The Linux AF_PACKET implementation never blocks waiting for room
930 * for packets, instead returning ENOBUFS. Translate this into
931 * EAGAIN for the caller. */
932 if (errno == ENOBUFS) {
933 return EAGAIN;
934 } else if (errno == EINTR) {
935 continue;
936 } else if (errno != EAGAIN) {
937 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
938 netdev_get_name(netdev_), ovs_strerror(errno));
939 }
940 return errno;
941 } else if (retval != size) {
942 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
943 "%zu) on %s", retval, size, netdev_get_name(netdev_));
944 return EMSGSIZE;
945 } else {
946 return 0;
947 }
948 }
949 }
950
951 /* Registers with the poll loop to wake up from the next call to poll_block()
952 * when the packet transmission queue has sufficient room to transmit a packet
953 * with netdev_send().
954 *
955 * The kernel maintains a packet transmission queue, so the client is not
956 * expected to do additional queuing of packets. Thus, this function is
957 * unlikely to ever be used. It is included for completeness. */
958 static void
959 netdev_linux_send_wait(struct netdev *netdev)
960 {
961 if (is_tap_netdev(netdev)) {
962 /* TAP device always accepts packets.*/
963 poll_immediate_wake();
964 }
965 }
966
967 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
968 * otherwise a positive errno value. */
969 static int
970 netdev_linux_set_etheraddr(struct netdev *netdev_,
971 const uint8_t mac[ETH_ADDR_LEN])
972 {
973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
974 struct netdev_saved_flags *sf = NULL;
975 int error;
976
977 if (netdev->cache_valid & VALID_ETHERADDR) {
978 if (netdev->ether_addr_error) {
979 return netdev->ether_addr_error;
980 }
981 if (eth_addr_equals(netdev->etheraddr, mac)) {
982 return 0;
983 }
984 netdev->cache_valid &= ~VALID_ETHERADDR;
985 }
986
987 /* Tap devices must be brought down before setting the address. */
988 if (is_tap_netdev(netdev_)) {
989 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
990 }
991 error = set_etheraddr(netdev_get_name(netdev_), mac);
992 if (!error || error == ENODEV) {
993 netdev->ether_addr_error = error;
994 netdev->cache_valid |= VALID_ETHERADDR;
995 if (!error) {
996 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
997 }
998 }
999
1000 netdev_restore_flags(sf);
1001
1002 return error;
1003 }
1004
1005 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1006 static int
1007 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1008 uint8_t mac[ETH_ADDR_LEN])
1009 {
1010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1011
1012 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1013 int error = get_etheraddr(netdev_get_name(netdev_),
1014 netdev->etheraddr);
1015
1016 netdev->ether_addr_error = error;
1017 netdev->cache_valid |= VALID_ETHERADDR;
1018 }
1019
1020 if (!netdev->ether_addr_error) {
1021 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1022 }
1023
1024 return netdev->ether_addr_error;
1025 }
1026
1027 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1028 * in bytes, not including the hardware header; thus, this is typically 1500
1029 * bytes for Ethernet devices. */
1030 static int
1031 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1032 {
1033 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1034 if (!(netdev->cache_valid & VALID_MTU)) {
1035 struct ifreq ifr;
1036 int error;
1037
1038 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1039 SIOCGIFMTU, "SIOCGIFMTU");
1040
1041 netdev->netdev_mtu_error = error;
1042 netdev->mtu = ifr.ifr_mtu;
1043 netdev->cache_valid |= VALID_MTU;
1044 }
1045
1046 if (!netdev->netdev_mtu_error) {
1047 *mtup = netdev->mtu;
1048 }
1049 return netdev->netdev_mtu_error;
1050 }
1051
1052 /* Sets the maximum size of transmitted (MTU) for given device using linux
1053 * networking ioctl interface.
1054 */
1055 static int
1056 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1057 {
1058 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1059 struct ifreq ifr;
1060 int error;
1061
1062 if (netdev->cache_valid & VALID_MTU) {
1063 if (netdev->netdev_mtu_error) {
1064 return netdev->netdev_mtu_error;
1065 }
1066 if (netdev->mtu == mtu) {
1067 return 0;
1068 }
1069 netdev->cache_valid &= ~VALID_MTU;
1070 }
1071 ifr.ifr_mtu = mtu;
1072 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1073 SIOCSIFMTU, "SIOCSIFMTU");
1074 if (!error || error == ENODEV) {
1075 netdev->netdev_mtu_error = error;
1076 netdev->mtu = ifr.ifr_mtu;
1077 netdev->cache_valid |= VALID_MTU;
1078 }
1079 return error;
1080 }
1081
1082 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1083 * On failure, returns a negative errno value. */
1084 static int
1085 netdev_linux_get_ifindex(const struct netdev *netdev)
1086 {
1087 int ifindex, error;
1088
1089 error = get_ifindex(netdev, &ifindex);
1090 return error ? -error : ifindex;
1091 }
1092
1093 static int
1094 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1095 {
1096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1097
1098 if (netdev->miimon_interval > 0) {
1099 *carrier = netdev->miimon;
1100 } else {
1101 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1102 }
1103
1104 return 0;
1105 }
1106
1107 static long long int
1108 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1109 {
1110 return netdev_linux_cast(netdev)->carrier_resets;
1111 }
1112
1113 static int
1114 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1115 struct mii_ioctl_data *data)
1116 {
1117 struct ifreq ifr;
1118 int error;
1119
1120 memset(&ifr, 0, sizeof ifr);
1121 memcpy(&ifr.ifr_data, data, sizeof *data);
1122 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1123 memcpy(data, &ifr.ifr_data, sizeof *data);
1124
1125 return error;
1126 }
1127
1128 static int
1129 netdev_linux_get_miimon(const char *name, bool *miimon)
1130 {
1131 struct mii_ioctl_data data;
1132 int error;
1133
1134 *miimon = false;
1135
1136 memset(&data, 0, sizeof data);
1137 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1138 if (!error) {
1139 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1140 data.reg_num = MII_BMSR;
1141 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1142 &data);
1143
1144 if (!error) {
1145 *miimon = !!(data.val_out & BMSR_LSTATUS);
1146 } else {
1147 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1148 }
1149 } else {
1150 struct ethtool_cmd ecmd;
1151
1152 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1153 name);
1154
1155 COVERAGE_INC(netdev_get_ethtool);
1156 memset(&ecmd, 0, sizeof ecmd);
1157 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1158 "ETHTOOL_GLINK");
1159 if (!error) {
1160 struct ethtool_value eval;
1161
1162 memcpy(&eval, &ecmd, sizeof eval);
1163 *miimon = !!eval.data;
1164 } else {
1165 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1166 }
1167 }
1168
1169 return error;
1170 }
1171
1172 static int
1173 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1174 long long int interval)
1175 {
1176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1177
1178 interval = interval > 0 ? MAX(interval, 100) : 0;
1179 if (netdev->miimon_interval != interval) {
1180 netdev->miimon_interval = interval;
1181 timer_set_expired(&netdev->miimon_timer);
1182 }
1183
1184 return 0;
1185 }
1186
1187 static void
1188 netdev_linux_miimon_run(void)
1189 {
1190 struct shash device_shash;
1191 struct shash_node *node;
1192
1193 shash_init(&device_shash);
1194 netdev_get_devices(&netdev_linux_class, &device_shash);
1195 SHASH_FOR_EACH (node, &device_shash) {
1196 struct netdev_linux *dev = node->data;
1197 bool miimon;
1198
1199 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1200 continue;
1201 }
1202
1203 netdev_linux_get_miimon(dev->up.name, &miimon);
1204 if (miimon != dev->miimon) {
1205 dev->miimon = miimon;
1206 netdev_linux_changed(dev, dev->ifi_flags, 0);
1207 }
1208
1209 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1210 }
1211
1212 shash_destroy(&device_shash);
1213 }
1214
1215 static void
1216 netdev_linux_miimon_wait(void)
1217 {
1218 struct shash device_shash;
1219 struct shash_node *node;
1220
1221 shash_init(&device_shash);
1222 netdev_get_devices(&netdev_linux_class, &device_shash);
1223 SHASH_FOR_EACH (node, &device_shash) {
1224 struct netdev_linux *dev = node->data;
1225
1226 if (dev->miimon_interval > 0) {
1227 timer_wait(&dev->miimon_timer);
1228 }
1229 }
1230 shash_destroy(&device_shash);
1231 }
1232
1233 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1234 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1235 * enabled. */
1236 static bool
1237 check_for_working_netlink_stats(void)
1238 {
1239 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1240 * preferable, so if that works, we'll use it. */
1241 int ifindex = do_get_ifindex("lo");
1242 if (ifindex < 0) {
1243 VLOG_WARN("failed to get ifindex for lo, "
1244 "obtaining netdev stats from proc");
1245 return false;
1246 } else {
1247 struct netdev_stats stats;
1248 int error = get_stats_via_netlink(ifindex, &stats);
1249 if (!error) {
1250 VLOG_DBG("obtaining netdev stats via rtnetlink");
1251 return true;
1252 } else {
1253 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1254 "via proc (you are probably running a pre-2.6.19 "
1255 "kernel)", ovs_strerror(error));
1256 return false;
1257 }
1258 }
1259 }
1260
1261 static void
1262 swap_uint64(uint64_t *a, uint64_t *b)
1263 {
1264 uint64_t tmp = *a;
1265 *a = *b;
1266 *b = tmp;
1267 }
1268
1269 /* Copies 'src' into 'dst', performing format conversion in the process.
1270 *
1271 * 'src' is allowed to be misaligned. */
1272 static void
1273 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1274 const struct ovs_vport_stats *src)
1275 {
1276 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1277 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1278 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1279 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1280 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1281 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1282 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1283 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1284 dst->multicast = 0;
1285 dst->collisions = 0;
1286 dst->rx_length_errors = 0;
1287 dst->rx_over_errors = 0;
1288 dst->rx_crc_errors = 0;
1289 dst->rx_frame_errors = 0;
1290 dst->rx_fifo_errors = 0;
1291 dst->rx_missed_errors = 0;
1292 dst->tx_aborted_errors = 0;
1293 dst->tx_carrier_errors = 0;
1294 dst->tx_fifo_errors = 0;
1295 dst->tx_heartbeat_errors = 0;
1296 dst->tx_window_errors = 0;
1297 }
1298
1299 static int
1300 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1301 {
1302 struct dpif_linux_vport reply;
1303 struct ofpbuf *buf;
1304 int error;
1305
1306 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1307 if (error) {
1308 return error;
1309 } else if (!reply.stats) {
1310 ofpbuf_delete(buf);
1311 return EOPNOTSUPP;
1312 }
1313
1314 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1315
1316 ofpbuf_delete(buf);
1317
1318 return 0;
1319 }
1320
1321 static void
1322 get_stats_via_vport(const struct netdev *netdev_,
1323 struct netdev_stats *stats)
1324 {
1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1326
1327 if (!netdev->vport_stats_error ||
1328 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1329 int error;
1330
1331 error = get_stats_via_vport__(netdev_, stats);
1332 if (error && error != ENOENT) {
1333 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1334 "(%s)",
1335 netdev_get_name(netdev_), ovs_strerror(error));
1336 }
1337 netdev->vport_stats_error = error;
1338 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1339 }
1340 }
1341
1342 static int
1343 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1344 struct netdev_stats *stats)
1345 {
1346 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1347 static int use_netlink_stats;
1348 int error;
1349
1350 if (ovsthread_once_start(&once)) {
1351 use_netlink_stats = check_for_working_netlink_stats();
1352 ovsthread_once_done(&once);
1353 }
1354
1355 if (use_netlink_stats) {
1356 int ifindex;
1357
1358 error = get_ifindex(netdev_, &ifindex);
1359 if (!error) {
1360 error = get_stats_via_netlink(ifindex, stats);
1361 }
1362 } else {
1363 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1364 }
1365
1366 if (error) {
1367 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1368 netdev_get_name(netdev_), error);
1369 }
1370 return error;
1371
1372 }
1373
1374 /* Retrieves current device stats for 'netdev-linux'. */
1375 static int
1376 netdev_linux_get_stats(const struct netdev *netdev_,
1377 struct netdev_stats *stats)
1378 {
1379 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1380 struct netdev_stats dev_stats;
1381 int error;
1382
1383 get_stats_via_vport(netdev_, stats);
1384
1385 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1386
1387 if (error) {
1388 if (netdev->vport_stats_error) {
1389 return error;
1390 } else {
1391 return 0;
1392 }
1393 }
1394
1395 if (netdev->vport_stats_error) {
1396 /* stats not available from OVS then use ioctl stats. */
1397 *stats = dev_stats;
1398 } else {
1399 stats->rx_errors += dev_stats.rx_errors;
1400 stats->tx_errors += dev_stats.tx_errors;
1401 stats->rx_dropped += dev_stats.rx_dropped;
1402 stats->tx_dropped += dev_stats.tx_dropped;
1403 stats->multicast += dev_stats.multicast;
1404 stats->collisions += dev_stats.collisions;
1405 stats->rx_length_errors += dev_stats.rx_length_errors;
1406 stats->rx_over_errors += dev_stats.rx_over_errors;
1407 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1408 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1409 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1410 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1411 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1412 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1413 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1414 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1415 stats->tx_window_errors += dev_stats.tx_window_errors;
1416 }
1417 return 0;
1418 }
1419
1420 /* Retrieves current device stats for 'netdev-tap' netdev or
1421 * netdev-internal. */
1422 static int
1423 netdev_tap_get_stats(const struct netdev *netdev_,
1424 struct netdev_stats *stats)
1425 {
1426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1427 struct netdev_stats dev_stats;
1428 int error;
1429
1430 get_stats_via_vport(netdev_, stats);
1431
1432 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1433 if (error) {
1434 if (netdev->vport_stats_error) {
1435 return error;
1436 } else {
1437 return 0;
1438 }
1439 }
1440
1441 /* If this port is an internal port then the transmit and receive stats
1442 * will appear to be swapped relative to the other ports since we are the
1443 * one sending the data, not a remote computer. For consistency, we swap
1444 * them back here. This does not apply if we are getting stats from the
1445 * vport layer because it always tracks stats from the perspective of the
1446 * switch. */
1447 if (netdev->vport_stats_error) {
1448 *stats = dev_stats;
1449 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1450 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1451 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1452 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1453 stats->rx_length_errors = 0;
1454 stats->rx_over_errors = 0;
1455 stats->rx_crc_errors = 0;
1456 stats->rx_frame_errors = 0;
1457 stats->rx_fifo_errors = 0;
1458 stats->rx_missed_errors = 0;
1459 stats->tx_aborted_errors = 0;
1460 stats->tx_carrier_errors = 0;
1461 stats->tx_fifo_errors = 0;
1462 stats->tx_heartbeat_errors = 0;
1463 stats->tx_window_errors = 0;
1464 } else {
1465 stats->rx_dropped += dev_stats.tx_dropped;
1466 stats->tx_dropped += dev_stats.rx_dropped;
1467
1468 stats->rx_errors += dev_stats.tx_errors;
1469 stats->tx_errors += dev_stats.rx_errors;
1470
1471 stats->multicast += dev_stats.multicast;
1472 stats->collisions += dev_stats.collisions;
1473 }
1474 return 0;
1475 }
1476
1477 static int
1478 netdev_internal_get_stats(const struct netdev *netdev_,
1479 struct netdev_stats *stats)
1480 {
1481 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1482
1483 get_stats_via_vport(netdev_, stats);
1484 return netdev->vport_stats_error;
1485 }
1486
1487 static int
1488 netdev_internal_set_stats(struct netdev *netdev,
1489 const struct netdev_stats *stats)
1490 {
1491 struct ovs_vport_stats vport_stats;
1492 struct dpif_linux_vport vport;
1493 int err;
1494
1495 vport_stats.rx_packets = stats->rx_packets;
1496 vport_stats.tx_packets = stats->tx_packets;
1497 vport_stats.rx_bytes = stats->rx_bytes;
1498 vport_stats.tx_bytes = stats->tx_bytes;
1499 vport_stats.rx_errors = stats->rx_errors;
1500 vport_stats.tx_errors = stats->tx_errors;
1501 vport_stats.rx_dropped = stats->rx_dropped;
1502 vport_stats.tx_dropped = stats->tx_dropped;
1503
1504 dpif_linux_vport_init(&vport);
1505 vport.cmd = OVS_VPORT_CMD_SET;
1506 vport.name = netdev_get_name(netdev);
1507 vport.stats = &vport_stats;
1508
1509 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1510
1511 /* If the vport layer doesn't know about the device, that doesn't mean it
1512 * doesn't exist (after all were able to open it when netdev_open() was
1513 * called), it just means that it isn't attached and we'll be getting
1514 * stats a different way. */
1515 if (err == ENODEV) {
1516 err = EOPNOTSUPP;
1517 }
1518
1519 return err;
1520 }
1521
1522 static void
1523 netdev_linux_read_features(struct netdev_linux *netdev)
1524 {
1525 struct ethtool_cmd ecmd;
1526 uint32_t speed;
1527 int error;
1528
1529 if (netdev->cache_valid & VALID_FEATURES) {
1530 return;
1531 }
1532
1533 COVERAGE_INC(netdev_get_ethtool);
1534 memset(&ecmd, 0, sizeof ecmd);
1535 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1536 ETHTOOL_GSET, "ETHTOOL_GSET");
1537 if (error) {
1538 goto out;
1539 }
1540
1541 /* Supported features. */
1542 netdev->supported = 0;
1543 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1544 netdev->supported |= NETDEV_F_10MB_HD;
1545 }
1546 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1547 netdev->supported |= NETDEV_F_10MB_FD;
1548 }
1549 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1550 netdev->supported |= NETDEV_F_100MB_HD;
1551 }
1552 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1553 netdev->supported |= NETDEV_F_100MB_FD;
1554 }
1555 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1556 netdev->supported |= NETDEV_F_1GB_HD;
1557 }
1558 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1559 netdev->supported |= NETDEV_F_1GB_FD;
1560 }
1561 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1562 netdev->supported |= NETDEV_F_10GB_FD;
1563 }
1564 if (ecmd.supported & SUPPORTED_TP) {
1565 netdev->supported |= NETDEV_F_COPPER;
1566 }
1567 if (ecmd.supported & SUPPORTED_FIBRE) {
1568 netdev->supported |= NETDEV_F_FIBER;
1569 }
1570 if (ecmd.supported & SUPPORTED_Autoneg) {
1571 netdev->supported |= NETDEV_F_AUTONEG;
1572 }
1573 if (ecmd.supported & SUPPORTED_Pause) {
1574 netdev->supported |= NETDEV_F_PAUSE;
1575 }
1576 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1577 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1578 }
1579
1580 /* Advertised features. */
1581 netdev->advertised = 0;
1582 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1583 netdev->advertised |= NETDEV_F_10MB_HD;
1584 }
1585 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1586 netdev->advertised |= NETDEV_F_10MB_FD;
1587 }
1588 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1589 netdev->advertised |= NETDEV_F_100MB_HD;
1590 }
1591 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1592 netdev->advertised |= NETDEV_F_100MB_FD;
1593 }
1594 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1595 netdev->advertised |= NETDEV_F_1GB_HD;
1596 }
1597 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1598 netdev->advertised |= NETDEV_F_1GB_FD;
1599 }
1600 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1601 netdev->advertised |= NETDEV_F_10GB_FD;
1602 }
1603 if (ecmd.advertising & ADVERTISED_TP) {
1604 netdev->advertised |= NETDEV_F_COPPER;
1605 }
1606 if (ecmd.advertising & ADVERTISED_FIBRE) {
1607 netdev->advertised |= NETDEV_F_FIBER;
1608 }
1609 if (ecmd.advertising & ADVERTISED_Autoneg) {
1610 netdev->advertised |= NETDEV_F_AUTONEG;
1611 }
1612 if (ecmd.advertising & ADVERTISED_Pause) {
1613 netdev->advertised |= NETDEV_F_PAUSE;
1614 }
1615 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1616 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1617 }
1618
1619 /* Current settings. */
1620 speed = ecmd.speed;
1621 if (speed == SPEED_10) {
1622 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1623 } else if (speed == SPEED_100) {
1624 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1625 } else if (speed == SPEED_1000) {
1626 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1627 } else if (speed == SPEED_10000) {
1628 netdev->current = NETDEV_F_10GB_FD;
1629 } else if (speed == 40000) {
1630 netdev->current = NETDEV_F_40GB_FD;
1631 } else if (speed == 100000) {
1632 netdev->current = NETDEV_F_100GB_FD;
1633 } else if (speed == 1000000) {
1634 netdev->current = NETDEV_F_1TB_FD;
1635 } else {
1636 netdev->current = 0;
1637 }
1638
1639 if (ecmd.port == PORT_TP) {
1640 netdev->current |= NETDEV_F_COPPER;
1641 } else if (ecmd.port == PORT_FIBRE) {
1642 netdev->current |= NETDEV_F_FIBER;
1643 }
1644
1645 if (ecmd.autoneg) {
1646 netdev->current |= NETDEV_F_AUTONEG;
1647 }
1648
1649 /* Peer advertisements. */
1650 netdev->peer = 0; /* XXX */
1651
1652 out:
1653 netdev->cache_valid |= VALID_FEATURES;
1654 netdev->get_features_error = error;
1655 }
1656
1657 /* Stores the features supported by 'netdev' into each of '*current',
1658 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1659 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1660 * errno value. */
1661 static int
1662 netdev_linux_get_features(const struct netdev *netdev_,
1663 enum netdev_features *current,
1664 enum netdev_features *advertised,
1665 enum netdev_features *supported,
1666 enum netdev_features *peer)
1667 {
1668 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1669
1670 netdev_linux_read_features(netdev);
1671
1672 if (!netdev->get_features_error) {
1673 *current = netdev->current;
1674 *advertised = netdev->advertised;
1675 *supported = netdev->supported;
1676 *peer = netdev->peer;
1677 }
1678 return netdev->get_features_error;
1679 }
1680
1681 /* Set the features advertised by 'netdev' to 'advertise'. */
1682 static int
1683 netdev_linux_set_advertisements(struct netdev *netdev,
1684 enum netdev_features advertise)
1685 {
1686 struct ethtool_cmd ecmd;
1687 int error;
1688
1689 COVERAGE_INC(netdev_get_ethtool);
1690 memset(&ecmd, 0, sizeof ecmd);
1691 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1692 ETHTOOL_GSET, "ETHTOOL_GSET");
1693 if (error) {
1694 return error;
1695 }
1696
1697 ecmd.advertising = 0;
1698 if (advertise & NETDEV_F_10MB_HD) {
1699 ecmd.advertising |= ADVERTISED_10baseT_Half;
1700 }
1701 if (advertise & NETDEV_F_10MB_FD) {
1702 ecmd.advertising |= ADVERTISED_10baseT_Full;
1703 }
1704 if (advertise & NETDEV_F_100MB_HD) {
1705 ecmd.advertising |= ADVERTISED_100baseT_Half;
1706 }
1707 if (advertise & NETDEV_F_100MB_FD) {
1708 ecmd.advertising |= ADVERTISED_100baseT_Full;
1709 }
1710 if (advertise & NETDEV_F_1GB_HD) {
1711 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1712 }
1713 if (advertise & NETDEV_F_1GB_FD) {
1714 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1715 }
1716 if (advertise & NETDEV_F_10GB_FD) {
1717 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1718 }
1719 if (advertise & NETDEV_F_COPPER) {
1720 ecmd.advertising |= ADVERTISED_TP;
1721 }
1722 if (advertise & NETDEV_F_FIBER) {
1723 ecmd.advertising |= ADVERTISED_FIBRE;
1724 }
1725 if (advertise & NETDEV_F_AUTONEG) {
1726 ecmd.advertising |= ADVERTISED_Autoneg;
1727 }
1728 if (advertise & NETDEV_F_PAUSE) {
1729 ecmd.advertising |= ADVERTISED_Pause;
1730 }
1731 if (advertise & NETDEV_F_PAUSE_ASYM) {
1732 ecmd.advertising |= ADVERTISED_Asym_Pause;
1733 }
1734 COVERAGE_INC(netdev_set_ethtool);
1735 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1736 ETHTOOL_SSET, "ETHTOOL_SSET");
1737 }
1738
1739 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1740 * successful, otherwise a positive errno value. */
1741 static int
1742 netdev_linux_set_policing(struct netdev *netdev_,
1743 uint32_t kbits_rate, uint32_t kbits_burst)
1744 {
1745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1746 const char *netdev_name = netdev_get_name(netdev_);
1747 int error;
1748
1749
1750 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1751 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1752 : kbits_burst); /* Stick with user-specified value. */
1753
1754 if (netdev->cache_valid & VALID_POLICING) {
1755 if (netdev->netdev_policing_error) {
1756 return netdev->netdev_policing_error;
1757 }
1758
1759 if (netdev->kbits_rate == kbits_rate &&
1760 netdev->kbits_burst == kbits_burst) {
1761 /* Assume that settings haven't changed since we last set them. */
1762 return 0;
1763 }
1764 netdev->cache_valid &= ~VALID_POLICING;
1765 }
1766
1767 COVERAGE_INC(netdev_set_policing);
1768 /* Remove any existing ingress qdisc. */
1769 error = tc_add_del_ingress_qdisc(netdev_, false);
1770 if (error) {
1771 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1772 netdev_name, ovs_strerror(error));
1773 goto out;
1774 }
1775
1776 if (kbits_rate) {
1777 error = tc_add_del_ingress_qdisc(netdev_, true);
1778 if (error) {
1779 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1780 netdev_name, ovs_strerror(error));
1781 goto out;
1782 }
1783
1784 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1785 if (error){
1786 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1787 netdev_name, ovs_strerror(error));
1788 goto out;
1789 }
1790 }
1791
1792 netdev->kbits_rate = kbits_rate;
1793 netdev->kbits_burst = kbits_burst;
1794
1795 out:
1796 if (!error || error == ENODEV) {
1797 netdev->netdev_policing_error = error;
1798 netdev->cache_valid |= VALID_POLICING;
1799 }
1800 return error;
1801 }
1802
1803 static int
1804 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1805 struct sset *types)
1806 {
1807 const struct tc_ops *const *opsp;
1808
1809 for (opsp = tcs; *opsp != NULL; opsp++) {
1810 const struct tc_ops *ops = *opsp;
1811 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1812 sset_add(types, ops->ovs_name);
1813 }
1814 }
1815 return 0;
1816 }
1817
1818 static const struct tc_ops *
1819 tc_lookup_ovs_name(const char *name)
1820 {
1821 const struct tc_ops *const *opsp;
1822
1823 for (opsp = tcs; *opsp != NULL; opsp++) {
1824 const struct tc_ops *ops = *opsp;
1825 if (!strcmp(name, ops->ovs_name)) {
1826 return ops;
1827 }
1828 }
1829 return NULL;
1830 }
1831
1832 static const struct tc_ops *
1833 tc_lookup_linux_name(const char *name)
1834 {
1835 const struct tc_ops *const *opsp;
1836
1837 for (opsp = tcs; *opsp != NULL; opsp++) {
1838 const struct tc_ops *ops = *opsp;
1839 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1840 return ops;
1841 }
1842 }
1843 return NULL;
1844 }
1845
1846 static struct tc_queue *
1847 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1848 size_t hash)
1849 {
1850 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1851 struct tc_queue *queue;
1852
1853 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1854 if (queue->queue_id == queue_id) {
1855 return queue;
1856 }
1857 }
1858 return NULL;
1859 }
1860
1861 static struct tc_queue *
1862 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1863 {
1864 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1865 }
1866
1867 static int
1868 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1869 const char *type,
1870 struct netdev_qos_capabilities *caps)
1871 {
1872 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1873 if (!ops) {
1874 return EOPNOTSUPP;
1875 }
1876 caps->n_queues = ops->n_queues;
1877 return 0;
1878 }
1879
1880 static int
1881 netdev_linux_get_qos(const struct netdev *netdev_,
1882 const char **typep, struct smap *details)
1883 {
1884 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1885 int error;
1886
1887 error = tc_query_qdisc(netdev_);
1888 if (error) {
1889 return error;
1890 }
1891
1892 *typep = netdev->tc->ops->ovs_name;
1893 return (netdev->tc->ops->qdisc_get
1894 ? netdev->tc->ops->qdisc_get(netdev_, details)
1895 : 0);
1896 }
1897
1898 static int
1899 netdev_linux_set_qos(struct netdev *netdev_,
1900 const char *type, const struct smap *details)
1901 {
1902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1903 const struct tc_ops *new_ops;
1904 int error;
1905
1906 new_ops = tc_lookup_ovs_name(type);
1907 if (!new_ops || !new_ops->tc_install) {
1908 return EOPNOTSUPP;
1909 }
1910
1911 error = tc_query_qdisc(netdev_);
1912 if (error) {
1913 return error;
1914 }
1915
1916 if (new_ops == netdev->tc->ops) {
1917 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1918 } else {
1919 /* Delete existing qdisc. */
1920 error = tc_del_qdisc(netdev_);
1921 if (error) {
1922 return error;
1923 }
1924 ovs_assert(netdev->tc == NULL);
1925
1926 /* Install new qdisc. */
1927 error = new_ops->tc_install(netdev_, details);
1928 ovs_assert((error == 0) == (netdev->tc != NULL));
1929
1930 return error;
1931 }
1932 }
1933
1934 static int
1935 netdev_linux_get_queue(const struct netdev *netdev_,
1936 unsigned int queue_id, struct smap *details)
1937 {
1938 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1939 int error;
1940
1941 error = tc_query_qdisc(netdev_);
1942 if (error) {
1943 return error;
1944 } else {
1945 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1946 return (queue
1947 ? netdev->tc->ops->class_get(netdev_, queue, details)
1948 : ENOENT);
1949 }
1950 }
1951
1952 static int
1953 netdev_linux_set_queue(struct netdev *netdev_,
1954 unsigned int queue_id, const struct smap *details)
1955 {
1956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1957 int error;
1958
1959 error = tc_query_qdisc(netdev_);
1960 if (error) {
1961 return error;
1962 } else if (queue_id >= netdev->tc->ops->n_queues
1963 || !netdev->tc->ops->class_set) {
1964 return EINVAL;
1965 }
1966
1967 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1968 }
1969
1970 static int
1971 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1972 {
1973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1974 int error;
1975
1976 error = tc_query_qdisc(netdev_);
1977 if (error) {
1978 return error;
1979 } else if (!netdev->tc->ops->class_delete) {
1980 return EINVAL;
1981 } else {
1982 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1983 return (queue
1984 ? netdev->tc->ops->class_delete(netdev_, queue)
1985 : ENOENT);
1986 }
1987 }
1988
1989 static int
1990 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1991 unsigned int queue_id,
1992 struct netdev_queue_stats *stats)
1993 {
1994 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1995 int error;
1996
1997 error = tc_query_qdisc(netdev_);
1998 if (error) {
1999 return error;
2000 } else if (!netdev->tc->ops->class_get_stats) {
2001 return EOPNOTSUPP;
2002 } else {
2003 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2004 if (!queue) {
2005 return ENOENT;
2006 }
2007 stats->created = queue->created;
2008 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2009 }
2010 }
2011
2012 static bool
2013 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2014 {
2015 struct ofpbuf request;
2016 struct tcmsg *tcmsg;
2017
2018 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2019 if (!tcmsg) {
2020 return false;
2021 }
2022 tcmsg->tcm_parent = 0;
2023 nl_dump_start(dump, NETLINK_ROUTE, &request);
2024 ofpbuf_uninit(&request);
2025 return true;
2026 }
2027
2028 static int
2029 netdev_linux_dump_queues(const struct netdev *netdev_,
2030 netdev_dump_queues_cb *cb, void *aux)
2031 {
2032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2033 struct tc_queue *queue, *next_queue;
2034 struct smap details;
2035 int last_error;
2036 int error;
2037
2038 error = tc_query_qdisc(netdev_);
2039 if (error) {
2040 return error;
2041 } else if (!netdev->tc->ops->class_get) {
2042 return EOPNOTSUPP;
2043 }
2044
2045 last_error = 0;
2046 smap_init(&details);
2047 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2048 &netdev->tc->queues) {
2049 smap_clear(&details);
2050
2051 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2052 if (!error) {
2053 (*cb)(queue->queue_id, &details, aux);
2054 } else {
2055 last_error = error;
2056 }
2057 }
2058 smap_destroy(&details);
2059
2060 return last_error;
2061 }
2062
2063 static int
2064 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2065 netdev_dump_queue_stats_cb *cb, void *aux)
2066 {
2067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2068 struct nl_dump dump;
2069 struct ofpbuf msg;
2070 int last_error;
2071 int error;
2072
2073 error = tc_query_qdisc(netdev_);
2074 if (error) {
2075 return error;
2076 } else if (!netdev->tc->ops->class_dump_stats) {
2077 return EOPNOTSUPP;
2078 }
2079
2080 last_error = 0;
2081 if (!start_queue_dump(netdev_, &dump)) {
2082 return ENODEV;
2083 }
2084 while (nl_dump_next(&dump, &msg)) {
2085 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2086 if (error) {
2087 last_error = error;
2088 }
2089 }
2090
2091 error = nl_dump_done(&dump);
2092 return error ? error : last_error;
2093 }
2094
2095 static int
2096 netdev_linux_get_in4(const struct netdev *netdev_,
2097 struct in_addr *address, struct in_addr *netmask)
2098 {
2099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2100
2101 if (!(netdev->cache_valid & VALID_IN4)) {
2102 int error;
2103
2104 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2105 SIOCGIFADDR, "SIOCGIFADDR");
2106 if (error) {
2107 return error;
2108 }
2109
2110 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2111 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2112 if (error) {
2113 return error;
2114 }
2115
2116 netdev->cache_valid |= VALID_IN4;
2117 }
2118 *address = netdev->address;
2119 *netmask = netdev->netmask;
2120 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2121 }
2122
2123 static int
2124 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2125 struct in_addr netmask)
2126 {
2127 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2128 int error;
2129
2130 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2131 if (!error) {
2132 netdev->cache_valid |= VALID_IN4;
2133 netdev->address = address;
2134 netdev->netmask = netmask;
2135 if (address.s_addr != INADDR_ANY) {
2136 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2137 "SIOCSIFNETMASK", netmask);
2138 }
2139 }
2140 return error;
2141 }
2142
2143 static bool
2144 parse_if_inet6_line(const char *line,
2145 struct in6_addr *in6, char ifname[16 + 1])
2146 {
2147 uint8_t *s6 = in6->s6_addr;
2148 #define X8 "%2"SCNx8
2149 return sscanf(line,
2150 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2151 "%*x %*x %*x %*x %16s\n",
2152 &s6[0], &s6[1], &s6[2], &s6[3],
2153 &s6[4], &s6[5], &s6[6], &s6[7],
2154 &s6[8], &s6[9], &s6[10], &s6[11],
2155 &s6[12], &s6[13], &s6[14], &s6[15],
2156 ifname) == 17;
2157 }
2158
2159 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2160 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2161 static int
2162 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2163 {
2164 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2165 if (!(netdev->cache_valid & VALID_IN6)) {
2166 FILE *file;
2167 char line[128];
2168
2169 netdev->in6 = in6addr_any;
2170
2171 file = fopen("/proc/net/if_inet6", "r");
2172 if (file != NULL) {
2173 const char *name = netdev_get_name(netdev_);
2174 while (fgets(line, sizeof line, file)) {
2175 struct in6_addr in6_tmp;
2176 char ifname[16 + 1];
2177 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2178 && !strcmp(name, ifname))
2179 {
2180 netdev->in6 = in6_tmp;
2181 break;
2182 }
2183 }
2184 fclose(file);
2185 }
2186 netdev->cache_valid |= VALID_IN6;
2187 }
2188 *in6 = netdev->in6;
2189 return 0;
2190 }
2191
2192 static void
2193 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2194 {
2195 struct sockaddr_in sin;
2196 memset(&sin, 0, sizeof sin);
2197 sin.sin_family = AF_INET;
2198 sin.sin_addr = addr;
2199 sin.sin_port = 0;
2200
2201 memset(sa, 0, sizeof *sa);
2202 memcpy(sa, &sin, sizeof sin);
2203 }
2204
2205 static int
2206 do_set_addr(struct netdev *netdev,
2207 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2208 {
2209 struct ifreq ifr;
2210 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2211 make_in4_sockaddr(&ifr.ifr_addr, addr);
2212
2213 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2214 ioctl_name);
2215 }
2216
2217 /* Adds 'router' as a default IP gateway. */
2218 static int
2219 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2220 {
2221 struct in_addr any = { INADDR_ANY };
2222 struct rtentry rt;
2223 int error;
2224
2225 memset(&rt, 0, sizeof rt);
2226 make_in4_sockaddr(&rt.rt_dst, any);
2227 make_in4_sockaddr(&rt.rt_gateway, router);
2228 make_in4_sockaddr(&rt.rt_genmask, any);
2229 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2230 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2231 if (error) {
2232 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2233 }
2234 return error;
2235 }
2236
2237 static int
2238 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2239 char **netdev_name)
2240 {
2241 static const char fn[] = "/proc/net/route";
2242 FILE *stream;
2243 char line[256];
2244 int ln;
2245
2246 *netdev_name = NULL;
2247 stream = fopen(fn, "r");
2248 if (stream == NULL) {
2249 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2250 return errno;
2251 }
2252
2253 ln = 0;
2254 while (fgets(line, sizeof line, stream)) {
2255 if (++ln >= 2) {
2256 char iface[17];
2257 ovs_be32 dest, gateway, mask;
2258 int refcnt, metric, mtu;
2259 unsigned int flags, use, window, irtt;
2260
2261 if (sscanf(line,
2262 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2263 " %d %u %u\n",
2264 iface, &dest, &gateway, &flags, &refcnt,
2265 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2266
2267 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2268 fn, ln, line);
2269 continue;
2270 }
2271 if (!(flags & RTF_UP)) {
2272 /* Skip routes that aren't up. */
2273 continue;
2274 }
2275
2276 /* The output of 'dest', 'mask', and 'gateway' were given in
2277 * network byte order, so we don't need need any endian
2278 * conversions here. */
2279 if ((dest & mask) == (host->s_addr & mask)) {
2280 if (!gateway) {
2281 /* The host is directly reachable. */
2282 next_hop->s_addr = 0;
2283 } else {
2284 /* To reach the host, we must go through a gateway. */
2285 next_hop->s_addr = gateway;
2286 }
2287 *netdev_name = xstrdup(iface);
2288 fclose(stream);
2289 return 0;
2290 }
2291 }
2292 }
2293
2294 fclose(stream);
2295 return ENXIO;
2296 }
2297
2298 static int
2299 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2300 {
2301 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2302 int error = 0;
2303
2304 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2305 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2306
2307 COVERAGE_INC(netdev_get_ethtool);
2308 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2309 error = netdev_linux_do_ethtool(netdev->up.name,
2310 cmd,
2311 ETHTOOL_GDRVINFO,
2312 "ETHTOOL_GDRVINFO");
2313 if (!error) {
2314 netdev->cache_valid |= VALID_DRVINFO;
2315 }
2316 }
2317
2318 if (!error) {
2319 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2320 smap_add(smap, "driver_version", netdev->drvinfo.version);
2321 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2322 }
2323 return error;
2324 }
2325
2326 static int
2327 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2328 struct smap *smap)
2329 {
2330 smap_add(smap, "driver_name", "openvswitch");
2331 return 0;
2332 }
2333
2334 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2335 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2336 * returns 0. Otherwise, it returns a positive errno value; in particular,
2337 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2338 static int
2339 netdev_linux_arp_lookup(const struct netdev *netdev,
2340 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2341 {
2342 struct arpreq r;
2343 struct sockaddr_in sin;
2344 int retval;
2345
2346 memset(&r, 0, sizeof r);
2347 memset(&sin, 0, sizeof sin);
2348 sin.sin_family = AF_INET;
2349 sin.sin_addr.s_addr = ip;
2350 sin.sin_port = 0;
2351 memcpy(&r.arp_pa, &sin, sizeof sin);
2352 r.arp_ha.sa_family = ARPHRD_ETHER;
2353 r.arp_flags = 0;
2354 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2355 COVERAGE_INC(netdev_arp_lookup);
2356 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2357 if (!retval) {
2358 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2359 } else if (retval != ENXIO) {
2360 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2361 netdev_get_name(netdev), IP_ARGS(ip),
2362 ovs_strerror(retval));
2363 }
2364 return retval;
2365 }
2366
2367 static int
2368 nd_to_iff_flags(enum netdev_flags nd)
2369 {
2370 int iff = 0;
2371 if (nd & NETDEV_UP) {
2372 iff |= IFF_UP;
2373 }
2374 if (nd & NETDEV_PROMISC) {
2375 iff |= IFF_PROMISC;
2376 }
2377 return iff;
2378 }
2379
2380 static int
2381 iff_to_nd_flags(int iff)
2382 {
2383 enum netdev_flags nd = 0;
2384 if (iff & IFF_UP) {
2385 nd |= NETDEV_UP;
2386 }
2387 if (iff & IFF_PROMISC) {
2388 nd |= NETDEV_PROMISC;
2389 }
2390 return nd;
2391 }
2392
2393 static int
2394 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2395 enum netdev_flags on, enum netdev_flags *old_flagsp)
2396 {
2397 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2398 int old_flags, new_flags;
2399 int error = 0;
2400
2401 old_flags = netdev->ifi_flags;
2402 *old_flagsp = iff_to_nd_flags(old_flags);
2403 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2404 if (new_flags != old_flags) {
2405 error = set_flags(netdev_get_name(netdev_), new_flags);
2406 get_flags(netdev_, &netdev->ifi_flags);
2407 }
2408 return error;
2409 }
2410
2411 static unsigned int
2412 netdev_linux_change_seq(const struct netdev *netdev)
2413 {
2414 return netdev_linux_cast(netdev)->change_seq;
2415 }
2416
2417 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2418 GET_FEATURES, GET_STATUS) \
2419 { \
2420 NAME, \
2421 \
2422 netdev_linux_init, \
2423 netdev_linux_run, \
2424 netdev_linux_wait, \
2425 \
2426 CREATE, \
2427 netdev_linux_destroy, \
2428 NULL, /* get_config */ \
2429 NULL, /* set_config */ \
2430 NULL, /* get_tunnel_config */ \
2431 \
2432 netdev_linux_rx_open, \
2433 \
2434 netdev_linux_send, \
2435 netdev_linux_send_wait, \
2436 \
2437 netdev_linux_set_etheraddr, \
2438 netdev_linux_get_etheraddr, \
2439 netdev_linux_get_mtu, \
2440 netdev_linux_set_mtu, \
2441 netdev_linux_get_ifindex, \
2442 netdev_linux_get_carrier, \
2443 netdev_linux_get_carrier_resets, \
2444 netdev_linux_set_miimon_interval, \
2445 GET_STATS, \
2446 SET_STATS, \
2447 \
2448 GET_FEATURES, \
2449 netdev_linux_set_advertisements, \
2450 \
2451 netdev_linux_set_policing, \
2452 netdev_linux_get_qos_types, \
2453 netdev_linux_get_qos_capabilities, \
2454 netdev_linux_get_qos, \
2455 netdev_linux_set_qos, \
2456 netdev_linux_get_queue, \
2457 netdev_linux_set_queue, \
2458 netdev_linux_delete_queue, \
2459 netdev_linux_get_queue_stats, \
2460 netdev_linux_dump_queues, \
2461 netdev_linux_dump_queue_stats, \
2462 \
2463 netdev_linux_get_in4, \
2464 netdev_linux_set_in4, \
2465 netdev_linux_get_in6, \
2466 netdev_linux_add_router, \
2467 netdev_linux_get_next_hop, \
2468 GET_STATUS, \
2469 netdev_linux_arp_lookup, \
2470 \
2471 netdev_linux_update_flags, \
2472 \
2473 netdev_linux_change_seq \
2474 }
2475
2476 const struct netdev_class netdev_linux_class =
2477 NETDEV_LINUX_CLASS(
2478 "system",
2479 netdev_linux_create,
2480 netdev_linux_get_stats,
2481 NULL, /* set_stats */
2482 netdev_linux_get_features,
2483 netdev_linux_get_status);
2484
2485 const struct netdev_class netdev_tap_class =
2486 NETDEV_LINUX_CLASS(
2487 "tap",
2488 netdev_linux_create_tap,
2489 netdev_tap_get_stats,
2490 NULL, /* set_stats */
2491 netdev_linux_get_features,
2492 netdev_linux_get_status);
2493
2494 const struct netdev_class netdev_internal_class =
2495 NETDEV_LINUX_CLASS(
2496 "internal",
2497 netdev_linux_create,
2498 netdev_internal_get_stats,
2499 netdev_internal_set_stats,
2500 NULL, /* get_features */
2501 netdev_internal_get_status);
2502
2503 static const struct netdev_rx_class netdev_rx_linux_class = {
2504 netdev_rx_linux_destroy,
2505 netdev_rx_linux_recv,
2506 netdev_rx_linux_wait,
2507 netdev_rx_linux_drain,
2508 };
2509 \f
2510 /* HTB traffic control class. */
2511
2512 #define HTB_N_QUEUES 0xf000
2513
2514 struct htb {
2515 struct tc tc;
2516 unsigned int max_rate; /* In bytes/s. */
2517 };
2518
2519 struct htb_class {
2520 struct tc_queue tc_queue;
2521 unsigned int min_rate; /* In bytes/s. */
2522 unsigned int max_rate; /* In bytes/s. */
2523 unsigned int burst; /* In bytes. */
2524 unsigned int priority; /* Lower values are higher priorities. */
2525 };
2526
2527 static struct htb *
2528 htb_get__(const struct netdev *netdev_)
2529 {
2530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2531 return CONTAINER_OF(netdev->tc, struct htb, tc);
2532 }
2533
2534 static void
2535 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2536 {
2537 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2538 struct htb *htb;
2539
2540 htb = xmalloc(sizeof *htb);
2541 tc_init(&htb->tc, &tc_ops_htb);
2542 htb->max_rate = max_rate;
2543
2544 netdev->tc = &htb->tc;
2545 }
2546
2547 /* Create an HTB qdisc.
2548 *
2549 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2550 static int
2551 htb_setup_qdisc__(struct netdev *netdev)
2552 {
2553 size_t opt_offset;
2554 struct tc_htb_glob opt;
2555 struct ofpbuf request;
2556 struct tcmsg *tcmsg;
2557
2558 tc_del_qdisc(netdev);
2559
2560 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2561 NLM_F_EXCL | NLM_F_CREATE, &request);
2562 if (!tcmsg) {
2563 return ENODEV;
2564 }
2565 tcmsg->tcm_handle = tc_make_handle(1, 0);
2566 tcmsg->tcm_parent = TC_H_ROOT;
2567
2568 nl_msg_put_string(&request, TCA_KIND, "htb");
2569
2570 memset(&opt, 0, sizeof opt);
2571 opt.rate2quantum = 10;
2572 opt.version = 3;
2573 opt.defcls = 1;
2574
2575 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2576 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2577 nl_msg_end_nested(&request, opt_offset);
2578
2579 return tc_transact(&request, NULL);
2580 }
2581
2582 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2583 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2584 static int
2585 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2586 unsigned int parent, struct htb_class *class)
2587 {
2588 size_t opt_offset;
2589 struct tc_htb_opt opt;
2590 struct ofpbuf request;
2591 struct tcmsg *tcmsg;
2592 int error;
2593 int mtu;
2594
2595 error = netdev_get_mtu(netdev, &mtu);
2596 if (error) {
2597 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2598 netdev_get_name(netdev));
2599 return error;
2600 }
2601
2602 memset(&opt, 0, sizeof opt);
2603 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2604 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2605 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2606 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2607 opt.prio = class->priority;
2608
2609 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2610 if (!tcmsg) {
2611 return ENODEV;
2612 }
2613 tcmsg->tcm_handle = handle;
2614 tcmsg->tcm_parent = parent;
2615
2616 nl_msg_put_string(&request, TCA_KIND, "htb");
2617 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2618 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2619 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2620 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2621 nl_msg_end_nested(&request, opt_offset);
2622
2623 error = tc_transact(&request, NULL);
2624 if (error) {
2625 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2626 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2627 netdev_get_name(netdev),
2628 tc_get_major(handle), tc_get_minor(handle),
2629 tc_get_major(parent), tc_get_minor(parent),
2630 class->min_rate, class->max_rate,
2631 class->burst, class->priority, ovs_strerror(error));
2632 }
2633 return error;
2634 }
2635
2636 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2637 * description of them into 'details'. The description complies with the
2638 * specification given in the vswitch database documentation for linux-htb
2639 * queue details. */
2640 static int
2641 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2642 {
2643 static const struct nl_policy tca_htb_policy[] = {
2644 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2645 .min_len = sizeof(struct tc_htb_opt) },
2646 };
2647
2648 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2649 const struct tc_htb_opt *htb;
2650
2651 if (!nl_parse_nested(nl_options, tca_htb_policy,
2652 attrs, ARRAY_SIZE(tca_htb_policy))) {
2653 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2654 return EPROTO;
2655 }
2656
2657 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2658 class->min_rate = htb->rate.rate;
2659 class->max_rate = htb->ceil.rate;
2660 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2661 class->priority = htb->prio;
2662 return 0;
2663 }
2664
2665 static int
2666 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2667 struct htb_class *options,
2668 struct netdev_queue_stats *stats)
2669 {
2670 struct nlattr *nl_options;
2671 unsigned int handle;
2672 int error;
2673
2674 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2675 if (!error && queue_id) {
2676 unsigned int major = tc_get_major(handle);
2677 unsigned int minor = tc_get_minor(handle);
2678 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2679 *queue_id = minor - 1;
2680 } else {
2681 error = EPROTO;
2682 }
2683 }
2684 if (!error && options) {
2685 error = htb_parse_tca_options__(nl_options, options);
2686 }
2687 return error;
2688 }
2689
2690 static void
2691 htb_parse_qdisc_details__(struct netdev *netdev,
2692 const struct smap *details, struct htb_class *hc)
2693 {
2694 const char *max_rate_s;
2695
2696 max_rate_s = smap_get(details, "max-rate");
2697 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2698 if (!hc->max_rate) {
2699 enum netdev_features current;
2700
2701 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2702 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2703 }
2704 hc->min_rate = hc->max_rate;
2705 hc->burst = 0;
2706 hc->priority = 0;
2707 }
2708
2709 static int
2710 htb_parse_class_details__(struct netdev *netdev,
2711 const struct smap *details, struct htb_class *hc)
2712 {
2713 const struct htb *htb = htb_get__(netdev);
2714 const char *min_rate_s = smap_get(details, "min-rate");
2715 const char *max_rate_s = smap_get(details, "max-rate");
2716 const char *burst_s = smap_get(details, "burst");
2717 const char *priority_s = smap_get(details, "priority");
2718 int mtu, error;
2719
2720 error = netdev_get_mtu(netdev, &mtu);
2721 if (error) {
2722 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2723 netdev_get_name(netdev));
2724 return error;
2725 }
2726
2727 /* HTB requires at least an mtu sized min-rate to send any traffic even
2728 * on uncongested links. */
2729 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2730 hc->min_rate = MAX(hc->min_rate, mtu);
2731 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2732
2733 /* max-rate */
2734 hc->max_rate = (max_rate_s
2735 ? strtoull(max_rate_s, NULL, 10) / 8
2736 : htb->max_rate);
2737 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2738 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2739
2740 /* burst
2741 *
2742 * According to hints in the documentation that I've read, it is important
2743 * that 'burst' be at least as big as the largest frame that might be
2744 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2745 * but having it a bit too small is a problem. Since netdev_get_mtu()
2746 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2747 * the MTU. We actually add 64, instead of 14, as a guard against
2748 * additional headers get tacked on somewhere that we're not aware of. */
2749 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2750 hc->burst = MAX(hc->burst, mtu + 64);
2751
2752 /* priority */
2753 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2754
2755 return 0;
2756 }
2757
2758 static int
2759 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2760 unsigned int parent, struct htb_class *options,
2761 struct netdev_queue_stats *stats)
2762 {
2763 struct ofpbuf *reply;
2764 int error;
2765
2766 error = tc_query_class(netdev, handle, parent, &reply);
2767 if (!error) {
2768 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2769 ofpbuf_delete(reply);
2770 }
2771 return error;
2772 }
2773
2774 static int
2775 htb_tc_install(struct netdev *netdev, const struct smap *details)
2776 {
2777 int error;
2778
2779 error = htb_setup_qdisc__(netdev);
2780 if (!error) {
2781 struct htb_class hc;
2782
2783 htb_parse_qdisc_details__(netdev, details, &hc);
2784 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2785 tc_make_handle(1, 0), &hc);
2786 if (!error) {
2787 htb_install__(netdev, hc.max_rate);
2788 }
2789 }
2790 return error;
2791 }
2792
2793 static struct htb_class *
2794 htb_class_cast__(const struct tc_queue *queue)
2795 {
2796 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2797 }
2798
2799 static void
2800 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2801 const struct htb_class *hc)
2802 {
2803 struct htb *htb = htb_get__(netdev);
2804 size_t hash = hash_int(queue_id, 0);
2805 struct tc_queue *queue;
2806 struct htb_class *hcp;
2807
2808 queue = tc_find_queue__(netdev, queue_id, hash);
2809 if (queue) {
2810 hcp = htb_class_cast__(queue);
2811 } else {
2812 hcp = xmalloc(sizeof *hcp);
2813 queue = &hcp->tc_queue;
2814 queue->queue_id = queue_id;
2815 queue->created = time_msec();
2816 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2817 }
2818
2819 hcp->min_rate = hc->min_rate;
2820 hcp->max_rate = hc->max_rate;
2821 hcp->burst = hc->burst;
2822 hcp->priority = hc->priority;
2823 }
2824
2825 static int
2826 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2827 {
2828 struct ofpbuf msg;
2829 struct nl_dump dump;
2830 struct htb_class hc;
2831
2832 /* Get qdisc options. */
2833 hc.max_rate = 0;
2834 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2835 htb_install__(netdev, hc.max_rate);
2836
2837 /* Get queues. */
2838 if (!start_queue_dump(netdev, &dump)) {
2839 return ENODEV;
2840 }
2841 while (nl_dump_next(&dump, &msg)) {
2842 unsigned int queue_id;
2843
2844 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2845 htb_update_queue__(netdev, queue_id, &hc);
2846 }
2847 }
2848 nl_dump_done(&dump);
2849
2850 return 0;
2851 }
2852
2853 static void
2854 htb_tc_destroy(struct tc *tc)
2855 {
2856 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2857 struct htb_class *hc, *next;
2858
2859 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2860 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2861 free(hc);
2862 }
2863 tc_destroy(tc);
2864 free(htb);
2865 }
2866
2867 static int
2868 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2869 {
2870 const struct htb *htb = htb_get__(netdev);
2871 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2872 return 0;
2873 }
2874
2875 static int
2876 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2877 {
2878 struct htb_class hc;
2879 int error;
2880
2881 htb_parse_qdisc_details__(netdev, details, &hc);
2882 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2883 tc_make_handle(1, 0), &hc);
2884 if (!error) {
2885 htb_get__(netdev)->max_rate = hc.max_rate;
2886 }
2887 return error;
2888 }
2889
2890 static int
2891 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2892 const struct tc_queue *queue, struct smap *details)
2893 {
2894 const struct htb_class *hc = htb_class_cast__(queue);
2895
2896 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2897 if (hc->min_rate != hc->max_rate) {
2898 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2899 }
2900 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2901 if (hc->priority) {
2902 smap_add_format(details, "priority", "%u", hc->priority);
2903 }
2904 return 0;
2905 }
2906
2907 static int
2908 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2909 const struct smap *details)
2910 {
2911 struct htb_class hc;
2912 int error;
2913
2914 error = htb_parse_class_details__(netdev, details, &hc);
2915 if (error) {
2916 return error;
2917 }
2918
2919 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2920 tc_make_handle(1, 0xfffe), &hc);
2921 if (error) {
2922 return error;
2923 }
2924
2925 htb_update_queue__(netdev, queue_id, &hc);
2926 return 0;
2927 }
2928
2929 static int
2930 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2931 {
2932 struct htb_class *hc = htb_class_cast__(queue);
2933 struct htb *htb = htb_get__(netdev);
2934 int error;
2935
2936 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2937 if (!error) {
2938 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2939 free(hc);
2940 }
2941 return error;
2942 }
2943
2944 static int
2945 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2946 struct netdev_queue_stats *stats)
2947 {
2948 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2949 tc_make_handle(1, 0xfffe), NULL, stats);
2950 }
2951
2952 static int
2953 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2954 const struct ofpbuf *nlmsg,
2955 netdev_dump_queue_stats_cb *cb, void *aux)
2956 {
2957 struct netdev_queue_stats stats;
2958 unsigned int handle, major, minor;
2959 int error;
2960
2961 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2962 if (error) {
2963 return error;
2964 }
2965
2966 major = tc_get_major(handle);
2967 minor = tc_get_minor(handle);
2968 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2969 (*cb)(minor - 1, &stats, aux);
2970 }
2971 return 0;
2972 }
2973
2974 static const struct tc_ops tc_ops_htb = {
2975 "htb", /* linux_name */
2976 "linux-htb", /* ovs_name */
2977 HTB_N_QUEUES, /* n_queues */
2978 htb_tc_install,
2979 htb_tc_load,
2980 htb_tc_destroy,
2981 htb_qdisc_get,
2982 htb_qdisc_set,
2983 htb_class_get,
2984 htb_class_set,
2985 htb_class_delete,
2986 htb_class_get_stats,
2987 htb_class_dump_stats
2988 };
2989 \f
2990 /* "linux-hfsc" traffic control class. */
2991
2992 #define HFSC_N_QUEUES 0xf000
2993
2994 struct hfsc {
2995 struct tc tc;
2996 uint32_t max_rate;
2997 };
2998
2999 struct hfsc_class {
3000 struct tc_queue tc_queue;
3001 uint32_t min_rate;
3002 uint32_t max_rate;
3003 };
3004
3005 static struct hfsc *
3006 hfsc_get__(const struct netdev *netdev_)
3007 {
3008 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3009 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3010 }
3011
3012 static struct hfsc_class *
3013 hfsc_class_cast__(const struct tc_queue *queue)
3014 {
3015 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3016 }
3017
3018 static void
3019 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3020 {
3021 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3022 struct hfsc *hfsc;
3023
3024 hfsc = xmalloc(sizeof *hfsc);
3025 tc_init(&hfsc->tc, &tc_ops_hfsc);
3026 hfsc->max_rate = max_rate;
3027 netdev->tc = &hfsc->tc;
3028 }
3029
3030 static void
3031 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3032 const struct hfsc_class *hc)
3033 {
3034 size_t hash;
3035 struct hfsc *hfsc;
3036 struct hfsc_class *hcp;
3037 struct tc_queue *queue;
3038
3039 hfsc = hfsc_get__(netdev);
3040 hash = hash_int(queue_id, 0);
3041
3042 queue = tc_find_queue__(netdev, queue_id, hash);
3043 if (queue) {
3044 hcp = hfsc_class_cast__(queue);
3045 } else {
3046 hcp = xmalloc(sizeof *hcp);
3047 queue = &hcp->tc_queue;
3048 queue->queue_id = queue_id;
3049 queue->created = time_msec();
3050 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3051 }
3052
3053 hcp->min_rate = hc->min_rate;
3054 hcp->max_rate = hc->max_rate;
3055 }
3056
3057 static int
3058 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3059 {
3060 const struct tc_service_curve *rsc, *fsc, *usc;
3061 static const struct nl_policy tca_hfsc_policy[] = {
3062 [TCA_HFSC_RSC] = {
3063 .type = NL_A_UNSPEC,
3064 .optional = false,
3065 .min_len = sizeof(struct tc_service_curve),
3066 },
3067 [TCA_HFSC_FSC] = {
3068 .type = NL_A_UNSPEC,
3069 .optional = false,
3070 .min_len = sizeof(struct tc_service_curve),
3071 },
3072 [TCA_HFSC_USC] = {
3073 .type = NL_A_UNSPEC,
3074 .optional = false,
3075 .min_len = sizeof(struct tc_service_curve),
3076 },
3077 };
3078 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3079
3080 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3081 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3082 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3083 return EPROTO;
3084 }
3085
3086 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3087 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3088 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3089
3090 if (rsc->m1 != 0 || rsc->d != 0 ||
3091 fsc->m1 != 0 || fsc->d != 0 ||
3092 usc->m1 != 0 || usc->d != 0) {
3093 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3094 "Non-linear service curves are not supported.");
3095 return EPROTO;
3096 }
3097
3098 if (rsc->m2 != fsc->m2) {
3099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3100 "Real-time service curves are not supported ");
3101 return EPROTO;
3102 }
3103
3104 if (rsc->m2 > usc->m2) {
3105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3106 "Min-rate service curve is greater than "
3107 "the max-rate service curve.");
3108 return EPROTO;
3109 }
3110
3111 class->min_rate = fsc->m2;
3112 class->max_rate = usc->m2;
3113 return 0;
3114 }
3115
3116 static int
3117 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3118 struct hfsc_class *options,
3119 struct netdev_queue_stats *stats)
3120 {
3121 int error;
3122 unsigned int handle;
3123 struct nlattr *nl_options;
3124
3125 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3126 if (error) {
3127 return error;
3128 }
3129
3130 if (queue_id) {
3131 unsigned int major, minor;
3132
3133 major = tc_get_major(handle);
3134 minor = tc_get_minor(handle);
3135 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3136 *queue_id = minor - 1;
3137 } else {
3138 return EPROTO;
3139 }
3140 }
3141
3142 if (options) {
3143 error = hfsc_parse_tca_options__(nl_options, options);
3144 }
3145
3146 return error;
3147 }
3148
3149 static int
3150 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3151 unsigned int parent, struct hfsc_class *options,
3152 struct netdev_queue_stats *stats)
3153 {
3154 int error;
3155 struct ofpbuf *reply;
3156
3157 error = tc_query_class(netdev, handle, parent, &reply);
3158 if (error) {
3159 return error;
3160 }
3161
3162 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3163 ofpbuf_delete(reply);
3164 return error;
3165 }
3166
3167 static void
3168 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3169 struct hfsc_class *class)
3170 {
3171 uint32_t max_rate;
3172 const char *max_rate_s;
3173
3174 max_rate_s = smap_get(details, "max-rate");
3175 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3176
3177 if (!max_rate) {
3178 enum netdev_features current;
3179
3180 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3181 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3182 }
3183
3184 class->min_rate = max_rate;
3185 class->max_rate = max_rate;
3186 }
3187
3188 static int
3189 hfsc_parse_class_details__(struct netdev *netdev,
3190 const struct smap *details,
3191 struct hfsc_class * class)
3192 {
3193 const struct hfsc *hfsc;
3194 uint32_t min_rate, max_rate;
3195 const char *min_rate_s, *max_rate_s;
3196
3197 hfsc = hfsc_get__(netdev);
3198 min_rate_s = smap_get(details, "min-rate");
3199 max_rate_s = smap_get(details, "max-rate");
3200
3201 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3202 min_rate = MAX(min_rate, 1);
3203 min_rate = MIN(min_rate, hfsc->max_rate);
3204
3205 max_rate = (max_rate_s
3206 ? strtoull(max_rate_s, NULL, 10) / 8
3207 : hfsc->max_rate);
3208 max_rate = MAX(max_rate, min_rate);
3209 max_rate = MIN(max_rate, hfsc->max_rate);
3210
3211 class->min_rate = min_rate;
3212 class->max_rate = max_rate;
3213
3214 return 0;
3215 }
3216
3217 /* Create an HFSC qdisc.
3218 *
3219 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3220 static int
3221 hfsc_setup_qdisc__(struct netdev * netdev)
3222 {
3223 struct tcmsg *tcmsg;
3224 struct ofpbuf request;
3225 struct tc_hfsc_qopt opt;
3226
3227 tc_del_qdisc(netdev);
3228
3229 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3230 NLM_F_EXCL | NLM_F_CREATE, &request);
3231
3232 if (!tcmsg) {
3233 return ENODEV;
3234 }
3235
3236 tcmsg->tcm_handle = tc_make_handle(1, 0);
3237 tcmsg->tcm_parent = TC_H_ROOT;
3238
3239 memset(&opt, 0, sizeof opt);
3240 opt.defcls = 1;
3241
3242 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3243 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3244
3245 return tc_transact(&request, NULL);
3246 }
3247
3248 /* Create an HFSC class.
3249 *
3250 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3251 * sc rate <min_rate> ul rate <max_rate>" */
3252 static int
3253 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3254 unsigned int parent, struct hfsc_class *class)
3255 {
3256 int error;
3257 size_t opt_offset;
3258 struct tcmsg *tcmsg;
3259 struct ofpbuf request;
3260 struct tc_service_curve min, max;
3261
3262 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3263
3264 if (!tcmsg) {
3265 return ENODEV;
3266 }
3267
3268 tcmsg->tcm_handle = handle;
3269 tcmsg->tcm_parent = parent;
3270
3271 min.m1 = 0;
3272 min.d = 0;
3273 min.m2 = class->min_rate;
3274
3275 max.m1 = 0;
3276 max.d = 0;
3277 max.m2 = class->max_rate;
3278
3279 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3280 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3281 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3282 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3283 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3284 nl_msg_end_nested(&request, opt_offset);
3285
3286 error = tc_transact(&request, NULL);
3287 if (error) {
3288 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3289 "min-rate %ubps, max-rate %ubps (%s)",
3290 netdev_get_name(netdev),
3291 tc_get_major(handle), tc_get_minor(handle),
3292 tc_get_major(parent), tc_get_minor(parent),
3293 class->min_rate, class->max_rate, ovs_strerror(error));
3294 }
3295
3296 return error;
3297 }
3298
3299 static int
3300 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3301 {
3302 int error;
3303 struct hfsc_class class;
3304
3305 error = hfsc_setup_qdisc__(netdev);
3306
3307 if (error) {
3308 return error;
3309 }
3310
3311 hfsc_parse_qdisc_details__(netdev, details, &class);
3312 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3313 tc_make_handle(1, 0), &class);
3314
3315 if (error) {
3316 return error;
3317 }
3318
3319 hfsc_install__(netdev, class.max_rate);
3320 return 0;
3321 }
3322
3323 static int
3324 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3325 {
3326 struct ofpbuf msg;
3327 struct nl_dump dump;
3328 struct hfsc_class hc;
3329
3330 hc.max_rate = 0;
3331 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3332 hfsc_install__(netdev, hc.max_rate);
3333
3334 if (!start_queue_dump(netdev, &dump)) {
3335 return ENODEV;
3336 }
3337
3338 while (nl_dump_next(&dump, &msg)) {
3339 unsigned int queue_id;
3340
3341 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3342 hfsc_update_queue__(netdev, queue_id, &hc);
3343 }
3344 }
3345
3346 nl_dump_done(&dump);
3347 return 0;
3348 }
3349
3350 static void
3351 hfsc_tc_destroy(struct tc *tc)
3352 {
3353 struct hfsc *hfsc;
3354 struct hfsc_class *hc, *next;
3355
3356 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3357
3358 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3359 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3360 free(hc);
3361 }
3362
3363 tc_destroy(tc);
3364 free(hfsc);
3365 }
3366
3367 static int
3368 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3369 {
3370 const struct hfsc *hfsc;
3371 hfsc = hfsc_get__(netdev);
3372 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3373 return 0;
3374 }
3375
3376 static int
3377 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3378 {
3379 int error;
3380 struct hfsc_class class;
3381
3382 hfsc_parse_qdisc_details__(netdev, details, &class);
3383 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3384 tc_make_handle(1, 0), &class);
3385
3386 if (!error) {
3387 hfsc_get__(netdev)->max_rate = class.max_rate;
3388 }
3389
3390 return error;
3391 }
3392
3393 static int
3394 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3395 const struct tc_queue *queue, struct smap *details)
3396 {
3397 const struct hfsc_class *hc;
3398
3399 hc = hfsc_class_cast__(queue);
3400 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3401 if (hc->min_rate != hc->max_rate) {
3402 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3403 }
3404 return 0;
3405 }
3406
3407 static int
3408 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3409 const struct smap *details)
3410 {
3411 int error;
3412 struct hfsc_class class;
3413
3414 error = hfsc_parse_class_details__(netdev, details, &class);
3415 if (error) {
3416 return error;
3417 }
3418
3419 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3420 tc_make_handle(1, 0xfffe), &class);
3421 if (error) {
3422 return error;
3423 }
3424
3425 hfsc_update_queue__(netdev, queue_id, &class);
3426 return 0;
3427 }
3428
3429 static int
3430 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3431 {
3432 int error;
3433 struct hfsc *hfsc;
3434 struct hfsc_class *hc;
3435
3436 hc = hfsc_class_cast__(queue);
3437 hfsc = hfsc_get__(netdev);
3438
3439 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3440 if (!error) {
3441 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3442 free(hc);
3443 }
3444 return error;
3445 }
3446
3447 static int
3448 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3449 struct netdev_queue_stats *stats)
3450 {
3451 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3452 tc_make_handle(1, 0xfffe), NULL, stats);
3453 }
3454
3455 static int
3456 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3457 const struct ofpbuf *nlmsg,
3458 netdev_dump_queue_stats_cb *cb, void *aux)
3459 {
3460 struct netdev_queue_stats stats;
3461 unsigned int handle, major, minor;
3462 int error;
3463
3464 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3465 if (error) {
3466 return error;
3467 }
3468
3469 major = tc_get_major(handle);
3470 minor = tc_get_minor(handle);
3471 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3472 (*cb)(minor - 1, &stats, aux);
3473 }
3474 return 0;
3475 }
3476
3477 static const struct tc_ops tc_ops_hfsc = {
3478 "hfsc", /* linux_name */
3479 "linux-hfsc", /* ovs_name */
3480 HFSC_N_QUEUES, /* n_queues */
3481 hfsc_tc_install, /* tc_install */
3482 hfsc_tc_load, /* tc_load */
3483 hfsc_tc_destroy, /* tc_destroy */
3484 hfsc_qdisc_get, /* qdisc_get */
3485 hfsc_qdisc_set, /* qdisc_set */
3486 hfsc_class_get, /* class_get */
3487 hfsc_class_set, /* class_set */
3488 hfsc_class_delete, /* class_delete */
3489 hfsc_class_get_stats, /* class_get_stats */
3490 hfsc_class_dump_stats /* class_dump_stats */
3491 };
3492 \f
3493 /* "linux-default" traffic control class.
3494 *
3495 * This class represents the default, unnamed Linux qdisc. It corresponds to
3496 * the "" (empty string) QoS type in the OVS database. */
3497
3498 static void
3499 default_install__(struct netdev *netdev_)
3500 {
3501 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3502 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3503
3504 /* Nothing but a tc class implementation is allowed to write to a tc. This
3505 * class never does that, so we can legitimately use a const tc object. */
3506 netdev->tc = CONST_CAST(struct tc *, &tc);
3507 }
3508
3509 static int
3510 default_tc_install(struct netdev *netdev,
3511 const struct smap *details OVS_UNUSED)
3512 {
3513 default_install__(netdev);
3514 return 0;
3515 }
3516
3517 static int
3518 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3519 {
3520 default_install__(netdev);
3521 return 0;
3522 }
3523
3524 static const struct tc_ops tc_ops_default = {
3525 NULL, /* linux_name */
3526 "", /* ovs_name */
3527 0, /* n_queues */
3528 default_tc_install,
3529 default_tc_load,
3530 NULL, /* tc_destroy */
3531 NULL, /* qdisc_get */
3532 NULL, /* qdisc_set */
3533 NULL, /* class_get */
3534 NULL, /* class_set */
3535 NULL, /* class_delete */
3536 NULL, /* class_get_stats */
3537 NULL /* class_dump_stats */
3538 };
3539 \f
3540 /* "linux-other" traffic control class.
3541 *
3542 * */
3543
3544 static int
3545 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3546 {
3547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3548 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3549
3550 /* Nothing but a tc class implementation is allowed to write to a tc. This
3551 * class never does that, so we can legitimately use a const tc object. */
3552 netdev->tc = CONST_CAST(struct tc *, &tc);
3553 return 0;
3554 }
3555
3556 static const struct tc_ops tc_ops_other = {
3557 NULL, /* linux_name */
3558 "linux-other", /* ovs_name */
3559 0, /* n_queues */
3560 NULL, /* tc_install */
3561 other_tc_load,
3562 NULL, /* tc_destroy */
3563 NULL, /* qdisc_get */
3564 NULL, /* qdisc_set */
3565 NULL, /* class_get */
3566 NULL, /* class_set */
3567 NULL, /* class_delete */
3568 NULL, /* class_get_stats */
3569 NULL /* class_dump_stats */
3570 };
3571 \f
3572 /* Traffic control. */
3573
3574 /* Number of kernel "tc" ticks per second. */
3575 static double ticks_per_s;
3576
3577 /* Number of kernel "jiffies" per second. This is used for the purpose of
3578 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3579 * one jiffy's worth of data.
3580 *
3581 * There are two possibilities here:
3582 *
3583 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3584 * approximate range of 100 to 1024. That means that we really need to
3585 * make sure that the qdisc can buffer that much data.
3586 *
3587 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3588 * has finely granular timers and there's no need to fudge additional room
3589 * for buffers. (There's no extra effort needed to implement that: the
3590 * large 'buffer_hz' is used as a divisor, so practically any number will
3591 * come out as 0 in the division. Small integer results in the case of
3592 * really high dividends won't have any real effect anyhow.)
3593 */
3594 static unsigned int buffer_hz;
3595
3596 /* Returns tc handle 'major':'minor'. */
3597 static unsigned int
3598 tc_make_handle(unsigned int major, unsigned int minor)
3599 {
3600 return TC_H_MAKE(major << 16, minor);
3601 }
3602
3603 /* Returns the major number from 'handle'. */
3604 static unsigned int
3605 tc_get_major(unsigned int handle)
3606 {
3607 return TC_H_MAJ(handle) >> 16;
3608 }
3609
3610 /* Returns the minor number from 'handle'. */
3611 static unsigned int
3612 tc_get_minor(unsigned int handle)
3613 {
3614 return TC_H_MIN(handle);
3615 }
3616
3617 static struct tcmsg *
3618 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3619 struct ofpbuf *request)
3620 {
3621 struct tcmsg *tcmsg;
3622 int ifindex;
3623 int error;
3624
3625 error = get_ifindex(netdev, &ifindex);
3626 if (error) {
3627 return NULL;
3628 }
3629
3630 ofpbuf_init(request, 512);
3631 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3632 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3633 tcmsg->tcm_family = AF_UNSPEC;
3634 tcmsg->tcm_ifindex = ifindex;
3635 /* Caller should fill in tcmsg->tcm_handle. */
3636 /* Caller should fill in tcmsg->tcm_parent. */
3637
3638 return tcmsg;
3639 }
3640
3641 static int
3642 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3643 {
3644 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3645 ofpbuf_uninit(request);
3646 return error;
3647 }
3648
3649 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3650 * policing configuration.
3651 *
3652 * This function is equivalent to running the following when 'add' is true:
3653 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3654 *
3655 * This function is equivalent to running the following when 'add' is false:
3656 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3657 *
3658 * The configuration and stats may be seen with the following command:
3659 * /sbin/tc -s qdisc show dev <devname>
3660 *
3661 * Returns 0 if successful, otherwise a positive errno value.
3662 */
3663 static int
3664 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3665 {
3666 struct ofpbuf request;
3667 struct tcmsg *tcmsg;
3668 int error;
3669 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3670 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3671
3672 tcmsg = tc_make_request(netdev, type, flags, &request);
3673 if (!tcmsg) {
3674 return ENODEV;
3675 }
3676 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3677 tcmsg->tcm_parent = TC_H_INGRESS;
3678 nl_msg_put_string(&request, TCA_KIND, "ingress");
3679 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3680
3681 error = tc_transact(&request, NULL);
3682 if (error) {
3683 /* If we're deleting the qdisc, don't worry about some of the
3684 * error conditions. */
3685 if (!add && (error == ENOENT || error == EINVAL)) {
3686 return 0;
3687 }
3688 return error;
3689 }
3690
3691 return 0;
3692 }
3693
3694 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3695 * of 'kbits_burst'.
3696 *
3697 * This function is equivalent to running:
3698 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3699 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3700 * mtu 65535 drop
3701 *
3702 * The configuration and stats may be seen with the following command:
3703 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3704 *
3705 * Returns 0 if successful, otherwise a positive errno value.
3706 */
3707 static int
3708 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3709 {
3710 struct tc_police tc_police;
3711 struct ofpbuf request;
3712 struct tcmsg *tcmsg;
3713 size_t basic_offset;
3714 size_t police_offset;
3715 int error;
3716 int mtu = 65535;
3717
3718 memset(&tc_police, 0, sizeof tc_police);
3719 tc_police.action = TC_POLICE_SHOT;
3720 tc_police.mtu = mtu;
3721 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3722 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3723 kbits_burst * 1024);
3724
3725 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3726 NLM_F_EXCL | NLM_F_CREATE, &request);
3727 if (!tcmsg) {
3728 return ENODEV;
3729 }
3730 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3731 tcmsg->tcm_info = tc_make_handle(49,
3732 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3733
3734 nl_msg_put_string(&request, TCA_KIND, "basic");
3735 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3736 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3737 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3738 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3739 nl_msg_end_nested(&request, police_offset);
3740 nl_msg_end_nested(&request, basic_offset);
3741
3742 error = tc_transact(&request, NULL);
3743 if (error) {
3744 return error;
3745 }
3746
3747 return 0;
3748 }
3749
3750 static void
3751 read_psched(void)
3752 {
3753 /* The values in psched are not individually very meaningful, but they are
3754 * important. The tables below show some values seen in the wild.
3755 *
3756 * Some notes:
3757 *
3758 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3759 * (Before that, there are hints that it was 1000000000.)
3760 *
3761 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3762 * above.
3763 *
3764 * /proc/net/psched
3765 * -----------------------------------
3766 * [1] 000c8000 000f4240 000f4240 00000064
3767 * [2] 000003e8 00000400 000f4240 3b9aca00
3768 * [3] 000003e8 00000400 000f4240 3b9aca00
3769 * [4] 000003e8 00000400 000f4240 00000064
3770 * [5] 000003e8 00000040 000f4240 3b9aca00
3771 * [6] 000003e8 00000040 000f4240 000000f9
3772 *
3773 * a b c d ticks_per_s buffer_hz
3774 * ------- --------- ---------- ------------- ----------- -------------
3775 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3776 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3777 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3778 * [4] 1,000 1,024 1,000,000 100 976,562 100
3779 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3780 * [6] 1,000 64 1,000,000 249 15,625,000 249
3781 *
3782 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3783 * [2] 2.6.26-1-686-bigmem from Debian lenny
3784 * [3] 2.6.26-2-sparc64 from Debian lenny
3785 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3786 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3787 * [6] 2.6.34 from kernel.org on KVM
3788 */
3789 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3790 static const char fn[] = "/proc/net/psched";
3791 unsigned int a, b, c, d;
3792 FILE *stream;
3793
3794 if (!ovsthread_once_start(&once)) {
3795 return;
3796 }
3797
3798 ticks_per_s = 1.0;
3799 buffer_hz = 100;
3800
3801 stream = fopen(fn, "r");
3802 if (!stream) {
3803 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3804 goto exit;
3805 }
3806
3807 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3808 VLOG_WARN("%s: read failed", fn);
3809 fclose(stream);
3810 goto exit;
3811 }
3812 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3813 fclose(stream);
3814
3815 if (!a || !c) {
3816 VLOG_WARN("%s: invalid scheduler parameters", fn);
3817 goto exit;
3818 }
3819
3820 ticks_per_s = (double) a * c / b;
3821 if (c == 1000000) {
3822 buffer_hz = d;
3823 } else {
3824 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3825 fn, a, b, c, d);
3826 }
3827 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3828
3829 exit:
3830 ovsthread_once_done(&once);
3831 }
3832
3833 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3834 * rate of 'rate' bytes per second. */
3835 static unsigned int
3836 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3837 {
3838 read_psched();
3839 return (rate * ticks) / ticks_per_s;
3840 }
3841
3842 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3843 * rate of 'rate' bytes per second. */
3844 static unsigned int
3845 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3846 {
3847 read_psched();
3848 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3849 }
3850
3851 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3852 * a transmission rate of 'rate' bytes per second. */
3853 static unsigned int
3854 tc_buffer_per_jiffy(unsigned int rate)
3855 {
3856 read_psched();
3857 return rate / buffer_hz;
3858 }
3859
3860 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3861 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3862 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3863 * stores NULL into it if it is absent.
3864 *
3865 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3866 * 'msg'.
3867 *
3868 * Returns 0 if successful, otherwise a positive errno value. */
3869 static int
3870 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3871 struct nlattr **options)
3872 {
3873 static const struct nl_policy tca_policy[] = {
3874 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3875 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3876 };
3877 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3878
3879 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3880 tca_policy, ta, ARRAY_SIZE(ta))) {
3881 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3882 goto error;
3883 }
3884
3885 if (kind) {
3886 *kind = nl_attr_get_string(ta[TCA_KIND]);
3887 }
3888
3889 if (options) {
3890 *options = ta[TCA_OPTIONS];
3891 }
3892
3893 return 0;
3894
3895 error:
3896 if (kind) {
3897 *kind = NULL;
3898 }
3899 if (options) {
3900 *options = NULL;
3901 }
3902 return EPROTO;
3903 }
3904
3905 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3906 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3907 * into '*options', and its queue statistics into '*stats'. Any of the output
3908 * arguments may be null.
3909 *
3910 * Returns 0 if successful, otherwise a positive errno value. */
3911 static int
3912 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3913 struct nlattr **options, struct netdev_queue_stats *stats)
3914 {
3915 static const struct nl_policy tca_policy[] = {
3916 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3917 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3918 };
3919 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3920
3921 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3922 tca_policy, ta, ARRAY_SIZE(ta))) {
3923 VLOG_WARN_RL(&rl, "failed to parse class message");
3924 goto error;
3925 }
3926
3927 if (handlep) {
3928 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3929 *handlep = tc->tcm_handle;
3930 }
3931
3932 if (options) {
3933 *options = ta[TCA_OPTIONS];
3934 }
3935
3936 if (stats) {
3937 const struct gnet_stats_queue *gsq;
3938 struct gnet_stats_basic gsb;
3939
3940 static const struct nl_policy stats_policy[] = {
3941 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3942 .min_len = sizeof gsb },
3943 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3944 .min_len = sizeof *gsq },
3945 };
3946 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3947
3948 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3949 sa, ARRAY_SIZE(sa))) {
3950 VLOG_WARN_RL(&rl, "failed to parse class stats");
3951 goto error;
3952 }
3953
3954 /* Alignment issues screw up the length of struct gnet_stats_basic on
3955 * some arch/bitsize combinations. Newer versions of Linux have a
3956 * struct gnet_stats_basic_packed, but we can't depend on that. The
3957 * easiest thing to do is just to make a copy. */
3958 memset(&gsb, 0, sizeof gsb);
3959 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3960 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3961 stats->tx_bytes = gsb.bytes;
3962 stats->tx_packets = gsb.packets;
3963
3964 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3965 stats->tx_errors = gsq->drops;
3966 }
3967
3968 return 0;
3969
3970 error:
3971 if (options) {
3972 *options = NULL;
3973 }
3974 if (stats) {
3975 memset(stats, 0, sizeof *stats);
3976 }
3977 return EPROTO;
3978 }
3979
3980 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3981 * on 'netdev'. */
3982 static int
3983 tc_query_class(const struct netdev *netdev,
3984 unsigned int handle, unsigned int parent,
3985 struct ofpbuf **replyp)
3986 {
3987 struct ofpbuf request;
3988 struct tcmsg *tcmsg;
3989 int error;
3990
3991 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3992 if (!tcmsg) {
3993 return ENODEV;
3994 }
3995 tcmsg->tcm_handle = handle;
3996 tcmsg->tcm_parent = parent;
3997
3998 error = tc_transact(&request, replyp);
3999 if (error) {
4000 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4001 netdev_get_name(netdev),
4002 tc_get_major(handle), tc_get_minor(handle),
4003 tc_get_major(parent), tc_get_minor(parent),
4004 ovs_strerror(error));
4005 }
4006 return error;
4007 }
4008
4009 /* Equivalent to "tc class del dev <name> handle <handle>". */
4010 static int
4011 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4012 {
4013 struct ofpbuf request;
4014 struct tcmsg *tcmsg;
4015 int error;
4016
4017 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4018 if (!tcmsg) {
4019 return ENODEV;
4020 }
4021 tcmsg->tcm_handle = handle;
4022 tcmsg->tcm_parent = 0;
4023
4024 error = tc_transact(&request, NULL);
4025 if (error) {
4026 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4027 netdev_get_name(netdev),
4028 tc_get_major(handle), tc_get_minor(handle),
4029 ovs_strerror(error));
4030 }
4031 return error;
4032 }
4033
4034 /* Equivalent to "tc qdisc del dev <name> root". */
4035 static int
4036 tc_del_qdisc(struct netdev *netdev_)
4037 {
4038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4039 struct ofpbuf request;
4040 struct tcmsg *tcmsg;
4041 int error;
4042
4043 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4044 if (!tcmsg) {
4045 return ENODEV;
4046 }
4047 tcmsg->tcm_handle = tc_make_handle(1, 0);
4048 tcmsg->tcm_parent = TC_H_ROOT;
4049
4050 error = tc_transact(&request, NULL);
4051 if (error == EINVAL) {
4052 /* EINVAL probably means that the default qdisc was in use, in which
4053 * case we've accomplished our purpose. */
4054 error = 0;
4055 }
4056 if (!error && netdev->tc) {
4057 if (netdev->tc->ops->tc_destroy) {
4058 netdev->tc->ops->tc_destroy(netdev->tc);
4059 }
4060 netdev->tc = NULL;
4061 }
4062 return error;
4063 }
4064
4065 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4066 * kernel to determine what they are. Returns 0 if successful, otherwise a
4067 * positive errno value. */
4068 static int
4069 tc_query_qdisc(const struct netdev *netdev_)
4070 {
4071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4072 struct ofpbuf request, *qdisc;
4073 const struct tc_ops *ops;
4074 struct tcmsg *tcmsg;
4075 int load_error;
4076 int error;
4077
4078 if (netdev->tc) {
4079 return 0;
4080 }
4081
4082 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4083 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4084 * 2.6.35 without that fix backported to it.
4085 *
4086 * To avoid the OOPS, we must not make a request that would attempt to dump
4087 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4088 * few others. There are a few ways that I can see to do this, but most of
4089 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4090 * technique chosen here is to assume that any non-default qdisc that we
4091 * create will have a class with handle 1:0. The built-in qdiscs only have
4092 * a class with handle 0:0.
4093 *
4094 * We could check for Linux 2.6.35+ and use a more straightforward method
4095 * there. */
4096 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4097 if (!tcmsg) {
4098 return ENODEV;
4099 }
4100 tcmsg->tcm_handle = tc_make_handle(1, 0);
4101 tcmsg->tcm_parent = 0;
4102
4103 /* Figure out what tc class to instantiate. */
4104 error = tc_transact(&request, &qdisc);
4105 if (!error) {
4106 const char *kind;
4107
4108 error = tc_parse_qdisc(qdisc, &kind, NULL);
4109 if (error) {
4110 ops = &tc_ops_other;
4111 } else {
4112 ops = tc_lookup_linux_name(kind);
4113 if (!ops) {
4114 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4115 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4116
4117 ops = &tc_ops_other;
4118 }
4119 }
4120 } else if (error == ENOENT) {
4121 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4122 * other entity that doesn't have a handle 1:0. We will assume
4123 * that it's the system default qdisc. */
4124 ops = &tc_ops_default;
4125 error = 0;
4126 } else {
4127 /* Who knows? Maybe the device got deleted. */
4128 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4129 netdev_get_name(netdev_), ovs_strerror(error));
4130 ops = &tc_ops_other;
4131 }
4132
4133 /* Instantiate it. */
4134 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4135 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4136 ofpbuf_delete(qdisc);
4137
4138 return error ? error : load_error;
4139 }
4140
4141 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4142 approximate the time to transmit packets of various lengths. For an MTU of
4143 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4144 represents two possible packet lengths; for a MTU of 513 through 1024, four
4145 possible lengths; and so on.
4146
4147 Returns, for the specified 'mtu', the number of bits that packet lengths
4148 need to be shifted right to fit within such a 256-entry table. */
4149 static int
4150 tc_calc_cell_log(unsigned int mtu)
4151 {
4152 int cell_log;
4153
4154 if (!mtu) {
4155 mtu = ETH_PAYLOAD_MAX;
4156 }
4157 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4158
4159 for (cell_log = 0; mtu >= 256; cell_log++) {
4160 mtu >>= 1;
4161 }
4162
4163 return cell_log;
4164 }
4165
4166 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4167 * of 'mtu'. */
4168 static void
4169 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4170 {
4171 memset(rate, 0, sizeof *rate);
4172 rate->cell_log = tc_calc_cell_log(mtu);
4173 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4174 /* rate->cell_align = 0; */ /* distro headers. */
4175 rate->mpu = ETH_TOTAL_MIN;
4176 rate->rate = Bps;
4177 }
4178
4179 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4180 * attribute of the specified "type".
4181 *
4182 * See tc_calc_cell_log() above for a description of "rtab"s. */
4183 static void
4184 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4185 {
4186 uint32_t *rtab;
4187 unsigned int i;
4188
4189 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4190 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4191 unsigned packet_size = (i + 1) << rate->cell_log;
4192 if (packet_size < rate->mpu) {
4193 packet_size = rate->mpu;
4194 }
4195 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4196 }
4197 }
4198
4199 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4200 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4201 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4202 * 0 is fine.) */
4203 static int
4204 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4205 {
4206 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4207 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4208 }
4209 \f
4210 /* Linux-only functions declared in netdev-linux.h */
4211
4212 /* Returns a fd for an AF_INET socket or a negative errno value. */
4213 int
4214 netdev_linux_get_af_inet_sock(void)
4215 {
4216 int error = netdev_linux_init();
4217 return error ? -error : af_inet_sock;
4218 }
4219
4220 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4221 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4222 int
4223 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4224 const char *flag_name, bool enable)
4225 {
4226 const char *netdev_name = netdev_get_name(netdev);
4227 struct ethtool_value evalue;
4228 uint32_t new_flags;
4229 int error;
4230
4231 COVERAGE_INC(netdev_get_ethtool);
4232 memset(&evalue, 0, sizeof evalue);
4233 error = netdev_linux_do_ethtool(netdev_name,
4234 (struct ethtool_cmd *)&evalue,
4235 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4236 if (error) {
4237 return error;
4238 }
4239
4240 COVERAGE_INC(netdev_set_ethtool);
4241 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4242 error = netdev_linux_do_ethtool(netdev_name,
4243 (struct ethtool_cmd *)&evalue,
4244 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4245 if (error) {
4246 return error;
4247 }
4248
4249 COVERAGE_INC(netdev_get_ethtool);
4250 memset(&evalue, 0, sizeof evalue);
4251 error = netdev_linux_do_ethtool(netdev_name,
4252 (struct ethtool_cmd *)&evalue,
4253 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4254 if (error) {
4255 return error;
4256 }
4257
4258 if (new_flags != evalue.data) {
4259 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4260 "device %s failed", enable ? "enable" : "disable",
4261 flag_name, netdev_name);
4262 return EOPNOTSUPP;
4263 }
4264
4265 return 0;
4266 }
4267 \f
4268 /* Utility functions. */
4269
4270 /* Copies 'src' into 'dst', performing format conversion in the process. */
4271 static void
4272 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4273 const struct rtnl_link_stats *src)
4274 {
4275 dst->rx_packets = src->rx_packets;
4276 dst->tx_packets = src->tx_packets;
4277 dst->rx_bytes = src->rx_bytes;
4278 dst->tx_bytes = src->tx_bytes;
4279 dst->rx_errors = src->rx_errors;
4280 dst->tx_errors = src->tx_errors;
4281 dst->rx_dropped = src->rx_dropped;
4282 dst->tx_dropped = src->tx_dropped;
4283 dst->multicast = src->multicast;
4284 dst->collisions = src->collisions;
4285 dst->rx_length_errors = src->rx_length_errors;
4286 dst->rx_over_errors = src->rx_over_errors;
4287 dst->rx_crc_errors = src->rx_crc_errors;
4288 dst->rx_frame_errors = src->rx_frame_errors;
4289 dst->rx_fifo_errors = src->rx_fifo_errors;
4290 dst->rx_missed_errors = src->rx_missed_errors;
4291 dst->tx_aborted_errors = src->tx_aborted_errors;
4292 dst->tx_carrier_errors = src->tx_carrier_errors;
4293 dst->tx_fifo_errors = src->tx_fifo_errors;
4294 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4295 dst->tx_window_errors = src->tx_window_errors;
4296 }
4297
4298 static int
4299 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4300 {
4301 /* Policy for RTNLGRP_LINK messages.
4302 *
4303 * There are *many* more fields in these messages, but currently we only
4304 * care about these fields. */
4305 static const struct nl_policy rtnlgrp_link_policy[] = {
4306 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4307 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4308 .min_len = sizeof(struct rtnl_link_stats) },
4309 };
4310
4311 struct ofpbuf request;
4312 struct ofpbuf *reply;
4313 struct ifinfomsg *ifi;
4314 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4315 int error;
4316
4317 ofpbuf_init(&request, 0);
4318 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4319 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4320 ifi->ifi_family = PF_UNSPEC;
4321 ifi->ifi_index = ifindex;
4322 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4323 ofpbuf_uninit(&request);
4324 if (error) {
4325 return error;
4326 }
4327
4328 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4329 rtnlgrp_link_policy,
4330 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4331 ofpbuf_delete(reply);
4332 return EPROTO;
4333 }
4334
4335 if (!attrs[IFLA_STATS]) {
4336 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4337 ofpbuf_delete(reply);
4338 return EPROTO;
4339 }
4340
4341 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4342
4343 ofpbuf_delete(reply);
4344
4345 return 0;
4346 }
4347
4348 static int
4349 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4350 {
4351 static const char fn[] = "/proc/net/dev";
4352 char line[1024];
4353 FILE *stream;
4354 int ln;
4355
4356 stream = fopen(fn, "r");
4357 if (!stream) {
4358 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4359 return errno;
4360 }
4361
4362 ln = 0;
4363 while (fgets(line, sizeof line, stream)) {
4364 if (++ln >= 3) {
4365 char devname[16];
4366 #define X64 "%"SCNu64
4367 if (sscanf(line,
4368 " %15[^:]:"
4369 X64 X64 X64 X64 X64 X64 X64 "%*u"
4370 X64 X64 X64 X64 X64 X64 X64 "%*u",
4371 devname,
4372 &stats->rx_bytes,
4373 &stats->rx_packets,
4374 &stats->rx_errors,
4375 &stats->rx_dropped,
4376 &stats->rx_fifo_errors,
4377 &stats->rx_frame_errors,
4378 &stats->multicast,
4379 &stats->tx_bytes,
4380 &stats->tx_packets,
4381 &stats->tx_errors,
4382 &stats->tx_dropped,
4383 &stats->tx_fifo_errors,
4384 &stats->collisions,
4385 &stats->tx_carrier_errors) != 15) {
4386 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4387 } else if (!strcmp(devname, netdev_name)) {
4388 stats->rx_length_errors = UINT64_MAX;
4389 stats->rx_over_errors = UINT64_MAX;
4390 stats->rx_crc_errors = UINT64_MAX;
4391 stats->rx_missed_errors = UINT64_MAX;
4392 stats->tx_aborted_errors = UINT64_MAX;
4393 stats->tx_heartbeat_errors = UINT64_MAX;
4394 stats->tx_window_errors = UINT64_MAX;
4395 fclose(stream);
4396 return 0;
4397 }
4398 }
4399 }
4400 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4401 fclose(stream);
4402 return ENODEV;
4403 }
4404
4405 static int
4406 get_flags(const struct netdev *dev, unsigned int *flags)
4407 {
4408 struct ifreq ifr;
4409 int error;
4410
4411 *flags = 0;
4412 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4413 "SIOCGIFFLAGS");
4414 if (!error) {
4415 *flags = ifr.ifr_flags;
4416 }
4417 return error;
4418 }
4419
4420 static int
4421 set_flags(const char *name, unsigned int flags)
4422 {
4423 struct ifreq ifr;
4424
4425 ifr.ifr_flags = flags;
4426 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4427 }
4428
4429 static int
4430 do_get_ifindex(const char *netdev_name)
4431 {
4432 struct ifreq ifr;
4433
4434 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4435 COVERAGE_INC(netdev_get_ifindex);
4436 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4437 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4438 netdev_name, ovs_strerror(errno));
4439 return -errno;
4440 }
4441 return ifr.ifr_ifindex;
4442 }
4443
4444 static int
4445 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4446 {
4447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4448
4449 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4450 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4451
4452 if (ifindex < 0) {
4453 netdev->get_ifindex_error = -ifindex;
4454 netdev->ifindex = 0;
4455 } else {
4456 netdev->get_ifindex_error = 0;
4457 netdev->ifindex = ifindex;
4458 }
4459 netdev->cache_valid |= VALID_IFINDEX;
4460 }
4461
4462 *ifindexp = netdev->ifindex;
4463 return netdev->get_ifindex_error;
4464 }
4465
4466 static int
4467 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4468 {
4469 struct ifreq ifr;
4470 int hwaddr_family;
4471
4472 memset(&ifr, 0, sizeof ifr);
4473 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4474 COVERAGE_INC(netdev_get_hwaddr);
4475 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4476 /* ENODEV probably means that a vif disappeared asynchronously and
4477 * hasn't been removed from the database yet, so reduce the log level
4478 * to INFO for that case. */
4479 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4480 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4481 netdev_name, ovs_strerror(errno));
4482 return errno;
4483 }
4484 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4485 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4486 VLOG_WARN("%s device has unknown hardware address family %d",
4487 netdev_name, hwaddr_family);
4488 }
4489 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4490 return 0;
4491 }
4492
4493 static int
4494 set_etheraddr(const char *netdev_name,
4495 const uint8_t mac[ETH_ADDR_LEN])
4496 {
4497 struct ifreq ifr;
4498
4499 memset(&ifr, 0, sizeof ifr);
4500 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4501 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4502 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4503 COVERAGE_INC(netdev_set_hwaddr);
4504 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4505 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4506 netdev_name, ovs_strerror(errno));
4507 return errno;
4508 }
4509 return 0;
4510 }
4511
4512 static int
4513 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4514 int cmd, const char *cmd_name)
4515 {
4516 struct ifreq ifr;
4517
4518 memset(&ifr, 0, sizeof ifr);
4519 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4520 ifr.ifr_data = (caddr_t) ecmd;
4521
4522 ecmd->cmd = cmd;
4523 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4524 return 0;
4525 } else {
4526 if (errno != EOPNOTSUPP) {
4527 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4528 "failed: %s", cmd_name, name, ovs_strerror(errno));
4529 } else {
4530 /* The device doesn't support this operation. That's pretty
4531 * common, so there's no point in logging anything. */
4532 }
4533 return errno;
4534 }
4535 }
4536
4537 static int
4538 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4539 const char *cmd_name)
4540 {
4541 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4542 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4543 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4544 ovs_strerror(errno));
4545 return errno;
4546 }
4547 return 0;
4548 }
4549
4550 static int
4551 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4552 int cmd, const char *cmd_name)
4553 {
4554 struct ifreq ifr;
4555 int error;
4556
4557 ifr.ifr_addr.sa_family = AF_INET;
4558 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4559 if (!error) {
4560 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4561 &ifr.ifr_addr);
4562 *ip = sin->sin_addr;
4563 }
4564 return error;
4565 }
4566
4567 /* Returns an AF_PACKET raw socket or a negative errno value. */
4568 static int
4569 af_packet_sock(void)
4570 {
4571 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4572 static int sock;
4573
4574 if (ovsthread_once_start(&once)) {
4575 sock = socket(AF_PACKET, SOCK_RAW, 0);
4576 if (sock >= 0) {
4577 int error = set_nonblocking(sock);
4578 if (error) {
4579 close(sock);
4580 sock = -error;
4581 }
4582 } else {
4583 sock = -errno;
4584 VLOG_ERR("failed to create packet socket: %s",
4585 ovs_strerror(errno));
4586 }
4587 ovsthread_once_done(&once);
4588 }
4589
4590 return sock;
4591 }