]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
netdev: Minor formatting improvements.
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "packets.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "shash.h"
68 #include "socket-util.h"
69 #include "sset.h"
70 #include "timer.h"
71 #include "unaligned.h"
72 #include "vlog.h"
73
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
83
84 \f
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
89 #endif
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
92 #endif
93
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #endif
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 #endif
102
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105 #ifndef TC_RTAB_SIZE
106 #define TC_RTAB_SIZE 1024
107 #endif
108
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
111
112 enum {
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
122 };
123
124 struct tap_state {
125 int fd;
126 };
127 \f
128 /* Traffic control. */
129
130 /* An instance of a traffic control class. Always associated with a particular
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
135 struct tc {
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140 };
141
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
144 /* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148 struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
152 };
153
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
156 *
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
160 struct tc_ops {
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
165
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
168
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
172
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
178 *
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
182 *
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
185 *
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
193 *
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
199 * 'netdev'.
200 *
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * tc_destroy(tc).
208 *
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
212 *
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
215
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 *
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
221 *
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
225 *
226 * This function may be null if 'tc' is not configurable.
227 */
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 *
237 * This function may be null if 'tc' is not configurable.
238 */
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
240
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 *
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
251 *
252 * This function may be null if 'tc' does not have queues ('n_queues' is
253 * 0). */
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
256
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * 'n_queues'.
261 *
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 *
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
270
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
273 *
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
280 *
281 * On success, initializes '*stats'.
282 *
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
288
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 *
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
297 };
298
299 static void
300 tc_init(struct tc *tc, const struct tc_ops *ops)
301 {
302 tc->ops = ops;
303 hmap_init(&tc->queues);
304 }
305
306 static void
307 tc_destroy(struct tc *tc)
308 {
309 hmap_destroy(&tc->queues);
310 }
311
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
316
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
322 NULL
323 };
324
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
328
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 int kbits_burst);
339
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
349
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
352
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 \f
359 struct netdev_linux {
360 struct netdev up;
361
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
365
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
369
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 int ifindex;
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
375 struct in6_addr in6;
376 int mtu;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tc *tc;
396
397 union {
398 struct tap_state tap;
399 } state;
400 };
401
402 struct netdev_rx_linux {
403 struct netdev_rx up;
404 bool is_tap;
405 int fd;
406 };
407
408 static const struct netdev_rx_class netdev_rx_linux_class;
409
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416
417 static int netdev_linux_init(void);
418
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
439
440 static bool
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 {
443 return netdev_class->init == netdev_linux_init;
444 }
445
446 static bool
447 is_tap_netdev(const struct netdev *netdev)
448 {
449 return netdev_get_class(netdev) == &netdev_tap_class;
450 }
451
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
454 {
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
456
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
458 }
459
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
462 {
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
465 }
466 \f
467 static int
468 netdev_linux_init(void)
469 {
470 static int status = -1;
471 if (status < 0) {
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
475 if (status) {
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
477 }
478 }
479 return status;
480 }
481
482 static void
483 netdev_linux_run(void)
484 {
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
487 }
488
489 static void
490 netdev_linux_wait(void)
491 {
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
494 }
495
496 static void
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
499 {
500 dev->change_seq++;
501 if (!dev->change_seq) {
502 dev->change_seq++;
503 }
504
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
507 }
508 dev->ifi_flags = ifi_flags;
509
510 dev->cache_valid &= mask;
511 }
512
513 static void
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
516 {
517 if (change->nlmsg_type == RTM_NEWLINK) {
518 /* Keep drv-info */
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
520
521 /* Update netdev from rtnl-change msg. */
522 if (change->mtu) {
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
526 }
527
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
532 }
533
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
537
538 } else {
539 netdev_linux_changed(dev, change->ifi_flags, 0);
540 }
541 }
542
543 static void
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
546 {
547 struct netdev_linux *dev;
548 if (change) {
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
552 }
553 } else {
554 struct shash device_shash;
555 struct shash_node *node;
556
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
560 struct netdev *netdev = node->data;
561 unsigned int flags;
562
563 dev = netdev_linux_cast(netdev);
564
565 get_flags(&dev->up, &flags);
566 netdev_linux_changed(dev, flags, 0);
567 }
568 shash_destroy(&device_shash);
569 }
570 }
571
572 static int
573 cache_notifier_ref(void)
574 {
575 if (!cache_notifier_refcount) {
576 ovs_assert(!netdev_linux_cache_notifier);
577
578 netdev_linux_cache_notifier =
579 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
580
581 if (!netdev_linux_cache_notifier) {
582 return EINVAL;
583 }
584 }
585 cache_notifier_refcount++;
586
587 return 0;
588 }
589
590 static void
591 cache_notifier_unref(void)
592 {
593 ovs_assert(cache_notifier_refcount > 0);
594 if (!--cache_notifier_refcount) {
595 ovs_assert(netdev_linux_cache_notifier);
596 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
597 netdev_linux_cache_notifier = NULL;
598 }
599 }
600
601 /* Creates system and internal devices. */
602 static int
603 netdev_linux_create(const struct netdev_class *class, const char *name,
604 struct netdev **netdevp)
605 {
606 struct netdev_linux *netdev;
607 int error;
608
609 error = cache_notifier_ref();
610 if (error) {
611 return error;
612 }
613
614 netdev = xzalloc(sizeof *netdev);
615 netdev->change_seq = 1;
616 netdev_init(&netdev->up, name, class);
617 error = get_flags(&netdev->up, &netdev->ifi_flags);
618 if (error == ENODEV) {
619 if (class != &netdev_internal_class) {
620 /* The device does not exist, so don't allow it to be opened. */
621 netdev_uninit(&netdev->up, false);
622 cache_notifier_unref();
623 free(netdev);
624 return ENODEV;
625 } else {
626 /* "Internal" netdevs have to be created as netdev objects before
627 * they exist in the kernel, because creating them in the kernel
628 * happens by passing a netdev object to dpif_port_add().
629 * Therefore, ignore the error. */
630 }
631 }
632
633 *netdevp = &netdev->up;
634 return 0;
635 }
636
637 /* For most types of netdevs we open the device for each call of
638 * netdev_open(). However, this is not the case with tap devices,
639 * since it is only possible to open the device once. In this
640 * situation we share a single file descriptor, and consequently
641 * buffers, across all readers. Therefore once data is read it will
642 * be unavailable to other reads for tap devices. */
643 static int
644 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
645 const char *name, struct netdev **netdevp)
646 {
647 struct netdev_linux *netdev;
648 struct tap_state *state;
649 static const char tap_dev[] = "/dev/net/tun";
650 struct ifreq ifr;
651 int error;
652
653 netdev = xzalloc(sizeof *netdev);
654 netdev->change_seq = 1;
655 state = &netdev->state.tap;
656
657 error = cache_notifier_ref();
658 if (error) {
659 goto error;
660 }
661
662 /* Open tap device. */
663 state->fd = open(tap_dev, O_RDWR);
664 if (state->fd < 0) {
665 error = errno;
666 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
667 goto error_unref_notifier;
668 }
669
670 /* Create tap device. */
671 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
672 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
673 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
674 VLOG_WARN("%s: creating tap device failed: %s", name,
675 ovs_strerror(errno));
676 error = errno;
677 goto error_close;
678 }
679
680 /* Make non-blocking. */
681 error = set_nonblocking(state->fd);
682 if (error) {
683 goto error_close;
684 }
685
686 netdev_init(&netdev->up, name, &netdev_tap_class);
687 *netdevp = &netdev->up;
688 return 0;
689
690 error_close:
691 close(state->fd);
692 error_unref_notifier:
693 cache_notifier_unref();
694 error:
695 free(netdev);
696 return error;
697 }
698
699 static void
700 destroy_tap(struct netdev_linux *netdev)
701 {
702 struct tap_state *state = &netdev->state.tap;
703
704 if (state->fd >= 0) {
705 close(state->fd);
706 }
707 }
708
709 /* Destroys the netdev device 'netdev_'. */
710 static void
711 netdev_linux_destroy(struct netdev *netdev_)
712 {
713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
714
715 if (netdev->tc && netdev->tc->ops->tc_destroy) {
716 netdev->tc->ops->tc_destroy(netdev->tc);
717 }
718
719 if (netdev_get_class(netdev_) == &netdev_tap_class) {
720 destroy_tap(netdev);
721 }
722 free(netdev);
723
724 cache_notifier_unref();
725 }
726
727 static int
728 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
729 {
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
731 bool is_tap = is_tap_netdev(netdev_);
732 struct netdev_rx_linux *rx;
733 int error;
734 int fd;
735
736 if (is_tap) {
737 fd = netdev->state.tap.fd;
738 } else {
739 struct sockaddr_ll sll;
740 int ifindex;
741 /* Result of tcpdump -dd inbound */
742 static struct sock_filter filt[] = {
743 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
744 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
745 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
746 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
747 };
748 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
749
750 /* Create file descriptor. */
751 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 if (fd < 0) {
753 error = errno;
754 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
755 goto error;
756 }
757
758 /* Set non-blocking mode. */
759 error = set_nonblocking(fd);
760 if (error) {
761 goto error;
762 }
763
764 /* Get ethernet device index. */
765 error = get_ifindex(&netdev->up, &ifindex);
766 if (error) {
767 goto error;
768 }
769
770 /* Bind to specific ethernet device. */
771 memset(&sll, 0, sizeof sll);
772 sll.sll_family = AF_PACKET;
773 sll.sll_ifindex = ifindex;
774 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
775 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
776 error = errno;
777 VLOG_ERR("%s: failed to bind raw socket (%s)",
778 netdev_get_name(netdev_), ovs_strerror(error));
779 goto error;
780 }
781
782 /* Filter for only inbound packets. */
783 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
784 sizeof fprog);
785 if (error) {
786 error = errno;
787 VLOG_ERR("%s: failed attach filter (%s)",
788 netdev_get_name(netdev_), ovs_strerror(error));
789 goto error;
790 }
791 }
792
793 rx = xmalloc(sizeof *rx);
794 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
795 rx->is_tap = is_tap;
796 rx->fd = fd;
797
798 *rxp = &rx->up;
799 return 0;
800
801 error:
802 if (fd >= 0) {
803 close(fd);
804 }
805 return error;
806 }
807
808 static void
809 netdev_rx_linux_destroy(struct netdev_rx *rx_)
810 {
811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
812
813 if (!rx->is_tap) {
814 close(rx->fd);
815 }
816 free(rx);
817 }
818
819 static int
820 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
821 {
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
823 ssize_t retval;
824
825 do {
826 retval = (rx->is_tap
827 ? read(rx->fd, data, size)
828 : recv(rx->fd, data, size, MSG_TRUNC));
829 } while (retval < 0 && errno == EINTR);
830
831 if (retval >= 0) {
832 return retval > size ? -EMSGSIZE : retval;
833 } else {
834 if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
836 ovs_strerror(errno), netdev_rx_get_name(rx_));
837 }
838 return -errno;
839 }
840 }
841
842 static void
843 netdev_rx_linux_wait(struct netdev_rx *rx_)
844 {
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
846 poll_fd_wait(rx->fd, POLLIN);
847 }
848
849 static int
850 netdev_rx_linux_drain(struct netdev_rx *rx_)
851 {
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
853 if (rx->is_tap) {
854 struct ifreq ifr;
855 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
856 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
857 if (error) {
858 return error;
859 }
860 drain_fd(rx->fd, ifr.ifr_qlen);
861 return 0;
862 } else {
863 return drain_rcvbuf(rx->fd);
864 }
865 }
866
867 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
868 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
869 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
870 * the packet is too big or too small to transmit on the device.
871 *
872 * The caller retains ownership of 'buffer' in all cases.
873 *
874 * The kernel maintains a packet transmission queue, so the caller is not
875 * expected to do additional queuing of packets. */
876 static int
877 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
878 {
879 for (;;) {
880 ssize_t retval;
881
882 if (!is_tap_netdev(netdev_)) {
883 /* Use our AF_PACKET socket to send to this device. */
884 struct sockaddr_ll sll;
885 struct msghdr msg;
886 struct iovec iov;
887 int ifindex;
888 int error;
889 int sock;
890
891 sock = af_packet_sock();
892 if (sock < 0) {
893 return -sock;
894 }
895
896 error = get_ifindex(netdev_, &ifindex);
897 if (error) {
898 return error;
899 }
900
901 /* We don't bother setting most fields in sockaddr_ll because the
902 * kernel ignores them for SOCK_RAW. */
903 memset(&sll, 0, sizeof sll);
904 sll.sll_family = AF_PACKET;
905 sll.sll_ifindex = ifindex;
906
907 iov.iov_base = CONST_CAST(void *, data);
908 iov.iov_len = size;
909
910 msg.msg_name = &sll;
911 msg.msg_namelen = sizeof sll;
912 msg.msg_iov = &iov;
913 msg.msg_iovlen = 1;
914 msg.msg_control = NULL;
915 msg.msg_controllen = 0;
916 msg.msg_flags = 0;
917
918 retval = sendmsg(sock, &msg, 0);
919 } else {
920 /* Use the tap fd to send to this device. This is essential for
921 * tap devices, because packets sent to a tap device with an
922 * AF_PACKET socket will loop back to be *received* again on the
923 * tap device. This doesn't occur on other interface types
924 * because we attach a socket filter to the rx socket. */
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926
927 retval = write(netdev->state.tap.fd, data, size);
928 }
929
930 if (retval < 0) {
931 /* The Linux AF_PACKET implementation never blocks waiting for room
932 * for packets, instead returning ENOBUFS. Translate this into
933 * EAGAIN for the caller. */
934 if (errno == ENOBUFS) {
935 return EAGAIN;
936 } else if (errno == EINTR) {
937 continue;
938 } else if (errno != EAGAIN) {
939 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
940 netdev_get_name(netdev_), ovs_strerror(errno));
941 }
942 return errno;
943 } else if (retval != size) {
944 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
945 "%zu) on %s", retval, size, netdev_get_name(netdev_));
946 return EMSGSIZE;
947 } else {
948 return 0;
949 }
950 }
951 }
952
953 /* Registers with the poll loop to wake up from the next call to poll_block()
954 * when the packet transmission queue has sufficient room to transmit a packet
955 * with netdev_send().
956 *
957 * The kernel maintains a packet transmission queue, so the client is not
958 * expected to do additional queuing of packets. Thus, this function is
959 * unlikely to ever be used. It is included for completeness. */
960 static void
961 netdev_linux_send_wait(struct netdev *netdev)
962 {
963 if (is_tap_netdev(netdev)) {
964 /* TAP device always accepts packets.*/
965 poll_immediate_wake();
966 }
967 }
968
969 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
970 * otherwise a positive errno value. */
971 static int
972 netdev_linux_set_etheraddr(struct netdev *netdev_,
973 const uint8_t mac[ETH_ADDR_LEN])
974 {
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 struct netdev_saved_flags *sf = NULL;
977 int error;
978
979 if (netdev->cache_valid & VALID_ETHERADDR) {
980 if (netdev->ether_addr_error) {
981 return netdev->ether_addr_error;
982 }
983 if (eth_addr_equals(netdev->etheraddr, mac)) {
984 return 0;
985 }
986 netdev->cache_valid &= ~VALID_ETHERADDR;
987 }
988
989 /* Tap devices must be brought down before setting the address. */
990 if (is_tap_netdev(netdev_)) {
991 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
992 }
993 error = set_etheraddr(netdev_get_name(netdev_), mac);
994 if (!error || error == ENODEV) {
995 netdev->ether_addr_error = error;
996 netdev->cache_valid |= VALID_ETHERADDR;
997 if (!error) {
998 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
999 }
1000 }
1001
1002 netdev_restore_flags(sf);
1003
1004 return error;
1005 }
1006
1007 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1008 static int
1009 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1010 uint8_t mac[ETH_ADDR_LEN])
1011 {
1012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1013
1014 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1015 int error = get_etheraddr(netdev_get_name(netdev_),
1016 netdev->etheraddr);
1017
1018 netdev->ether_addr_error = error;
1019 netdev->cache_valid |= VALID_ETHERADDR;
1020 }
1021
1022 if (!netdev->ether_addr_error) {
1023 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1024 }
1025
1026 return netdev->ether_addr_error;
1027 }
1028
1029 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1030 * in bytes, not including the hardware header; thus, this is typically 1500
1031 * bytes for Ethernet devices. */
1032 static int
1033 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1034 {
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 if (!(netdev->cache_valid & VALID_MTU)) {
1037 struct ifreq ifr;
1038 int error;
1039
1040 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1041 SIOCGIFMTU, "SIOCGIFMTU");
1042
1043 netdev->netdev_mtu_error = error;
1044 netdev->mtu = ifr.ifr_mtu;
1045 netdev->cache_valid |= VALID_MTU;
1046 }
1047
1048 if (!netdev->netdev_mtu_error) {
1049 *mtup = netdev->mtu;
1050 }
1051 return netdev->netdev_mtu_error;
1052 }
1053
1054 /* Sets the maximum size of transmitted (MTU) for given device using linux
1055 * networking ioctl interface.
1056 */
1057 static int
1058 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1059 {
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1061 struct ifreq ifr;
1062 int error;
1063
1064 if (netdev->cache_valid & VALID_MTU) {
1065 if (netdev->netdev_mtu_error) {
1066 return netdev->netdev_mtu_error;
1067 }
1068 if (netdev->mtu == mtu) {
1069 return 0;
1070 }
1071 netdev->cache_valid &= ~VALID_MTU;
1072 }
1073 ifr.ifr_mtu = mtu;
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCSIFMTU, "SIOCSIFMTU");
1076 if (!error || error == ENODEV) {
1077 netdev->netdev_mtu_error = error;
1078 netdev->mtu = ifr.ifr_mtu;
1079 netdev->cache_valid |= VALID_MTU;
1080 }
1081 return error;
1082 }
1083
1084 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1085 * On failure, returns a negative errno value. */
1086 static int
1087 netdev_linux_get_ifindex(const struct netdev *netdev)
1088 {
1089 int ifindex, error;
1090
1091 error = get_ifindex(netdev, &ifindex);
1092 return error ? -error : ifindex;
1093 }
1094
1095 static int
1096 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1097 {
1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1099
1100 if (netdev->miimon_interval > 0) {
1101 *carrier = netdev->miimon;
1102 } else {
1103 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1104 }
1105
1106 return 0;
1107 }
1108
1109 static long long int
1110 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1111 {
1112 return netdev_linux_cast(netdev)->carrier_resets;
1113 }
1114
1115 static int
1116 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1117 struct mii_ioctl_data *data)
1118 {
1119 struct ifreq ifr;
1120 int error;
1121
1122 memset(&ifr, 0, sizeof ifr);
1123 memcpy(&ifr.ifr_data, data, sizeof *data);
1124 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1125 memcpy(data, &ifr.ifr_data, sizeof *data);
1126
1127 return error;
1128 }
1129
1130 static int
1131 netdev_linux_get_miimon(const char *name, bool *miimon)
1132 {
1133 struct mii_ioctl_data data;
1134 int error;
1135
1136 *miimon = false;
1137
1138 memset(&data, 0, sizeof data);
1139 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1140 if (!error) {
1141 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1142 data.reg_num = MII_BMSR;
1143 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1144 &data);
1145
1146 if (!error) {
1147 *miimon = !!(data.val_out & BMSR_LSTATUS);
1148 } else {
1149 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1150 }
1151 } else {
1152 struct ethtool_cmd ecmd;
1153
1154 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1155 name);
1156
1157 COVERAGE_INC(netdev_get_ethtool);
1158 memset(&ecmd, 0, sizeof ecmd);
1159 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1160 "ETHTOOL_GLINK");
1161 if (!error) {
1162 struct ethtool_value eval;
1163
1164 memcpy(&eval, &ecmd, sizeof eval);
1165 *miimon = !!eval.data;
1166 } else {
1167 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1168 }
1169 }
1170
1171 return error;
1172 }
1173
1174 static int
1175 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1176 long long int interval)
1177 {
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179
1180 interval = interval > 0 ? MAX(interval, 100) : 0;
1181 if (netdev->miimon_interval != interval) {
1182 netdev->miimon_interval = interval;
1183 timer_set_expired(&netdev->miimon_timer);
1184 }
1185
1186 return 0;
1187 }
1188
1189 static void
1190 netdev_linux_miimon_run(void)
1191 {
1192 struct shash device_shash;
1193 struct shash_node *node;
1194
1195 shash_init(&device_shash);
1196 netdev_get_devices(&netdev_linux_class, &device_shash);
1197 SHASH_FOR_EACH (node, &device_shash) {
1198 struct netdev *netdev = node->data;
1199 struct netdev_linux *dev = netdev_linux_cast(netdev);
1200 bool miimon;
1201
1202 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1203 continue;
1204 }
1205
1206 netdev_linux_get_miimon(dev->up.name, &miimon);
1207 if (miimon != dev->miimon) {
1208 dev->miimon = miimon;
1209 netdev_linux_changed(dev, dev->ifi_flags, 0);
1210 }
1211
1212 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1213 }
1214
1215 shash_destroy(&device_shash);
1216 }
1217
1218 static void
1219 netdev_linux_miimon_wait(void)
1220 {
1221 struct shash device_shash;
1222 struct shash_node *node;
1223
1224 shash_init(&device_shash);
1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1226 SHASH_FOR_EACH (node, &device_shash) {
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1229
1230 if (dev->miimon_interval > 0) {
1231 timer_wait(&dev->miimon_timer);
1232 }
1233 }
1234 shash_destroy(&device_shash);
1235 }
1236
1237 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1238 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1239 * enabled. */
1240 static bool
1241 check_for_working_netlink_stats(void)
1242 {
1243 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1244 * preferable, so if that works, we'll use it. */
1245 int ifindex = do_get_ifindex("lo");
1246 if (ifindex < 0) {
1247 VLOG_WARN("failed to get ifindex for lo, "
1248 "obtaining netdev stats from proc");
1249 return false;
1250 } else {
1251 struct netdev_stats stats;
1252 int error = get_stats_via_netlink(ifindex, &stats);
1253 if (!error) {
1254 VLOG_DBG("obtaining netdev stats via rtnetlink");
1255 return true;
1256 } else {
1257 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1258 "via proc (you are probably running a pre-2.6.19 "
1259 "kernel)", ovs_strerror(error));
1260 return false;
1261 }
1262 }
1263 }
1264
1265 static void
1266 swap_uint64(uint64_t *a, uint64_t *b)
1267 {
1268 uint64_t tmp = *a;
1269 *a = *b;
1270 *b = tmp;
1271 }
1272
1273 /* Copies 'src' into 'dst', performing format conversion in the process.
1274 *
1275 * 'src' is allowed to be misaligned. */
1276 static void
1277 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1278 const struct ovs_vport_stats *src)
1279 {
1280 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1281 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1282 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1283 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1284 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1285 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1286 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1287 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1288 dst->multicast = 0;
1289 dst->collisions = 0;
1290 dst->rx_length_errors = 0;
1291 dst->rx_over_errors = 0;
1292 dst->rx_crc_errors = 0;
1293 dst->rx_frame_errors = 0;
1294 dst->rx_fifo_errors = 0;
1295 dst->rx_missed_errors = 0;
1296 dst->tx_aborted_errors = 0;
1297 dst->tx_carrier_errors = 0;
1298 dst->tx_fifo_errors = 0;
1299 dst->tx_heartbeat_errors = 0;
1300 dst->tx_window_errors = 0;
1301 }
1302
1303 static int
1304 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1305 {
1306 struct dpif_linux_vport reply;
1307 struct ofpbuf *buf;
1308 int error;
1309
1310 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1311 if (error) {
1312 return error;
1313 } else if (!reply.stats) {
1314 ofpbuf_delete(buf);
1315 return EOPNOTSUPP;
1316 }
1317
1318 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1319
1320 ofpbuf_delete(buf);
1321
1322 return 0;
1323 }
1324
1325 static void
1326 get_stats_via_vport(const struct netdev *netdev_,
1327 struct netdev_stats *stats)
1328 {
1329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1330
1331 if (!netdev->vport_stats_error ||
1332 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1333 int error;
1334
1335 error = get_stats_via_vport__(netdev_, stats);
1336 if (error && error != ENOENT) {
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1338 "(%s)",
1339 netdev_get_name(netdev_), ovs_strerror(error));
1340 }
1341 netdev->vport_stats_error = error;
1342 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1343 }
1344 }
1345
1346 static int
1347 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1349 {
1350 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1351 static int use_netlink_stats;
1352 int error;
1353
1354 if (ovsthread_once_start(&once)) {
1355 use_netlink_stats = check_for_working_netlink_stats();
1356 ovsthread_once_done(&once);
1357 }
1358
1359 if (use_netlink_stats) {
1360 int ifindex;
1361
1362 error = get_ifindex(netdev_, &ifindex);
1363 if (!error) {
1364 error = get_stats_via_netlink(ifindex, stats);
1365 }
1366 } else {
1367 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1368 }
1369
1370 if (error) {
1371 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1372 netdev_get_name(netdev_), error);
1373 }
1374 return error;
1375
1376 }
1377
1378 /* Retrieves current device stats for 'netdev-linux'. */
1379 static int
1380 netdev_linux_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1382 {
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 struct netdev_stats dev_stats;
1385 int error;
1386
1387 get_stats_via_vport(netdev_, stats);
1388
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1390
1391 if (error) {
1392 if (netdev->vport_stats_error) {
1393 return error;
1394 } else {
1395 return 0;
1396 }
1397 }
1398
1399 if (netdev->vport_stats_error) {
1400 /* stats not available from OVS then use ioctl stats. */
1401 *stats = dev_stats;
1402 } else {
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1420 }
1421 return 0;
1422 }
1423
1424 /* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1426 static int
1427 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1428 {
1429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1430 struct netdev_stats dev_stats;
1431 int error;
1432
1433 get_stats_via_vport(netdev_, stats);
1434
1435 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1436 if (error) {
1437 if (netdev->vport_stats_error) {
1438 return error;
1439 } else {
1440 return 0;
1441 }
1442 }
1443
1444 /* If this port is an internal port then the transmit and receive stats
1445 * will appear to be swapped relative to the other ports since we are the
1446 * one sending the data, not a remote computer. For consistency, we swap
1447 * them back here. This does not apply if we are getting stats from the
1448 * vport layer because it always tracks stats from the perspective of the
1449 * switch. */
1450 if (netdev->vport_stats_error) {
1451 *stats = dev_stats;
1452 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1453 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1454 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1455 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1456 stats->rx_length_errors = 0;
1457 stats->rx_over_errors = 0;
1458 stats->rx_crc_errors = 0;
1459 stats->rx_frame_errors = 0;
1460 stats->rx_fifo_errors = 0;
1461 stats->rx_missed_errors = 0;
1462 stats->tx_aborted_errors = 0;
1463 stats->tx_carrier_errors = 0;
1464 stats->tx_fifo_errors = 0;
1465 stats->tx_heartbeat_errors = 0;
1466 stats->tx_window_errors = 0;
1467 } else {
1468 stats->rx_dropped += dev_stats.tx_dropped;
1469 stats->tx_dropped += dev_stats.rx_dropped;
1470
1471 stats->rx_errors += dev_stats.tx_errors;
1472 stats->tx_errors += dev_stats.rx_errors;
1473
1474 stats->multicast += dev_stats.multicast;
1475 stats->collisions += dev_stats.collisions;
1476 }
1477 return 0;
1478 }
1479
1480 static int
1481 netdev_internal_get_stats(const struct netdev *netdev_,
1482 struct netdev_stats *stats)
1483 {
1484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1485
1486 get_stats_via_vport(netdev_, stats);
1487 return netdev->vport_stats_error;
1488 }
1489
1490 static int
1491 netdev_internal_set_stats(struct netdev *netdev,
1492 const struct netdev_stats *stats)
1493 {
1494 struct ovs_vport_stats vport_stats;
1495 struct dpif_linux_vport vport;
1496 int err;
1497
1498 vport_stats.rx_packets = stats->rx_packets;
1499 vport_stats.tx_packets = stats->tx_packets;
1500 vport_stats.rx_bytes = stats->rx_bytes;
1501 vport_stats.tx_bytes = stats->tx_bytes;
1502 vport_stats.rx_errors = stats->rx_errors;
1503 vport_stats.tx_errors = stats->tx_errors;
1504 vport_stats.rx_dropped = stats->rx_dropped;
1505 vport_stats.tx_dropped = stats->tx_dropped;
1506
1507 dpif_linux_vport_init(&vport);
1508 vport.cmd = OVS_VPORT_CMD_SET;
1509 vport.name = netdev_get_name(netdev);
1510 vport.stats = &vport_stats;
1511
1512 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1513
1514 /* If the vport layer doesn't know about the device, that doesn't mean it
1515 * doesn't exist (after all were able to open it when netdev_open() was
1516 * called), it just means that it isn't attached and we'll be getting
1517 * stats a different way. */
1518 if (err == ENODEV) {
1519 err = EOPNOTSUPP;
1520 }
1521
1522 return err;
1523 }
1524
1525 static void
1526 netdev_linux_read_features(struct netdev_linux *netdev)
1527 {
1528 struct ethtool_cmd ecmd;
1529 uint32_t speed;
1530 int error;
1531
1532 if (netdev->cache_valid & VALID_FEATURES) {
1533 return;
1534 }
1535
1536 COVERAGE_INC(netdev_get_ethtool);
1537 memset(&ecmd, 0, sizeof ecmd);
1538 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1539 ETHTOOL_GSET, "ETHTOOL_GSET");
1540 if (error) {
1541 goto out;
1542 }
1543
1544 /* Supported features. */
1545 netdev->supported = 0;
1546 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1547 netdev->supported |= NETDEV_F_10MB_HD;
1548 }
1549 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1550 netdev->supported |= NETDEV_F_10MB_FD;
1551 }
1552 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1553 netdev->supported |= NETDEV_F_100MB_HD;
1554 }
1555 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1556 netdev->supported |= NETDEV_F_100MB_FD;
1557 }
1558 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1559 netdev->supported |= NETDEV_F_1GB_HD;
1560 }
1561 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1562 netdev->supported |= NETDEV_F_1GB_FD;
1563 }
1564 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1565 netdev->supported |= NETDEV_F_10GB_FD;
1566 }
1567 if (ecmd.supported & SUPPORTED_TP) {
1568 netdev->supported |= NETDEV_F_COPPER;
1569 }
1570 if (ecmd.supported & SUPPORTED_FIBRE) {
1571 netdev->supported |= NETDEV_F_FIBER;
1572 }
1573 if (ecmd.supported & SUPPORTED_Autoneg) {
1574 netdev->supported |= NETDEV_F_AUTONEG;
1575 }
1576 if (ecmd.supported & SUPPORTED_Pause) {
1577 netdev->supported |= NETDEV_F_PAUSE;
1578 }
1579 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1580 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1581 }
1582
1583 /* Advertised features. */
1584 netdev->advertised = 0;
1585 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1586 netdev->advertised |= NETDEV_F_10MB_HD;
1587 }
1588 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1589 netdev->advertised |= NETDEV_F_10MB_FD;
1590 }
1591 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1592 netdev->advertised |= NETDEV_F_100MB_HD;
1593 }
1594 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1595 netdev->advertised |= NETDEV_F_100MB_FD;
1596 }
1597 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1598 netdev->advertised |= NETDEV_F_1GB_HD;
1599 }
1600 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1601 netdev->advertised |= NETDEV_F_1GB_FD;
1602 }
1603 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1604 netdev->advertised |= NETDEV_F_10GB_FD;
1605 }
1606 if (ecmd.advertising & ADVERTISED_TP) {
1607 netdev->advertised |= NETDEV_F_COPPER;
1608 }
1609 if (ecmd.advertising & ADVERTISED_FIBRE) {
1610 netdev->advertised |= NETDEV_F_FIBER;
1611 }
1612 if (ecmd.advertising & ADVERTISED_Autoneg) {
1613 netdev->advertised |= NETDEV_F_AUTONEG;
1614 }
1615 if (ecmd.advertising & ADVERTISED_Pause) {
1616 netdev->advertised |= NETDEV_F_PAUSE;
1617 }
1618 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1619 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1620 }
1621
1622 /* Current settings. */
1623 speed = ecmd.speed;
1624 if (speed == SPEED_10) {
1625 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1626 } else if (speed == SPEED_100) {
1627 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1628 } else if (speed == SPEED_1000) {
1629 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1630 } else if (speed == SPEED_10000) {
1631 netdev->current = NETDEV_F_10GB_FD;
1632 } else if (speed == 40000) {
1633 netdev->current = NETDEV_F_40GB_FD;
1634 } else if (speed == 100000) {
1635 netdev->current = NETDEV_F_100GB_FD;
1636 } else if (speed == 1000000) {
1637 netdev->current = NETDEV_F_1TB_FD;
1638 } else {
1639 netdev->current = 0;
1640 }
1641
1642 if (ecmd.port == PORT_TP) {
1643 netdev->current |= NETDEV_F_COPPER;
1644 } else if (ecmd.port == PORT_FIBRE) {
1645 netdev->current |= NETDEV_F_FIBER;
1646 }
1647
1648 if (ecmd.autoneg) {
1649 netdev->current |= NETDEV_F_AUTONEG;
1650 }
1651
1652 /* Peer advertisements. */
1653 netdev->peer = 0; /* XXX */
1654
1655 out:
1656 netdev->cache_valid |= VALID_FEATURES;
1657 netdev->get_features_error = error;
1658 }
1659
1660 /* Stores the features supported by 'netdev' into each of '*current',
1661 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1662 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1663 * errno value. */
1664 static int
1665 netdev_linux_get_features(const struct netdev *netdev_,
1666 enum netdev_features *current,
1667 enum netdev_features *advertised,
1668 enum netdev_features *supported,
1669 enum netdev_features *peer)
1670 {
1671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1672
1673 netdev_linux_read_features(netdev);
1674
1675 if (!netdev->get_features_error) {
1676 *current = netdev->current;
1677 *advertised = netdev->advertised;
1678 *supported = netdev->supported;
1679 *peer = netdev->peer;
1680 }
1681 return netdev->get_features_error;
1682 }
1683
1684 /* Set the features advertised by 'netdev' to 'advertise'. */
1685 static int
1686 netdev_linux_set_advertisements(struct netdev *netdev,
1687 enum netdev_features advertise)
1688 {
1689 struct ethtool_cmd ecmd;
1690 int error;
1691
1692 COVERAGE_INC(netdev_get_ethtool);
1693 memset(&ecmd, 0, sizeof ecmd);
1694 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1695 ETHTOOL_GSET, "ETHTOOL_GSET");
1696 if (error) {
1697 return error;
1698 }
1699
1700 ecmd.advertising = 0;
1701 if (advertise & NETDEV_F_10MB_HD) {
1702 ecmd.advertising |= ADVERTISED_10baseT_Half;
1703 }
1704 if (advertise & NETDEV_F_10MB_FD) {
1705 ecmd.advertising |= ADVERTISED_10baseT_Full;
1706 }
1707 if (advertise & NETDEV_F_100MB_HD) {
1708 ecmd.advertising |= ADVERTISED_100baseT_Half;
1709 }
1710 if (advertise & NETDEV_F_100MB_FD) {
1711 ecmd.advertising |= ADVERTISED_100baseT_Full;
1712 }
1713 if (advertise & NETDEV_F_1GB_HD) {
1714 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1715 }
1716 if (advertise & NETDEV_F_1GB_FD) {
1717 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1718 }
1719 if (advertise & NETDEV_F_10GB_FD) {
1720 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1721 }
1722 if (advertise & NETDEV_F_COPPER) {
1723 ecmd.advertising |= ADVERTISED_TP;
1724 }
1725 if (advertise & NETDEV_F_FIBER) {
1726 ecmd.advertising |= ADVERTISED_FIBRE;
1727 }
1728 if (advertise & NETDEV_F_AUTONEG) {
1729 ecmd.advertising |= ADVERTISED_Autoneg;
1730 }
1731 if (advertise & NETDEV_F_PAUSE) {
1732 ecmd.advertising |= ADVERTISED_Pause;
1733 }
1734 if (advertise & NETDEV_F_PAUSE_ASYM) {
1735 ecmd.advertising |= ADVERTISED_Asym_Pause;
1736 }
1737 COVERAGE_INC(netdev_set_ethtool);
1738 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1739 ETHTOOL_SSET, "ETHTOOL_SSET");
1740 }
1741
1742 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1743 * successful, otherwise a positive errno value. */
1744 static int
1745 netdev_linux_set_policing(struct netdev *netdev_,
1746 uint32_t kbits_rate, uint32_t kbits_burst)
1747 {
1748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1749 const char *netdev_name = netdev_get_name(netdev_);
1750 int error;
1751
1752
1753 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1754 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1755 : kbits_burst); /* Stick with user-specified value. */
1756
1757 if (netdev->cache_valid & VALID_POLICING) {
1758 if (netdev->netdev_policing_error) {
1759 return netdev->netdev_policing_error;
1760 }
1761
1762 if (netdev->kbits_rate == kbits_rate &&
1763 netdev->kbits_burst == kbits_burst) {
1764 /* Assume that settings haven't changed since we last set them. */
1765 return 0;
1766 }
1767 netdev->cache_valid &= ~VALID_POLICING;
1768 }
1769
1770 COVERAGE_INC(netdev_set_policing);
1771 /* Remove any existing ingress qdisc. */
1772 error = tc_add_del_ingress_qdisc(netdev_, false);
1773 if (error) {
1774 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1775 netdev_name, ovs_strerror(error));
1776 goto out;
1777 }
1778
1779 if (kbits_rate) {
1780 error = tc_add_del_ingress_qdisc(netdev_, true);
1781 if (error) {
1782 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1783 netdev_name, ovs_strerror(error));
1784 goto out;
1785 }
1786
1787 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1788 if (error){
1789 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1790 netdev_name, ovs_strerror(error));
1791 goto out;
1792 }
1793 }
1794
1795 netdev->kbits_rate = kbits_rate;
1796 netdev->kbits_burst = kbits_burst;
1797
1798 out:
1799 if (!error || error == ENODEV) {
1800 netdev->netdev_policing_error = error;
1801 netdev->cache_valid |= VALID_POLICING;
1802 }
1803 return error;
1804 }
1805
1806 static int
1807 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1808 struct sset *types)
1809 {
1810 const struct tc_ops *const *opsp;
1811
1812 for (opsp = tcs; *opsp != NULL; opsp++) {
1813 const struct tc_ops *ops = *opsp;
1814 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1815 sset_add(types, ops->ovs_name);
1816 }
1817 }
1818 return 0;
1819 }
1820
1821 static const struct tc_ops *
1822 tc_lookup_ovs_name(const char *name)
1823 {
1824 const struct tc_ops *const *opsp;
1825
1826 for (opsp = tcs; *opsp != NULL; opsp++) {
1827 const struct tc_ops *ops = *opsp;
1828 if (!strcmp(name, ops->ovs_name)) {
1829 return ops;
1830 }
1831 }
1832 return NULL;
1833 }
1834
1835 static const struct tc_ops *
1836 tc_lookup_linux_name(const char *name)
1837 {
1838 const struct tc_ops *const *opsp;
1839
1840 for (opsp = tcs; *opsp != NULL; opsp++) {
1841 const struct tc_ops *ops = *opsp;
1842 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1843 return ops;
1844 }
1845 }
1846 return NULL;
1847 }
1848
1849 static struct tc_queue *
1850 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1851 size_t hash)
1852 {
1853 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1854 struct tc_queue *queue;
1855
1856 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1857 if (queue->queue_id == queue_id) {
1858 return queue;
1859 }
1860 }
1861 return NULL;
1862 }
1863
1864 static struct tc_queue *
1865 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1866 {
1867 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1868 }
1869
1870 static int
1871 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1872 const char *type,
1873 struct netdev_qos_capabilities *caps)
1874 {
1875 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1876 if (!ops) {
1877 return EOPNOTSUPP;
1878 }
1879 caps->n_queues = ops->n_queues;
1880 return 0;
1881 }
1882
1883 static int
1884 netdev_linux_get_qos(const struct netdev *netdev_,
1885 const char **typep, struct smap *details)
1886 {
1887 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1888 int error;
1889
1890 error = tc_query_qdisc(netdev_);
1891 if (error) {
1892 return error;
1893 }
1894
1895 *typep = netdev->tc->ops->ovs_name;
1896 return (netdev->tc->ops->qdisc_get
1897 ? netdev->tc->ops->qdisc_get(netdev_, details)
1898 : 0);
1899 }
1900
1901 static int
1902 netdev_linux_set_qos(struct netdev *netdev_,
1903 const char *type, const struct smap *details)
1904 {
1905 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1906 const struct tc_ops *new_ops;
1907 int error;
1908
1909 new_ops = tc_lookup_ovs_name(type);
1910 if (!new_ops || !new_ops->tc_install) {
1911 return EOPNOTSUPP;
1912 }
1913
1914 error = tc_query_qdisc(netdev_);
1915 if (error) {
1916 return error;
1917 }
1918
1919 if (new_ops == netdev->tc->ops) {
1920 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1921 } else {
1922 /* Delete existing qdisc. */
1923 error = tc_del_qdisc(netdev_);
1924 if (error) {
1925 return error;
1926 }
1927 ovs_assert(netdev->tc == NULL);
1928
1929 /* Install new qdisc. */
1930 error = new_ops->tc_install(netdev_, details);
1931 ovs_assert((error == 0) == (netdev->tc != NULL));
1932
1933 return error;
1934 }
1935 }
1936
1937 static int
1938 netdev_linux_get_queue(const struct netdev *netdev_,
1939 unsigned int queue_id, struct smap *details)
1940 {
1941 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1942 int error;
1943
1944 error = tc_query_qdisc(netdev_);
1945 if (error) {
1946 return error;
1947 } else {
1948 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1949 return (queue
1950 ? netdev->tc->ops->class_get(netdev_, queue, details)
1951 : ENOENT);
1952 }
1953 }
1954
1955 static int
1956 netdev_linux_set_queue(struct netdev *netdev_,
1957 unsigned int queue_id, const struct smap *details)
1958 {
1959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1960 int error;
1961
1962 error = tc_query_qdisc(netdev_);
1963 if (error) {
1964 return error;
1965 } else if (queue_id >= netdev->tc->ops->n_queues
1966 || !netdev->tc->ops->class_set) {
1967 return EINVAL;
1968 }
1969
1970 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1971 }
1972
1973 static int
1974 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1975 {
1976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1977 int error;
1978
1979 error = tc_query_qdisc(netdev_);
1980 if (error) {
1981 return error;
1982 } else if (!netdev->tc->ops->class_delete) {
1983 return EINVAL;
1984 } else {
1985 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1986 return (queue
1987 ? netdev->tc->ops->class_delete(netdev_, queue)
1988 : ENOENT);
1989 }
1990 }
1991
1992 static int
1993 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1994 unsigned int queue_id,
1995 struct netdev_queue_stats *stats)
1996 {
1997 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1998 int error;
1999
2000 error = tc_query_qdisc(netdev_);
2001 if (error) {
2002 return error;
2003 } else if (!netdev->tc->ops->class_get_stats) {
2004 return EOPNOTSUPP;
2005 } else {
2006 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2007 if (!queue) {
2008 return ENOENT;
2009 }
2010 stats->created = queue->created;
2011 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2012 }
2013 }
2014
2015 static bool
2016 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2017 {
2018 struct ofpbuf request;
2019 struct tcmsg *tcmsg;
2020
2021 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2022 if (!tcmsg) {
2023 return false;
2024 }
2025 tcmsg->tcm_parent = 0;
2026 nl_dump_start(dump, NETLINK_ROUTE, &request);
2027 ofpbuf_uninit(&request);
2028 return true;
2029 }
2030
2031 static int
2032 netdev_linux_dump_queues(const struct netdev *netdev_,
2033 netdev_dump_queues_cb *cb, void *aux)
2034 {
2035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2036 struct tc_queue *queue, *next_queue;
2037 struct smap details;
2038 int last_error;
2039 int error;
2040
2041 error = tc_query_qdisc(netdev_);
2042 if (error) {
2043 return error;
2044 } else if (!netdev->tc->ops->class_get) {
2045 return EOPNOTSUPP;
2046 }
2047
2048 last_error = 0;
2049 smap_init(&details);
2050 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2051 &netdev->tc->queues) {
2052 smap_clear(&details);
2053
2054 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2055 if (!error) {
2056 (*cb)(queue->queue_id, &details, aux);
2057 } else {
2058 last_error = error;
2059 }
2060 }
2061 smap_destroy(&details);
2062
2063 return last_error;
2064 }
2065
2066 static int
2067 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2068 netdev_dump_queue_stats_cb *cb, void *aux)
2069 {
2070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2071 struct nl_dump dump;
2072 struct ofpbuf msg;
2073 int last_error;
2074 int error;
2075
2076 error = tc_query_qdisc(netdev_);
2077 if (error) {
2078 return error;
2079 } else if (!netdev->tc->ops->class_dump_stats) {
2080 return EOPNOTSUPP;
2081 }
2082
2083 last_error = 0;
2084 if (!start_queue_dump(netdev_, &dump)) {
2085 return ENODEV;
2086 }
2087 while (nl_dump_next(&dump, &msg)) {
2088 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2089 if (error) {
2090 last_error = error;
2091 }
2092 }
2093
2094 error = nl_dump_done(&dump);
2095 return error ? error : last_error;
2096 }
2097
2098 static int
2099 netdev_linux_get_in4(const struct netdev *netdev_,
2100 struct in_addr *address, struct in_addr *netmask)
2101 {
2102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2103
2104 if (!(netdev->cache_valid & VALID_IN4)) {
2105 int error;
2106
2107 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2108 SIOCGIFADDR, "SIOCGIFADDR");
2109 if (error) {
2110 return error;
2111 }
2112
2113 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2114 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2115 if (error) {
2116 return error;
2117 }
2118
2119 netdev->cache_valid |= VALID_IN4;
2120 }
2121 *address = netdev->address;
2122 *netmask = netdev->netmask;
2123 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2124 }
2125
2126 static int
2127 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2128 struct in_addr netmask)
2129 {
2130 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2131 int error;
2132
2133 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2134 if (!error) {
2135 netdev->cache_valid |= VALID_IN4;
2136 netdev->address = address;
2137 netdev->netmask = netmask;
2138 if (address.s_addr != INADDR_ANY) {
2139 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2140 "SIOCSIFNETMASK", netmask);
2141 }
2142 }
2143 return error;
2144 }
2145
2146 static bool
2147 parse_if_inet6_line(const char *line,
2148 struct in6_addr *in6, char ifname[16 + 1])
2149 {
2150 uint8_t *s6 = in6->s6_addr;
2151 #define X8 "%2"SCNx8
2152 return sscanf(line,
2153 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2154 "%*x %*x %*x %*x %16s\n",
2155 &s6[0], &s6[1], &s6[2], &s6[3],
2156 &s6[4], &s6[5], &s6[6], &s6[7],
2157 &s6[8], &s6[9], &s6[10], &s6[11],
2158 &s6[12], &s6[13], &s6[14], &s6[15],
2159 ifname) == 17;
2160 }
2161
2162 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2163 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2164 static int
2165 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2166 {
2167 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2168 if (!(netdev->cache_valid & VALID_IN6)) {
2169 FILE *file;
2170 char line[128];
2171
2172 netdev->in6 = in6addr_any;
2173
2174 file = fopen("/proc/net/if_inet6", "r");
2175 if (file != NULL) {
2176 const char *name = netdev_get_name(netdev_);
2177 while (fgets(line, sizeof line, file)) {
2178 struct in6_addr in6_tmp;
2179 char ifname[16 + 1];
2180 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2181 && !strcmp(name, ifname))
2182 {
2183 netdev->in6 = in6_tmp;
2184 break;
2185 }
2186 }
2187 fclose(file);
2188 }
2189 netdev->cache_valid |= VALID_IN6;
2190 }
2191 *in6 = netdev->in6;
2192 return 0;
2193 }
2194
2195 static void
2196 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2197 {
2198 struct sockaddr_in sin;
2199 memset(&sin, 0, sizeof sin);
2200 sin.sin_family = AF_INET;
2201 sin.sin_addr = addr;
2202 sin.sin_port = 0;
2203
2204 memset(sa, 0, sizeof *sa);
2205 memcpy(sa, &sin, sizeof sin);
2206 }
2207
2208 static int
2209 do_set_addr(struct netdev *netdev,
2210 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2211 {
2212 struct ifreq ifr;
2213 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2214 make_in4_sockaddr(&ifr.ifr_addr, addr);
2215
2216 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2217 ioctl_name);
2218 }
2219
2220 /* Adds 'router' as a default IP gateway. */
2221 static int
2222 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2223 {
2224 struct in_addr any = { INADDR_ANY };
2225 struct rtentry rt;
2226 int error;
2227
2228 memset(&rt, 0, sizeof rt);
2229 make_in4_sockaddr(&rt.rt_dst, any);
2230 make_in4_sockaddr(&rt.rt_gateway, router);
2231 make_in4_sockaddr(&rt.rt_genmask, any);
2232 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2233 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2234 if (error) {
2235 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2236 }
2237 return error;
2238 }
2239
2240 static int
2241 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2242 char **netdev_name)
2243 {
2244 static const char fn[] = "/proc/net/route";
2245 FILE *stream;
2246 char line[256];
2247 int ln;
2248
2249 *netdev_name = NULL;
2250 stream = fopen(fn, "r");
2251 if (stream == NULL) {
2252 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2253 return errno;
2254 }
2255
2256 ln = 0;
2257 while (fgets(line, sizeof line, stream)) {
2258 if (++ln >= 2) {
2259 char iface[17];
2260 ovs_be32 dest, gateway, mask;
2261 int refcnt, metric, mtu;
2262 unsigned int flags, use, window, irtt;
2263
2264 if (sscanf(line,
2265 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2266 " %d %u %u\n",
2267 iface, &dest, &gateway, &flags, &refcnt,
2268 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2269
2270 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2271 fn, ln, line);
2272 continue;
2273 }
2274 if (!(flags & RTF_UP)) {
2275 /* Skip routes that aren't up. */
2276 continue;
2277 }
2278
2279 /* The output of 'dest', 'mask', and 'gateway' were given in
2280 * network byte order, so we don't need need any endian
2281 * conversions here. */
2282 if ((dest & mask) == (host->s_addr & mask)) {
2283 if (!gateway) {
2284 /* The host is directly reachable. */
2285 next_hop->s_addr = 0;
2286 } else {
2287 /* To reach the host, we must go through a gateway. */
2288 next_hop->s_addr = gateway;
2289 }
2290 *netdev_name = xstrdup(iface);
2291 fclose(stream);
2292 return 0;
2293 }
2294 }
2295 }
2296
2297 fclose(stream);
2298 return ENXIO;
2299 }
2300
2301 static int
2302 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2303 {
2304 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2305 int error = 0;
2306
2307 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2308 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2309
2310 COVERAGE_INC(netdev_get_ethtool);
2311 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2312 error = netdev_linux_do_ethtool(netdev->up.name,
2313 cmd,
2314 ETHTOOL_GDRVINFO,
2315 "ETHTOOL_GDRVINFO");
2316 if (!error) {
2317 netdev->cache_valid |= VALID_DRVINFO;
2318 }
2319 }
2320
2321 if (!error) {
2322 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2323 smap_add(smap, "driver_version", netdev->drvinfo.version);
2324 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2325 }
2326 return error;
2327 }
2328
2329 static int
2330 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2331 struct smap *smap)
2332 {
2333 smap_add(smap, "driver_name", "openvswitch");
2334 return 0;
2335 }
2336
2337 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2338 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2339 * returns 0. Otherwise, it returns a positive errno value; in particular,
2340 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2341 static int
2342 netdev_linux_arp_lookup(const struct netdev *netdev,
2343 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2344 {
2345 struct arpreq r;
2346 struct sockaddr_in sin;
2347 int retval;
2348
2349 memset(&r, 0, sizeof r);
2350 memset(&sin, 0, sizeof sin);
2351 sin.sin_family = AF_INET;
2352 sin.sin_addr.s_addr = ip;
2353 sin.sin_port = 0;
2354 memcpy(&r.arp_pa, &sin, sizeof sin);
2355 r.arp_ha.sa_family = ARPHRD_ETHER;
2356 r.arp_flags = 0;
2357 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2358 COVERAGE_INC(netdev_arp_lookup);
2359 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2360 if (!retval) {
2361 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2362 } else if (retval != ENXIO) {
2363 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2364 netdev_get_name(netdev), IP_ARGS(ip),
2365 ovs_strerror(retval));
2366 }
2367 return retval;
2368 }
2369
2370 static int
2371 nd_to_iff_flags(enum netdev_flags nd)
2372 {
2373 int iff = 0;
2374 if (nd & NETDEV_UP) {
2375 iff |= IFF_UP;
2376 }
2377 if (nd & NETDEV_PROMISC) {
2378 iff |= IFF_PROMISC;
2379 }
2380 return iff;
2381 }
2382
2383 static int
2384 iff_to_nd_flags(int iff)
2385 {
2386 enum netdev_flags nd = 0;
2387 if (iff & IFF_UP) {
2388 nd |= NETDEV_UP;
2389 }
2390 if (iff & IFF_PROMISC) {
2391 nd |= NETDEV_PROMISC;
2392 }
2393 return nd;
2394 }
2395
2396 static int
2397 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2398 enum netdev_flags on, enum netdev_flags *old_flagsp)
2399 {
2400 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2401 int old_flags, new_flags;
2402 int error = 0;
2403
2404 old_flags = netdev->ifi_flags;
2405 *old_flagsp = iff_to_nd_flags(old_flags);
2406 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2407 if (new_flags != old_flags) {
2408 error = set_flags(netdev_get_name(netdev_), new_flags);
2409 get_flags(netdev_, &netdev->ifi_flags);
2410 }
2411 return error;
2412 }
2413
2414 static unsigned int
2415 netdev_linux_change_seq(const struct netdev *netdev)
2416 {
2417 return netdev_linux_cast(netdev)->change_seq;
2418 }
2419
2420 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2421 GET_FEATURES, GET_STATUS) \
2422 { \
2423 NAME, \
2424 \
2425 netdev_linux_init, \
2426 netdev_linux_run, \
2427 netdev_linux_wait, \
2428 \
2429 CREATE, \
2430 netdev_linux_destroy, \
2431 NULL, /* get_config */ \
2432 NULL, /* set_config */ \
2433 NULL, /* get_tunnel_config */ \
2434 \
2435 netdev_linux_rx_open, \
2436 \
2437 netdev_linux_send, \
2438 netdev_linux_send_wait, \
2439 \
2440 netdev_linux_set_etheraddr, \
2441 netdev_linux_get_etheraddr, \
2442 netdev_linux_get_mtu, \
2443 netdev_linux_set_mtu, \
2444 netdev_linux_get_ifindex, \
2445 netdev_linux_get_carrier, \
2446 netdev_linux_get_carrier_resets, \
2447 netdev_linux_set_miimon_interval, \
2448 GET_STATS, \
2449 SET_STATS, \
2450 \
2451 GET_FEATURES, \
2452 netdev_linux_set_advertisements, \
2453 \
2454 netdev_linux_set_policing, \
2455 netdev_linux_get_qos_types, \
2456 netdev_linux_get_qos_capabilities, \
2457 netdev_linux_get_qos, \
2458 netdev_linux_set_qos, \
2459 netdev_linux_get_queue, \
2460 netdev_linux_set_queue, \
2461 netdev_linux_delete_queue, \
2462 netdev_linux_get_queue_stats, \
2463 netdev_linux_dump_queues, \
2464 netdev_linux_dump_queue_stats, \
2465 \
2466 netdev_linux_get_in4, \
2467 netdev_linux_set_in4, \
2468 netdev_linux_get_in6, \
2469 netdev_linux_add_router, \
2470 netdev_linux_get_next_hop, \
2471 GET_STATUS, \
2472 netdev_linux_arp_lookup, \
2473 \
2474 netdev_linux_update_flags, \
2475 \
2476 netdev_linux_change_seq \
2477 }
2478
2479 const struct netdev_class netdev_linux_class =
2480 NETDEV_LINUX_CLASS(
2481 "system",
2482 netdev_linux_create,
2483 netdev_linux_get_stats,
2484 NULL, /* set_stats */
2485 netdev_linux_get_features,
2486 netdev_linux_get_status);
2487
2488 const struct netdev_class netdev_tap_class =
2489 NETDEV_LINUX_CLASS(
2490 "tap",
2491 netdev_linux_create_tap,
2492 netdev_tap_get_stats,
2493 NULL, /* set_stats */
2494 netdev_linux_get_features,
2495 netdev_linux_get_status);
2496
2497 const struct netdev_class netdev_internal_class =
2498 NETDEV_LINUX_CLASS(
2499 "internal",
2500 netdev_linux_create,
2501 netdev_internal_get_stats,
2502 netdev_internal_set_stats,
2503 NULL, /* get_features */
2504 netdev_internal_get_status);
2505
2506 static const struct netdev_rx_class netdev_rx_linux_class = {
2507 netdev_rx_linux_destroy,
2508 netdev_rx_linux_recv,
2509 netdev_rx_linux_wait,
2510 netdev_rx_linux_drain,
2511 };
2512 \f
2513 /* HTB traffic control class. */
2514
2515 #define HTB_N_QUEUES 0xf000
2516
2517 struct htb {
2518 struct tc tc;
2519 unsigned int max_rate; /* In bytes/s. */
2520 };
2521
2522 struct htb_class {
2523 struct tc_queue tc_queue;
2524 unsigned int min_rate; /* In bytes/s. */
2525 unsigned int max_rate; /* In bytes/s. */
2526 unsigned int burst; /* In bytes. */
2527 unsigned int priority; /* Lower values are higher priorities. */
2528 };
2529
2530 static struct htb *
2531 htb_get__(const struct netdev *netdev_)
2532 {
2533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2534 return CONTAINER_OF(netdev->tc, struct htb, tc);
2535 }
2536
2537 static void
2538 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2539 {
2540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2541 struct htb *htb;
2542
2543 htb = xmalloc(sizeof *htb);
2544 tc_init(&htb->tc, &tc_ops_htb);
2545 htb->max_rate = max_rate;
2546
2547 netdev->tc = &htb->tc;
2548 }
2549
2550 /* Create an HTB qdisc.
2551 *
2552 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2553 static int
2554 htb_setup_qdisc__(struct netdev *netdev)
2555 {
2556 size_t opt_offset;
2557 struct tc_htb_glob opt;
2558 struct ofpbuf request;
2559 struct tcmsg *tcmsg;
2560
2561 tc_del_qdisc(netdev);
2562
2563 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2564 NLM_F_EXCL | NLM_F_CREATE, &request);
2565 if (!tcmsg) {
2566 return ENODEV;
2567 }
2568 tcmsg->tcm_handle = tc_make_handle(1, 0);
2569 tcmsg->tcm_parent = TC_H_ROOT;
2570
2571 nl_msg_put_string(&request, TCA_KIND, "htb");
2572
2573 memset(&opt, 0, sizeof opt);
2574 opt.rate2quantum = 10;
2575 opt.version = 3;
2576 opt.defcls = 1;
2577
2578 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2579 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2580 nl_msg_end_nested(&request, opt_offset);
2581
2582 return tc_transact(&request, NULL);
2583 }
2584
2585 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2586 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2587 static int
2588 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2589 unsigned int parent, struct htb_class *class)
2590 {
2591 size_t opt_offset;
2592 struct tc_htb_opt opt;
2593 struct ofpbuf request;
2594 struct tcmsg *tcmsg;
2595 int error;
2596 int mtu;
2597
2598 error = netdev_get_mtu(netdev, &mtu);
2599 if (error) {
2600 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2601 netdev_get_name(netdev));
2602 return error;
2603 }
2604
2605 memset(&opt, 0, sizeof opt);
2606 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2607 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2608 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2609 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2610 opt.prio = class->priority;
2611
2612 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2613 if (!tcmsg) {
2614 return ENODEV;
2615 }
2616 tcmsg->tcm_handle = handle;
2617 tcmsg->tcm_parent = parent;
2618
2619 nl_msg_put_string(&request, TCA_KIND, "htb");
2620 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2621 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2622 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2623 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2624 nl_msg_end_nested(&request, opt_offset);
2625
2626 error = tc_transact(&request, NULL);
2627 if (error) {
2628 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2629 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2630 netdev_get_name(netdev),
2631 tc_get_major(handle), tc_get_minor(handle),
2632 tc_get_major(parent), tc_get_minor(parent),
2633 class->min_rate, class->max_rate,
2634 class->burst, class->priority, ovs_strerror(error));
2635 }
2636 return error;
2637 }
2638
2639 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2640 * description of them into 'details'. The description complies with the
2641 * specification given in the vswitch database documentation for linux-htb
2642 * queue details. */
2643 static int
2644 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2645 {
2646 static const struct nl_policy tca_htb_policy[] = {
2647 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2648 .min_len = sizeof(struct tc_htb_opt) },
2649 };
2650
2651 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2652 const struct tc_htb_opt *htb;
2653
2654 if (!nl_parse_nested(nl_options, tca_htb_policy,
2655 attrs, ARRAY_SIZE(tca_htb_policy))) {
2656 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2657 return EPROTO;
2658 }
2659
2660 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2661 class->min_rate = htb->rate.rate;
2662 class->max_rate = htb->ceil.rate;
2663 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2664 class->priority = htb->prio;
2665 return 0;
2666 }
2667
2668 static int
2669 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2670 struct htb_class *options,
2671 struct netdev_queue_stats *stats)
2672 {
2673 struct nlattr *nl_options;
2674 unsigned int handle;
2675 int error;
2676
2677 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2678 if (!error && queue_id) {
2679 unsigned int major = tc_get_major(handle);
2680 unsigned int minor = tc_get_minor(handle);
2681 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2682 *queue_id = minor - 1;
2683 } else {
2684 error = EPROTO;
2685 }
2686 }
2687 if (!error && options) {
2688 error = htb_parse_tca_options__(nl_options, options);
2689 }
2690 return error;
2691 }
2692
2693 static void
2694 htb_parse_qdisc_details__(struct netdev *netdev,
2695 const struct smap *details, struct htb_class *hc)
2696 {
2697 const char *max_rate_s;
2698
2699 max_rate_s = smap_get(details, "max-rate");
2700 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2701 if (!hc->max_rate) {
2702 enum netdev_features current;
2703
2704 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2705 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2706 }
2707 hc->min_rate = hc->max_rate;
2708 hc->burst = 0;
2709 hc->priority = 0;
2710 }
2711
2712 static int
2713 htb_parse_class_details__(struct netdev *netdev,
2714 const struct smap *details, struct htb_class *hc)
2715 {
2716 const struct htb *htb = htb_get__(netdev);
2717 const char *min_rate_s = smap_get(details, "min-rate");
2718 const char *max_rate_s = smap_get(details, "max-rate");
2719 const char *burst_s = smap_get(details, "burst");
2720 const char *priority_s = smap_get(details, "priority");
2721 int mtu, error;
2722
2723 error = netdev_get_mtu(netdev, &mtu);
2724 if (error) {
2725 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2726 netdev_get_name(netdev));
2727 return error;
2728 }
2729
2730 /* HTB requires at least an mtu sized min-rate to send any traffic even
2731 * on uncongested links. */
2732 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2733 hc->min_rate = MAX(hc->min_rate, mtu);
2734 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2735
2736 /* max-rate */
2737 hc->max_rate = (max_rate_s
2738 ? strtoull(max_rate_s, NULL, 10) / 8
2739 : htb->max_rate);
2740 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2741 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2742
2743 /* burst
2744 *
2745 * According to hints in the documentation that I've read, it is important
2746 * that 'burst' be at least as big as the largest frame that might be
2747 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2748 * but having it a bit too small is a problem. Since netdev_get_mtu()
2749 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2750 * the MTU. We actually add 64, instead of 14, as a guard against
2751 * additional headers get tacked on somewhere that we're not aware of. */
2752 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2753 hc->burst = MAX(hc->burst, mtu + 64);
2754
2755 /* priority */
2756 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2757
2758 return 0;
2759 }
2760
2761 static int
2762 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2763 unsigned int parent, struct htb_class *options,
2764 struct netdev_queue_stats *stats)
2765 {
2766 struct ofpbuf *reply;
2767 int error;
2768
2769 error = tc_query_class(netdev, handle, parent, &reply);
2770 if (!error) {
2771 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2772 ofpbuf_delete(reply);
2773 }
2774 return error;
2775 }
2776
2777 static int
2778 htb_tc_install(struct netdev *netdev, const struct smap *details)
2779 {
2780 int error;
2781
2782 error = htb_setup_qdisc__(netdev);
2783 if (!error) {
2784 struct htb_class hc;
2785
2786 htb_parse_qdisc_details__(netdev, details, &hc);
2787 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2788 tc_make_handle(1, 0), &hc);
2789 if (!error) {
2790 htb_install__(netdev, hc.max_rate);
2791 }
2792 }
2793 return error;
2794 }
2795
2796 static struct htb_class *
2797 htb_class_cast__(const struct tc_queue *queue)
2798 {
2799 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2800 }
2801
2802 static void
2803 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2804 const struct htb_class *hc)
2805 {
2806 struct htb *htb = htb_get__(netdev);
2807 size_t hash = hash_int(queue_id, 0);
2808 struct tc_queue *queue;
2809 struct htb_class *hcp;
2810
2811 queue = tc_find_queue__(netdev, queue_id, hash);
2812 if (queue) {
2813 hcp = htb_class_cast__(queue);
2814 } else {
2815 hcp = xmalloc(sizeof *hcp);
2816 queue = &hcp->tc_queue;
2817 queue->queue_id = queue_id;
2818 queue->created = time_msec();
2819 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2820 }
2821
2822 hcp->min_rate = hc->min_rate;
2823 hcp->max_rate = hc->max_rate;
2824 hcp->burst = hc->burst;
2825 hcp->priority = hc->priority;
2826 }
2827
2828 static int
2829 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2830 {
2831 struct ofpbuf msg;
2832 struct nl_dump dump;
2833 struct htb_class hc;
2834
2835 /* Get qdisc options. */
2836 hc.max_rate = 0;
2837 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2838 htb_install__(netdev, hc.max_rate);
2839
2840 /* Get queues. */
2841 if (!start_queue_dump(netdev, &dump)) {
2842 return ENODEV;
2843 }
2844 while (nl_dump_next(&dump, &msg)) {
2845 unsigned int queue_id;
2846
2847 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2848 htb_update_queue__(netdev, queue_id, &hc);
2849 }
2850 }
2851 nl_dump_done(&dump);
2852
2853 return 0;
2854 }
2855
2856 static void
2857 htb_tc_destroy(struct tc *tc)
2858 {
2859 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2860 struct htb_class *hc, *next;
2861
2862 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2863 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2864 free(hc);
2865 }
2866 tc_destroy(tc);
2867 free(htb);
2868 }
2869
2870 static int
2871 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2872 {
2873 const struct htb *htb = htb_get__(netdev);
2874 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2875 return 0;
2876 }
2877
2878 static int
2879 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2880 {
2881 struct htb_class hc;
2882 int error;
2883
2884 htb_parse_qdisc_details__(netdev, details, &hc);
2885 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2886 tc_make_handle(1, 0), &hc);
2887 if (!error) {
2888 htb_get__(netdev)->max_rate = hc.max_rate;
2889 }
2890 return error;
2891 }
2892
2893 static int
2894 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2895 const struct tc_queue *queue, struct smap *details)
2896 {
2897 const struct htb_class *hc = htb_class_cast__(queue);
2898
2899 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2900 if (hc->min_rate != hc->max_rate) {
2901 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2902 }
2903 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2904 if (hc->priority) {
2905 smap_add_format(details, "priority", "%u", hc->priority);
2906 }
2907 return 0;
2908 }
2909
2910 static int
2911 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2912 const struct smap *details)
2913 {
2914 struct htb_class hc;
2915 int error;
2916
2917 error = htb_parse_class_details__(netdev, details, &hc);
2918 if (error) {
2919 return error;
2920 }
2921
2922 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2923 tc_make_handle(1, 0xfffe), &hc);
2924 if (error) {
2925 return error;
2926 }
2927
2928 htb_update_queue__(netdev, queue_id, &hc);
2929 return 0;
2930 }
2931
2932 static int
2933 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2934 {
2935 struct htb_class *hc = htb_class_cast__(queue);
2936 struct htb *htb = htb_get__(netdev);
2937 int error;
2938
2939 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2940 if (!error) {
2941 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2942 free(hc);
2943 }
2944 return error;
2945 }
2946
2947 static int
2948 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2949 struct netdev_queue_stats *stats)
2950 {
2951 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2952 tc_make_handle(1, 0xfffe), NULL, stats);
2953 }
2954
2955 static int
2956 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2957 const struct ofpbuf *nlmsg,
2958 netdev_dump_queue_stats_cb *cb, void *aux)
2959 {
2960 struct netdev_queue_stats stats;
2961 unsigned int handle, major, minor;
2962 int error;
2963
2964 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2965 if (error) {
2966 return error;
2967 }
2968
2969 major = tc_get_major(handle);
2970 minor = tc_get_minor(handle);
2971 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2972 (*cb)(minor - 1, &stats, aux);
2973 }
2974 return 0;
2975 }
2976
2977 static const struct tc_ops tc_ops_htb = {
2978 "htb", /* linux_name */
2979 "linux-htb", /* ovs_name */
2980 HTB_N_QUEUES, /* n_queues */
2981 htb_tc_install,
2982 htb_tc_load,
2983 htb_tc_destroy,
2984 htb_qdisc_get,
2985 htb_qdisc_set,
2986 htb_class_get,
2987 htb_class_set,
2988 htb_class_delete,
2989 htb_class_get_stats,
2990 htb_class_dump_stats
2991 };
2992 \f
2993 /* "linux-hfsc" traffic control class. */
2994
2995 #define HFSC_N_QUEUES 0xf000
2996
2997 struct hfsc {
2998 struct tc tc;
2999 uint32_t max_rate;
3000 };
3001
3002 struct hfsc_class {
3003 struct tc_queue tc_queue;
3004 uint32_t min_rate;
3005 uint32_t max_rate;
3006 };
3007
3008 static struct hfsc *
3009 hfsc_get__(const struct netdev *netdev_)
3010 {
3011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3012 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3013 }
3014
3015 static struct hfsc_class *
3016 hfsc_class_cast__(const struct tc_queue *queue)
3017 {
3018 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3019 }
3020
3021 static void
3022 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3023 {
3024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3025 struct hfsc *hfsc;
3026
3027 hfsc = xmalloc(sizeof *hfsc);
3028 tc_init(&hfsc->tc, &tc_ops_hfsc);
3029 hfsc->max_rate = max_rate;
3030 netdev->tc = &hfsc->tc;
3031 }
3032
3033 static void
3034 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3035 const struct hfsc_class *hc)
3036 {
3037 size_t hash;
3038 struct hfsc *hfsc;
3039 struct hfsc_class *hcp;
3040 struct tc_queue *queue;
3041
3042 hfsc = hfsc_get__(netdev);
3043 hash = hash_int(queue_id, 0);
3044
3045 queue = tc_find_queue__(netdev, queue_id, hash);
3046 if (queue) {
3047 hcp = hfsc_class_cast__(queue);
3048 } else {
3049 hcp = xmalloc(sizeof *hcp);
3050 queue = &hcp->tc_queue;
3051 queue->queue_id = queue_id;
3052 queue->created = time_msec();
3053 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3054 }
3055
3056 hcp->min_rate = hc->min_rate;
3057 hcp->max_rate = hc->max_rate;
3058 }
3059
3060 static int
3061 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3062 {
3063 const struct tc_service_curve *rsc, *fsc, *usc;
3064 static const struct nl_policy tca_hfsc_policy[] = {
3065 [TCA_HFSC_RSC] = {
3066 .type = NL_A_UNSPEC,
3067 .optional = false,
3068 .min_len = sizeof(struct tc_service_curve),
3069 },
3070 [TCA_HFSC_FSC] = {
3071 .type = NL_A_UNSPEC,
3072 .optional = false,
3073 .min_len = sizeof(struct tc_service_curve),
3074 },
3075 [TCA_HFSC_USC] = {
3076 .type = NL_A_UNSPEC,
3077 .optional = false,
3078 .min_len = sizeof(struct tc_service_curve),
3079 },
3080 };
3081 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3082
3083 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3084 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3085 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3086 return EPROTO;
3087 }
3088
3089 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3090 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3091 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3092
3093 if (rsc->m1 != 0 || rsc->d != 0 ||
3094 fsc->m1 != 0 || fsc->d != 0 ||
3095 usc->m1 != 0 || usc->d != 0) {
3096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3097 "Non-linear service curves are not supported.");
3098 return EPROTO;
3099 }
3100
3101 if (rsc->m2 != fsc->m2) {
3102 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3103 "Real-time service curves are not supported ");
3104 return EPROTO;
3105 }
3106
3107 if (rsc->m2 > usc->m2) {
3108 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3109 "Min-rate service curve is greater than "
3110 "the max-rate service curve.");
3111 return EPROTO;
3112 }
3113
3114 class->min_rate = fsc->m2;
3115 class->max_rate = usc->m2;
3116 return 0;
3117 }
3118
3119 static int
3120 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3121 struct hfsc_class *options,
3122 struct netdev_queue_stats *stats)
3123 {
3124 int error;
3125 unsigned int handle;
3126 struct nlattr *nl_options;
3127
3128 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3129 if (error) {
3130 return error;
3131 }
3132
3133 if (queue_id) {
3134 unsigned int major, minor;
3135
3136 major = tc_get_major(handle);
3137 minor = tc_get_minor(handle);
3138 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3139 *queue_id = minor - 1;
3140 } else {
3141 return EPROTO;
3142 }
3143 }
3144
3145 if (options) {
3146 error = hfsc_parse_tca_options__(nl_options, options);
3147 }
3148
3149 return error;
3150 }
3151
3152 static int
3153 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3154 unsigned int parent, struct hfsc_class *options,
3155 struct netdev_queue_stats *stats)
3156 {
3157 int error;
3158 struct ofpbuf *reply;
3159
3160 error = tc_query_class(netdev, handle, parent, &reply);
3161 if (error) {
3162 return error;
3163 }
3164
3165 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3166 ofpbuf_delete(reply);
3167 return error;
3168 }
3169
3170 static void
3171 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3172 struct hfsc_class *class)
3173 {
3174 uint32_t max_rate;
3175 const char *max_rate_s;
3176
3177 max_rate_s = smap_get(details, "max-rate");
3178 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3179
3180 if (!max_rate) {
3181 enum netdev_features current;
3182
3183 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3184 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3185 }
3186
3187 class->min_rate = max_rate;
3188 class->max_rate = max_rate;
3189 }
3190
3191 static int
3192 hfsc_parse_class_details__(struct netdev *netdev,
3193 const struct smap *details,
3194 struct hfsc_class * class)
3195 {
3196 const struct hfsc *hfsc;
3197 uint32_t min_rate, max_rate;
3198 const char *min_rate_s, *max_rate_s;
3199
3200 hfsc = hfsc_get__(netdev);
3201 min_rate_s = smap_get(details, "min-rate");
3202 max_rate_s = smap_get(details, "max-rate");
3203
3204 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3205 min_rate = MAX(min_rate, 1);
3206 min_rate = MIN(min_rate, hfsc->max_rate);
3207
3208 max_rate = (max_rate_s
3209 ? strtoull(max_rate_s, NULL, 10) / 8
3210 : hfsc->max_rate);
3211 max_rate = MAX(max_rate, min_rate);
3212 max_rate = MIN(max_rate, hfsc->max_rate);
3213
3214 class->min_rate = min_rate;
3215 class->max_rate = max_rate;
3216
3217 return 0;
3218 }
3219
3220 /* Create an HFSC qdisc.
3221 *
3222 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3223 static int
3224 hfsc_setup_qdisc__(struct netdev * netdev)
3225 {
3226 struct tcmsg *tcmsg;
3227 struct ofpbuf request;
3228 struct tc_hfsc_qopt opt;
3229
3230 tc_del_qdisc(netdev);
3231
3232 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3233 NLM_F_EXCL | NLM_F_CREATE, &request);
3234
3235 if (!tcmsg) {
3236 return ENODEV;
3237 }
3238
3239 tcmsg->tcm_handle = tc_make_handle(1, 0);
3240 tcmsg->tcm_parent = TC_H_ROOT;
3241
3242 memset(&opt, 0, sizeof opt);
3243 opt.defcls = 1;
3244
3245 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3246 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3247
3248 return tc_transact(&request, NULL);
3249 }
3250
3251 /* Create an HFSC class.
3252 *
3253 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3254 * sc rate <min_rate> ul rate <max_rate>" */
3255 static int
3256 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3257 unsigned int parent, struct hfsc_class *class)
3258 {
3259 int error;
3260 size_t opt_offset;
3261 struct tcmsg *tcmsg;
3262 struct ofpbuf request;
3263 struct tc_service_curve min, max;
3264
3265 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3266
3267 if (!tcmsg) {
3268 return ENODEV;
3269 }
3270
3271 tcmsg->tcm_handle = handle;
3272 tcmsg->tcm_parent = parent;
3273
3274 min.m1 = 0;
3275 min.d = 0;
3276 min.m2 = class->min_rate;
3277
3278 max.m1 = 0;
3279 max.d = 0;
3280 max.m2 = class->max_rate;
3281
3282 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3283 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3284 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3285 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3287 nl_msg_end_nested(&request, opt_offset);
3288
3289 error = tc_transact(&request, NULL);
3290 if (error) {
3291 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3292 "min-rate %ubps, max-rate %ubps (%s)",
3293 netdev_get_name(netdev),
3294 tc_get_major(handle), tc_get_minor(handle),
3295 tc_get_major(parent), tc_get_minor(parent),
3296 class->min_rate, class->max_rate, ovs_strerror(error));
3297 }
3298
3299 return error;
3300 }
3301
3302 static int
3303 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3304 {
3305 int error;
3306 struct hfsc_class class;
3307
3308 error = hfsc_setup_qdisc__(netdev);
3309
3310 if (error) {
3311 return error;
3312 }
3313
3314 hfsc_parse_qdisc_details__(netdev, details, &class);
3315 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3316 tc_make_handle(1, 0), &class);
3317
3318 if (error) {
3319 return error;
3320 }
3321
3322 hfsc_install__(netdev, class.max_rate);
3323 return 0;
3324 }
3325
3326 static int
3327 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3328 {
3329 struct ofpbuf msg;
3330 struct nl_dump dump;
3331 struct hfsc_class hc;
3332
3333 hc.max_rate = 0;
3334 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3335 hfsc_install__(netdev, hc.max_rate);
3336
3337 if (!start_queue_dump(netdev, &dump)) {
3338 return ENODEV;
3339 }
3340
3341 while (nl_dump_next(&dump, &msg)) {
3342 unsigned int queue_id;
3343
3344 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3345 hfsc_update_queue__(netdev, queue_id, &hc);
3346 }
3347 }
3348
3349 nl_dump_done(&dump);
3350 return 0;
3351 }
3352
3353 static void
3354 hfsc_tc_destroy(struct tc *tc)
3355 {
3356 struct hfsc *hfsc;
3357 struct hfsc_class *hc, *next;
3358
3359 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3360
3361 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3362 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3363 free(hc);
3364 }
3365
3366 tc_destroy(tc);
3367 free(hfsc);
3368 }
3369
3370 static int
3371 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3372 {
3373 const struct hfsc *hfsc;
3374 hfsc = hfsc_get__(netdev);
3375 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3376 return 0;
3377 }
3378
3379 static int
3380 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3381 {
3382 int error;
3383 struct hfsc_class class;
3384
3385 hfsc_parse_qdisc_details__(netdev, details, &class);
3386 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3387 tc_make_handle(1, 0), &class);
3388
3389 if (!error) {
3390 hfsc_get__(netdev)->max_rate = class.max_rate;
3391 }
3392
3393 return error;
3394 }
3395
3396 static int
3397 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3398 const struct tc_queue *queue, struct smap *details)
3399 {
3400 const struct hfsc_class *hc;
3401
3402 hc = hfsc_class_cast__(queue);
3403 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3404 if (hc->min_rate != hc->max_rate) {
3405 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3406 }
3407 return 0;
3408 }
3409
3410 static int
3411 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3412 const struct smap *details)
3413 {
3414 int error;
3415 struct hfsc_class class;
3416
3417 error = hfsc_parse_class_details__(netdev, details, &class);
3418 if (error) {
3419 return error;
3420 }
3421
3422 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3423 tc_make_handle(1, 0xfffe), &class);
3424 if (error) {
3425 return error;
3426 }
3427
3428 hfsc_update_queue__(netdev, queue_id, &class);
3429 return 0;
3430 }
3431
3432 static int
3433 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3434 {
3435 int error;
3436 struct hfsc *hfsc;
3437 struct hfsc_class *hc;
3438
3439 hc = hfsc_class_cast__(queue);
3440 hfsc = hfsc_get__(netdev);
3441
3442 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3443 if (!error) {
3444 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3445 free(hc);
3446 }
3447 return error;
3448 }
3449
3450 static int
3451 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3452 struct netdev_queue_stats *stats)
3453 {
3454 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3455 tc_make_handle(1, 0xfffe), NULL, stats);
3456 }
3457
3458 static int
3459 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3460 const struct ofpbuf *nlmsg,
3461 netdev_dump_queue_stats_cb *cb, void *aux)
3462 {
3463 struct netdev_queue_stats stats;
3464 unsigned int handle, major, minor;
3465 int error;
3466
3467 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3468 if (error) {
3469 return error;
3470 }
3471
3472 major = tc_get_major(handle);
3473 minor = tc_get_minor(handle);
3474 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3475 (*cb)(minor - 1, &stats, aux);
3476 }
3477 return 0;
3478 }
3479
3480 static const struct tc_ops tc_ops_hfsc = {
3481 "hfsc", /* linux_name */
3482 "linux-hfsc", /* ovs_name */
3483 HFSC_N_QUEUES, /* n_queues */
3484 hfsc_tc_install, /* tc_install */
3485 hfsc_tc_load, /* tc_load */
3486 hfsc_tc_destroy, /* tc_destroy */
3487 hfsc_qdisc_get, /* qdisc_get */
3488 hfsc_qdisc_set, /* qdisc_set */
3489 hfsc_class_get, /* class_get */
3490 hfsc_class_set, /* class_set */
3491 hfsc_class_delete, /* class_delete */
3492 hfsc_class_get_stats, /* class_get_stats */
3493 hfsc_class_dump_stats /* class_dump_stats */
3494 };
3495 \f
3496 /* "linux-default" traffic control class.
3497 *
3498 * This class represents the default, unnamed Linux qdisc. It corresponds to
3499 * the "" (empty string) QoS type in the OVS database. */
3500
3501 static void
3502 default_install__(struct netdev *netdev_)
3503 {
3504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3505 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3506
3507 /* Nothing but a tc class implementation is allowed to write to a tc. This
3508 * class never does that, so we can legitimately use a const tc object. */
3509 netdev->tc = CONST_CAST(struct tc *, &tc);
3510 }
3511
3512 static int
3513 default_tc_install(struct netdev *netdev,
3514 const struct smap *details OVS_UNUSED)
3515 {
3516 default_install__(netdev);
3517 return 0;
3518 }
3519
3520 static int
3521 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3522 {
3523 default_install__(netdev);
3524 return 0;
3525 }
3526
3527 static const struct tc_ops tc_ops_default = {
3528 NULL, /* linux_name */
3529 "", /* ovs_name */
3530 0, /* n_queues */
3531 default_tc_install,
3532 default_tc_load,
3533 NULL, /* tc_destroy */
3534 NULL, /* qdisc_get */
3535 NULL, /* qdisc_set */
3536 NULL, /* class_get */
3537 NULL, /* class_set */
3538 NULL, /* class_delete */
3539 NULL, /* class_get_stats */
3540 NULL /* class_dump_stats */
3541 };
3542 \f
3543 /* "linux-other" traffic control class.
3544 *
3545 * */
3546
3547 static int
3548 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3549 {
3550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3551 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3552
3553 /* Nothing but a tc class implementation is allowed to write to a tc. This
3554 * class never does that, so we can legitimately use a const tc object. */
3555 netdev->tc = CONST_CAST(struct tc *, &tc);
3556 return 0;
3557 }
3558
3559 static const struct tc_ops tc_ops_other = {
3560 NULL, /* linux_name */
3561 "linux-other", /* ovs_name */
3562 0, /* n_queues */
3563 NULL, /* tc_install */
3564 other_tc_load,
3565 NULL, /* tc_destroy */
3566 NULL, /* qdisc_get */
3567 NULL, /* qdisc_set */
3568 NULL, /* class_get */
3569 NULL, /* class_set */
3570 NULL, /* class_delete */
3571 NULL, /* class_get_stats */
3572 NULL /* class_dump_stats */
3573 };
3574 \f
3575 /* Traffic control. */
3576
3577 /* Number of kernel "tc" ticks per second. */
3578 static double ticks_per_s;
3579
3580 /* Number of kernel "jiffies" per second. This is used for the purpose of
3581 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3582 * one jiffy's worth of data.
3583 *
3584 * There are two possibilities here:
3585 *
3586 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3587 * approximate range of 100 to 1024. That means that we really need to
3588 * make sure that the qdisc can buffer that much data.
3589 *
3590 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3591 * has finely granular timers and there's no need to fudge additional room
3592 * for buffers. (There's no extra effort needed to implement that: the
3593 * large 'buffer_hz' is used as a divisor, so practically any number will
3594 * come out as 0 in the division. Small integer results in the case of
3595 * really high dividends won't have any real effect anyhow.)
3596 */
3597 static unsigned int buffer_hz;
3598
3599 /* Returns tc handle 'major':'minor'. */
3600 static unsigned int
3601 tc_make_handle(unsigned int major, unsigned int minor)
3602 {
3603 return TC_H_MAKE(major << 16, minor);
3604 }
3605
3606 /* Returns the major number from 'handle'. */
3607 static unsigned int
3608 tc_get_major(unsigned int handle)
3609 {
3610 return TC_H_MAJ(handle) >> 16;
3611 }
3612
3613 /* Returns the minor number from 'handle'. */
3614 static unsigned int
3615 tc_get_minor(unsigned int handle)
3616 {
3617 return TC_H_MIN(handle);
3618 }
3619
3620 static struct tcmsg *
3621 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3622 struct ofpbuf *request)
3623 {
3624 struct tcmsg *tcmsg;
3625 int ifindex;
3626 int error;
3627
3628 error = get_ifindex(netdev, &ifindex);
3629 if (error) {
3630 return NULL;
3631 }
3632
3633 ofpbuf_init(request, 512);
3634 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3635 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3636 tcmsg->tcm_family = AF_UNSPEC;
3637 tcmsg->tcm_ifindex = ifindex;
3638 /* Caller should fill in tcmsg->tcm_handle. */
3639 /* Caller should fill in tcmsg->tcm_parent. */
3640
3641 return tcmsg;
3642 }
3643
3644 static int
3645 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3646 {
3647 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3648 ofpbuf_uninit(request);
3649 return error;
3650 }
3651
3652 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3653 * policing configuration.
3654 *
3655 * This function is equivalent to running the following when 'add' is true:
3656 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3657 *
3658 * This function is equivalent to running the following when 'add' is false:
3659 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3660 *
3661 * The configuration and stats may be seen with the following command:
3662 * /sbin/tc -s qdisc show dev <devname>
3663 *
3664 * Returns 0 if successful, otherwise a positive errno value.
3665 */
3666 static int
3667 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3668 {
3669 struct ofpbuf request;
3670 struct tcmsg *tcmsg;
3671 int error;
3672 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3673 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3674
3675 tcmsg = tc_make_request(netdev, type, flags, &request);
3676 if (!tcmsg) {
3677 return ENODEV;
3678 }
3679 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3680 tcmsg->tcm_parent = TC_H_INGRESS;
3681 nl_msg_put_string(&request, TCA_KIND, "ingress");
3682 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3683
3684 error = tc_transact(&request, NULL);
3685 if (error) {
3686 /* If we're deleting the qdisc, don't worry about some of the
3687 * error conditions. */
3688 if (!add && (error == ENOENT || error == EINVAL)) {
3689 return 0;
3690 }
3691 return error;
3692 }
3693
3694 return 0;
3695 }
3696
3697 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3698 * of 'kbits_burst'.
3699 *
3700 * This function is equivalent to running:
3701 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3702 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3703 * mtu 65535 drop
3704 *
3705 * The configuration and stats may be seen with the following command:
3706 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3707 *
3708 * Returns 0 if successful, otherwise a positive errno value.
3709 */
3710 static int
3711 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3712 {
3713 struct tc_police tc_police;
3714 struct ofpbuf request;
3715 struct tcmsg *tcmsg;
3716 size_t basic_offset;
3717 size_t police_offset;
3718 int error;
3719 int mtu = 65535;
3720
3721 memset(&tc_police, 0, sizeof tc_police);
3722 tc_police.action = TC_POLICE_SHOT;
3723 tc_police.mtu = mtu;
3724 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3725 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3726 kbits_burst * 1024);
3727
3728 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3729 NLM_F_EXCL | NLM_F_CREATE, &request);
3730 if (!tcmsg) {
3731 return ENODEV;
3732 }
3733 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3734 tcmsg->tcm_info = tc_make_handle(49,
3735 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3736
3737 nl_msg_put_string(&request, TCA_KIND, "basic");
3738 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3739 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3740 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3741 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3742 nl_msg_end_nested(&request, police_offset);
3743 nl_msg_end_nested(&request, basic_offset);
3744
3745 error = tc_transact(&request, NULL);
3746 if (error) {
3747 return error;
3748 }
3749
3750 return 0;
3751 }
3752
3753 static void
3754 read_psched(void)
3755 {
3756 /* The values in psched are not individually very meaningful, but they are
3757 * important. The tables below show some values seen in the wild.
3758 *
3759 * Some notes:
3760 *
3761 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3762 * (Before that, there are hints that it was 1000000000.)
3763 *
3764 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3765 * above.
3766 *
3767 * /proc/net/psched
3768 * -----------------------------------
3769 * [1] 000c8000 000f4240 000f4240 00000064
3770 * [2] 000003e8 00000400 000f4240 3b9aca00
3771 * [3] 000003e8 00000400 000f4240 3b9aca00
3772 * [4] 000003e8 00000400 000f4240 00000064
3773 * [5] 000003e8 00000040 000f4240 3b9aca00
3774 * [6] 000003e8 00000040 000f4240 000000f9
3775 *
3776 * a b c d ticks_per_s buffer_hz
3777 * ------- --------- ---------- ------------- ----------- -------------
3778 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3779 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3780 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [4] 1,000 1,024 1,000,000 100 976,562 100
3782 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3783 * [6] 1,000 64 1,000,000 249 15,625,000 249
3784 *
3785 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3786 * [2] 2.6.26-1-686-bigmem from Debian lenny
3787 * [3] 2.6.26-2-sparc64 from Debian lenny
3788 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3789 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3790 * [6] 2.6.34 from kernel.org on KVM
3791 */
3792 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3793 static const char fn[] = "/proc/net/psched";
3794 unsigned int a, b, c, d;
3795 FILE *stream;
3796
3797 if (!ovsthread_once_start(&once)) {
3798 return;
3799 }
3800
3801 ticks_per_s = 1.0;
3802 buffer_hz = 100;
3803
3804 stream = fopen(fn, "r");
3805 if (!stream) {
3806 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3807 goto exit;
3808 }
3809
3810 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3811 VLOG_WARN("%s: read failed", fn);
3812 fclose(stream);
3813 goto exit;
3814 }
3815 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3816 fclose(stream);
3817
3818 if (!a || !c) {
3819 VLOG_WARN("%s: invalid scheduler parameters", fn);
3820 goto exit;
3821 }
3822
3823 ticks_per_s = (double) a * c / b;
3824 if (c == 1000000) {
3825 buffer_hz = d;
3826 } else {
3827 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3828 fn, a, b, c, d);
3829 }
3830 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3831
3832 exit:
3833 ovsthread_once_done(&once);
3834 }
3835
3836 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3837 * rate of 'rate' bytes per second. */
3838 static unsigned int
3839 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3840 {
3841 read_psched();
3842 return (rate * ticks) / ticks_per_s;
3843 }
3844
3845 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3846 * rate of 'rate' bytes per second. */
3847 static unsigned int
3848 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3849 {
3850 read_psched();
3851 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3852 }
3853
3854 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3855 * a transmission rate of 'rate' bytes per second. */
3856 static unsigned int
3857 tc_buffer_per_jiffy(unsigned int rate)
3858 {
3859 read_psched();
3860 return rate / buffer_hz;
3861 }
3862
3863 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3864 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3865 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3866 * stores NULL into it if it is absent.
3867 *
3868 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3869 * 'msg'.
3870 *
3871 * Returns 0 if successful, otherwise a positive errno value. */
3872 static int
3873 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3874 struct nlattr **options)
3875 {
3876 static const struct nl_policy tca_policy[] = {
3877 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3878 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3879 };
3880 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3881
3882 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3883 tca_policy, ta, ARRAY_SIZE(ta))) {
3884 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3885 goto error;
3886 }
3887
3888 if (kind) {
3889 *kind = nl_attr_get_string(ta[TCA_KIND]);
3890 }
3891
3892 if (options) {
3893 *options = ta[TCA_OPTIONS];
3894 }
3895
3896 return 0;
3897
3898 error:
3899 if (kind) {
3900 *kind = NULL;
3901 }
3902 if (options) {
3903 *options = NULL;
3904 }
3905 return EPROTO;
3906 }
3907
3908 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3909 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3910 * into '*options', and its queue statistics into '*stats'. Any of the output
3911 * arguments may be null.
3912 *
3913 * Returns 0 if successful, otherwise a positive errno value. */
3914 static int
3915 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3916 struct nlattr **options, struct netdev_queue_stats *stats)
3917 {
3918 static const struct nl_policy tca_policy[] = {
3919 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3920 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3921 };
3922 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3923
3924 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3925 tca_policy, ta, ARRAY_SIZE(ta))) {
3926 VLOG_WARN_RL(&rl, "failed to parse class message");
3927 goto error;
3928 }
3929
3930 if (handlep) {
3931 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3932 *handlep = tc->tcm_handle;
3933 }
3934
3935 if (options) {
3936 *options = ta[TCA_OPTIONS];
3937 }
3938
3939 if (stats) {
3940 const struct gnet_stats_queue *gsq;
3941 struct gnet_stats_basic gsb;
3942
3943 static const struct nl_policy stats_policy[] = {
3944 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3945 .min_len = sizeof gsb },
3946 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3947 .min_len = sizeof *gsq },
3948 };
3949 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3950
3951 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3952 sa, ARRAY_SIZE(sa))) {
3953 VLOG_WARN_RL(&rl, "failed to parse class stats");
3954 goto error;
3955 }
3956
3957 /* Alignment issues screw up the length of struct gnet_stats_basic on
3958 * some arch/bitsize combinations. Newer versions of Linux have a
3959 * struct gnet_stats_basic_packed, but we can't depend on that. The
3960 * easiest thing to do is just to make a copy. */
3961 memset(&gsb, 0, sizeof gsb);
3962 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3963 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3964 stats->tx_bytes = gsb.bytes;
3965 stats->tx_packets = gsb.packets;
3966
3967 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3968 stats->tx_errors = gsq->drops;
3969 }
3970
3971 return 0;
3972
3973 error:
3974 if (options) {
3975 *options = NULL;
3976 }
3977 if (stats) {
3978 memset(stats, 0, sizeof *stats);
3979 }
3980 return EPROTO;
3981 }
3982
3983 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3984 * on 'netdev'. */
3985 static int
3986 tc_query_class(const struct netdev *netdev,
3987 unsigned int handle, unsigned int parent,
3988 struct ofpbuf **replyp)
3989 {
3990 struct ofpbuf request;
3991 struct tcmsg *tcmsg;
3992 int error;
3993
3994 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3995 if (!tcmsg) {
3996 return ENODEV;
3997 }
3998 tcmsg->tcm_handle = handle;
3999 tcmsg->tcm_parent = parent;
4000
4001 error = tc_transact(&request, replyp);
4002 if (error) {
4003 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4004 netdev_get_name(netdev),
4005 tc_get_major(handle), tc_get_minor(handle),
4006 tc_get_major(parent), tc_get_minor(parent),
4007 ovs_strerror(error));
4008 }
4009 return error;
4010 }
4011
4012 /* Equivalent to "tc class del dev <name> handle <handle>". */
4013 static int
4014 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4015 {
4016 struct ofpbuf request;
4017 struct tcmsg *tcmsg;
4018 int error;
4019
4020 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4021 if (!tcmsg) {
4022 return ENODEV;
4023 }
4024 tcmsg->tcm_handle = handle;
4025 tcmsg->tcm_parent = 0;
4026
4027 error = tc_transact(&request, NULL);
4028 if (error) {
4029 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4030 netdev_get_name(netdev),
4031 tc_get_major(handle), tc_get_minor(handle),
4032 ovs_strerror(error));
4033 }
4034 return error;
4035 }
4036
4037 /* Equivalent to "tc qdisc del dev <name> root". */
4038 static int
4039 tc_del_qdisc(struct netdev *netdev_)
4040 {
4041 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4042 struct ofpbuf request;
4043 struct tcmsg *tcmsg;
4044 int error;
4045
4046 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4047 if (!tcmsg) {
4048 return ENODEV;
4049 }
4050 tcmsg->tcm_handle = tc_make_handle(1, 0);
4051 tcmsg->tcm_parent = TC_H_ROOT;
4052
4053 error = tc_transact(&request, NULL);
4054 if (error == EINVAL) {
4055 /* EINVAL probably means that the default qdisc was in use, in which
4056 * case we've accomplished our purpose. */
4057 error = 0;
4058 }
4059 if (!error && netdev->tc) {
4060 if (netdev->tc->ops->tc_destroy) {
4061 netdev->tc->ops->tc_destroy(netdev->tc);
4062 }
4063 netdev->tc = NULL;
4064 }
4065 return error;
4066 }
4067
4068 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4069 * kernel to determine what they are. Returns 0 if successful, otherwise a
4070 * positive errno value. */
4071 static int
4072 tc_query_qdisc(const struct netdev *netdev_)
4073 {
4074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4075 struct ofpbuf request, *qdisc;
4076 const struct tc_ops *ops;
4077 struct tcmsg *tcmsg;
4078 int load_error;
4079 int error;
4080
4081 if (netdev->tc) {
4082 return 0;
4083 }
4084
4085 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4086 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4087 * 2.6.35 without that fix backported to it.
4088 *
4089 * To avoid the OOPS, we must not make a request that would attempt to dump
4090 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4091 * few others. There are a few ways that I can see to do this, but most of
4092 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4093 * technique chosen here is to assume that any non-default qdisc that we
4094 * create will have a class with handle 1:0. The built-in qdiscs only have
4095 * a class with handle 0:0.
4096 *
4097 * We could check for Linux 2.6.35+ and use a more straightforward method
4098 * there. */
4099 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4100 if (!tcmsg) {
4101 return ENODEV;
4102 }
4103 tcmsg->tcm_handle = tc_make_handle(1, 0);
4104 tcmsg->tcm_parent = 0;
4105
4106 /* Figure out what tc class to instantiate. */
4107 error = tc_transact(&request, &qdisc);
4108 if (!error) {
4109 const char *kind;
4110
4111 error = tc_parse_qdisc(qdisc, &kind, NULL);
4112 if (error) {
4113 ops = &tc_ops_other;
4114 } else {
4115 ops = tc_lookup_linux_name(kind);
4116 if (!ops) {
4117 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4118 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4119
4120 ops = &tc_ops_other;
4121 }
4122 }
4123 } else if (error == ENOENT) {
4124 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4125 * other entity that doesn't have a handle 1:0. We will assume
4126 * that it's the system default qdisc. */
4127 ops = &tc_ops_default;
4128 error = 0;
4129 } else {
4130 /* Who knows? Maybe the device got deleted. */
4131 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4132 netdev_get_name(netdev_), ovs_strerror(error));
4133 ops = &tc_ops_other;
4134 }
4135
4136 /* Instantiate it. */
4137 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4138 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4139 ofpbuf_delete(qdisc);
4140
4141 return error ? error : load_error;
4142 }
4143
4144 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4145 approximate the time to transmit packets of various lengths. For an MTU of
4146 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4147 represents two possible packet lengths; for a MTU of 513 through 1024, four
4148 possible lengths; and so on.
4149
4150 Returns, for the specified 'mtu', the number of bits that packet lengths
4151 need to be shifted right to fit within such a 256-entry table. */
4152 static int
4153 tc_calc_cell_log(unsigned int mtu)
4154 {
4155 int cell_log;
4156
4157 if (!mtu) {
4158 mtu = ETH_PAYLOAD_MAX;
4159 }
4160 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4161
4162 for (cell_log = 0; mtu >= 256; cell_log++) {
4163 mtu >>= 1;
4164 }
4165
4166 return cell_log;
4167 }
4168
4169 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4170 * of 'mtu'. */
4171 static void
4172 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4173 {
4174 memset(rate, 0, sizeof *rate);
4175 rate->cell_log = tc_calc_cell_log(mtu);
4176 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4177 /* rate->cell_align = 0; */ /* distro headers. */
4178 rate->mpu = ETH_TOTAL_MIN;
4179 rate->rate = Bps;
4180 }
4181
4182 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4183 * attribute of the specified "type".
4184 *
4185 * See tc_calc_cell_log() above for a description of "rtab"s. */
4186 static void
4187 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4188 {
4189 uint32_t *rtab;
4190 unsigned int i;
4191
4192 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4193 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4194 unsigned packet_size = (i + 1) << rate->cell_log;
4195 if (packet_size < rate->mpu) {
4196 packet_size = rate->mpu;
4197 }
4198 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4199 }
4200 }
4201
4202 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4203 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4204 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4205 * 0 is fine.) */
4206 static int
4207 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4208 {
4209 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4210 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4211 }
4212 \f
4213 /* Linux-only functions declared in netdev-linux.h */
4214
4215 /* Returns a fd for an AF_INET socket or a negative errno value. */
4216 int
4217 netdev_linux_get_af_inet_sock(void)
4218 {
4219 int error = netdev_linux_init();
4220 return error ? -error : af_inet_sock;
4221 }
4222
4223 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4224 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4225 int
4226 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4227 const char *flag_name, bool enable)
4228 {
4229 const char *netdev_name = netdev_get_name(netdev);
4230 struct ethtool_value evalue;
4231 uint32_t new_flags;
4232 int error;
4233
4234 COVERAGE_INC(netdev_get_ethtool);
4235 memset(&evalue, 0, sizeof evalue);
4236 error = netdev_linux_do_ethtool(netdev_name,
4237 (struct ethtool_cmd *)&evalue,
4238 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4239 if (error) {
4240 return error;
4241 }
4242
4243 COVERAGE_INC(netdev_set_ethtool);
4244 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4245 error = netdev_linux_do_ethtool(netdev_name,
4246 (struct ethtool_cmd *)&evalue,
4247 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4248 if (error) {
4249 return error;
4250 }
4251
4252 COVERAGE_INC(netdev_get_ethtool);
4253 memset(&evalue, 0, sizeof evalue);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4257 if (error) {
4258 return error;
4259 }
4260
4261 if (new_flags != evalue.data) {
4262 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4263 "device %s failed", enable ? "enable" : "disable",
4264 flag_name, netdev_name);
4265 return EOPNOTSUPP;
4266 }
4267
4268 return 0;
4269 }
4270 \f
4271 /* Utility functions. */
4272
4273 /* Copies 'src' into 'dst', performing format conversion in the process. */
4274 static void
4275 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4276 const struct rtnl_link_stats *src)
4277 {
4278 dst->rx_packets = src->rx_packets;
4279 dst->tx_packets = src->tx_packets;
4280 dst->rx_bytes = src->rx_bytes;
4281 dst->tx_bytes = src->tx_bytes;
4282 dst->rx_errors = src->rx_errors;
4283 dst->tx_errors = src->tx_errors;
4284 dst->rx_dropped = src->rx_dropped;
4285 dst->tx_dropped = src->tx_dropped;
4286 dst->multicast = src->multicast;
4287 dst->collisions = src->collisions;
4288 dst->rx_length_errors = src->rx_length_errors;
4289 dst->rx_over_errors = src->rx_over_errors;
4290 dst->rx_crc_errors = src->rx_crc_errors;
4291 dst->rx_frame_errors = src->rx_frame_errors;
4292 dst->rx_fifo_errors = src->rx_fifo_errors;
4293 dst->rx_missed_errors = src->rx_missed_errors;
4294 dst->tx_aborted_errors = src->tx_aborted_errors;
4295 dst->tx_carrier_errors = src->tx_carrier_errors;
4296 dst->tx_fifo_errors = src->tx_fifo_errors;
4297 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4298 dst->tx_window_errors = src->tx_window_errors;
4299 }
4300
4301 static int
4302 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4303 {
4304 /* Policy for RTNLGRP_LINK messages.
4305 *
4306 * There are *many* more fields in these messages, but currently we only
4307 * care about these fields. */
4308 static const struct nl_policy rtnlgrp_link_policy[] = {
4309 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4310 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4311 .min_len = sizeof(struct rtnl_link_stats) },
4312 };
4313
4314 struct ofpbuf request;
4315 struct ofpbuf *reply;
4316 struct ifinfomsg *ifi;
4317 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4318 int error;
4319
4320 ofpbuf_init(&request, 0);
4321 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4322 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4323 ifi->ifi_family = PF_UNSPEC;
4324 ifi->ifi_index = ifindex;
4325 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4326 ofpbuf_uninit(&request);
4327 if (error) {
4328 return error;
4329 }
4330
4331 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4332 rtnlgrp_link_policy,
4333 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4334 ofpbuf_delete(reply);
4335 return EPROTO;
4336 }
4337
4338 if (!attrs[IFLA_STATS]) {
4339 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4340 ofpbuf_delete(reply);
4341 return EPROTO;
4342 }
4343
4344 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4345
4346 ofpbuf_delete(reply);
4347
4348 return 0;
4349 }
4350
4351 static int
4352 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4353 {
4354 static const char fn[] = "/proc/net/dev";
4355 char line[1024];
4356 FILE *stream;
4357 int ln;
4358
4359 stream = fopen(fn, "r");
4360 if (!stream) {
4361 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4362 return errno;
4363 }
4364
4365 ln = 0;
4366 while (fgets(line, sizeof line, stream)) {
4367 if (++ln >= 3) {
4368 char devname[16];
4369 #define X64 "%"SCNu64
4370 if (sscanf(line,
4371 " %15[^:]:"
4372 X64 X64 X64 X64 X64 X64 X64 "%*u"
4373 X64 X64 X64 X64 X64 X64 X64 "%*u",
4374 devname,
4375 &stats->rx_bytes,
4376 &stats->rx_packets,
4377 &stats->rx_errors,
4378 &stats->rx_dropped,
4379 &stats->rx_fifo_errors,
4380 &stats->rx_frame_errors,
4381 &stats->multicast,
4382 &stats->tx_bytes,
4383 &stats->tx_packets,
4384 &stats->tx_errors,
4385 &stats->tx_dropped,
4386 &stats->tx_fifo_errors,
4387 &stats->collisions,
4388 &stats->tx_carrier_errors) != 15) {
4389 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4390 } else if (!strcmp(devname, netdev_name)) {
4391 stats->rx_length_errors = UINT64_MAX;
4392 stats->rx_over_errors = UINT64_MAX;
4393 stats->rx_crc_errors = UINT64_MAX;
4394 stats->rx_missed_errors = UINT64_MAX;
4395 stats->tx_aborted_errors = UINT64_MAX;
4396 stats->tx_heartbeat_errors = UINT64_MAX;
4397 stats->tx_window_errors = UINT64_MAX;
4398 fclose(stream);
4399 return 0;
4400 }
4401 }
4402 }
4403 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4404 fclose(stream);
4405 return ENODEV;
4406 }
4407
4408 static int
4409 get_flags(const struct netdev *dev, unsigned int *flags)
4410 {
4411 struct ifreq ifr;
4412 int error;
4413
4414 *flags = 0;
4415 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4416 "SIOCGIFFLAGS");
4417 if (!error) {
4418 *flags = ifr.ifr_flags;
4419 }
4420 return error;
4421 }
4422
4423 static int
4424 set_flags(const char *name, unsigned int flags)
4425 {
4426 struct ifreq ifr;
4427
4428 ifr.ifr_flags = flags;
4429 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4430 }
4431
4432 static int
4433 do_get_ifindex(const char *netdev_name)
4434 {
4435 struct ifreq ifr;
4436
4437 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4438 COVERAGE_INC(netdev_get_ifindex);
4439 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4440 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4441 netdev_name, ovs_strerror(errno));
4442 return -errno;
4443 }
4444 return ifr.ifr_ifindex;
4445 }
4446
4447 static int
4448 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4449 {
4450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4451
4452 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4453 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4454
4455 if (ifindex < 0) {
4456 netdev->get_ifindex_error = -ifindex;
4457 netdev->ifindex = 0;
4458 } else {
4459 netdev->get_ifindex_error = 0;
4460 netdev->ifindex = ifindex;
4461 }
4462 netdev->cache_valid |= VALID_IFINDEX;
4463 }
4464
4465 *ifindexp = netdev->ifindex;
4466 return netdev->get_ifindex_error;
4467 }
4468
4469 static int
4470 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4471 {
4472 struct ifreq ifr;
4473 int hwaddr_family;
4474
4475 memset(&ifr, 0, sizeof ifr);
4476 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4477 COVERAGE_INC(netdev_get_hwaddr);
4478 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4479 /* ENODEV probably means that a vif disappeared asynchronously and
4480 * hasn't been removed from the database yet, so reduce the log level
4481 * to INFO for that case. */
4482 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4483 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4484 netdev_name, ovs_strerror(errno));
4485 return errno;
4486 }
4487 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4488 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4489 VLOG_WARN("%s device has unknown hardware address family %d",
4490 netdev_name, hwaddr_family);
4491 }
4492 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4493 return 0;
4494 }
4495
4496 static int
4497 set_etheraddr(const char *netdev_name,
4498 const uint8_t mac[ETH_ADDR_LEN])
4499 {
4500 struct ifreq ifr;
4501
4502 memset(&ifr, 0, sizeof ifr);
4503 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4504 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4505 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4506 COVERAGE_INC(netdev_set_hwaddr);
4507 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4508 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4509 netdev_name, ovs_strerror(errno));
4510 return errno;
4511 }
4512 return 0;
4513 }
4514
4515 static int
4516 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4517 int cmd, const char *cmd_name)
4518 {
4519 struct ifreq ifr;
4520
4521 memset(&ifr, 0, sizeof ifr);
4522 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4523 ifr.ifr_data = (caddr_t) ecmd;
4524
4525 ecmd->cmd = cmd;
4526 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4527 return 0;
4528 } else {
4529 if (errno != EOPNOTSUPP) {
4530 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4531 "failed: %s", cmd_name, name, ovs_strerror(errno));
4532 } else {
4533 /* The device doesn't support this operation. That's pretty
4534 * common, so there's no point in logging anything. */
4535 }
4536 return errno;
4537 }
4538 }
4539
4540 static int
4541 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4542 const char *cmd_name)
4543 {
4544 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4545 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4546 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4547 ovs_strerror(errno));
4548 return errno;
4549 }
4550 return 0;
4551 }
4552
4553 static int
4554 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4555 int cmd, const char *cmd_name)
4556 {
4557 struct ifreq ifr;
4558 int error;
4559
4560 ifr.ifr_addr.sa_family = AF_INET;
4561 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4562 if (!error) {
4563 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4564 &ifr.ifr_addr);
4565 *ip = sin->sin_addr;
4566 }
4567 return error;
4568 }
4569
4570 /* Returns an AF_PACKET raw socket or a negative errno value. */
4571 static int
4572 af_packet_sock(void)
4573 {
4574 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4575 static int sock;
4576
4577 if (ovsthread_once_start(&once)) {
4578 sock = socket(AF_PACKET, SOCK_RAW, 0);
4579 if (sock >= 0) {
4580 int error = set_nonblocking(sock);
4581 if (error) {
4582 close(sock);
4583 sock = -error;
4584 }
4585 } else {
4586 sock = -errno;
4587 VLOG_ERR("failed to create packet socket: %s",
4588 ovs_strerror(errno));
4589 }
4590 ovsthread_once_done(&once);
4591 }
4592
4593 return sock;
4594 }