]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
29d8ad96bfe2965c348145422642ef26877f583e
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "packets.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "shash.h"
68 #include "socket-util.h"
69 #include "sset.h"
70 #include "timer.h"
71 #include "unaligned.h"
72 #include "vlog.h"
73
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
83
84 \f
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
89 #endif
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
92 #endif
93
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #endif
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 #endif
102
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105 #ifndef TC_RTAB_SIZE
106 #define TC_RTAB_SIZE 1024
107 #endif
108
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
111
112 enum {
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
122 };
123
124 struct tap_state {
125 int fd;
126 };
127 \f
128 /* Traffic control. */
129
130 /* An instance of a traffic control class. Always associated with a particular
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
135 struct tc {
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140 };
141
142 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
144 /* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148 struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
151 long long int created; /* Time queue was created, in msecs. */
152 };
153
154 /* A particular kind of traffic control. Each implementation generally maps to
155 * one particular Linux qdisc class.
156 *
157 * The functions below return 0 if successful or a positive errno value on
158 * failure, except where otherwise noted. All of them must be provided, except
159 * where otherwise noted. */
160 struct tc_ops {
161 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
162 * This is null for tc_ops_default and tc_ops_other, for which there are no
163 * appropriate values. */
164 const char *linux_name;
165
166 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
167 const char *ovs_name;
168
169 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
170 * queues. The queues are numbered 0 through n_queues - 1. */
171 unsigned int n_queues;
172
173 /* Called to install this TC class on 'netdev'. The implementation should
174 * make the Netlink calls required to set up 'netdev' with the right qdisc
175 * and configure it according to 'details'. The implementation may assume
176 * that the current qdisc is the default; that is, there is no need for it
177 * to delete the current qdisc before installing itself.
178 *
179 * The contents of 'details' should be documented as valid for 'ovs_name'
180 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
181 * (which is built as ovs-vswitchd.conf.db(8)).
182 *
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'.
185 *
186 * (This function is null for tc_ops_other, which cannot be installed. For
187 * other TC classes it should always be nonnull.) */
188 int (*tc_install)(struct netdev *netdev, const struct smap *details);
189
190 /* Called when the netdev code determines (through a Netlink query) that
191 * this TC class's qdisc is installed on 'netdev', but we didn't install
192 * it ourselves and so don't know any of the details.
193 *
194 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
195 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
196 * implementation should parse the other attributes of 'nlmsg' as
197 * necessary to determine its configuration. If necessary it should also
198 * use Netlink queries to determine the configuration of queues on
199 * 'netdev'.
200 *
201 * This function must return 0 if and only if it sets 'netdev->tc' to an
202 * initialized 'struct tc'. */
203 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
204
205 /* Destroys the data structures allocated by the implementation as part of
206 * 'tc'. (This includes destroying 'tc->queues' by calling
207 * tc_destroy(tc).
208 *
209 * The implementation should not need to perform any Netlink calls. If
210 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
211 * (But it may not be desirable.)
212 *
213 * This function may be null if 'tc' is trivial. */
214 void (*tc_destroy)(struct tc *tc);
215
216 /* Retrieves details of 'netdev->tc' configuration into 'details'.
217 *
218 * The implementation should not need to perform any Netlink calls, because
219 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
220 * cached the configuration.
221 *
222 * The contents of 'details' should be documented as valid for 'ovs_name'
223 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
224 * (which is built as ovs-vswitchd.conf.db(8)).
225 *
226 * This function may be null if 'tc' is not configurable.
227 */
228 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
229
230 /* Reconfigures 'netdev->tc' according to 'details', performing any
231 * required Netlink calls to complete the reconfiguration.
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 *
237 * This function may be null if 'tc' is not configurable.
238 */
239 int (*qdisc_set)(struct netdev *, const struct smap *details);
240
241 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
242 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 *
248 * The implementation should not need to perform any Netlink calls, because
249 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
250 * cached the queue configuration.
251 *
252 * This function may be null if 'tc' does not have queues ('n_queues' is
253 * 0). */
254 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
255 struct smap *details);
256
257 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
258 * 'details', perfoming any required Netlink calls to complete the
259 * reconfiguration. The caller ensures that 'queue_id' is less than
260 * 'n_queues'.
261 *
262 * The contents of 'details' should be documented as valid for 'ovs_name'
263 * in the "other_config" column in the "Queue" table in
264 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
265 *
266 * This function may be null if 'tc' does not have queues or its queues are
267 * not configurable. */
268 int (*class_set)(struct netdev *, unsigned int queue_id,
269 const struct smap *details);
270
271 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
272 * tc_queue's within 'netdev->tc->queues'.
273 *
274 * This function may be null if 'tc' does not have queues or its queues
275 * cannot be deleted. */
276 int (*class_delete)(struct netdev *, struct tc_queue *queue);
277
278 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
279 * 'struct tc_queue's within 'netdev->tc->queues'.
280 *
281 * On success, initializes '*stats'.
282 *
283 * This function may be null if 'tc' does not have queues or if it cannot
284 * report queue statistics. */
285 int (*class_get_stats)(const struct netdev *netdev,
286 const struct tc_queue *queue,
287 struct netdev_queue_stats *stats);
288
289 /* Extracts queue stats from 'nlmsg', which is a response to a
290 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
291 *
292 * This function may be null if 'tc' does not have queues or if it cannot
293 * report queue statistics. */
294 int (*class_dump_stats)(const struct netdev *netdev,
295 const struct ofpbuf *nlmsg,
296 netdev_dump_queue_stats_cb *cb, void *aux);
297 };
298
299 static void
300 tc_init(struct tc *tc, const struct tc_ops *ops)
301 {
302 tc->ops = ops;
303 hmap_init(&tc->queues);
304 }
305
306 static void
307 tc_destroy(struct tc *tc)
308 {
309 hmap_destroy(&tc->queues);
310 }
311
312 static const struct tc_ops tc_ops_htb;
313 static const struct tc_ops tc_ops_hfsc;
314 static const struct tc_ops tc_ops_default;
315 static const struct tc_ops tc_ops_other;
316
317 static const struct tc_ops *const tcs[] = {
318 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
319 &tc_ops_hfsc, /* Hierarchical fair service curve. */
320 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
321 &tc_ops_other, /* Some other qdisc. */
322 NULL
323 };
324
325 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
326 static unsigned int tc_get_major(unsigned int handle);
327 static unsigned int tc_get_minor(unsigned int handle);
328
329 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
330 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
331 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
332
333 static struct tcmsg *tc_make_request(const struct netdev *, int type,
334 unsigned int flags, struct ofpbuf *);
335 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
336 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
337 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
338 int kbits_burst);
339
340 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
341 struct nlattr **options);
342 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
343 struct nlattr **options,
344 struct netdev_queue_stats *);
345 static int tc_query_class(const struct netdev *,
346 unsigned int handle, unsigned int parent,
347 struct ofpbuf **replyp);
348 static int tc_delete_class(const struct netdev *, unsigned int handle);
349
350 static int tc_del_qdisc(struct netdev *netdev);
351 static int tc_query_qdisc(const struct netdev *netdev);
352
353 static int tc_calc_cell_log(unsigned int mtu);
354 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
355 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
356 const struct tc_ratespec *rate);
357 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
358 \f
359 struct netdev_linux {
360 struct netdev up;
361
362 struct shash_node *shash_node;
363 unsigned int cache_valid;
364 unsigned int change_seq;
365
366 bool miimon; /* Link status of last poll. */
367 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
368 struct timer miimon_timer;
369
370 /* The following are figured out "on demand" only. They are only valid
371 * when the corresponding VALID_* bit in 'cache_valid' is set. */
372 int ifindex;
373 uint8_t etheraddr[ETH_ADDR_LEN];
374 struct in_addr address, netmask;
375 struct in6_addr in6;
376 int mtu;
377 unsigned int ifi_flags;
378 long long int carrier_resets;
379 uint32_t kbits_rate; /* Policing data. */
380 uint32_t kbits_burst;
381 int vport_stats_error; /* Cached error code from vport_get_stats().
382 0 or an errno value. */
383 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
384 int ether_addr_error; /* Cached error code from set/get etheraddr. */
385 int netdev_policing_error; /* Cached error code from set policing. */
386 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
387 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
388
389 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
392 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
393
394 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tc *tc;
396
397 union {
398 struct tap_state tap;
399 } state;
400 };
401
402 struct netdev_rx_linux {
403 struct netdev_rx up;
404 bool is_tap;
405 int fd;
406 };
407
408 static const struct netdev_rx_class netdev_rx_linux_class;
409
410 /* Sockets used for ioctl operations. */
411 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
412
413 /* This is set pretty low because we probably won't learn anything from the
414 * additional log messages. */
415 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
416
417 static int netdev_linux_init(void);
418
419 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
420 int cmd, const char *cmd_name);
421 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
422 const char *cmd_name);
423 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
424 int cmd, const char *cmd_name);
425 static int get_flags(const struct netdev *, unsigned int *flags);
426 static int set_flags(const char *, unsigned int flags);
427 static int do_get_ifindex(const char *netdev_name);
428 static int get_ifindex(const struct netdev *, int *ifindexp);
429 static int do_set_addr(struct netdev *netdev,
430 int ioctl_nr, const char *ioctl_name,
431 struct in_addr addr);
432 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
433 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
434 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
435 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
436 static int af_packet_sock(void);
437 static void netdev_linux_miimon_run(void);
438 static void netdev_linux_miimon_wait(void);
439
440 static bool
441 is_netdev_linux_class(const struct netdev_class *netdev_class)
442 {
443 return netdev_class->init == netdev_linux_init;
444 }
445
446 static bool
447 is_tap_netdev(const struct netdev *netdev)
448 {
449 return netdev_get_class(netdev) == &netdev_tap_class;
450 }
451
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
454 {
455 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
456
457 return CONTAINER_OF(netdev, struct netdev_linux, up);
458 }
459
460 static struct netdev_rx_linux *
461 netdev_rx_linux_cast(const struct netdev_rx *rx)
462 {
463 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
464 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
465 }
466 \f
467 static int
468 netdev_linux_init(void)
469 {
470 static int status = -1;
471 if (status < 0) {
472 /* Create AF_INET socket. */
473 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
474 status = af_inet_sock >= 0 ? 0 : errno;
475 if (status) {
476 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
477 }
478 }
479 return status;
480 }
481
482 static void
483 netdev_linux_run(void)
484 {
485 rtnetlink_link_run();
486 netdev_linux_miimon_run();
487 }
488
489 static void
490 netdev_linux_wait(void)
491 {
492 rtnetlink_link_wait();
493 netdev_linux_miimon_wait();
494 }
495
496 static void
497 netdev_linux_changed(struct netdev_linux *dev,
498 unsigned int ifi_flags, unsigned int mask)
499 {
500 dev->change_seq++;
501 if (!dev->change_seq) {
502 dev->change_seq++;
503 }
504
505 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
506 dev->carrier_resets++;
507 }
508 dev->ifi_flags = ifi_flags;
509
510 dev->cache_valid &= mask;
511 }
512
513 static void
514 netdev_linux_update(struct netdev_linux *dev,
515 const struct rtnetlink_link_change *change)
516 {
517 if (change->nlmsg_type == RTM_NEWLINK) {
518 /* Keep drv-info */
519 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
520
521 /* Update netdev from rtnl-change msg. */
522 if (change->mtu) {
523 dev->mtu = change->mtu;
524 dev->cache_valid |= VALID_MTU;
525 dev->netdev_mtu_error = 0;
526 }
527
528 if (!eth_addr_is_zero(change->addr)) {
529 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
530 dev->cache_valid |= VALID_ETHERADDR;
531 dev->ether_addr_error = 0;
532 }
533
534 dev->ifindex = change->ifi_index;
535 dev->cache_valid |= VALID_IFINDEX;
536 dev->get_ifindex_error = 0;
537
538 } else {
539 netdev_linux_changed(dev, change->ifi_flags, 0);
540 }
541 }
542
543 static void
544 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
545 void *aux OVS_UNUSED)
546 {
547 struct netdev_linux *dev;
548 if (change) {
549 struct netdev *base_dev = netdev_from_name(change->ifname);
550 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
551 netdev_linux_update(netdev_linux_cast(base_dev), change);
552 }
553 } else {
554 struct shash device_shash;
555 struct shash_node *node;
556
557 shash_init(&device_shash);
558 netdev_get_devices(&netdev_linux_class, &device_shash);
559 SHASH_FOR_EACH (node, &device_shash) {
560 struct netdev *netdev = node->data;
561 unsigned int flags;
562
563 dev = netdev_linux_cast(netdev);
564
565 get_flags(&dev->up, &flags);
566 netdev_linux_changed(dev, flags, 0);
567 }
568 shash_destroy(&device_shash);
569 }
570 }
571
572 static int
573 cache_notifier_ref(void)
574 {
575 if (!cache_notifier_refcount) {
576 ovs_assert(!netdev_linux_cache_notifier);
577
578 netdev_linux_cache_notifier =
579 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
580
581 if (!netdev_linux_cache_notifier) {
582 return EINVAL;
583 }
584 }
585 cache_notifier_refcount++;
586
587 return 0;
588 }
589
590 static void
591 cache_notifier_unref(void)
592 {
593 ovs_assert(cache_notifier_refcount > 0);
594 if (!--cache_notifier_refcount) {
595 ovs_assert(netdev_linux_cache_notifier);
596 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
597 netdev_linux_cache_notifier = NULL;
598 }
599 }
600
601 /* Creates system and internal devices. */
602 static int
603 netdev_linux_create(const struct netdev_class *class, const char *name,
604 struct netdev **netdevp)
605 {
606 struct netdev_linux *netdev;
607 int error;
608
609 error = cache_notifier_ref();
610 if (error) {
611 return error;
612 }
613
614 netdev = xzalloc(sizeof *netdev);
615 netdev->change_seq = 1;
616 netdev_init(&netdev->up, name, class);
617 error = get_flags(&netdev->up, &netdev->ifi_flags);
618 if (error == ENODEV) {
619 if (class != &netdev_internal_class) {
620 /* The device does not exist, so don't allow it to be opened. */
621 netdev_uninit(&netdev->up, false);
622 cache_notifier_unref();
623 free(netdev);
624 return ENODEV;
625 } else {
626 /* "Internal" netdevs have to be created as netdev objects before
627 * they exist in the kernel, because creating them in the kernel
628 * happens by passing a netdev object to dpif_port_add().
629 * Therefore, ignore the error. */
630 }
631 }
632
633 *netdevp = &netdev->up;
634 return 0;
635 }
636
637 /* For most types of netdevs we open the device for each call of
638 * netdev_open(). However, this is not the case with tap devices,
639 * since it is only possible to open the device once. In this
640 * situation we share a single file descriptor, and consequently
641 * buffers, across all readers. Therefore once data is read it will
642 * be unavailable to other reads for tap devices. */
643 static int
644 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
645 const char *name, struct netdev **netdevp)
646 {
647 struct netdev_linux *netdev;
648 struct tap_state *state;
649 static const char tap_dev[] = "/dev/net/tun";
650 struct ifreq ifr;
651 int error;
652
653 netdev = xzalloc(sizeof *netdev);
654 netdev->change_seq = 1;
655 state = &netdev->state.tap;
656
657 error = cache_notifier_ref();
658 if (error) {
659 goto error;
660 }
661
662 /* Open tap device. */
663 state->fd = open(tap_dev, O_RDWR);
664 if (state->fd < 0) {
665 error = errno;
666 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
667 goto error_unref_notifier;
668 }
669
670 /* Create tap device. */
671 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
672 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
673 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
674 VLOG_WARN("%s: creating tap device failed: %s", name,
675 ovs_strerror(errno));
676 error = errno;
677 goto error_close;
678 }
679
680 /* Make non-blocking. */
681 error = set_nonblocking(state->fd);
682 if (error) {
683 goto error_close;
684 }
685
686 netdev_init(&netdev->up, name, &netdev_tap_class);
687 *netdevp = &netdev->up;
688 return 0;
689
690 error_close:
691 close(state->fd);
692 error_unref_notifier:
693 cache_notifier_unref();
694 error:
695 free(netdev);
696 return error;
697 }
698
699 static void
700 destroy_tap(struct netdev_linux *netdev)
701 {
702 struct tap_state *state = &netdev->state.tap;
703
704 if (state->fd >= 0) {
705 close(state->fd);
706 }
707 }
708
709 /* Destroys the netdev device 'netdev_'. */
710 static void
711 netdev_linux_destroy(struct netdev *netdev_)
712 {
713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
714
715 if (netdev->tc && netdev->tc->ops->tc_destroy) {
716 netdev->tc->ops->tc_destroy(netdev->tc);
717 }
718
719 if (netdev_get_class(netdev_) == &netdev_tap_class) {
720 destroy_tap(netdev);
721 }
722 free(netdev);
723
724 cache_notifier_unref();
725 }
726
727 static int
728 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
729 {
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
731 bool is_tap = is_tap_netdev(netdev_);
732 struct netdev_rx_linux *rx;
733 int error;
734 int fd;
735
736 if (is_tap) {
737 fd = netdev->state.tap.fd;
738 } else {
739 struct sockaddr_ll sll;
740 int ifindex;
741 /* Result of tcpdump -dd inbound */
742 static struct sock_filter filt[] = {
743 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
744 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
745 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
746 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
747 };
748 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
749
750 /* Create file descriptor. */
751 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 if (fd < 0) {
753 error = errno;
754 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
755 goto error;
756 }
757
758 /* Set non-blocking mode. */
759 error = set_nonblocking(fd);
760 if (error) {
761 goto error;
762 }
763
764 /* Get ethernet device index. */
765 error = get_ifindex(&netdev->up, &ifindex);
766 if (error) {
767 goto error;
768 }
769
770 /* Bind to specific ethernet device. */
771 memset(&sll, 0, sizeof sll);
772 sll.sll_family = AF_PACKET;
773 sll.sll_ifindex = ifindex;
774 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
775 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
776 error = errno;
777 VLOG_ERR("%s: failed to bind raw socket (%s)",
778 netdev_get_name(netdev_), ovs_strerror(error));
779 goto error;
780 }
781
782 /* Filter for only inbound packets. */
783 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
784 sizeof fprog);
785 if (error) {
786 error = errno;
787 VLOG_ERR("%s: failed attach filter (%s)",
788 netdev_get_name(netdev_), ovs_strerror(error));
789 goto error;
790 }
791 }
792
793 rx = xmalloc(sizeof *rx);
794 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
795 rx->is_tap = is_tap;
796 rx->fd = fd;
797
798 *rxp = &rx->up;
799 return 0;
800
801 error:
802 if (fd >= 0) {
803 close(fd);
804 }
805 return error;
806 }
807
808 static void
809 netdev_rx_linux_destroy(struct netdev_rx *rx_)
810 {
811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
812
813 if (!rx->is_tap) {
814 close(rx->fd);
815 }
816 free(rx);
817 }
818
819 static int
820 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
821 {
822 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
823 ssize_t retval;
824
825 do {
826 retval = (rx->is_tap
827 ? read(rx->fd, data, size)
828 : recv(rx->fd, data, size, MSG_TRUNC));
829 } while (retval < 0 && errno == EINTR);
830
831 if (retval >= 0) {
832 return retval > size ? -EMSGSIZE : retval;
833 } else {
834 if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
836 ovs_strerror(errno), netdev_rx_get_name(rx_));
837 }
838 return -errno;
839 }
840 }
841
842 static void
843 netdev_rx_linux_wait(struct netdev_rx *rx_)
844 {
845 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
846 poll_fd_wait(rx->fd, POLLIN);
847 }
848
849 static int
850 netdev_rx_linux_drain(struct netdev_rx *rx_)
851 {
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
853 if (rx->is_tap) {
854 struct ifreq ifr;
855 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
856 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
857 if (error) {
858 return error;
859 }
860 drain_fd(rx->fd, ifr.ifr_qlen);
861 return 0;
862 } else {
863 return drain_rcvbuf(rx->fd);
864 }
865 }
866
867 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
868 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
869 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
870 * the packet is too big or too small to transmit on the device.
871 *
872 * The caller retains ownership of 'buffer' in all cases.
873 *
874 * The kernel maintains a packet transmission queue, so the caller is not
875 * expected to do additional queuing of packets. */
876 static int
877 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
878 {
879 for (;;) {
880 ssize_t retval;
881
882 if (!is_tap_netdev(netdev_)) {
883 /* Use our AF_PACKET socket to send to this device. */
884 struct sockaddr_ll sll;
885 struct msghdr msg;
886 struct iovec iov;
887 int ifindex;
888 int error;
889 int sock;
890
891 sock = af_packet_sock();
892 if (sock < 0) {
893 return -sock;
894 }
895
896 error = get_ifindex(netdev_, &ifindex);
897 if (error) {
898 return error;
899 }
900
901 /* We don't bother setting most fields in sockaddr_ll because the
902 * kernel ignores them for SOCK_RAW. */
903 memset(&sll, 0, sizeof sll);
904 sll.sll_family = AF_PACKET;
905 sll.sll_ifindex = ifindex;
906
907 iov.iov_base = CONST_CAST(void *, data);
908 iov.iov_len = size;
909
910 msg.msg_name = &sll;
911 msg.msg_namelen = sizeof sll;
912 msg.msg_iov = &iov;
913 msg.msg_iovlen = 1;
914 msg.msg_control = NULL;
915 msg.msg_controllen = 0;
916 msg.msg_flags = 0;
917
918 retval = sendmsg(sock, &msg, 0);
919 } else {
920 /* Use the tap fd to send to this device. This is essential for
921 * tap devices, because packets sent to a tap device with an
922 * AF_PACKET socket will loop back to be *received* again on the
923 * tap device. This doesn't occur on other interface types
924 * because we attach a socket filter to the rx socket. */
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926
927 retval = write(netdev->state.tap.fd, data, size);
928 }
929
930 if (retval < 0) {
931 /* The Linux AF_PACKET implementation never blocks waiting for room
932 * for packets, instead returning ENOBUFS. Translate this into
933 * EAGAIN for the caller. */
934 if (errno == ENOBUFS) {
935 return EAGAIN;
936 } else if (errno == EINTR) {
937 continue;
938 } else if (errno != EAGAIN) {
939 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
940 netdev_get_name(netdev_), ovs_strerror(errno));
941 }
942 return errno;
943 } else if (retval != size) {
944 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
945 "%zu) on %s", retval, size, netdev_get_name(netdev_));
946 return EMSGSIZE;
947 } else {
948 return 0;
949 }
950 }
951 }
952
953 /* Registers with the poll loop to wake up from the next call to poll_block()
954 * when the packet transmission queue has sufficient room to transmit a packet
955 * with netdev_send().
956 *
957 * The kernel maintains a packet transmission queue, so the client is not
958 * expected to do additional queuing of packets. Thus, this function is
959 * unlikely to ever be used. It is included for completeness. */
960 static void
961 netdev_linux_send_wait(struct netdev *netdev)
962 {
963 if (is_tap_netdev(netdev)) {
964 /* TAP device always accepts packets.*/
965 poll_immediate_wake();
966 }
967 }
968
969 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
970 * otherwise a positive errno value. */
971 static int
972 netdev_linux_set_etheraddr(struct netdev *netdev_,
973 const uint8_t mac[ETH_ADDR_LEN])
974 {
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 struct netdev_saved_flags *sf = NULL;
977 int error;
978
979 if (netdev->cache_valid & VALID_ETHERADDR) {
980 if (netdev->ether_addr_error) {
981 return netdev->ether_addr_error;
982 }
983 if (eth_addr_equals(netdev->etheraddr, mac)) {
984 return 0;
985 }
986 netdev->cache_valid &= ~VALID_ETHERADDR;
987 }
988
989 /* Tap devices must be brought down before setting the address. */
990 if (is_tap_netdev(netdev_)) {
991 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
992 }
993 error = set_etheraddr(netdev_get_name(netdev_), mac);
994 if (!error || error == ENODEV) {
995 netdev->ether_addr_error = error;
996 netdev->cache_valid |= VALID_ETHERADDR;
997 if (!error) {
998 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
999 }
1000 }
1001
1002 netdev_restore_flags(sf);
1003
1004 return error;
1005 }
1006
1007 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1008 static int
1009 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1010 uint8_t mac[ETH_ADDR_LEN])
1011 {
1012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1013
1014 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1015 int error = get_etheraddr(netdev_get_name(netdev_),
1016 netdev->etheraddr);
1017
1018 netdev->ether_addr_error = error;
1019 netdev->cache_valid |= VALID_ETHERADDR;
1020 }
1021
1022 if (!netdev->ether_addr_error) {
1023 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1024 }
1025
1026 return netdev->ether_addr_error;
1027 }
1028
1029 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1030 * in bytes, not including the hardware header; thus, this is typically 1500
1031 * bytes for Ethernet devices. */
1032 static int
1033 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1034 {
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 if (!(netdev->cache_valid & VALID_MTU)) {
1037 struct ifreq ifr;
1038 int error;
1039
1040 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1041 SIOCGIFMTU, "SIOCGIFMTU");
1042
1043 netdev->netdev_mtu_error = error;
1044 netdev->mtu = ifr.ifr_mtu;
1045 netdev->cache_valid |= VALID_MTU;
1046 }
1047
1048 if (!netdev->netdev_mtu_error) {
1049 *mtup = netdev->mtu;
1050 }
1051 return netdev->netdev_mtu_error;
1052 }
1053
1054 /* Sets the maximum size of transmitted (MTU) for given device using linux
1055 * networking ioctl interface.
1056 */
1057 static int
1058 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1059 {
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1061 struct ifreq ifr;
1062 int error;
1063
1064 if (netdev->cache_valid & VALID_MTU) {
1065 if (netdev->netdev_mtu_error) {
1066 return netdev->netdev_mtu_error;
1067 }
1068 if (netdev->mtu == mtu) {
1069 return 0;
1070 }
1071 netdev->cache_valid &= ~VALID_MTU;
1072 }
1073 ifr.ifr_mtu = mtu;
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCSIFMTU, "SIOCSIFMTU");
1076 if (!error || error == ENODEV) {
1077 netdev->netdev_mtu_error = error;
1078 netdev->mtu = ifr.ifr_mtu;
1079 netdev->cache_valid |= VALID_MTU;
1080 }
1081 return error;
1082 }
1083
1084 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1085 * On failure, returns a negative errno value. */
1086 static int
1087 netdev_linux_get_ifindex(const struct netdev *netdev)
1088 {
1089 int ifindex, error;
1090
1091 error = get_ifindex(netdev, &ifindex);
1092 return error ? -error : ifindex;
1093 }
1094
1095 static int
1096 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1097 {
1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1099
1100 if (netdev->miimon_interval > 0) {
1101 *carrier = netdev->miimon;
1102 } else {
1103 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1104 }
1105
1106 return 0;
1107 }
1108
1109 static long long int
1110 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1111 {
1112 return netdev_linux_cast(netdev)->carrier_resets;
1113 }
1114
1115 static int
1116 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1117 struct mii_ioctl_data *data)
1118 {
1119 struct ifreq ifr;
1120 int error;
1121
1122 memset(&ifr, 0, sizeof ifr);
1123 memcpy(&ifr.ifr_data, data, sizeof *data);
1124 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1125 memcpy(data, &ifr.ifr_data, sizeof *data);
1126
1127 return error;
1128 }
1129
1130 static int
1131 netdev_linux_get_miimon(const char *name, bool *miimon)
1132 {
1133 struct mii_ioctl_data data;
1134 int error;
1135
1136 *miimon = false;
1137
1138 memset(&data, 0, sizeof data);
1139 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1140 if (!error) {
1141 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1142 data.reg_num = MII_BMSR;
1143 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1144 &data);
1145
1146 if (!error) {
1147 *miimon = !!(data.val_out & BMSR_LSTATUS);
1148 } else {
1149 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1150 }
1151 } else {
1152 struct ethtool_cmd ecmd;
1153
1154 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1155 name);
1156
1157 COVERAGE_INC(netdev_get_ethtool);
1158 memset(&ecmd, 0, sizeof ecmd);
1159 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1160 "ETHTOOL_GLINK");
1161 if (!error) {
1162 struct ethtool_value eval;
1163
1164 memcpy(&eval, &ecmd, sizeof eval);
1165 *miimon = !!eval.data;
1166 } else {
1167 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1168 }
1169 }
1170
1171 return error;
1172 }
1173
1174 static int
1175 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1176 long long int interval)
1177 {
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179
1180 interval = interval > 0 ? MAX(interval, 100) : 0;
1181 if (netdev->miimon_interval != interval) {
1182 netdev->miimon_interval = interval;
1183 timer_set_expired(&netdev->miimon_timer);
1184 }
1185
1186 return 0;
1187 }
1188
1189 static void
1190 netdev_linux_miimon_run(void)
1191 {
1192 struct shash device_shash;
1193 struct shash_node *node;
1194
1195 shash_init(&device_shash);
1196 netdev_get_devices(&netdev_linux_class, &device_shash);
1197 SHASH_FOR_EACH (node, &device_shash) {
1198 struct netdev *netdev = node->data;
1199 struct netdev_linux *dev = netdev_linux_cast(netdev);
1200 bool miimon;
1201
1202 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1203 continue;
1204 }
1205
1206 netdev_linux_get_miimon(dev->up.name, &miimon);
1207 if (miimon != dev->miimon) {
1208 dev->miimon = miimon;
1209 netdev_linux_changed(dev, dev->ifi_flags, 0);
1210 }
1211
1212 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1213 }
1214
1215 shash_destroy(&device_shash);
1216 }
1217
1218 static void
1219 netdev_linux_miimon_wait(void)
1220 {
1221 struct shash device_shash;
1222 struct shash_node *node;
1223
1224 shash_init(&device_shash);
1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1226 SHASH_FOR_EACH (node, &device_shash) {
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1229
1230 if (dev->miimon_interval > 0) {
1231 timer_wait(&dev->miimon_timer);
1232 }
1233 }
1234 shash_destroy(&device_shash);
1235 }
1236
1237 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1238 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1239 * enabled. */
1240 static bool
1241 check_for_working_netlink_stats(void)
1242 {
1243 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1244 * preferable, so if that works, we'll use it. */
1245 int ifindex = do_get_ifindex("lo");
1246 if (ifindex < 0) {
1247 VLOG_WARN("failed to get ifindex for lo, "
1248 "obtaining netdev stats from proc");
1249 return false;
1250 } else {
1251 struct netdev_stats stats;
1252 int error = get_stats_via_netlink(ifindex, &stats);
1253 if (!error) {
1254 VLOG_DBG("obtaining netdev stats via rtnetlink");
1255 return true;
1256 } else {
1257 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1258 "via proc (you are probably running a pre-2.6.19 "
1259 "kernel)", ovs_strerror(error));
1260 return false;
1261 }
1262 }
1263 }
1264
1265 static void
1266 swap_uint64(uint64_t *a, uint64_t *b)
1267 {
1268 uint64_t tmp = *a;
1269 *a = *b;
1270 *b = tmp;
1271 }
1272
1273 /* Copies 'src' into 'dst', performing format conversion in the process.
1274 *
1275 * 'src' is allowed to be misaligned. */
1276 static void
1277 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1278 const struct ovs_vport_stats *src)
1279 {
1280 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1281 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1282 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1283 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1284 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1285 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1286 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1287 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1288 dst->multicast = 0;
1289 dst->collisions = 0;
1290 dst->rx_length_errors = 0;
1291 dst->rx_over_errors = 0;
1292 dst->rx_crc_errors = 0;
1293 dst->rx_frame_errors = 0;
1294 dst->rx_fifo_errors = 0;
1295 dst->rx_missed_errors = 0;
1296 dst->tx_aborted_errors = 0;
1297 dst->tx_carrier_errors = 0;
1298 dst->tx_fifo_errors = 0;
1299 dst->tx_heartbeat_errors = 0;
1300 dst->tx_window_errors = 0;
1301 }
1302
1303 static int
1304 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1305 {
1306 struct dpif_linux_vport reply;
1307 struct ofpbuf *buf;
1308 int error;
1309
1310 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1311 if (error) {
1312 return error;
1313 } else if (!reply.stats) {
1314 ofpbuf_delete(buf);
1315 return EOPNOTSUPP;
1316 }
1317
1318 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1319
1320 ofpbuf_delete(buf);
1321
1322 return 0;
1323 }
1324
1325 static void
1326 get_stats_via_vport(const struct netdev *netdev_,
1327 struct netdev_stats *stats)
1328 {
1329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1330
1331 if (!netdev->vport_stats_error ||
1332 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1333 int error;
1334
1335 error = get_stats_via_vport__(netdev_, stats);
1336 if (error && error != ENOENT) {
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1338 "(%s)",
1339 netdev_get_name(netdev_), ovs_strerror(error));
1340 }
1341 netdev->vport_stats_error = error;
1342 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1343 }
1344 }
1345
1346 static int
1347 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1348 struct netdev_stats *stats)
1349 {
1350 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1351 static int use_netlink_stats;
1352 int error;
1353
1354 if (ovsthread_once_start(&once)) {
1355 use_netlink_stats = check_for_working_netlink_stats();
1356 ovsthread_once_done(&once);
1357 }
1358
1359 if (use_netlink_stats) {
1360 int ifindex;
1361
1362 error = get_ifindex(netdev_, &ifindex);
1363 if (!error) {
1364 error = get_stats_via_netlink(ifindex, stats);
1365 }
1366 } else {
1367 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1368 }
1369
1370 if (error) {
1371 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1372 netdev_get_name(netdev_), error);
1373 }
1374 return error;
1375
1376 }
1377
1378 /* Retrieves current device stats for 'netdev-linux'. */
1379 static int
1380 netdev_linux_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1382 {
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 struct netdev_stats dev_stats;
1385 int error;
1386
1387 get_stats_via_vport(netdev_, stats);
1388
1389 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1390
1391 if (error) {
1392 if (netdev->vport_stats_error) {
1393 return error;
1394 } else {
1395 return 0;
1396 }
1397 }
1398
1399 if (netdev->vport_stats_error) {
1400 /* stats not available from OVS then use ioctl stats. */
1401 *stats = dev_stats;
1402 } else {
1403 stats->rx_errors += dev_stats.rx_errors;
1404 stats->tx_errors += dev_stats.tx_errors;
1405 stats->rx_dropped += dev_stats.rx_dropped;
1406 stats->tx_dropped += dev_stats.tx_dropped;
1407 stats->multicast += dev_stats.multicast;
1408 stats->collisions += dev_stats.collisions;
1409 stats->rx_length_errors += dev_stats.rx_length_errors;
1410 stats->rx_over_errors += dev_stats.rx_over_errors;
1411 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1412 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1413 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1414 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1415 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1416 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1417 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1418 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1419 stats->tx_window_errors += dev_stats.tx_window_errors;
1420 }
1421 return 0;
1422 }
1423
1424 /* Retrieves current device stats for 'netdev-tap' netdev or
1425 * netdev-internal. */
1426 static int
1427 netdev_tap_get_stats(const struct netdev *netdev_,
1428 struct netdev_stats *stats)
1429 {
1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1431 struct netdev_stats dev_stats;
1432 int error;
1433
1434 get_stats_via_vport(netdev_, stats);
1435
1436 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1437 if (error) {
1438 if (netdev->vport_stats_error) {
1439 return error;
1440 } else {
1441 return 0;
1442 }
1443 }
1444
1445 /* If this port is an internal port then the transmit and receive stats
1446 * will appear to be swapped relative to the other ports since we are the
1447 * one sending the data, not a remote computer. For consistency, we swap
1448 * them back here. This does not apply if we are getting stats from the
1449 * vport layer because it always tracks stats from the perspective of the
1450 * switch. */
1451 if (netdev->vport_stats_error) {
1452 *stats = dev_stats;
1453 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1454 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1455 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1456 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1457 stats->rx_length_errors = 0;
1458 stats->rx_over_errors = 0;
1459 stats->rx_crc_errors = 0;
1460 stats->rx_frame_errors = 0;
1461 stats->rx_fifo_errors = 0;
1462 stats->rx_missed_errors = 0;
1463 stats->tx_aborted_errors = 0;
1464 stats->tx_carrier_errors = 0;
1465 stats->tx_fifo_errors = 0;
1466 stats->tx_heartbeat_errors = 0;
1467 stats->tx_window_errors = 0;
1468 } else {
1469 stats->rx_dropped += dev_stats.tx_dropped;
1470 stats->tx_dropped += dev_stats.rx_dropped;
1471
1472 stats->rx_errors += dev_stats.tx_errors;
1473 stats->tx_errors += dev_stats.rx_errors;
1474
1475 stats->multicast += dev_stats.multicast;
1476 stats->collisions += dev_stats.collisions;
1477 }
1478 return 0;
1479 }
1480
1481 static int
1482 netdev_internal_get_stats(const struct netdev *netdev_,
1483 struct netdev_stats *stats)
1484 {
1485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1486
1487 get_stats_via_vport(netdev_, stats);
1488 return netdev->vport_stats_error;
1489 }
1490
1491 static int
1492 netdev_internal_set_stats(struct netdev *netdev,
1493 const struct netdev_stats *stats)
1494 {
1495 struct ovs_vport_stats vport_stats;
1496 struct dpif_linux_vport vport;
1497 int err;
1498
1499 vport_stats.rx_packets = stats->rx_packets;
1500 vport_stats.tx_packets = stats->tx_packets;
1501 vport_stats.rx_bytes = stats->rx_bytes;
1502 vport_stats.tx_bytes = stats->tx_bytes;
1503 vport_stats.rx_errors = stats->rx_errors;
1504 vport_stats.tx_errors = stats->tx_errors;
1505 vport_stats.rx_dropped = stats->rx_dropped;
1506 vport_stats.tx_dropped = stats->tx_dropped;
1507
1508 dpif_linux_vport_init(&vport);
1509 vport.cmd = OVS_VPORT_CMD_SET;
1510 vport.name = netdev_get_name(netdev);
1511 vport.stats = &vport_stats;
1512
1513 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1514
1515 /* If the vport layer doesn't know about the device, that doesn't mean it
1516 * doesn't exist (after all were able to open it when netdev_open() was
1517 * called), it just means that it isn't attached and we'll be getting
1518 * stats a different way. */
1519 if (err == ENODEV) {
1520 err = EOPNOTSUPP;
1521 }
1522
1523 return err;
1524 }
1525
1526 static void
1527 netdev_linux_read_features(struct netdev_linux *netdev)
1528 {
1529 struct ethtool_cmd ecmd;
1530 uint32_t speed;
1531 int error;
1532
1533 if (netdev->cache_valid & VALID_FEATURES) {
1534 return;
1535 }
1536
1537 COVERAGE_INC(netdev_get_ethtool);
1538 memset(&ecmd, 0, sizeof ecmd);
1539 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1540 ETHTOOL_GSET, "ETHTOOL_GSET");
1541 if (error) {
1542 goto out;
1543 }
1544
1545 /* Supported features. */
1546 netdev->supported = 0;
1547 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1548 netdev->supported |= NETDEV_F_10MB_HD;
1549 }
1550 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1551 netdev->supported |= NETDEV_F_10MB_FD;
1552 }
1553 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1554 netdev->supported |= NETDEV_F_100MB_HD;
1555 }
1556 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1557 netdev->supported |= NETDEV_F_100MB_FD;
1558 }
1559 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1560 netdev->supported |= NETDEV_F_1GB_HD;
1561 }
1562 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1563 netdev->supported |= NETDEV_F_1GB_FD;
1564 }
1565 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1566 netdev->supported |= NETDEV_F_10GB_FD;
1567 }
1568 if (ecmd.supported & SUPPORTED_TP) {
1569 netdev->supported |= NETDEV_F_COPPER;
1570 }
1571 if (ecmd.supported & SUPPORTED_FIBRE) {
1572 netdev->supported |= NETDEV_F_FIBER;
1573 }
1574 if (ecmd.supported & SUPPORTED_Autoneg) {
1575 netdev->supported |= NETDEV_F_AUTONEG;
1576 }
1577 if (ecmd.supported & SUPPORTED_Pause) {
1578 netdev->supported |= NETDEV_F_PAUSE;
1579 }
1580 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1581 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1582 }
1583
1584 /* Advertised features. */
1585 netdev->advertised = 0;
1586 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1587 netdev->advertised |= NETDEV_F_10MB_HD;
1588 }
1589 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1590 netdev->advertised |= NETDEV_F_10MB_FD;
1591 }
1592 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1593 netdev->advertised |= NETDEV_F_100MB_HD;
1594 }
1595 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1596 netdev->advertised |= NETDEV_F_100MB_FD;
1597 }
1598 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1599 netdev->advertised |= NETDEV_F_1GB_HD;
1600 }
1601 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1602 netdev->advertised |= NETDEV_F_1GB_FD;
1603 }
1604 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1605 netdev->advertised |= NETDEV_F_10GB_FD;
1606 }
1607 if (ecmd.advertising & ADVERTISED_TP) {
1608 netdev->advertised |= NETDEV_F_COPPER;
1609 }
1610 if (ecmd.advertising & ADVERTISED_FIBRE) {
1611 netdev->advertised |= NETDEV_F_FIBER;
1612 }
1613 if (ecmd.advertising & ADVERTISED_Autoneg) {
1614 netdev->advertised |= NETDEV_F_AUTONEG;
1615 }
1616 if (ecmd.advertising & ADVERTISED_Pause) {
1617 netdev->advertised |= NETDEV_F_PAUSE;
1618 }
1619 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1620 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1621 }
1622
1623 /* Current settings. */
1624 speed = ecmd.speed;
1625 if (speed == SPEED_10) {
1626 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1627 } else if (speed == SPEED_100) {
1628 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1629 } else if (speed == SPEED_1000) {
1630 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1631 } else if (speed == SPEED_10000) {
1632 netdev->current = NETDEV_F_10GB_FD;
1633 } else if (speed == 40000) {
1634 netdev->current = NETDEV_F_40GB_FD;
1635 } else if (speed == 100000) {
1636 netdev->current = NETDEV_F_100GB_FD;
1637 } else if (speed == 1000000) {
1638 netdev->current = NETDEV_F_1TB_FD;
1639 } else {
1640 netdev->current = 0;
1641 }
1642
1643 if (ecmd.port == PORT_TP) {
1644 netdev->current |= NETDEV_F_COPPER;
1645 } else if (ecmd.port == PORT_FIBRE) {
1646 netdev->current |= NETDEV_F_FIBER;
1647 }
1648
1649 if (ecmd.autoneg) {
1650 netdev->current |= NETDEV_F_AUTONEG;
1651 }
1652
1653 /* Peer advertisements. */
1654 netdev->peer = 0; /* XXX */
1655
1656 out:
1657 netdev->cache_valid |= VALID_FEATURES;
1658 netdev->get_features_error = error;
1659 }
1660
1661 /* Stores the features supported by 'netdev' into each of '*current',
1662 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1663 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1664 * errno value. */
1665 static int
1666 netdev_linux_get_features(const struct netdev *netdev_,
1667 enum netdev_features *current,
1668 enum netdev_features *advertised,
1669 enum netdev_features *supported,
1670 enum netdev_features *peer)
1671 {
1672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1673
1674 netdev_linux_read_features(netdev);
1675
1676 if (!netdev->get_features_error) {
1677 *current = netdev->current;
1678 *advertised = netdev->advertised;
1679 *supported = netdev->supported;
1680 *peer = netdev->peer;
1681 }
1682 return netdev->get_features_error;
1683 }
1684
1685 /* Set the features advertised by 'netdev' to 'advertise'. */
1686 static int
1687 netdev_linux_set_advertisements(struct netdev *netdev,
1688 enum netdev_features advertise)
1689 {
1690 struct ethtool_cmd ecmd;
1691 int error;
1692
1693 COVERAGE_INC(netdev_get_ethtool);
1694 memset(&ecmd, 0, sizeof ecmd);
1695 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1696 ETHTOOL_GSET, "ETHTOOL_GSET");
1697 if (error) {
1698 return error;
1699 }
1700
1701 ecmd.advertising = 0;
1702 if (advertise & NETDEV_F_10MB_HD) {
1703 ecmd.advertising |= ADVERTISED_10baseT_Half;
1704 }
1705 if (advertise & NETDEV_F_10MB_FD) {
1706 ecmd.advertising |= ADVERTISED_10baseT_Full;
1707 }
1708 if (advertise & NETDEV_F_100MB_HD) {
1709 ecmd.advertising |= ADVERTISED_100baseT_Half;
1710 }
1711 if (advertise & NETDEV_F_100MB_FD) {
1712 ecmd.advertising |= ADVERTISED_100baseT_Full;
1713 }
1714 if (advertise & NETDEV_F_1GB_HD) {
1715 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1716 }
1717 if (advertise & NETDEV_F_1GB_FD) {
1718 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1719 }
1720 if (advertise & NETDEV_F_10GB_FD) {
1721 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1722 }
1723 if (advertise & NETDEV_F_COPPER) {
1724 ecmd.advertising |= ADVERTISED_TP;
1725 }
1726 if (advertise & NETDEV_F_FIBER) {
1727 ecmd.advertising |= ADVERTISED_FIBRE;
1728 }
1729 if (advertise & NETDEV_F_AUTONEG) {
1730 ecmd.advertising |= ADVERTISED_Autoneg;
1731 }
1732 if (advertise & NETDEV_F_PAUSE) {
1733 ecmd.advertising |= ADVERTISED_Pause;
1734 }
1735 if (advertise & NETDEV_F_PAUSE_ASYM) {
1736 ecmd.advertising |= ADVERTISED_Asym_Pause;
1737 }
1738 COVERAGE_INC(netdev_set_ethtool);
1739 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1740 ETHTOOL_SSET, "ETHTOOL_SSET");
1741 }
1742
1743 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1744 * successful, otherwise a positive errno value. */
1745 static int
1746 netdev_linux_set_policing(struct netdev *netdev_,
1747 uint32_t kbits_rate, uint32_t kbits_burst)
1748 {
1749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1750 const char *netdev_name = netdev_get_name(netdev_);
1751 int error;
1752
1753
1754 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1755 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1756 : kbits_burst); /* Stick with user-specified value. */
1757
1758 if (netdev->cache_valid & VALID_POLICING) {
1759 if (netdev->netdev_policing_error) {
1760 return netdev->netdev_policing_error;
1761 }
1762
1763 if (netdev->kbits_rate == kbits_rate &&
1764 netdev->kbits_burst == kbits_burst) {
1765 /* Assume that settings haven't changed since we last set them. */
1766 return 0;
1767 }
1768 netdev->cache_valid &= ~VALID_POLICING;
1769 }
1770
1771 COVERAGE_INC(netdev_set_policing);
1772 /* Remove any existing ingress qdisc. */
1773 error = tc_add_del_ingress_qdisc(netdev_, false);
1774 if (error) {
1775 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1776 netdev_name, ovs_strerror(error));
1777 goto out;
1778 }
1779
1780 if (kbits_rate) {
1781 error = tc_add_del_ingress_qdisc(netdev_, true);
1782 if (error) {
1783 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1784 netdev_name, ovs_strerror(error));
1785 goto out;
1786 }
1787
1788 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1789 if (error){
1790 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1791 netdev_name, ovs_strerror(error));
1792 goto out;
1793 }
1794 }
1795
1796 netdev->kbits_rate = kbits_rate;
1797 netdev->kbits_burst = kbits_burst;
1798
1799 out:
1800 if (!error || error == ENODEV) {
1801 netdev->netdev_policing_error = error;
1802 netdev->cache_valid |= VALID_POLICING;
1803 }
1804 return error;
1805 }
1806
1807 static int
1808 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1809 struct sset *types)
1810 {
1811 const struct tc_ops *const *opsp;
1812
1813 for (opsp = tcs; *opsp != NULL; opsp++) {
1814 const struct tc_ops *ops = *opsp;
1815 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1816 sset_add(types, ops->ovs_name);
1817 }
1818 }
1819 return 0;
1820 }
1821
1822 static const struct tc_ops *
1823 tc_lookup_ovs_name(const char *name)
1824 {
1825 const struct tc_ops *const *opsp;
1826
1827 for (opsp = tcs; *opsp != NULL; opsp++) {
1828 const struct tc_ops *ops = *opsp;
1829 if (!strcmp(name, ops->ovs_name)) {
1830 return ops;
1831 }
1832 }
1833 return NULL;
1834 }
1835
1836 static const struct tc_ops *
1837 tc_lookup_linux_name(const char *name)
1838 {
1839 const struct tc_ops *const *opsp;
1840
1841 for (opsp = tcs; *opsp != NULL; opsp++) {
1842 const struct tc_ops *ops = *opsp;
1843 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1844 return ops;
1845 }
1846 }
1847 return NULL;
1848 }
1849
1850 static struct tc_queue *
1851 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1852 size_t hash)
1853 {
1854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1855 struct tc_queue *queue;
1856
1857 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1858 if (queue->queue_id == queue_id) {
1859 return queue;
1860 }
1861 }
1862 return NULL;
1863 }
1864
1865 static struct tc_queue *
1866 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1867 {
1868 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1869 }
1870
1871 static int
1872 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1873 const char *type,
1874 struct netdev_qos_capabilities *caps)
1875 {
1876 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1877 if (!ops) {
1878 return EOPNOTSUPP;
1879 }
1880 caps->n_queues = ops->n_queues;
1881 return 0;
1882 }
1883
1884 static int
1885 netdev_linux_get_qos(const struct netdev *netdev_,
1886 const char **typep, struct smap *details)
1887 {
1888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1889 int error;
1890
1891 error = tc_query_qdisc(netdev_);
1892 if (error) {
1893 return error;
1894 }
1895
1896 *typep = netdev->tc->ops->ovs_name;
1897 return (netdev->tc->ops->qdisc_get
1898 ? netdev->tc->ops->qdisc_get(netdev_, details)
1899 : 0);
1900 }
1901
1902 static int
1903 netdev_linux_set_qos(struct netdev *netdev_,
1904 const char *type, const struct smap *details)
1905 {
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1907 const struct tc_ops *new_ops;
1908 int error;
1909
1910 new_ops = tc_lookup_ovs_name(type);
1911 if (!new_ops || !new_ops->tc_install) {
1912 return EOPNOTSUPP;
1913 }
1914
1915 error = tc_query_qdisc(netdev_);
1916 if (error) {
1917 return error;
1918 }
1919
1920 if (new_ops == netdev->tc->ops) {
1921 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1922 } else {
1923 /* Delete existing qdisc. */
1924 error = tc_del_qdisc(netdev_);
1925 if (error) {
1926 return error;
1927 }
1928 ovs_assert(netdev->tc == NULL);
1929
1930 /* Install new qdisc. */
1931 error = new_ops->tc_install(netdev_, details);
1932 ovs_assert((error == 0) == (netdev->tc != NULL));
1933
1934 return error;
1935 }
1936 }
1937
1938 static int
1939 netdev_linux_get_queue(const struct netdev *netdev_,
1940 unsigned int queue_id, struct smap *details)
1941 {
1942 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 int error;
1944
1945 error = tc_query_qdisc(netdev_);
1946 if (error) {
1947 return error;
1948 } else {
1949 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1950 return (queue
1951 ? netdev->tc->ops->class_get(netdev_, queue, details)
1952 : ENOENT);
1953 }
1954 }
1955
1956 static int
1957 netdev_linux_set_queue(struct netdev *netdev_,
1958 unsigned int queue_id, const struct smap *details)
1959 {
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1961 int error;
1962
1963 error = tc_query_qdisc(netdev_);
1964 if (error) {
1965 return error;
1966 } else if (queue_id >= netdev->tc->ops->n_queues
1967 || !netdev->tc->ops->class_set) {
1968 return EINVAL;
1969 }
1970
1971 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1972 }
1973
1974 static int
1975 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1976 {
1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1978 int error;
1979
1980 error = tc_query_qdisc(netdev_);
1981 if (error) {
1982 return error;
1983 } else if (!netdev->tc->ops->class_delete) {
1984 return EINVAL;
1985 } else {
1986 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1987 return (queue
1988 ? netdev->tc->ops->class_delete(netdev_, queue)
1989 : ENOENT);
1990 }
1991 }
1992
1993 static int
1994 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1995 unsigned int queue_id,
1996 struct netdev_queue_stats *stats)
1997 {
1998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1999 int error;
2000
2001 error = tc_query_qdisc(netdev_);
2002 if (error) {
2003 return error;
2004 } else if (!netdev->tc->ops->class_get_stats) {
2005 return EOPNOTSUPP;
2006 } else {
2007 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2008 if (!queue) {
2009 return ENOENT;
2010 }
2011 stats->created = queue->created;
2012 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
2013 }
2014 }
2015
2016 static bool
2017 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2018 {
2019 struct ofpbuf request;
2020 struct tcmsg *tcmsg;
2021
2022 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2023 if (!tcmsg) {
2024 return false;
2025 }
2026 tcmsg->tcm_parent = 0;
2027 nl_dump_start(dump, NETLINK_ROUTE, &request);
2028 ofpbuf_uninit(&request);
2029 return true;
2030 }
2031
2032 static int
2033 netdev_linux_dump_queues(const struct netdev *netdev_,
2034 netdev_dump_queues_cb *cb, void *aux)
2035 {
2036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2037 struct tc_queue *queue, *next_queue;
2038 struct smap details;
2039 int last_error;
2040 int error;
2041
2042 error = tc_query_qdisc(netdev_);
2043 if (error) {
2044 return error;
2045 } else if (!netdev->tc->ops->class_get) {
2046 return EOPNOTSUPP;
2047 }
2048
2049 last_error = 0;
2050 smap_init(&details);
2051 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2052 &netdev->tc->queues) {
2053 smap_clear(&details);
2054
2055 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2056 if (!error) {
2057 (*cb)(queue->queue_id, &details, aux);
2058 } else {
2059 last_error = error;
2060 }
2061 }
2062 smap_destroy(&details);
2063
2064 return last_error;
2065 }
2066
2067 static int
2068 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2069 netdev_dump_queue_stats_cb *cb, void *aux)
2070 {
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2072 struct nl_dump dump;
2073 struct ofpbuf msg;
2074 int last_error;
2075 int error;
2076
2077 error = tc_query_qdisc(netdev_);
2078 if (error) {
2079 return error;
2080 } else if (!netdev->tc->ops->class_dump_stats) {
2081 return EOPNOTSUPP;
2082 }
2083
2084 last_error = 0;
2085 if (!start_queue_dump(netdev_, &dump)) {
2086 return ENODEV;
2087 }
2088 while (nl_dump_next(&dump, &msg)) {
2089 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2090 if (error) {
2091 last_error = error;
2092 }
2093 }
2094
2095 error = nl_dump_done(&dump);
2096 return error ? error : last_error;
2097 }
2098
2099 static int
2100 netdev_linux_get_in4(const struct netdev *netdev_,
2101 struct in_addr *address, struct in_addr *netmask)
2102 {
2103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2104
2105 if (!(netdev->cache_valid & VALID_IN4)) {
2106 int error;
2107
2108 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2109 SIOCGIFADDR, "SIOCGIFADDR");
2110 if (error) {
2111 return error;
2112 }
2113
2114 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2115 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2116 if (error) {
2117 return error;
2118 }
2119
2120 netdev->cache_valid |= VALID_IN4;
2121 }
2122 *address = netdev->address;
2123 *netmask = netdev->netmask;
2124 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2125 }
2126
2127 static int
2128 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2129 struct in_addr netmask)
2130 {
2131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2132 int error;
2133
2134 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2135 if (!error) {
2136 netdev->cache_valid |= VALID_IN4;
2137 netdev->address = address;
2138 netdev->netmask = netmask;
2139 if (address.s_addr != INADDR_ANY) {
2140 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2141 "SIOCSIFNETMASK", netmask);
2142 }
2143 }
2144 return error;
2145 }
2146
2147 static bool
2148 parse_if_inet6_line(const char *line,
2149 struct in6_addr *in6, char ifname[16 + 1])
2150 {
2151 uint8_t *s6 = in6->s6_addr;
2152 #define X8 "%2"SCNx8
2153 return sscanf(line,
2154 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2155 "%*x %*x %*x %*x %16s\n",
2156 &s6[0], &s6[1], &s6[2], &s6[3],
2157 &s6[4], &s6[5], &s6[6], &s6[7],
2158 &s6[8], &s6[9], &s6[10], &s6[11],
2159 &s6[12], &s6[13], &s6[14], &s6[15],
2160 ifname) == 17;
2161 }
2162
2163 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2164 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2165 static int
2166 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2167 {
2168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2169 if (!(netdev->cache_valid & VALID_IN6)) {
2170 FILE *file;
2171 char line[128];
2172
2173 netdev->in6 = in6addr_any;
2174
2175 file = fopen("/proc/net/if_inet6", "r");
2176 if (file != NULL) {
2177 const char *name = netdev_get_name(netdev_);
2178 while (fgets(line, sizeof line, file)) {
2179 struct in6_addr in6_tmp;
2180 char ifname[16 + 1];
2181 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2182 && !strcmp(name, ifname))
2183 {
2184 netdev->in6 = in6_tmp;
2185 break;
2186 }
2187 }
2188 fclose(file);
2189 }
2190 netdev->cache_valid |= VALID_IN6;
2191 }
2192 *in6 = netdev->in6;
2193 return 0;
2194 }
2195
2196 static void
2197 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2198 {
2199 struct sockaddr_in sin;
2200 memset(&sin, 0, sizeof sin);
2201 sin.sin_family = AF_INET;
2202 sin.sin_addr = addr;
2203 sin.sin_port = 0;
2204
2205 memset(sa, 0, sizeof *sa);
2206 memcpy(sa, &sin, sizeof sin);
2207 }
2208
2209 static int
2210 do_set_addr(struct netdev *netdev,
2211 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2212 {
2213 struct ifreq ifr;
2214 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2215 make_in4_sockaddr(&ifr.ifr_addr, addr);
2216
2217 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2218 ioctl_name);
2219 }
2220
2221 /* Adds 'router' as a default IP gateway. */
2222 static int
2223 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2224 {
2225 struct in_addr any = { INADDR_ANY };
2226 struct rtentry rt;
2227 int error;
2228
2229 memset(&rt, 0, sizeof rt);
2230 make_in4_sockaddr(&rt.rt_dst, any);
2231 make_in4_sockaddr(&rt.rt_gateway, router);
2232 make_in4_sockaddr(&rt.rt_genmask, any);
2233 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2234 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2235 if (error) {
2236 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2237 }
2238 return error;
2239 }
2240
2241 static int
2242 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2243 char **netdev_name)
2244 {
2245 static const char fn[] = "/proc/net/route";
2246 FILE *stream;
2247 char line[256];
2248 int ln;
2249
2250 *netdev_name = NULL;
2251 stream = fopen(fn, "r");
2252 if (stream == NULL) {
2253 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2254 return errno;
2255 }
2256
2257 ln = 0;
2258 while (fgets(line, sizeof line, stream)) {
2259 if (++ln >= 2) {
2260 char iface[17];
2261 ovs_be32 dest, gateway, mask;
2262 int refcnt, metric, mtu;
2263 unsigned int flags, use, window, irtt;
2264
2265 if (sscanf(line,
2266 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2267 " %d %u %u\n",
2268 iface, &dest, &gateway, &flags, &refcnt,
2269 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2270
2271 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2272 fn, ln, line);
2273 continue;
2274 }
2275 if (!(flags & RTF_UP)) {
2276 /* Skip routes that aren't up. */
2277 continue;
2278 }
2279
2280 /* The output of 'dest', 'mask', and 'gateway' were given in
2281 * network byte order, so we don't need need any endian
2282 * conversions here. */
2283 if ((dest & mask) == (host->s_addr & mask)) {
2284 if (!gateway) {
2285 /* The host is directly reachable. */
2286 next_hop->s_addr = 0;
2287 } else {
2288 /* To reach the host, we must go through a gateway. */
2289 next_hop->s_addr = gateway;
2290 }
2291 *netdev_name = xstrdup(iface);
2292 fclose(stream);
2293 return 0;
2294 }
2295 }
2296 }
2297
2298 fclose(stream);
2299 return ENXIO;
2300 }
2301
2302 static int
2303 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2304 {
2305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2306 int error = 0;
2307
2308 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2309 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2310
2311 COVERAGE_INC(netdev_get_ethtool);
2312 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2313 error = netdev_linux_do_ethtool(netdev->up.name,
2314 cmd,
2315 ETHTOOL_GDRVINFO,
2316 "ETHTOOL_GDRVINFO");
2317 if (!error) {
2318 netdev->cache_valid |= VALID_DRVINFO;
2319 }
2320 }
2321
2322 if (!error) {
2323 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2324 smap_add(smap, "driver_version", netdev->drvinfo.version);
2325 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2326 }
2327 return error;
2328 }
2329
2330 static int
2331 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2332 struct smap *smap)
2333 {
2334 smap_add(smap, "driver_name", "openvswitch");
2335 return 0;
2336 }
2337
2338 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2339 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2340 * returns 0. Otherwise, it returns a positive errno value; in particular,
2341 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2342 static int
2343 netdev_linux_arp_lookup(const struct netdev *netdev,
2344 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2345 {
2346 struct arpreq r;
2347 struct sockaddr_in sin;
2348 int retval;
2349
2350 memset(&r, 0, sizeof r);
2351 memset(&sin, 0, sizeof sin);
2352 sin.sin_family = AF_INET;
2353 sin.sin_addr.s_addr = ip;
2354 sin.sin_port = 0;
2355 memcpy(&r.arp_pa, &sin, sizeof sin);
2356 r.arp_ha.sa_family = ARPHRD_ETHER;
2357 r.arp_flags = 0;
2358 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2359 COVERAGE_INC(netdev_arp_lookup);
2360 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2361 if (!retval) {
2362 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2363 } else if (retval != ENXIO) {
2364 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2365 netdev_get_name(netdev), IP_ARGS(ip),
2366 ovs_strerror(retval));
2367 }
2368 return retval;
2369 }
2370
2371 static int
2372 nd_to_iff_flags(enum netdev_flags nd)
2373 {
2374 int iff = 0;
2375 if (nd & NETDEV_UP) {
2376 iff |= IFF_UP;
2377 }
2378 if (nd & NETDEV_PROMISC) {
2379 iff |= IFF_PROMISC;
2380 }
2381 return iff;
2382 }
2383
2384 static int
2385 iff_to_nd_flags(int iff)
2386 {
2387 enum netdev_flags nd = 0;
2388 if (iff & IFF_UP) {
2389 nd |= NETDEV_UP;
2390 }
2391 if (iff & IFF_PROMISC) {
2392 nd |= NETDEV_PROMISC;
2393 }
2394 return nd;
2395 }
2396
2397 static int
2398 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2399 enum netdev_flags on, enum netdev_flags *old_flagsp)
2400 {
2401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2402 int old_flags, new_flags;
2403 int error = 0;
2404
2405 old_flags = netdev->ifi_flags;
2406 *old_flagsp = iff_to_nd_flags(old_flags);
2407 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2408 if (new_flags != old_flags) {
2409 error = set_flags(netdev_get_name(netdev_), new_flags);
2410 get_flags(netdev_, &netdev->ifi_flags);
2411 }
2412 return error;
2413 }
2414
2415 static unsigned int
2416 netdev_linux_change_seq(const struct netdev *netdev)
2417 {
2418 return netdev_linux_cast(netdev)->change_seq;
2419 }
2420
2421 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2422 GET_FEATURES, GET_STATUS) \
2423 { \
2424 NAME, \
2425 \
2426 netdev_linux_init, \
2427 netdev_linux_run, \
2428 netdev_linux_wait, \
2429 \
2430 CREATE, \
2431 netdev_linux_destroy, \
2432 NULL, /* get_config */ \
2433 NULL, /* set_config */ \
2434 NULL, /* get_tunnel_config */ \
2435 \
2436 netdev_linux_rx_open, \
2437 \
2438 netdev_linux_send, \
2439 netdev_linux_send_wait, \
2440 \
2441 netdev_linux_set_etheraddr, \
2442 netdev_linux_get_etheraddr, \
2443 netdev_linux_get_mtu, \
2444 netdev_linux_set_mtu, \
2445 netdev_linux_get_ifindex, \
2446 netdev_linux_get_carrier, \
2447 netdev_linux_get_carrier_resets, \
2448 netdev_linux_set_miimon_interval, \
2449 GET_STATS, \
2450 SET_STATS, \
2451 \
2452 GET_FEATURES, \
2453 netdev_linux_set_advertisements, \
2454 \
2455 netdev_linux_set_policing, \
2456 netdev_linux_get_qos_types, \
2457 netdev_linux_get_qos_capabilities, \
2458 netdev_linux_get_qos, \
2459 netdev_linux_set_qos, \
2460 netdev_linux_get_queue, \
2461 netdev_linux_set_queue, \
2462 netdev_linux_delete_queue, \
2463 netdev_linux_get_queue_stats, \
2464 netdev_linux_dump_queues, \
2465 netdev_linux_dump_queue_stats, \
2466 \
2467 netdev_linux_get_in4, \
2468 netdev_linux_set_in4, \
2469 netdev_linux_get_in6, \
2470 netdev_linux_add_router, \
2471 netdev_linux_get_next_hop, \
2472 GET_STATUS, \
2473 netdev_linux_arp_lookup, \
2474 \
2475 netdev_linux_update_flags, \
2476 \
2477 netdev_linux_change_seq \
2478 }
2479
2480 const struct netdev_class netdev_linux_class =
2481 NETDEV_LINUX_CLASS(
2482 "system",
2483 netdev_linux_create,
2484 netdev_linux_get_stats,
2485 NULL, /* set_stats */
2486 netdev_linux_get_features,
2487 netdev_linux_get_status);
2488
2489 const struct netdev_class netdev_tap_class =
2490 NETDEV_LINUX_CLASS(
2491 "tap",
2492 netdev_linux_create_tap,
2493 netdev_tap_get_stats,
2494 NULL, /* set_stats */
2495 netdev_linux_get_features,
2496 netdev_linux_get_status);
2497
2498 const struct netdev_class netdev_internal_class =
2499 NETDEV_LINUX_CLASS(
2500 "internal",
2501 netdev_linux_create,
2502 netdev_internal_get_stats,
2503 netdev_internal_set_stats,
2504 NULL, /* get_features */
2505 netdev_internal_get_status);
2506
2507 static const struct netdev_rx_class netdev_rx_linux_class = {
2508 netdev_rx_linux_destroy,
2509 netdev_rx_linux_recv,
2510 netdev_rx_linux_wait,
2511 netdev_rx_linux_drain,
2512 };
2513 \f
2514 /* HTB traffic control class. */
2515
2516 #define HTB_N_QUEUES 0xf000
2517
2518 struct htb {
2519 struct tc tc;
2520 unsigned int max_rate; /* In bytes/s. */
2521 };
2522
2523 struct htb_class {
2524 struct tc_queue tc_queue;
2525 unsigned int min_rate; /* In bytes/s. */
2526 unsigned int max_rate; /* In bytes/s. */
2527 unsigned int burst; /* In bytes. */
2528 unsigned int priority; /* Lower values are higher priorities. */
2529 };
2530
2531 static struct htb *
2532 htb_get__(const struct netdev *netdev_)
2533 {
2534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2535 return CONTAINER_OF(netdev->tc, struct htb, tc);
2536 }
2537
2538 static void
2539 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2540 {
2541 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2542 struct htb *htb;
2543
2544 htb = xmalloc(sizeof *htb);
2545 tc_init(&htb->tc, &tc_ops_htb);
2546 htb->max_rate = max_rate;
2547
2548 netdev->tc = &htb->tc;
2549 }
2550
2551 /* Create an HTB qdisc.
2552 *
2553 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2554 static int
2555 htb_setup_qdisc__(struct netdev *netdev)
2556 {
2557 size_t opt_offset;
2558 struct tc_htb_glob opt;
2559 struct ofpbuf request;
2560 struct tcmsg *tcmsg;
2561
2562 tc_del_qdisc(netdev);
2563
2564 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2565 NLM_F_EXCL | NLM_F_CREATE, &request);
2566 if (!tcmsg) {
2567 return ENODEV;
2568 }
2569 tcmsg->tcm_handle = tc_make_handle(1, 0);
2570 tcmsg->tcm_parent = TC_H_ROOT;
2571
2572 nl_msg_put_string(&request, TCA_KIND, "htb");
2573
2574 memset(&opt, 0, sizeof opt);
2575 opt.rate2quantum = 10;
2576 opt.version = 3;
2577 opt.defcls = 1;
2578
2579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2580 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2581 nl_msg_end_nested(&request, opt_offset);
2582
2583 return tc_transact(&request, NULL);
2584 }
2585
2586 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2587 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2588 static int
2589 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2590 unsigned int parent, struct htb_class *class)
2591 {
2592 size_t opt_offset;
2593 struct tc_htb_opt opt;
2594 struct ofpbuf request;
2595 struct tcmsg *tcmsg;
2596 int error;
2597 int mtu;
2598
2599 error = netdev_get_mtu(netdev, &mtu);
2600 if (error) {
2601 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2602 netdev_get_name(netdev));
2603 return error;
2604 }
2605
2606 memset(&opt, 0, sizeof opt);
2607 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2608 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2609 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2610 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2611 opt.prio = class->priority;
2612
2613 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2614 if (!tcmsg) {
2615 return ENODEV;
2616 }
2617 tcmsg->tcm_handle = handle;
2618 tcmsg->tcm_parent = parent;
2619
2620 nl_msg_put_string(&request, TCA_KIND, "htb");
2621 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2622 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2623 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2624 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2625 nl_msg_end_nested(&request, opt_offset);
2626
2627 error = tc_transact(&request, NULL);
2628 if (error) {
2629 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2630 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2631 netdev_get_name(netdev),
2632 tc_get_major(handle), tc_get_minor(handle),
2633 tc_get_major(parent), tc_get_minor(parent),
2634 class->min_rate, class->max_rate,
2635 class->burst, class->priority, ovs_strerror(error));
2636 }
2637 return error;
2638 }
2639
2640 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2641 * description of them into 'details'. The description complies with the
2642 * specification given in the vswitch database documentation for linux-htb
2643 * queue details. */
2644 static int
2645 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2646 {
2647 static const struct nl_policy tca_htb_policy[] = {
2648 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2649 .min_len = sizeof(struct tc_htb_opt) },
2650 };
2651
2652 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2653 const struct tc_htb_opt *htb;
2654
2655 if (!nl_parse_nested(nl_options, tca_htb_policy,
2656 attrs, ARRAY_SIZE(tca_htb_policy))) {
2657 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2658 return EPROTO;
2659 }
2660
2661 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2662 class->min_rate = htb->rate.rate;
2663 class->max_rate = htb->ceil.rate;
2664 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2665 class->priority = htb->prio;
2666 return 0;
2667 }
2668
2669 static int
2670 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2671 struct htb_class *options,
2672 struct netdev_queue_stats *stats)
2673 {
2674 struct nlattr *nl_options;
2675 unsigned int handle;
2676 int error;
2677
2678 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2679 if (!error && queue_id) {
2680 unsigned int major = tc_get_major(handle);
2681 unsigned int minor = tc_get_minor(handle);
2682 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2683 *queue_id = minor - 1;
2684 } else {
2685 error = EPROTO;
2686 }
2687 }
2688 if (!error && options) {
2689 error = htb_parse_tca_options__(nl_options, options);
2690 }
2691 return error;
2692 }
2693
2694 static void
2695 htb_parse_qdisc_details__(struct netdev *netdev,
2696 const struct smap *details, struct htb_class *hc)
2697 {
2698 const char *max_rate_s;
2699
2700 max_rate_s = smap_get(details, "max-rate");
2701 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2702 if (!hc->max_rate) {
2703 enum netdev_features current;
2704
2705 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2706 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2707 }
2708 hc->min_rate = hc->max_rate;
2709 hc->burst = 0;
2710 hc->priority = 0;
2711 }
2712
2713 static int
2714 htb_parse_class_details__(struct netdev *netdev,
2715 const struct smap *details, struct htb_class *hc)
2716 {
2717 const struct htb *htb = htb_get__(netdev);
2718 const char *min_rate_s = smap_get(details, "min-rate");
2719 const char *max_rate_s = smap_get(details, "max-rate");
2720 const char *burst_s = smap_get(details, "burst");
2721 const char *priority_s = smap_get(details, "priority");
2722 int mtu, error;
2723
2724 error = netdev_get_mtu(netdev, &mtu);
2725 if (error) {
2726 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2727 netdev_get_name(netdev));
2728 return error;
2729 }
2730
2731 /* HTB requires at least an mtu sized min-rate to send any traffic even
2732 * on uncongested links. */
2733 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2734 hc->min_rate = MAX(hc->min_rate, mtu);
2735 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2736
2737 /* max-rate */
2738 hc->max_rate = (max_rate_s
2739 ? strtoull(max_rate_s, NULL, 10) / 8
2740 : htb->max_rate);
2741 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2742 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2743
2744 /* burst
2745 *
2746 * According to hints in the documentation that I've read, it is important
2747 * that 'burst' be at least as big as the largest frame that might be
2748 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2749 * but having it a bit too small is a problem. Since netdev_get_mtu()
2750 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2751 * the MTU. We actually add 64, instead of 14, as a guard against
2752 * additional headers get tacked on somewhere that we're not aware of. */
2753 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2754 hc->burst = MAX(hc->burst, mtu + 64);
2755
2756 /* priority */
2757 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2758
2759 return 0;
2760 }
2761
2762 static int
2763 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2764 unsigned int parent, struct htb_class *options,
2765 struct netdev_queue_stats *stats)
2766 {
2767 struct ofpbuf *reply;
2768 int error;
2769
2770 error = tc_query_class(netdev, handle, parent, &reply);
2771 if (!error) {
2772 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2773 ofpbuf_delete(reply);
2774 }
2775 return error;
2776 }
2777
2778 static int
2779 htb_tc_install(struct netdev *netdev, const struct smap *details)
2780 {
2781 int error;
2782
2783 error = htb_setup_qdisc__(netdev);
2784 if (!error) {
2785 struct htb_class hc;
2786
2787 htb_parse_qdisc_details__(netdev, details, &hc);
2788 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2789 tc_make_handle(1, 0), &hc);
2790 if (!error) {
2791 htb_install__(netdev, hc.max_rate);
2792 }
2793 }
2794 return error;
2795 }
2796
2797 static struct htb_class *
2798 htb_class_cast__(const struct tc_queue *queue)
2799 {
2800 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2801 }
2802
2803 static void
2804 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2805 const struct htb_class *hc)
2806 {
2807 struct htb *htb = htb_get__(netdev);
2808 size_t hash = hash_int(queue_id, 0);
2809 struct tc_queue *queue;
2810 struct htb_class *hcp;
2811
2812 queue = tc_find_queue__(netdev, queue_id, hash);
2813 if (queue) {
2814 hcp = htb_class_cast__(queue);
2815 } else {
2816 hcp = xmalloc(sizeof *hcp);
2817 queue = &hcp->tc_queue;
2818 queue->queue_id = queue_id;
2819 queue->created = time_msec();
2820 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2821 }
2822
2823 hcp->min_rate = hc->min_rate;
2824 hcp->max_rate = hc->max_rate;
2825 hcp->burst = hc->burst;
2826 hcp->priority = hc->priority;
2827 }
2828
2829 static int
2830 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2831 {
2832 struct ofpbuf msg;
2833 struct nl_dump dump;
2834 struct htb_class hc;
2835
2836 /* Get qdisc options. */
2837 hc.max_rate = 0;
2838 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2839 htb_install__(netdev, hc.max_rate);
2840
2841 /* Get queues. */
2842 if (!start_queue_dump(netdev, &dump)) {
2843 return ENODEV;
2844 }
2845 while (nl_dump_next(&dump, &msg)) {
2846 unsigned int queue_id;
2847
2848 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2849 htb_update_queue__(netdev, queue_id, &hc);
2850 }
2851 }
2852 nl_dump_done(&dump);
2853
2854 return 0;
2855 }
2856
2857 static void
2858 htb_tc_destroy(struct tc *tc)
2859 {
2860 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2861 struct htb_class *hc, *next;
2862
2863 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2864 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2865 free(hc);
2866 }
2867 tc_destroy(tc);
2868 free(htb);
2869 }
2870
2871 static int
2872 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2873 {
2874 const struct htb *htb = htb_get__(netdev);
2875 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2876 return 0;
2877 }
2878
2879 static int
2880 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2881 {
2882 struct htb_class hc;
2883 int error;
2884
2885 htb_parse_qdisc_details__(netdev, details, &hc);
2886 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2887 tc_make_handle(1, 0), &hc);
2888 if (!error) {
2889 htb_get__(netdev)->max_rate = hc.max_rate;
2890 }
2891 return error;
2892 }
2893
2894 static int
2895 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2896 const struct tc_queue *queue, struct smap *details)
2897 {
2898 const struct htb_class *hc = htb_class_cast__(queue);
2899
2900 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2901 if (hc->min_rate != hc->max_rate) {
2902 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2903 }
2904 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2905 if (hc->priority) {
2906 smap_add_format(details, "priority", "%u", hc->priority);
2907 }
2908 return 0;
2909 }
2910
2911 static int
2912 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2913 const struct smap *details)
2914 {
2915 struct htb_class hc;
2916 int error;
2917
2918 error = htb_parse_class_details__(netdev, details, &hc);
2919 if (error) {
2920 return error;
2921 }
2922
2923 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2924 tc_make_handle(1, 0xfffe), &hc);
2925 if (error) {
2926 return error;
2927 }
2928
2929 htb_update_queue__(netdev, queue_id, &hc);
2930 return 0;
2931 }
2932
2933 static int
2934 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2935 {
2936 struct htb_class *hc = htb_class_cast__(queue);
2937 struct htb *htb = htb_get__(netdev);
2938 int error;
2939
2940 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2941 if (!error) {
2942 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2943 free(hc);
2944 }
2945 return error;
2946 }
2947
2948 static int
2949 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2950 struct netdev_queue_stats *stats)
2951 {
2952 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2953 tc_make_handle(1, 0xfffe), NULL, stats);
2954 }
2955
2956 static int
2957 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2958 const struct ofpbuf *nlmsg,
2959 netdev_dump_queue_stats_cb *cb, void *aux)
2960 {
2961 struct netdev_queue_stats stats;
2962 unsigned int handle, major, minor;
2963 int error;
2964
2965 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2966 if (error) {
2967 return error;
2968 }
2969
2970 major = tc_get_major(handle);
2971 minor = tc_get_minor(handle);
2972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2973 (*cb)(minor - 1, &stats, aux);
2974 }
2975 return 0;
2976 }
2977
2978 static const struct tc_ops tc_ops_htb = {
2979 "htb", /* linux_name */
2980 "linux-htb", /* ovs_name */
2981 HTB_N_QUEUES, /* n_queues */
2982 htb_tc_install,
2983 htb_tc_load,
2984 htb_tc_destroy,
2985 htb_qdisc_get,
2986 htb_qdisc_set,
2987 htb_class_get,
2988 htb_class_set,
2989 htb_class_delete,
2990 htb_class_get_stats,
2991 htb_class_dump_stats
2992 };
2993 \f
2994 /* "linux-hfsc" traffic control class. */
2995
2996 #define HFSC_N_QUEUES 0xf000
2997
2998 struct hfsc {
2999 struct tc tc;
3000 uint32_t max_rate;
3001 };
3002
3003 struct hfsc_class {
3004 struct tc_queue tc_queue;
3005 uint32_t min_rate;
3006 uint32_t max_rate;
3007 };
3008
3009 static struct hfsc *
3010 hfsc_get__(const struct netdev *netdev_)
3011 {
3012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3013 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3014 }
3015
3016 static struct hfsc_class *
3017 hfsc_class_cast__(const struct tc_queue *queue)
3018 {
3019 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3020 }
3021
3022 static void
3023 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3024 {
3025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3026 struct hfsc *hfsc;
3027
3028 hfsc = xmalloc(sizeof *hfsc);
3029 tc_init(&hfsc->tc, &tc_ops_hfsc);
3030 hfsc->max_rate = max_rate;
3031 netdev->tc = &hfsc->tc;
3032 }
3033
3034 static void
3035 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3036 const struct hfsc_class *hc)
3037 {
3038 size_t hash;
3039 struct hfsc *hfsc;
3040 struct hfsc_class *hcp;
3041 struct tc_queue *queue;
3042
3043 hfsc = hfsc_get__(netdev);
3044 hash = hash_int(queue_id, 0);
3045
3046 queue = tc_find_queue__(netdev, queue_id, hash);
3047 if (queue) {
3048 hcp = hfsc_class_cast__(queue);
3049 } else {
3050 hcp = xmalloc(sizeof *hcp);
3051 queue = &hcp->tc_queue;
3052 queue->queue_id = queue_id;
3053 queue->created = time_msec();
3054 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3055 }
3056
3057 hcp->min_rate = hc->min_rate;
3058 hcp->max_rate = hc->max_rate;
3059 }
3060
3061 static int
3062 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3063 {
3064 const struct tc_service_curve *rsc, *fsc, *usc;
3065 static const struct nl_policy tca_hfsc_policy[] = {
3066 [TCA_HFSC_RSC] = {
3067 .type = NL_A_UNSPEC,
3068 .optional = false,
3069 .min_len = sizeof(struct tc_service_curve),
3070 },
3071 [TCA_HFSC_FSC] = {
3072 .type = NL_A_UNSPEC,
3073 .optional = false,
3074 .min_len = sizeof(struct tc_service_curve),
3075 },
3076 [TCA_HFSC_USC] = {
3077 .type = NL_A_UNSPEC,
3078 .optional = false,
3079 .min_len = sizeof(struct tc_service_curve),
3080 },
3081 };
3082 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3083
3084 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3085 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3087 return EPROTO;
3088 }
3089
3090 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3091 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3092 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3093
3094 if (rsc->m1 != 0 || rsc->d != 0 ||
3095 fsc->m1 != 0 || fsc->d != 0 ||
3096 usc->m1 != 0 || usc->d != 0) {
3097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3098 "Non-linear service curves are not supported.");
3099 return EPROTO;
3100 }
3101
3102 if (rsc->m2 != fsc->m2) {
3103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3104 "Real-time service curves are not supported ");
3105 return EPROTO;
3106 }
3107
3108 if (rsc->m2 > usc->m2) {
3109 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3110 "Min-rate service curve is greater than "
3111 "the max-rate service curve.");
3112 return EPROTO;
3113 }
3114
3115 class->min_rate = fsc->m2;
3116 class->max_rate = usc->m2;
3117 return 0;
3118 }
3119
3120 static int
3121 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3122 struct hfsc_class *options,
3123 struct netdev_queue_stats *stats)
3124 {
3125 int error;
3126 unsigned int handle;
3127 struct nlattr *nl_options;
3128
3129 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3130 if (error) {
3131 return error;
3132 }
3133
3134 if (queue_id) {
3135 unsigned int major, minor;
3136
3137 major = tc_get_major(handle);
3138 minor = tc_get_minor(handle);
3139 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3140 *queue_id = minor - 1;
3141 } else {
3142 return EPROTO;
3143 }
3144 }
3145
3146 if (options) {
3147 error = hfsc_parse_tca_options__(nl_options, options);
3148 }
3149
3150 return error;
3151 }
3152
3153 static int
3154 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3155 unsigned int parent, struct hfsc_class *options,
3156 struct netdev_queue_stats *stats)
3157 {
3158 int error;
3159 struct ofpbuf *reply;
3160
3161 error = tc_query_class(netdev, handle, parent, &reply);
3162 if (error) {
3163 return error;
3164 }
3165
3166 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3167 ofpbuf_delete(reply);
3168 return error;
3169 }
3170
3171 static void
3172 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3173 struct hfsc_class *class)
3174 {
3175 uint32_t max_rate;
3176 const char *max_rate_s;
3177
3178 max_rate_s = smap_get(details, "max-rate");
3179 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3180
3181 if (!max_rate) {
3182 enum netdev_features current;
3183
3184 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3185 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3186 }
3187
3188 class->min_rate = max_rate;
3189 class->max_rate = max_rate;
3190 }
3191
3192 static int
3193 hfsc_parse_class_details__(struct netdev *netdev,
3194 const struct smap *details,
3195 struct hfsc_class * class)
3196 {
3197 const struct hfsc *hfsc;
3198 uint32_t min_rate, max_rate;
3199 const char *min_rate_s, *max_rate_s;
3200
3201 hfsc = hfsc_get__(netdev);
3202 min_rate_s = smap_get(details, "min-rate");
3203 max_rate_s = smap_get(details, "max-rate");
3204
3205 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3206 min_rate = MAX(min_rate, 1);
3207 min_rate = MIN(min_rate, hfsc->max_rate);
3208
3209 max_rate = (max_rate_s
3210 ? strtoull(max_rate_s, NULL, 10) / 8
3211 : hfsc->max_rate);
3212 max_rate = MAX(max_rate, min_rate);
3213 max_rate = MIN(max_rate, hfsc->max_rate);
3214
3215 class->min_rate = min_rate;
3216 class->max_rate = max_rate;
3217
3218 return 0;
3219 }
3220
3221 /* Create an HFSC qdisc.
3222 *
3223 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3224 static int
3225 hfsc_setup_qdisc__(struct netdev * netdev)
3226 {
3227 struct tcmsg *tcmsg;
3228 struct ofpbuf request;
3229 struct tc_hfsc_qopt opt;
3230
3231 tc_del_qdisc(netdev);
3232
3233 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3234 NLM_F_EXCL | NLM_F_CREATE, &request);
3235
3236 if (!tcmsg) {
3237 return ENODEV;
3238 }
3239
3240 tcmsg->tcm_handle = tc_make_handle(1, 0);
3241 tcmsg->tcm_parent = TC_H_ROOT;
3242
3243 memset(&opt, 0, sizeof opt);
3244 opt.defcls = 1;
3245
3246 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3247 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3248
3249 return tc_transact(&request, NULL);
3250 }
3251
3252 /* Create an HFSC class.
3253 *
3254 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3255 * sc rate <min_rate> ul rate <max_rate>" */
3256 static int
3257 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3258 unsigned int parent, struct hfsc_class *class)
3259 {
3260 int error;
3261 size_t opt_offset;
3262 struct tcmsg *tcmsg;
3263 struct ofpbuf request;
3264 struct tc_service_curve min, max;
3265
3266 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3267
3268 if (!tcmsg) {
3269 return ENODEV;
3270 }
3271
3272 tcmsg->tcm_handle = handle;
3273 tcmsg->tcm_parent = parent;
3274
3275 min.m1 = 0;
3276 min.d = 0;
3277 min.m2 = class->min_rate;
3278
3279 max.m1 = 0;
3280 max.d = 0;
3281 max.m2 = class->max_rate;
3282
3283 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3285 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3286 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3287 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3288 nl_msg_end_nested(&request, opt_offset);
3289
3290 error = tc_transact(&request, NULL);
3291 if (error) {
3292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3293 "min-rate %ubps, max-rate %ubps (%s)",
3294 netdev_get_name(netdev),
3295 tc_get_major(handle), tc_get_minor(handle),
3296 tc_get_major(parent), tc_get_minor(parent),
3297 class->min_rate, class->max_rate, ovs_strerror(error));
3298 }
3299
3300 return error;
3301 }
3302
3303 static int
3304 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3305 {
3306 int error;
3307 struct hfsc_class class;
3308
3309 error = hfsc_setup_qdisc__(netdev);
3310
3311 if (error) {
3312 return error;
3313 }
3314
3315 hfsc_parse_qdisc_details__(netdev, details, &class);
3316 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3317 tc_make_handle(1, 0), &class);
3318
3319 if (error) {
3320 return error;
3321 }
3322
3323 hfsc_install__(netdev, class.max_rate);
3324 return 0;
3325 }
3326
3327 static int
3328 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3329 {
3330 struct ofpbuf msg;
3331 struct nl_dump dump;
3332 struct hfsc_class hc;
3333
3334 hc.max_rate = 0;
3335 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3336 hfsc_install__(netdev, hc.max_rate);
3337
3338 if (!start_queue_dump(netdev, &dump)) {
3339 return ENODEV;
3340 }
3341
3342 while (nl_dump_next(&dump, &msg)) {
3343 unsigned int queue_id;
3344
3345 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3346 hfsc_update_queue__(netdev, queue_id, &hc);
3347 }
3348 }
3349
3350 nl_dump_done(&dump);
3351 return 0;
3352 }
3353
3354 static void
3355 hfsc_tc_destroy(struct tc *tc)
3356 {
3357 struct hfsc *hfsc;
3358 struct hfsc_class *hc, *next;
3359
3360 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3361
3362 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3364 free(hc);
3365 }
3366
3367 tc_destroy(tc);
3368 free(hfsc);
3369 }
3370
3371 static int
3372 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3373 {
3374 const struct hfsc *hfsc;
3375 hfsc = hfsc_get__(netdev);
3376 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3377 return 0;
3378 }
3379
3380 static int
3381 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3382 {
3383 int error;
3384 struct hfsc_class class;
3385
3386 hfsc_parse_qdisc_details__(netdev, details, &class);
3387 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3388 tc_make_handle(1, 0), &class);
3389
3390 if (!error) {
3391 hfsc_get__(netdev)->max_rate = class.max_rate;
3392 }
3393
3394 return error;
3395 }
3396
3397 static int
3398 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3399 const struct tc_queue *queue, struct smap *details)
3400 {
3401 const struct hfsc_class *hc;
3402
3403 hc = hfsc_class_cast__(queue);
3404 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3405 if (hc->min_rate != hc->max_rate) {
3406 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3407 }
3408 return 0;
3409 }
3410
3411 static int
3412 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3413 const struct smap *details)
3414 {
3415 int error;
3416 struct hfsc_class class;
3417
3418 error = hfsc_parse_class_details__(netdev, details, &class);
3419 if (error) {
3420 return error;
3421 }
3422
3423 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3424 tc_make_handle(1, 0xfffe), &class);
3425 if (error) {
3426 return error;
3427 }
3428
3429 hfsc_update_queue__(netdev, queue_id, &class);
3430 return 0;
3431 }
3432
3433 static int
3434 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3435 {
3436 int error;
3437 struct hfsc *hfsc;
3438 struct hfsc_class *hc;
3439
3440 hc = hfsc_class_cast__(queue);
3441 hfsc = hfsc_get__(netdev);
3442
3443 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3444 if (!error) {
3445 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3446 free(hc);
3447 }
3448 return error;
3449 }
3450
3451 static int
3452 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3453 struct netdev_queue_stats *stats)
3454 {
3455 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3456 tc_make_handle(1, 0xfffe), NULL, stats);
3457 }
3458
3459 static int
3460 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3461 const struct ofpbuf *nlmsg,
3462 netdev_dump_queue_stats_cb *cb, void *aux)
3463 {
3464 struct netdev_queue_stats stats;
3465 unsigned int handle, major, minor;
3466 int error;
3467
3468 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3469 if (error) {
3470 return error;
3471 }
3472
3473 major = tc_get_major(handle);
3474 minor = tc_get_minor(handle);
3475 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3476 (*cb)(minor - 1, &stats, aux);
3477 }
3478 return 0;
3479 }
3480
3481 static const struct tc_ops tc_ops_hfsc = {
3482 "hfsc", /* linux_name */
3483 "linux-hfsc", /* ovs_name */
3484 HFSC_N_QUEUES, /* n_queues */
3485 hfsc_tc_install, /* tc_install */
3486 hfsc_tc_load, /* tc_load */
3487 hfsc_tc_destroy, /* tc_destroy */
3488 hfsc_qdisc_get, /* qdisc_get */
3489 hfsc_qdisc_set, /* qdisc_set */
3490 hfsc_class_get, /* class_get */
3491 hfsc_class_set, /* class_set */
3492 hfsc_class_delete, /* class_delete */
3493 hfsc_class_get_stats, /* class_get_stats */
3494 hfsc_class_dump_stats /* class_dump_stats */
3495 };
3496 \f
3497 /* "linux-default" traffic control class.
3498 *
3499 * This class represents the default, unnamed Linux qdisc. It corresponds to
3500 * the "" (empty string) QoS type in the OVS database. */
3501
3502 static void
3503 default_install__(struct netdev *netdev_)
3504 {
3505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3506 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3507
3508 /* Nothing but a tc class implementation is allowed to write to a tc. This
3509 * class never does that, so we can legitimately use a const tc object. */
3510 netdev->tc = CONST_CAST(struct tc *, &tc);
3511 }
3512
3513 static int
3514 default_tc_install(struct netdev *netdev,
3515 const struct smap *details OVS_UNUSED)
3516 {
3517 default_install__(netdev);
3518 return 0;
3519 }
3520
3521 static int
3522 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3523 {
3524 default_install__(netdev);
3525 return 0;
3526 }
3527
3528 static const struct tc_ops tc_ops_default = {
3529 NULL, /* linux_name */
3530 "", /* ovs_name */
3531 0, /* n_queues */
3532 default_tc_install,
3533 default_tc_load,
3534 NULL, /* tc_destroy */
3535 NULL, /* qdisc_get */
3536 NULL, /* qdisc_set */
3537 NULL, /* class_get */
3538 NULL, /* class_set */
3539 NULL, /* class_delete */
3540 NULL, /* class_get_stats */
3541 NULL /* class_dump_stats */
3542 };
3543 \f
3544 /* "linux-other" traffic control class.
3545 *
3546 * */
3547
3548 static int
3549 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3550 {
3551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3552 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3553
3554 /* Nothing but a tc class implementation is allowed to write to a tc. This
3555 * class never does that, so we can legitimately use a const tc object. */
3556 netdev->tc = CONST_CAST(struct tc *, &tc);
3557 return 0;
3558 }
3559
3560 static const struct tc_ops tc_ops_other = {
3561 NULL, /* linux_name */
3562 "linux-other", /* ovs_name */
3563 0, /* n_queues */
3564 NULL, /* tc_install */
3565 other_tc_load,
3566 NULL, /* tc_destroy */
3567 NULL, /* qdisc_get */
3568 NULL, /* qdisc_set */
3569 NULL, /* class_get */
3570 NULL, /* class_set */
3571 NULL, /* class_delete */
3572 NULL, /* class_get_stats */
3573 NULL /* class_dump_stats */
3574 };
3575 \f
3576 /* Traffic control. */
3577
3578 /* Number of kernel "tc" ticks per second. */
3579 static double ticks_per_s;
3580
3581 /* Number of kernel "jiffies" per second. This is used for the purpose of
3582 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3583 * one jiffy's worth of data.
3584 *
3585 * There are two possibilities here:
3586 *
3587 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3588 * approximate range of 100 to 1024. That means that we really need to
3589 * make sure that the qdisc can buffer that much data.
3590 *
3591 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3592 * has finely granular timers and there's no need to fudge additional room
3593 * for buffers. (There's no extra effort needed to implement that: the
3594 * large 'buffer_hz' is used as a divisor, so practically any number will
3595 * come out as 0 in the division. Small integer results in the case of
3596 * really high dividends won't have any real effect anyhow.)
3597 */
3598 static unsigned int buffer_hz;
3599
3600 /* Returns tc handle 'major':'minor'. */
3601 static unsigned int
3602 tc_make_handle(unsigned int major, unsigned int minor)
3603 {
3604 return TC_H_MAKE(major << 16, minor);
3605 }
3606
3607 /* Returns the major number from 'handle'. */
3608 static unsigned int
3609 tc_get_major(unsigned int handle)
3610 {
3611 return TC_H_MAJ(handle) >> 16;
3612 }
3613
3614 /* Returns the minor number from 'handle'. */
3615 static unsigned int
3616 tc_get_minor(unsigned int handle)
3617 {
3618 return TC_H_MIN(handle);
3619 }
3620
3621 static struct tcmsg *
3622 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3623 struct ofpbuf *request)
3624 {
3625 struct tcmsg *tcmsg;
3626 int ifindex;
3627 int error;
3628
3629 error = get_ifindex(netdev, &ifindex);
3630 if (error) {
3631 return NULL;
3632 }
3633
3634 ofpbuf_init(request, 512);
3635 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3636 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3637 tcmsg->tcm_family = AF_UNSPEC;
3638 tcmsg->tcm_ifindex = ifindex;
3639 /* Caller should fill in tcmsg->tcm_handle. */
3640 /* Caller should fill in tcmsg->tcm_parent. */
3641
3642 return tcmsg;
3643 }
3644
3645 static int
3646 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3647 {
3648 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3649 ofpbuf_uninit(request);
3650 return error;
3651 }
3652
3653 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3654 * policing configuration.
3655 *
3656 * This function is equivalent to running the following when 'add' is true:
3657 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3658 *
3659 * This function is equivalent to running the following when 'add' is false:
3660 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3661 *
3662 * The configuration and stats may be seen with the following command:
3663 * /sbin/tc -s qdisc show dev <devname>
3664 *
3665 * Returns 0 if successful, otherwise a positive errno value.
3666 */
3667 static int
3668 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3669 {
3670 struct ofpbuf request;
3671 struct tcmsg *tcmsg;
3672 int error;
3673 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3674 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3675
3676 tcmsg = tc_make_request(netdev, type, flags, &request);
3677 if (!tcmsg) {
3678 return ENODEV;
3679 }
3680 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3681 tcmsg->tcm_parent = TC_H_INGRESS;
3682 nl_msg_put_string(&request, TCA_KIND, "ingress");
3683 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3684
3685 error = tc_transact(&request, NULL);
3686 if (error) {
3687 /* If we're deleting the qdisc, don't worry about some of the
3688 * error conditions. */
3689 if (!add && (error == ENOENT || error == EINVAL)) {
3690 return 0;
3691 }
3692 return error;
3693 }
3694
3695 return 0;
3696 }
3697
3698 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3699 * of 'kbits_burst'.
3700 *
3701 * This function is equivalent to running:
3702 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3703 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3704 * mtu 65535 drop
3705 *
3706 * The configuration and stats may be seen with the following command:
3707 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3708 *
3709 * Returns 0 if successful, otherwise a positive errno value.
3710 */
3711 static int
3712 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3713 {
3714 struct tc_police tc_police;
3715 struct ofpbuf request;
3716 struct tcmsg *tcmsg;
3717 size_t basic_offset;
3718 size_t police_offset;
3719 int error;
3720 int mtu = 65535;
3721
3722 memset(&tc_police, 0, sizeof tc_police);
3723 tc_police.action = TC_POLICE_SHOT;
3724 tc_police.mtu = mtu;
3725 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3726 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3727 kbits_burst * 1024);
3728
3729 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3730 NLM_F_EXCL | NLM_F_CREATE, &request);
3731 if (!tcmsg) {
3732 return ENODEV;
3733 }
3734 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3735 tcmsg->tcm_info = tc_make_handle(49,
3736 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3737
3738 nl_msg_put_string(&request, TCA_KIND, "basic");
3739 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3740 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3741 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3742 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3743 nl_msg_end_nested(&request, police_offset);
3744 nl_msg_end_nested(&request, basic_offset);
3745
3746 error = tc_transact(&request, NULL);
3747 if (error) {
3748 return error;
3749 }
3750
3751 return 0;
3752 }
3753
3754 static void
3755 read_psched(void)
3756 {
3757 /* The values in psched are not individually very meaningful, but they are
3758 * important. The tables below show some values seen in the wild.
3759 *
3760 * Some notes:
3761 *
3762 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3763 * (Before that, there are hints that it was 1000000000.)
3764 *
3765 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3766 * above.
3767 *
3768 * /proc/net/psched
3769 * -----------------------------------
3770 * [1] 000c8000 000f4240 000f4240 00000064
3771 * [2] 000003e8 00000400 000f4240 3b9aca00
3772 * [3] 000003e8 00000400 000f4240 3b9aca00
3773 * [4] 000003e8 00000400 000f4240 00000064
3774 * [5] 000003e8 00000040 000f4240 3b9aca00
3775 * [6] 000003e8 00000040 000f4240 000000f9
3776 *
3777 * a b c d ticks_per_s buffer_hz
3778 * ------- --------- ---------- ------------- ----------- -------------
3779 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3780 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3781 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3782 * [4] 1,000 1,024 1,000,000 100 976,562 100
3783 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3784 * [6] 1,000 64 1,000,000 249 15,625,000 249
3785 *
3786 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3787 * [2] 2.6.26-1-686-bigmem from Debian lenny
3788 * [3] 2.6.26-2-sparc64 from Debian lenny
3789 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3790 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3791 * [6] 2.6.34 from kernel.org on KVM
3792 */
3793 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3794 static const char fn[] = "/proc/net/psched";
3795 unsigned int a, b, c, d;
3796 FILE *stream;
3797
3798 if (!ovsthread_once_start(&once)) {
3799 return;
3800 }
3801
3802 ticks_per_s = 1.0;
3803 buffer_hz = 100;
3804
3805 stream = fopen(fn, "r");
3806 if (!stream) {
3807 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3808 goto exit;
3809 }
3810
3811 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3812 VLOG_WARN("%s: read failed", fn);
3813 fclose(stream);
3814 goto exit;
3815 }
3816 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3817 fclose(stream);
3818
3819 if (!a || !c) {
3820 VLOG_WARN("%s: invalid scheduler parameters", fn);
3821 goto exit;
3822 }
3823
3824 ticks_per_s = (double) a * c / b;
3825 if (c == 1000000) {
3826 buffer_hz = d;
3827 } else {
3828 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3829 fn, a, b, c, d);
3830 }
3831 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3832
3833 exit:
3834 ovsthread_once_done(&once);
3835 }
3836
3837 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3838 * rate of 'rate' bytes per second. */
3839 static unsigned int
3840 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3841 {
3842 read_psched();
3843 return (rate * ticks) / ticks_per_s;
3844 }
3845
3846 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3847 * rate of 'rate' bytes per second. */
3848 static unsigned int
3849 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3850 {
3851 read_psched();
3852 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3853 }
3854
3855 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3856 * a transmission rate of 'rate' bytes per second. */
3857 static unsigned int
3858 tc_buffer_per_jiffy(unsigned int rate)
3859 {
3860 read_psched();
3861 return rate / buffer_hz;
3862 }
3863
3864 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3865 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3866 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3867 * stores NULL into it if it is absent.
3868 *
3869 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3870 * 'msg'.
3871 *
3872 * Returns 0 if successful, otherwise a positive errno value. */
3873 static int
3874 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3875 struct nlattr **options)
3876 {
3877 static const struct nl_policy tca_policy[] = {
3878 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3879 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3880 };
3881 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3882
3883 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3884 tca_policy, ta, ARRAY_SIZE(ta))) {
3885 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3886 goto error;
3887 }
3888
3889 if (kind) {
3890 *kind = nl_attr_get_string(ta[TCA_KIND]);
3891 }
3892
3893 if (options) {
3894 *options = ta[TCA_OPTIONS];
3895 }
3896
3897 return 0;
3898
3899 error:
3900 if (kind) {
3901 *kind = NULL;
3902 }
3903 if (options) {
3904 *options = NULL;
3905 }
3906 return EPROTO;
3907 }
3908
3909 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3910 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3911 * into '*options', and its queue statistics into '*stats'. Any of the output
3912 * arguments may be null.
3913 *
3914 * Returns 0 if successful, otherwise a positive errno value. */
3915 static int
3916 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3917 struct nlattr **options, struct netdev_queue_stats *stats)
3918 {
3919 static const struct nl_policy tca_policy[] = {
3920 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3921 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3922 };
3923 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3924
3925 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3926 tca_policy, ta, ARRAY_SIZE(ta))) {
3927 VLOG_WARN_RL(&rl, "failed to parse class message");
3928 goto error;
3929 }
3930
3931 if (handlep) {
3932 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3933 *handlep = tc->tcm_handle;
3934 }
3935
3936 if (options) {
3937 *options = ta[TCA_OPTIONS];
3938 }
3939
3940 if (stats) {
3941 const struct gnet_stats_queue *gsq;
3942 struct gnet_stats_basic gsb;
3943
3944 static const struct nl_policy stats_policy[] = {
3945 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3946 .min_len = sizeof gsb },
3947 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3948 .min_len = sizeof *gsq },
3949 };
3950 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3951
3952 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3953 sa, ARRAY_SIZE(sa))) {
3954 VLOG_WARN_RL(&rl, "failed to parse class stats");
3955 goto error;
3956 }
3957
3958 /* Alignment issues screw up the length of struct gnet_stats_basic on
3959 * some arch/bitsize combinations. Newer versions of Linux have a
3960 * struct gnet_stats_basic_packed, but we can't depend on that. The
3961 * easiest thing to do is just to make a copy. */
3962 memset(&gsb, 0, sizeof gsb);
3963 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3964 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3965 stats->tx_bytes = gsb.bytes;
3966 stats->tx_packets = gsb.packets;
3967
3968 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3969 stats->tx_errors = gsq->drops;
3970 }
3971
3972 return 0;
3973
3974 error:
3975 if (options) {
3976 *options = NULL;
3977 }
3978 if (stats) {
3979 memset(stats, 0, sizeof *stats);
3980 }
3981 return EPROTO;
3982 }
3983
3984 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3985 * on 'netdev'. */
3986 static int
3987 tc_query_class(const struct netdev *netdev,
3988 unsigned int handle, unsigned int parent,
3989 struct ofpbuf **replyp)
3990 {
3991 struct ofpbuf request;
3992 struct tcmsg *tcmsg;
3993 int error;
3994
3995 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3996 if (!tcmsg) {
3997 return ENODEV;
3998 }
3999 tcmsg->tcm_handle = handle;
4000 tcmsg->tcm_parent = parent;
4001
4002 error = tc_transact(&request, replyp);
4003 if (error) {
4004 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4005 netdev_get_name(netdev),
4006 tc_get_major(handle), tc_get_minor(handle),
4007 tc_get_major(parent), tc_get_minor(parent),
4008 ovs_strerror(error));
4009 }
4010 return error;
4011 }
4012
4013 /* Equivalent to "tc class del dev <name> handle <handle>". */
4014 static int
4015 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4016 {
4017 struct ofpbuf request;
4018 struct tcmsg *tcmsg;
4019 int error;
4020
4021 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4022 if (!tcmsg) {
4023 return ENODEV;
4024 }
4025 tcmsg->tcm_handle = handle;
4026 tcmsg->tcm_parent = 0;
4027
4028 error = tc_transact(&request, NULL);
4029 if (error) {
4030 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4031 netdev_get_name(netdev),
4032 tc_get_major(handle), tc_get_minor(handle),
4033 ovs_strerror(error));
4034 }
4035 return error;
4036 }
4037
4038 /* Equivalent to "tc qdisc del dev <name> root". */
4039 static int
4040 tc_del_qdisc(struct netdev *netdev_)
4041 {
4042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4043 struct ofpbuf request;
4044 struct tcmsg *tcmsg;
4045 int error;
4046
4047 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4048 if (!tcmsg) {
4049 return ENODEV;
4050 }
4051 tcmsg->tcm_handle = tc_make_handle(1, 0);
4052 tcmsg->tcm_parent = TC_H_ROOT;
4053
4054 error = tc_transact(&request, NULL);
4055 if (error == EINVAL) {
4056 /* EINVAL probably means that the default qdisc was in use, in which
4057 * case we've accomplished our purpose. */
4058 error = 0;
4059 }
4060 if (!error && netdev->tc) {
4061 if (netdev->tc->ops->tc_destroy) {
4062 netdev->tc->ops->tc_destroy(netdev->tc);
4063 }
4064 netdev->tc = NULL;
4065 }
4066 return error;
4067 }
4068
4069 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4070 * kernel to determine what they are. Returns 0 if successful, otherwise a
4071 * positive errno value. */
4072 static int
4073 tc_query_qdisc(const struct netdev *netdev_)
4074 {
4075 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4076 struct ofpbuf request, *qdisc;
4077 const struct tc_ops *ops;
4078 struct tcmsg *tcmsg;
4079 int load_error;
4080 int error;
4081
4082 if (netdev->tc) {
4083 return 0;
4084 }
4085
4086 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4087 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4088 * 2.6.35 without that fix backported to it.
4089 *
4090 * To avoid the OOPS, we must not make a request that would attempt to dump
4091 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4092 * few others. There are a few ways that I can see to do this, but most of
4093 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4094 * technique chosen here is to assume that any non-default qdisc that we
4095 * create will have a class with handle 1:0. The built-in qdiscs only have
4096 * a class with handle 0:0.
4097 *
4098 * We could check for Linux 2.6.35+ and use a more straightforward method
4099 * there. */
4100 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4101 if (!tcmsg) {
4102 return ENODEV;
4103 }
4104 tcmsg->tcm_handle = tc_make_handle(1, 0);
4105 tcmsg->tcm_parent = 0;
4106
4107 /* Figure out what tc class to instantiate. */
4108 error = tc_transact(&request, &qdisc);
4109 if (!error) {
4110 const char *kind;
4111
4112 error = tc_parse_qdisc(qdisc, &kind, NULL);
4113 if (error) {
4114 ops = &tc_ops_other;
4115 } else {
4116 ops = tc_lookup_linux_name(kind);
4117 if (!ops) {
4118 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4119 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4120
4121 ops = &tc_ops_other;
4122 }
4123 }
4124 } else if (error == ENOENT) {
4125 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4126 * other entity that doesn't have a handle 1:0. We will assume
4127 * that it's the system default qdisc. */
4128 ops = &tc_ops_default;
4129 error = 0;
4130 } else {
4131 /* Who knows? Maybe the device got deleted. */
4132 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4133 netdev_get_name(netdev_), ovs_strerror(error));
4134 ops = &tc_ops_other;
4135 }
4136
4137 /* Instantiate it. */
4138 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4139 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4140 ofpbuf_delete(qdisc);
4141
4142 return error ? error : load_error;
4143 }
4144
4145 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4146 approximate the time to transmit packets of various lengths. For an MTU of
4147 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4148 represents two possible packet lengths; for a MTU of 513 through 1024, four
4149 possible lengths; and so on.
4150
4151 Returns, for the specified 'mtu', the number of bits that packet lengths
4152 need to be shifted right to fit within such a 256-entry table. */
4153 static int
4154 tc_calc_cell_log(unsigned int mtu)
4155 {
4156 int cell_log;
4157
4158 if (!mtu) {
4159 mtu = ETH_PAYLOAD_MAX;
4160 }
4161 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4162
4163 for (cell_log = 0; mtu >= 256; cell_log++) {
4164 mtu >>= 1;
4165 }
4166
4167 return cell_log;
4168 }
4169
4170 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4171 * of 'mtu'. */
4172 static void
4173 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4174 {
4175 memset(rate, 0, sizeof *rate);
4176 rate->cell_log = tc_calc_cell_log(mtu);
4177 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4178 /* rate->cell_align = 0; */ /* distro headers. */
4179 rate->mpu = ETH_TOTAL_MIN;
4180 rate->rate = Bps;
4181 }
4182
4183 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4184 * attribute of the specified "type".
4185 *
4186 * See tc_calc_cell_log() above for a description of "rtab"s. */
4187 static void
4188 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4189 {
4190 uint32_t *rtab;
4191 unsigned int i;
4192
4193 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4194 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4195 unsigned packet_size = (i + 1) << rate->cell_log;
4196 if (packet_size < rate->mpu) {
4197 packet_size = rate->mpu;
4198 }
4199 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4200 }
4201 }
4202
4203 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4204 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4205 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4206 * 0 is fine.) */
4207 static int
4208 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4209 {
4210 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4211 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4212 }
4213 \f
4214 /* Linux-only functions declared in netdev-linux.h */
4215
4216 /* Returns a fd for an AF_INET socket or a negative errno value. */
4217 int
4218 netdev_linux_get_af_inet_sock(void)
4219 {
4220 int error = netdev_linux_init();
4221 return error ? -error : af_inet_sock;
4222 }
4223
4224 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4225 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4226 int
4227 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4228 const char *flag_name, bool enable)
4229 {
4230 const char *netdev_name = netdev_get_name(netdev);
4231 struct ethtool_value evalue;
4232 uint32_t new_flags;
4233 int error;
4234
4235 COVERAGE_INC(netdev_get_ethtool);
4236 memset(&evalue, 0, sizeof evalue);
4237 error = netdev_linux_do_ethtool(netdev_name,
4238 (struct ethtool_cmd *)&evalue,
4239 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4240 if (error) {
4241 return error;
4242 }
4243
4244 COVERAGE_INC(netdev_set_ethtool);
4245 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4246 error = netdev_linux_do_ethtool(netdev_name,
4247 (struct ethtool_cmd *)&evalue,
4248 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4249 if (error) {
4250 return error;
4251 }
4252
4253 COVERAGE_INC(netdev_get_ethtool);
4254 memset(&evalue, 0, sizeof evalue);
4255 error = netdev_linux_do_ethtool(netdev_name,
4256 (struct ethtool_cmd *)&evalue,
4257 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4258 if (error) {
4259 return error;
4260 }
4261
4262 if (new_flags != evalue.data) {
4263 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4264 "device %s failed", enable ? "enable" : "disable",
4265 flag_name, netdev_name);
4266 return EOPNOTSUPP;
4267 }
4268
4269 return 0;
4270 }
4271 \f
4272 /* Utility functions. */
4273
4274 /* Copies 'src' into 'dst', performing format conversion in the process. */
4275 static void
4276 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4277 const struct rtnl_link_stats *src)
4278 {
4279 dst->rx_packets = src->rx_packets;
4280 dst->tx_packets = src->tx_packets;
4281 dst->rx_bytes = src->rx_bytes;
4282 dst->tx_bytes = src->tx_bytes;
4283 dst->rx_errors = src->rx_errors;
4284 dst->tx_errors = src->tx_errors;
4285 dst->rx_dropped = src->rx_dropped;
4286 dst->tx_dropped = src->tx_dropped;
4287 dst->multicast = src->multicast;
4288 dst->collisions = src->collisions;
4289 dst->rx_length_errors = src->rx_length_errors;
4290 dst->rx_over_errors = src->rx_over_errors;
4291 dst->rx_crc_errors = src->rx_crc_errors;
4292 dst->rx_frame_errors = src->rx_frame_errors;
4293 dst->rx_fifo_errors = src->rx_fifo_errors;
4294 dst->rx_missed_errors = src->rx_missed_errors;
4295 dst->tx_aborted_errors = src->tx_aborted_errors;
4296 dst->tx_carrier_errors = src->tx_carrier_errors;
4297 dst->tx_fifo_errors = src->tx_fifo_errors;
4298 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4299 dst->tx_window_errors = src->tx_window_errors;
4300 }
4301
4302 static int
4303 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4304 {
4305 /* Policy for RTNLGRP_LINK messages.
4306 *
4307 * There are *many* more fields in these messages, but currently we only
4308 * care about these fields. */
4309 static const struct nl_policy rtnlgrp_link_policy[] = {
4310 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4311 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4312 .min_len = sizeof(struct rtnl_link_stats) },
4313 };
4314
4315 struct ofpbuf request;
4316 struct ofpbuf *reply;
4317 struct ifinfomsg *ifi;
4318 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4319 int error;
4320
4321 ofpbuf_init(&request, 0);
4322 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4323 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4324 ifi->ifi_family = PF_UNSPEC;
4325 ifi->ifi_index = ifindex;
4326 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4327 ofpbuf_uninit(&request);
4328 if (error) {
4329 return error;
4330 }
4331
4332 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4333 rtnlgrp_link_policy,
4334 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4335 ofpbuf_delete(reply);
4336 return EPROTO;
4337 }
4338
4339 if (!attrs[IFLA_STATS]) {
4340 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4341 ofpbuf_delete(reply);
4342 return EPROTO;
4343 }
4344
4345 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4346
4347 ofpbuf_delete(reply);
4348
4349 return 0;
4350 }
4351
4352 static int
4353 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4354 {
4355 static const char fn[] = "/proc/net/dev";
4356 char line[1024];
4357 FILE *stream;
4358 int ln;
4359
4360 stream = fopen(fn, "r");
4361 if (!stream) {
4362 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4363 return errno;
4364 }
4365
4366 ln = 0;
4367 while (fgets(line, sizeof line, stream)) {
4368 if (++ln >= 3) {
4369 char devname[16];
4370 #define X64 "%"SCNu64
4371 if (sscanf(line,
4372 " %15[^:]:"
4373 X64 X64 X64 X64 X64 X64 X64 "%*u"
4374 X64 X64 X64 X64 X64 X64 X64 "%*u",
4375 devname,
4376 &stats->rx_bytes,
4377 &stats->rx_packets,
4378 &stats->rx_errors,
4379 &stats->rx_dropped,
4380 &stats->rx_fifo_errors,
4381 &stats->rx_frame_errors,
4382 &stats->multicast,
4383 &stats->tx_bytes,
4384 &stats->tx_packets,
4385 &stats->tx_errors,
4386 &stats->tx_dropped,
4387 &stats->tx_fifo_errors,
4388 &stats->collisions,
4389 &stats->tx_carrier_errors) != 15) {
4390 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4391 } else if (!strcmp(devname, netdev_name)) {
4392 stats->rx_length_errors = UINT64_MAX;
4393 stats->rx_over_errors = UINT64_MAX;
4394 stats->rx_crc_errors = UINT64_MAX;
4395 stats->rx_missed_errors = UINT64_MAX;
4396 stats->tx_aborted_errors = UINT64_MAX;
4397 stats->tx_heartbeat_errors = UINT64_MAX;
4398 stats->tx_window_errors = UINT64_MAX;
4399 fclose(stream);
4400 return 0;
4401 }
4402 }
4403 }
4404 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4405 fclose(stream);
4406 return ENODEV;
4407 }
4408
4409 static int
4410 get_flags(const struct netdev *dev, unsigned int *flags)
4411 {
4412 struct ifreq ifr;
4413 int error;
4414
4415 *flags = 0;
4416 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4417 "SIOCGIFFLAGS");
4418 if (!error) {
4419 *flags = ifr.ifr_flags;
4420 }
4421 return error;
4422 }
4423
4424 static int
4425 set_flags(const char *name, unsigned int flags)
4426 {
4427 struct ifreq ifr;
4428
4429 ifr.ifr_flags = flags;
4430 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4431 }
4432
4433 static int
4434 do_get_ifindex(const char *netdev_name)
4435 {
4436 struct ifreq ifr;
4437
4438 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4439 COVERAGE_INC(netdev_get_ifindex);
4440 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4441 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4442 netdev_name, ovs_strerror(errno));
4443 return -errno;
4444 }
4445 return ifr.ifr_ifindex;
4446 }
4447
4448 static int
4449 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4450 {
4451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4452
4453 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4454 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4455
4456 if (ifindex < 0) {
4457 netdev->get_ifindex_error = -ifindex;
4458 netdev->ifindex = 0;
4459 } else {
4460 netdev->get_ifindex_error = 0;
4461 netdev->ifindex = ifindex;
4462 }
4463 netdev->cache_valid |= VALID_IFINDEX;
4464 }
4465
4466 *ifindexp = netdev->ifindex;
4467 return netdev->get_ifindex_error;
4468 }
4469
4470 static int
4471 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4472 {
4473 struct ifreq ifr;
4474 int hwaddr_family;
4475
4476 memset(&ifr, 0, sizeof ifr);
4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4478 COVERAGE_INC(netdev_get_hwaddr);
4479 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4480 /* ENODEV probably means that a vif disappeared asynchronously and
4481 * hasn't been removed from the database yet, so reduce the log level
4482 * to INFO for that case. */
4483 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4484 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4485 netdev_name, ovs_strerror(errno));
4486 return errno;
4487 }
4488 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4489 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4490 VLOG_WARN("%s device has unknown hardware address family %d",
4491 netdev_name, hwaddr_family);
4492 }
4493 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4494 return 0;
4495 }
4496
4497 static int
4498 set_etheraddr(const char *netdev_name,
4499 const uint8_t mac[ETH_ADDR_LEN])
4500 {
4501 struct ifreq ifr;
4502
4503 memset(&ifr, 0, sizeof ifr);
4504 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4505 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4506 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4507 COVERAGE_INC(netdev_set_hwaddr);
4508 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4509 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4510 netdev_name, ovs_strerror(errno));
4511 return errno;
4512 }
4513 return 0;
4514 }
4515
4516 static int
4517 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4518 int cmd, const char *cmd_name)
4519 {
4520 struct ifreq ifr;
4521
4522 memset(&ifr, 0, sizeof ifr);
4523 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4524 ifr.ifr_data = (caddr_t) ecmd;
4525
4526 ecmd->cmd = cmd;
4527 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4528 return 0;
4529 } else {
4530 if (errno != EOPNOTSUPP) {
4531 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4532 "failed: %s", cmd_name, name, ovs_strerror(errno));
4533 } else {
4534 /* The device doesn't support this operation. That's pretty
4535 * common, so there's no point in logging anything. */
4536 }
4537 return errno;
4538 }
4539 }
4540
4541 static int
4542 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4543 const char *cmd_name)
4544 {
4545 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4546 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4547 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4548 ovs_strerror(errno));
4549 return errno;
4550 }
4551 return 0;
4552 }
4553
4554 static int
4555 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4556 int cmd, const char *cmd_name)
4557 {
4558 struct ifreq ifr;
4559 int error;
4560
4561 ifr.ifr_addr.sa_family = AF_INET;
4562 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4563 if (!error) {
4564 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4565 &ifr.ifr_addr);
4566 *ip = sin->sin_addr;
4567 }
4568 return error;
4569 }
4570
4571 /* Returns an AF_PACKET raw socket or a negative errno value. */
4572 static int
4573 af_packet_sock(void)
4574 {
4575 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4576 static int sock;
4577
4578 if (ovsthread_once_start(&once)) {
4579 sock = socket(AF_PACKET, SOCK_RAW, 0);
4580 if (sock >= 0) {
4581 int error = set_nonblocking(sock);
4582 if (error) {
4583 close(sock);
4584 sock = -error;
4585 }
4586 } else {
4587 sock = -errno;
4588 VLOG_ERR("failed to create packet socket: %s",
4589 ovs_strerror(errno));
4590 }
4591 ovsthread_once_done(&once);
4592 }
4593
4594 return sock;
4595 }