]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
ba0d8633c3cc47dd61cd1e88f88c8ba3a6230d9b
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "packets.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "shash.h"
68 #include "socket-util.h"
69 #include "sset.h"
70 #include "timer.h"
71 #include "unaligned.h"
72 #include "vlog.h"
73
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
83
84 \f
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
89 #endif
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
92 #endif
93
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #endif
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 #endif
102
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105 #ifndef TC_RTAB_SIZE
106 #define TC_RTAB_SIZE 1024
107 #endif
108
109 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
110 static int cache_notifier_refcount;
111
112 enum {
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
118 VALID_POLICING = 1 << 5,
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
121 VALID_FEATURES = 1 << 8,
122 };
123 \f
124 /* Traffic control. */
125
126 /* An instance of a traffic control class. Always associated with a particular
127 * network device.
128 *
129 * Each TC implementation subclasses this with whatever additional data it
130 * needs. */
131 struct tc {
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
136 };
137
138 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
139
140 /* One traffic control queue.
141 *
142 * Each TC implementation subclasses this with whatever additional data it
143 * needs. */
144 struct tc_queue {
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
147 long long int created; /* Time queue was created, in msecs. */
148 };
149
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
152 *
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
156 struct tc_ops {
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
161
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
164
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
168
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
174 *
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
178 *
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
181 *
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
185
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
189 *
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
195 * 'netdev'.
196 *
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
200
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * tc_destroy(tc).
204 *
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
208 *
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
211
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
213 *
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
217 *
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
221 *
222 * This function may be null if 'tc' is not configurable.
223 */
224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
225
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
228 *
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
232 *
233 * This function may be null if 'tc' is not configurable.
234 */
235 int (*qdisc_set)(struct netdev *, const struct smap *details);
236
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
239 *
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
243 *
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
247 *
248 * This function may be null if 'tc' does not have queues ('n_queues' is
249 * 0). */
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct smap *details);
252
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * 'n_queues'.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct smap *details);
266
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
269 *
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
273
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
276 *
277 * On success, initializes '*stats'.
278 *
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
284
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
287 *
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
293 };
294
295 static void
296 tc_init(struct tc *tc, const struct tc_ops *ops)
297 {
298 tc->ops = ops;
299 hmap_init(&tc->queues);
300 }
301
302 static void
303 tc_destroy(struct tc *tc)
304 {
305 hmap_destroy(&tc->queues);
306 }
307
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
312
313 static const struct tc_ops *const tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
318 NULL
319 };
320
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
324
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
328
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 int kbits_burst);
335
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
345
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
348
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
354 \f
355 struct netdev_linux {
356 struct netdev up;
357
358 unsigned int cache_valid;
359 unsigned int change_seq;
360
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
364
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 int ifindex;
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
370 struct in6_addr in6;
371 int mtu;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
383
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
387
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
389 struct tc *tc;
390
391 /* For devices of class netdev_tap_class only. */
392 int tap_fd;
393 };
394
395 struct netdev_rx_linux {
396 struct netdev_rx up;
397 bool is_tap;
398 int fd;
399 };
400
401 static const struct netdev_rx_class netdev_rx_linux_class;
402
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
405
406 /* This is set pretty low because we probably won't learn anything from the
407 * additional log messages. */
408 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
409
410 static int netdev_linux_init(void);
411
412 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
413 int cmd, const char *cmd_name);
414 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
415 const char *cmd_name);
416 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
417 int cmd, const char *cmd_name);
418 static int get_flags(const struct netdev *, unsigned int *flags);
419 static int set_flags(const char *, unsigned int flags);
420 static int do_get_ifindex(const char *netdev_name);
421 static int get_ifindex(const struct netdev *, int *ifindexp);
422 static int do_set_addr(struct netdev *netdev,
423 int ioctl_nr, const char *ioctl_name,
424 struct in_addr addr);
425 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
426 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
427 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
428 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
429 static int af_packet_sock(void);
430 static void netdev_linux_miimon_run(void);
431 static void netdev_linux_miimon_wait(void);
432
433 static bool
434 is_netdev_linux_class(const struct netdev_class *netdev_class)
435 {
436 return netdev_class->init == netdev_linux_init;
437 }
438
439 static bool
440 is_tap_netdev(const struct netdev *netdev)
441 {
442 return netdev_get_class(netdev) == &netdev_tap_class;
443 }
444
445 static struct netdev_linux *
446 netdev_linux_cast(const struct netdev *netdev)
447 {
448 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
449
450 return CONTAINER_OF(netdev, struct netdev_linux, up);
451 }
452
453 static struct netdev_rx_linux *
454 netdev_rx_linux_cast(const struct netdev_rx *rx)
455 {
456 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
457 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
458 }
459 \f
460 static int
461 netdev_linux_init(void)
462 {
463 static int status = -1;
464 if (status < 0) {
465 /* Create AF_INET socket. */
466 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
467 status = af_inet_sock >= 0 ? 0 : errno;
468 if (status) {
469 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
470 }
471 }
472 return status;
473 }
474
475 static void
476 netdev_linux_run(void)
477 {
478 rtnetlink_link_run();
479 netdev_linux_miimon_run();
480 }
481
482 static void
483 netdev_linux_wait(void)
484 {
485 rtnetlink_link_wait();
486 netdev_linux_miimon_wait();
487 }
488
489 static void
490 netdev_linux_changed(struct netdev_linux *dev,
491 unsigned int ifi_flags, unsigned int mask)
492 {
493 dev->change_seq++;
494 if (!dev->change_seq) {
495 dev->change_seq++;
496 }
497
498 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
499 dev->carrier_resets++;
500 }
501 dev->ifi_flags = ifi_flags;
502
503 dev->cache_valid &= mask;
504 }
505
506 static void
507 netdev_linux_update(struct netdev_linux *dev,
508 const struct rtnetlink_link_change *change)
509 {
510 if (change->nlmsg_type == RTM_NEWLINK) {
511 /* Keep drv-info */
512 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
513
514 /* Update netdev from rtnl-change msg. */
515 if (change->mtu) {
516 dev->mtu = change->mtu;
517 dev->cache_valid |= VALID_MTU;
518 dev->netdev_mtu_error = 0;
519 }
520
521 if (!eth_addr_is_zero(change->addr)) {
522 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
523 dev->cache_valid |= VALID_ETHERADDR;
524 dev->ether_addr_error = 0;
525 }
526
527 dev->ifindex = change->ifi_index;
528 dev->cache_valid |= VALID_IFINDEX;
529 dev->get_ifindex_error = 0;
530
531 } else {
532 netdev_linux_changed(dev, change->ifi_flags, 0);
533 }
534 }
535
536 static void
537 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
538 void *aux OVS_UNUSED)
539 {
540 if (change) {
541 struct netdev *base_dev = netdev_from_name(change->ifname);
542 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
543 netdev_linux_update(netdev_linux_cast(base_dev), change);
544 }
545 } else {
546 struct shash device_shash;
547 struct shash_node *node;
548
549 shash_init(&device_shash);
550 netdev_get_devices(&netdev_linux_class, &device_shash);
551 SHASH_FOR_EACH (node, &device_shash) {
552 struct netdev *netdev = node->data;
553 struct netdev_linux *dev = netdev_linux_cast(netdev);
554 unsigned int flags;
555
556 get_flags(&dev->up, &flags);
557 netdev_linux_changed(dev, flags, 0);
558 }
559 shash_destroy(&device_shash);
560 }
561 }
562
563 static int
564 cache_notifier_ref(void)
565 {
566 if (!cache_notifier_refcount) {
567 ovs_assert(!netdev_linux_cache_notifier);
568
569 netdev_linux_cache_notifier =
570 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
571
572 if (!netdev_linux_cache_notifier) {
573 return EINVAL;
574 }
575 }
576 cache_notifier_refcount++;
577
578 return 0;
579 }
580
581 static void
582 cache_notifier_unref(void)
583 {
584 ovs_assert(cache_notifier_refcount > 0);
585 if (!--cache_notifier_refcount) {
586 ovs_assert(netdev_linux_cache_notifier);
587 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
588 netdev_linux_cache_notifier = NULL;
589 }
590 }
591
592 /* Creates system and internal devices. */
593 static int
594 netdev_linux_create(const struct netdev_class *class, const char *name,
595 struct netdev **netdevp)
596 {
597 struct netdev_linux *netdev;
598 int error;
599
600 error = cache_notifier_ref();
601 if (error) {
602 return error;
603 }
604
605 netdev = xzalloc(sizeof *netdev);
606 netdev->change_seq = 1;
607 netdev_init(&netdev->up, name, class);
608 error = get_flags(&netdev->up, &netdev->ifi_flags);
609 if (error == ENODEV) {
610 if (class != &netdev_internal_class) {
611 /* The device does not exist, so don't allow it to be opened. */
612 netdev_uninit(&netdev->up, false);
613 cache_notifier_unref();
614 free(netdev);
615 return ENODEV;
616 } else {
617 /* "Internal" netdevs have to be created as netdev objects before
618 * they exist in the kernel, because creating them in the kernel
619 * happens by passing a netdev object to dpif_port_add().
620 * Therefore, ignore the error. */
621 }
622 }
623
624 *netdevp = &netdev->up;
625 return 0;
626 }
627
628 /* For most types of netdevs we open the device for each call of
629 * netdev_open(). However, this is not the case with tap devices,
630 * since it is only possible to open the device once. In this
631 * situation we share a single file descriptor, and consequently
632 * buffers, across all readers. Therefore once data is read it will
633 * be unavailable to other reads for tap devices. */
634 static int
635 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
636 const char *name, struct netdev **netdevp)
637 {
638 struct netdev_linux *netdev;
639 static const char tap_dev[] = "/dev/net/tun";
640 struct ifreq ifr;
641 int error;
642
643 netdev = xzalloc(sizeof *netdev);
644 netdev->change_seq = 1;
645
646 error = cache_notifier_ref();
647 if (error) {
648 goto error;
649 }
650
651 /* Open tap device. */
652 netdev->tap_fd = open(tap_dev, O_RDWR);
653 if (netdev->tap_fd < 0) {
654 error = errno;
655 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
656 goto error_unref_notifier;
657 }
658
659 /* Create tap device. */
660 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
661 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
662 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
663 VLOG_WARN("%s: creating tap device failed: %s", name,
664 ovs_strerror(errno));
665 error = errno;
666 goto error_close;
667 }
668
669 /* Make non-blocking. */
670 error = set_nonblocking(netdev->tap_fd);
671 if (error) {
672 goto error_close;
673 }
674
675 netdev_init(&netdev->up, name, &netdev_tap_class);
676 *netdevp = &netdev->up;
677 return 0;
678
679 error_close:
680 close(netdev->tap_fd);
681 error_unref_notifier:
682 cache_notifier_unref();
683 error:
684 free(netdev);
685 return error;
686 }
687
688 static void
689 netdev_linux_destroy(struct netdev *netdev_)
690 {
691 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
692
693 if (netdev->tc && netdev->tc->ops->tc_destroy) {
694 netdev->tc->ops->tc_destroy(netdev->tc);
695 }
696
697 if (netdev_get_class(netdev_) == &netdev_tap_class
698 && netdev->tap_fd >= 0)
699 {
700 close(netdev->tap_fd);
701 }
702 free(netdev);
703
704 cache_notifier_unref();
705 }
706
707 static int
708 netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
709 {
710 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
711 bool is_tap = is_tap_netdev(netdev_);
712 struct netdev_rx_linux *rx;
713 int error;
714 int fd;
715
716 if (is_tap) {
717 fd = netdev->tap_fd;
718 } else {
719 struct sockaddr_ll sll;
720 int ifindex;
721 /* Result of tcpdump -dd inbound */
722 static struct sock_filter filt[] = {
723 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
724 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
725 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
726 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
727 };
728 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
729
730 /* Create file descriptor. */
731 fd = socket(PF_PACKET, SOCK_RAW, 0);
732 if (fd < 0) {
733 error = errno;
734 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
735 goto error;
736 }
737
738 /* Set non-blocking mode. */
739 error = set_nonblocking(fd);
740 if (error) {
741 goto error;
742 }
743
744 /* Get ethernet device index. */
745 error = get_ifindex(&netdev->up, &ifindex);
746 if (error) {
747 goto error;
748 }
749
750 /* Bind to specific ethernet device. */
751 memset(&sll, 0, sizeof sll);
752 sll.sll_family = AF_PACKET;
753 sll.sll_ifindex = ifindex;
754 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
755 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
756 error = errno;
757 VLOG_ERR("%s: failed to bind raw socket (%s)",
758 netdev_get_name(netdev_), ovs_strerror(error));
759 goto error;
760 }
761
762 /* Filter for only inbound packets. */
763 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
764 sizeof fprog);
765 if (error) {
766 error = errno;
767 VLOG_ERR("%s: failed attach filter (%s)",
768 netdev_get_name(netdev_), ovs_strerror(error));
769 goto error;
770 }
771 }
772
773 rx = xmalloc(sizeof *rx);
774 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
775 rx->is_tap = is_tap;
776 rx->fd = fd;
777
778 *rxp = &rx->up;
779 return 0;
780
781 error:
782 if (fd >= 0) {
783 close(fd);
784 }
785 return error;
786 }
787
788 static void
789 netdev_rx_linux_destroy(struct netdev_rx *rx_)
790 {
791 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
792
793 if (!rx->is_tap) {
794 close(rx->fd);
795 }
796 free(rx);
797 }
798
799 static int
800 netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
801 {
802 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
803 ssize_t retval;
804
805 do {
806 retval = (rx->is_tap
807 ? read(rx->fd, data, size)
808 : recv(rx->fd, data, size, MSG_TRUNC));
809 } while (retval < 0 && errno == EINTR);
810
811 if (retval >= 0) {
812 return retval > size ? -EMSGSIZE : retval;
813 } else {
814 if (errno != EAGAIN) {
815 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
816 ovs_strerror(errno), netdev_rx_get_name(rx_));
817 }
818 return -errno;
819 }
820 }
821
822 static void
823 netdev_rx_linux_wait(struct netdev_rx *rx_)
824 {
825 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
826 poll_fd_wait(rx->fd, POLLIN);
827 }
828
829 static int
830 netdev_rx_linux_drain(struct netdev_rx *rx_)
831 {
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
833 if (rx->is_tap) {
834 struct ifreq ifr;
835 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
836 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
837 if (error) {
838 return error;
839 }
840 drain_fd(rx->fd, ifr.ifr_qlen);
841 return 0;
842 } else {
843 return drain_rcvbuf(rx->fd);
844 }
845 }
846
847 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
848 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
849 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
850 * the packet is too big or too small to transmit on the device.
851 *
852 * The caller retains ownership of 'buffer' in all cases.
853 *
854 * The kernel maintains a packet transmission queue, so the caller is not
855 * expected to do additional queuing of packets. */
856 static int
857 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
858 {
859 for (;;) {
860 ssize_t retval;
861
862 if (!is_tap_netdev(netdev_)) {
863 /* Use our AF_PACKET socket to send to this device. */
864 struct sockaddr_ll sll;
865 struct msghdr msg;
866 struct iovec iov;
867 int ifindex;
868 int error;
869 int sock;
870
871 sock = af_packet_sock();
872 if (sock < 0) {
873 return -sock;
874 }
875
876 error = get_ifindex(netdev_, &ifindex);
877 if (error) {
878 return error;
879 }
880
881 /* We don't bother setting most fields in sockaddr_ll because the
882 * kernel ignores them for SOCK_RAW. */
883 memset(&sll, 0, sizeof sll);
884 sll.sll_family = AF_PACKET;
885 sll.sll_ifindex = ifindex;
886
887 iov.iov_base = CONST_CAST(void *, data);
888 iov.iov_len = size;
889
890 msg.msg_name = &sll;
891 msg.msg_namelen = sizeof sll;
892 msg.msg_iov = &iov;
893 msg.msg_iovlen = 1;
894 msg.msg_control = NULL;
895 msg.msg_controllen = 0;
896 msg.msg_flags = 0;
897
898 retval = sendmsg(sock, &msg, 0);
899 } else {
900 /* Use the tap fd to send to this device. This is essential for
901 * tap devices, because packets sent to a tap device with an
902 * AF_PACKET socket will loop back to be *received* again on the
903 * tap device. This doesn't occur on other interface types
904 * because we attach a socket filter to the rx socket. */
905 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
906
907 retval = write(netdev->tap_fd, data, size);
908 }
909
910 if (retval < 0) {
911 /* The Linux AF_PACKET implementation never blocks waiting for room
912 * for packets, instead returning ENOBUFS. Translate this into
913 * EAGAIN for the caller. */
914 if (errno == ENOBUFS) {
915 return EAGAIN;
916 } else if (errno == EINTR) {
917 continue;
918 } else if (errno != EAGAIN) {
919 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
920 netdev_get_name(netdev_), ovs_strerror(errno));
921 }
922 return errno;
923 } else if (retval != size) {
924 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
925 "%zu) on %s", retval, size, netdev_get_name(netdev_));
926 return EMSGSIZE;
927 } else {
928 return 0;
929 }
930 }
931 }
932
933 /* Registers with the poll loop to wake up from the next call to poll_block()
934 * when the packet transmission queue has sufficient room to transmit a packet
935 * with netdev_send().
936 *
937 * The kernel maintains a packet transmission queue, so the client is not
938 * expected to do additional queuing of packets. Thus, this function is
939 * unlikely to ever be used. It is included for completeness. */
940 static void
941 netdev_linux_send_wait(struct netdev *netdev)
942 {
943 if (is_tap_netdev(netdev)) {
944 /* TAP device always accepts packets.*/
945 poll_immediate_wake();
946 }
947 }
948
949 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
950 * otherwise a positive errno value. */
951 static int
952 netdev_linux_set_etheraddr(struct netdev *netdev_,
953 const uint8_t mac[ETH_ADDR_LEN])
954 {
955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
956 struct netdev_saved_flags *sf = NULL;
957 int error;
958
959 if (netdev->cache_valid & VALID_ETHERADDR) {
960 if (netdev->ether_addr_error) {
961 return netdev->ether_addr_error;
962 }
963 if (eth_addr_equals(netdev->etheraddr, mac)) {
964 return 0;
965 }
966 netdev->cache_valid &= ~VALID_ETHERADDR;
967 }
968
969 /* Tap devices must be brought down before setting the address. */
970 if (is_tap_netdev(netdev_)) {
971 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
972 }
973 error = set_etheraddr(netdev_get_name(netdev_), mac);
974 if (!error || error == ENODEV) {
975 netdev->ether_addr_error = error;
976 netdev->cache_valid |= VALID_ETHERADDR;
977 if (!error) {
978 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
979 }
980 }
981
982 netdev_restore_flags(sf);
983
984 return error;
985 }
986
987 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
988 static int
989 netdev_linux_get_etheraddr(const struct netdev *netdev_,
990 uint8_t mac[ETH_ADDR_LEN])
991 {
992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
993
994 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
995 int error = get_etheraddr(netdev_get_name(netdev_),
996 netdev->etheraddr);
997
998 netdev->ether_addr_error = error;
999 netdev->cache_valid |= VALID_ETHERADDR;
1000 }
1001
1002 if (!netdev->ether_addr_error) {
1003 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1004 }
1005
1006 return netdev->ether_addr_error;
1007 }
1008
1009 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1010 * in bytes, not including the hardware header; thus, this is typically 1500
1011 * bytes for Ethernet devices. */
1012 static int
1013 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1014 {
1015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1016 if (!(netdev->cache_valid & VALID_MTU)) {
1017 struct ifreq ifr;
1018 int error;
1019
1020 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1021 SIOCGIFMTU, "SIOCGIFMTU");
1022
1023 netdev->netdev_mtu_error = error;
1024 netdev->mtu = ifr.ifr_mtu;
1025 netdev->cache_valid |= VALID_MTU;
1026 }
1027
1028 if (!netdev->netdev_mtu_error) {
1029 *mtup = netdev->mtu;
1030 }
1031 return netdev->netdev_mtu_error;
1032 }
1033
1034 /* Sets the maximum size of transmitted (MTU) for given device using linux
1035 * networking ioctl interface.
1036 */
1037 static int
1038 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1039 {
1040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1041 struct ifreq ifr;
1042 int error;
1043
1044 if (netdev->cache_valid & VALID_MTU) {
1045 if (netdev->netdev_mtu_error) {
1046 return netdev->netdev_mtu_error;
1047 }
1048 if (netdev->mtu == mtu) {
1049 return 0;
1050 }
1051 netdev->cache_valid &= ~VALID_MTU;
1052 }
1053 ifr.ifr_mtu = mtu;
1054 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1055 SIOCSIFMTU, "SIOCSIFMTU");
1056 if (!error || error == ENODEV) {
1057 netdev->netdev_mtu_error = error;
1058 netdev->mtu = ifr.ifr_mtu;
1059 netdev->cache_valid |= VALID_MTU;
1060 }
1061 return error;
1062 }
1063
1064 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1065 * On failure, returns a negative errno value. */
1066 static int
1067 netdev_linux_get_ifindex(const struct netdev *netdev)
1068 {
1069 int ifindex, error;
1070
1071 error = get_ifindex(netdev, &ifindex);
1072 return error ? -error : ifindex;
1073 }
1074
1075 static int
1076 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1077 {
1078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1079
1080 if (netdev->miimon_interval > 0) {
1081 *carrier = netdev->miimon;
1082 } else {
1083 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1084 }
1085
1086 return 0;
1087 }
1088
1089 static long long int
1090 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1091 {
1092 return netdev_linux_cast(netdev)->carrier_resets;
1093 }
1094
1095 static int
1096 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1097 struct mii_ioctl_data *data)
1098 {
1099 struct ifreq ifr;
1100 int error;
1101
1102 memset(&ifr, 0, sizeof ifr);
1103 memcpy(&ifr.ifr_data, data, sizeof *data);
1104 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1105 memcpy(data, &ifr.ifr_data, sizeof *data);
1106
1107 return error;
1108 }
1109
1110 static int
1111 netdev_linux_get_miimon(const char *name, bool *miimon)
1112 {
1113 struct mii_ioctl_data data;
1114 int error;
1115
1116 *miimon = false;
1117
1118 memset(&data, 0, sizeof data);
1119 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1120 if (!error) {
1121 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1122 data.reg_num = MII_BMSR;
1123 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1124 &data);
1125
1126 if (!error) {
1127 *miimon = !!(data.val_out & BMSR_LSTATUS);
1128 } else {
1129 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1130 }
1131 } else {
1132 struct ethtool_cmd ecmd;
1133
1134 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1135 name);
1136
1137 COVERAGE_INC(netdev_get_ethtool);
1138 memset(&ecmd, 0, sizeof ecmd);
1139 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1140 "ETHTOOL_GLINK");
1141 if (!error) {
1142 struct ethtool_value eval;
1143
1144 memcpy(&eval, &ecmd, sizeof eval);
1145 *miimon = !!eval.data;
1146 } else {
1147 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1148 }
1149 }
1150
1151 return error;
1152 }
1153
1154 static int
1155 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1156 long long int interval)
1157 {
1158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1159
1160 interval = interval > 0 ? MAX(interval, 100) : 0;
1161 if (netdev->miimon_interval != interval) {
1162 netdev->miimon_interval = interval;
1163 timer_set_expired(&netdev->miimon_timer);
1164 }
1165
1166 return 0;
1167 }
1168
1169 static void
1170 netdev_linux_miimon_run(void)
1171 {
1172 struct shash device_shash;
1173 struct shash_node *node;
1174
1175 shash_init(&device_shash);
1176 netdev_get_devices(&netdev_linux_class, &device_shash);
1177 SHASH_FOR_EACH (node, &device_shash) {
1178 struct netdev *netdev = node->data;
1179 struct netdev_linux *dev = netdev_linux_cast(netdev);
1180 bool miimon;
1181
1182 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1183 continue;
1184 }
1185
1186 netdev_linux_get_miimon(dev->up.name, &miimon);
1187 if (miimon != dev->miimon) {
1188 dev->miimon = miimon;
1189 netdev_linux_changed(dev, dev->ifi_flags, 0);
1190 }
1191
1192 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1193 }
1194
1195 shash_destroy(&device_shash);
1196 }
1197
1198 static void
1199 netdev_linux_miimon_wait(void)
1200 {
1201 struct shash device_shash;
1202 struct shash_node *node;
1203
1204 shash_init(&device_shash);
1205 netdev_get_devices(&netdev_linux_class, &device_shash);
1206 SHASH_FOR_EACH (node, &device_shash) {
1207 struct netdev *netdev = node->data;
1208 struct netdev_linux *dev = netdev_linux_cast(netdev);
1209
1210 if (dev->miimon_interval > 0) {
1211 timer_wait(&dev->miimon_timer);
1212 }
1213 }
1214 shash_destroy(&device_shash);
1215 }
1216
1217 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1218 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1219 * enabled. */
1220 static bool
1221 check_for_working_netlink_stats(void)
1222 {
1223 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1224 * preferable, so if that works, we'll use it. */
1225 int ifindex = do_get_ifindex("lo");
1226 if (ifindex < 0) {
1227 VLOG_WARN("failed to get ifindex for lo, "
1228 "obtaining netdev stats from proc");
1229 return false;
1230 } else {
1231 struct netdev_stats stats;
1232 int error = get_stats_via_netlink(ifindex, &stats);
1233 if (!error) {
1234 VLOG_DBG("obtaining netdev stats via rtnetlink");
1235 return true;
1236 } else {
1237 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1238 "via proc (you are probably running a pre-2.6.19 "
1239 "kernel)", ovs_strerror(error));
1240 return false;
1241 }
1242 }
1243 }
1244
1245 static void
1246 swap_uint64(uint64_t *a, uint64_t *b)
1247 {
1248 uint64_t tmp = *a;
1249 *a = *b;
1250 *b = tmp;
1251 }
1252
1253 /* Copies 'src' into 'dst', performing format conversion in the process.
1254 *
1255 * 'src' is allowed to be misaligned. */
1256 static void
1257 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1258 const struct ovs_vport_stats *src)
1259 {
1260 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1261 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1262 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1263 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1264 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1265 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1266 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1267 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1268 dst->multicast = 0;
1269 dst->collisions = 0;
1270 dst->rx_length_errors = 0;
1271 dst->rx_over_errors = 0;
1272 dst->rx_crc_errors = 0;
1273 dst->rx_frame_errors = 0;
1274 dst->rx_fifo_errors = 0;
1275 dst->rx_missed_errors = 0;
1276 dst->tx_aborted_errors = 0;
1277 dst->tx_carrier_errors = 0;
1278 dst->tx_fifo_errors = 0;
1279 dst->tx_heartbeat_errors = 0;
1280 dst->tx_window_errors = 0;
1281 }
1282
1283 static int
1284 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1285 {
1286 struct dpif_linux_vport reply;
1287 struct ofpbuf *buf;
1288 int error;
1289
1290 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1291 if (error) {
1292 return error;
1293 } else if (!reply.stats) {
1294 ofpbuf_delete(buf);
1295 return EOPNOTSUPP;
1296 }
1297
1298 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1299
1300 ofpbuf_delete(buf);
1301
1302 return 0;
1303 }
1304
1305 static void
1306 get_stats_via_vport(const struct netdev *netdev_,
1307 struct netdev_stats *stats)
1308 {
1309 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1310
1311 if (!netdev->vport_stats_error ||
1312 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1313 int error;
1314
1315 error = get_stats_via_vport__(netdev_, stats);
1316 if (error && error != ENOENT) {
1317 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1318 "(%s)",
1319 netdev_get_name(netdev_), ovs_strerror(error));
1320 }
1321 netdev->vport_stats_error = error;
1322 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1323 }
1324 }
1325
1326 static int
1327 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1328 struct netdev_stats *stats)
1329 {
1330 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1331 static int use_netlink_stats;
1332 int error;
1333
1334 if (ovsthread_once_start(&once)) {
1335 use_netlink_stats = check_for_working_netlink_stats();
1336 ovsthread_once_done(&once);
1337 }
1338
1339 if (use_netlink_stats) {
1340 int ifindex;
1341
1342 error = get_ifindex(netdev_, &ifindex);
1343 if (!error) {
1344 error = get_stats_via_netlink(ifindex, stats);
1345 }
1346 } else {
1347 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1348 }
1349
1350 if (error) {
1351 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1352 netdev_get_name(netdev_), error);
1353 }
1354 return error;
1355
1356 }
1357
1358 /* Retrieves current device stats for 'netdev-linux'. */
1359 static int
1360 netdev_linux_get_stats(const struct netdev *netdev_,
1361 struct netdev_stats *stats)
1362 {
1363 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1364 struct netdev_stats dev_stats;
1365 int error;
1366
1367 get_stats_via_vport(netdev_, stats);
1368
1369 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1370
1371 if (error) {
1372 if (netdev->vport_stats_error) {
1373 return error;
1374 } else {
1375 return 0;
1376 }
1377 }
1378
1379 if (netdev->vport_stats_error) {
1380 /* stats not available from OVS then use ioctl stats. */
1381 *stats = dev_stats;
1382 } else {
1383 stats->rx_errors += dev_stats.rx_errors;
1384 stats->tx_errors += dev_stats.tx_errors;
1385 stats->rx_dropped += dev_stats.rx_dropped;
1386 stats->tx_dropped += dev_stats.tx_dropped;
1387 stats->multicast += dev_stats.multicast;
1388 stats->collisions += dev_stats.collisions;
1389 stats->rx_length_errors += dev_stats.rx_length_errors;
1390 stats->rx_over_errors += dev_stats.rx_over_errors;
1391 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1392 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1393 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1394 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1395 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1396 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1397 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1398 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1399 stats->tx_window_errors += dev_stats.tx_window_errors;
1400 }
1401 return 0;
1402 }
1403
1404 /* Retrieves current device stats for 'netdev-tap' netdev or
1405 * netdev-internal. */
1406 static int
1407 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1408 {
1409 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1410 struct netdev_stats dev_stats;
1411 int error;
1412
1413 get_stats_via_vport(netdev_, stats);
1414
1415 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1416 if (error) {
1417 if (netdev->vport_stats_error) {
1418 return error;
1419 } else {
1420 return 0;
1421 }
1422 }
1423
1424 /* If this port is an internal port then the transmit and receive stats
1425 * will appear to be swapped relative to the other ports since we are the
1426 * one sending the data, not a remote computer. For consistency, we swap
1427 * them back here. This does not apply if we are getting stats from the
1428 * vport layer because it always tracks stats from the perspective of the
1429 * switch. */
1430 if (netdev->vport_stats_error) {
1431 *stats = dev_stats;
1432 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1433 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1434 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1435 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1436 stats->rx_length_errors = 0;
1437 stats->rx_over_errors = 0;
1438 stats->rx_crc_errors = 0;
1439 stats->rx_frame_errors = 0;
1440 stats->rx_fifo_errors = 0;
1441 stats->rx_missed_errors = 0;
1442 stats->tx_aborted_errors = 0;
1443 stats->tx_carrier_errors = 0;
1444 stats->tx_fifo_errors = 0;
1445 stats->tx_heartbeat_errors = 0;
1446 stats->tx_window_errors = 0;
1447 } else {
1448 stats->rx_dropped += dev_stats.tx_dropped;
1449 stats->tx_dropped += dev_stats.rx_dropped;
1450
1451 stats->rx_errors += dev_stats.tx_errors;
1452 stats->tx_errors += dev_stats.rx_errors;
1453
1454 stats->multicast += dev_stats.multicast;
1455 stats->collisions += dev_stats.collisions;
1456 }
1457 return 0;
1458 }
1459
1460 static int
1461 netdev_internal_get_stats(const struct netdev *netdev_,
1462 struct netdev_stats *stats)
1463 {
1464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1465
1466 get_stats_via_vport(netdev_, stats);
1467 return netdev->vport_stats_error;
1468 }
1469
1470 static int
1471 netdev_internal_set_stats(struct netdev *netdev,
1472 const struct netdev_stats *stats)
1473 {
1474 struct ovs_vport_stats vport_stats;
1475 struct dpif_linux_vport vport;
1476 int err;
1477
1478 vport_stats.rx_packets = stats->rx_packets;
1479 vport_stats.tx_packets = stats->tx_packets;
1480 vport_stats.rx_bytes = stats->rx_bytes;
1481 vport_stats.tx_bytes = stats->tx_bytes;
1482 vport_stats.rx_errors = stats->rx_errors;
1483 vport_stats.tx_errors = stats->tx_errors;
1484 vport_stats.rx_dropped = stats->rx_dropped;
1485 vport_stats.tx_dropped = stats->tx_dropped;
1486
1487 dpif_linux_vport_init(&vport);
1488 vport.cmd = OVS_VPORT_CMD_SET;
1489 vport.name = netdev_get_name(netdev);
1490 vport.stats = &vport_stats;
1491
1492 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1493
1494 /* If the vport layer doesn't know about the device, that doesn't mean it
1495 * doesn't exist (after all were able to open it when netdev_open() was
1496 * called), it just means that it isn't attached and we'll be getting
1497 * stats a different way. */
1498 if (err == ENODEV) {
1499 err = EOPNOTSUPP;
1500 }
1501
1502 return err;
1503 }
1504
1505 static void
1506 netdev_linux_read_features(struct netdev_linux *netdev)
1507 {
1508 struct ethtool_cmd ecmd;
1509 uint32_t speed;
1510 int error;
1511
1512 if (netdev->cache_valid & VALID_FEATURES) {
1513 return;
1514 }
1515
1516 COVERAGE_INC(netdev_get_ethtool);
1517 memset(&ecmd, 0, sizeof ecmd);
1518 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1519 ETHTOOL_GSET, "ETHTOOL_GSET");
1520 if (error) {
1521 goto out;
1522 }
1523
1524 /* Supported features. */
1525 netdev->supported = 0;
1526 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1527 netdev->supported |= NETDEV_F_10MB_HD;
1528 }
1529 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1530 netdev->supported |= NETDEV_F_10MB_FD;
1531 }
1532 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1533 netdev->supported |= NETDEV_F_100MB_HD;
1534 }
1535 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1536 netdev->supported |= NETDEV_F_100MB_FD;
1537 }
1538 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1539 netdev->supported |= NETDEV_F_1GB_HD;
1540 }
1541 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1542 netdev->supported |= NETDEV_F_1GB_FD;
1543 }
1544 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1545 netdev->supported |= NETDEV_F_10GB_FD;
1546 }
1547 if (ecmd.supported & SUPPORTED_TP) {
1548 netdev->supported |= NETDEV_F_COPPER;
1549 }
1550 if (ecmd.supported & SUPPORTED_FIBRE) {
1551 netdev->supported |= NETDEV_F_FIBER;
1552 }
1553 if (ecmd.supported & SUPPORTED_Autoneg) {
1554 netdev->supported |= NETDEV_F_AUTONEG;
1555 }
1556 if (ecmd.supported & SUPPORTED_Pause) {
1557 netdev->supported |= NETDEV_F_PAUSE;
1558 }
1559 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1560 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1561 }
1562
1563 /* Advertised features. */
1564 netdev->advertised = 0;
1565 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1566 netdev->advertised |= NETDEV_F_10MB_HD;
1567 }
1568 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1569 netdev->advertised |= NETDEV_F_10MB_FD;
1570 }
1571 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1572 netdev->advertised |= NETDEV_F_100MB_HD;
1573 }
1574 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1575 netdev->advertised |= NETDEV_F_100MB_FD;
1576 }
1577 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1578 netdev->advertised |= NETDEV_F_1GB_HD;
1579 }
1580 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1581 netdev->advertised |= NETDEV_F_1GB_FD;
1582 }
1583 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1584 netdev->advertised |= NETDEV_F_10GB_FD;
1585 }
1586 if (ecmd.advertising & ADVERTISED_TP) {
1587 netdev->advertised |= NETDEV_F_COPPER;
1588 }
1589 if (ecmd.advertising & ADVERTISED_FIBRE) {
1590 netdev->advertised |= NETDEV_F_FIBER;
1591 }
1592 if (ecmd.advertising & ADVERTISED_Autoneg) {
1593 netdev->advertised |= NETDEV_F_AUTONEG;
1594 }
1595 if (ecmd.advertising & ADVERTISED_Pause) {
1596 netdev->advertised |= NETDEV_F_PAUSE;
1597 }
1598 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1599 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1600 }
1601
1602 /* Current settings. */
1603 speed = ecmd.speed;
1604 if (speed == SPEED_10) {
1605 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1606 } else if (speed == SPEED_100) {
1607 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1608 } else if (speed == SPEED_1000) {
1609 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1610 } else if (speed == SPEED_10000) {
1611 netdev->current = NETDEV_F_10GB_FD;
1612 } else if (speed == 40000) {
1613 netdev->current = NETDEV_F_40GB_FD;
1614 } else if (speed == 100000) {
1615 netdev->current = NETDEV_F_100GB_FD;
1616 } else if (speed == 1000000) {
1617 netdev->current = NETDEV_F_1TB_FD;
1618 } else {
1619 netdev->current = 0;
1620 }
1621
1622 if (ecmd.port == PORT_TP) {
1623 netdev->current |= NETDEV_F_COPPER;
1624 } else if (ecmd.port == PORT_FIBRE) {
1625 netdev->current |= NETDEV_F_FIBER;
1626 }
1627
1628 if (ecmd.autoneg) {
1629 netdev->current |= NETDEV_F_AUTONEG;
1630 }
1631
1632 out:
1633 netdev->cache_valid |= VALID_FEATURES;
1634 netdev->get_features_error = error;
1635 }
1636
1637 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1638 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1639 * Returns 0 if successful, otherwise a positive errno value. */
1640 static int
1641 netdev_linux_get_features(const struct netdev *netdev_,
1642 enum netdev_features *current,
1643 enum netdev_features *advertised,
1644 enum netdev_features *supported,
1645 enum netdev_features *peer)
1646 {
1647 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1648
1649 netdev_linux_read_features(netdev);
1650
1651 if (!netdev->get_features_error) {
1652 *current = netdev->current;
1653 *advertised = netdev->advertised;
1654 *supported = netdev->supported;
1655 *peer = 0; /* XXX */
1656 }
1657 return netdev->get_features_error;
1658 }
1659
1660 /* Set the features advertised by 'netdev' to 'advertise'. */
1661 static int
1662 netdev_linux_set_advertisements(struct netdev *netdev,
1663 enum netdev_features advertise)
1664 {
1665 struct ethtool_cmd ecmd;
1666 int error;
1667
1668 COVERAGE_INC(netdev_get_ethtool);
1669 memset(&ecmd, 0, sizeof ecmd);
1670 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1671 ETHTOOL_GSET, "ETHTOOL_GSET");
1672 if (error) {
1673 return error;
1674 }
1675
1676 ecmd.advertising = 0;
1677 if (advertise & NETDEV_F_10MB_HD) {
1678 ecmd.advertising |= ADVERTISED_10baseT_Half;
1679 }
1680 if (advertise & NETDEV_F_10MB_FD) {
1681 ecmd.advertising |= ADVERTISED_10baseT_Full;
1682 }
1683 if (advertise & NETDEV_F_100MB_HD) {
1684 ecmd.advertising |= ADVERTISED_100baseT_Half;
1685 }
1686 if (advertise & NETDEV_F_100MB_FD) {
1687 ecmd.advertising |= ADVERTISED_100baseT_Full;
1688 }
1689 if (advertise & NETDEV_F_1GB_HD) {
1690 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1691 }
1692 if (advertise & NETDEV_F_1GB_FD) {
1693 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1694 }
1695 if (advertise & NETDEV_F_10GB_FD) {
1696 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1697 }
1698 if (advertise & NETDEV_F_COPPER) {
1699 ecmd.advertising |= ADVERTISED_TP;
1700 }
1701 if (advertise & NETDEV_F_FIBER) {
1702 ecmd.advertising |= ADVERTISED_FIBRE;
1703 }
1704 if (advertise & NETDEV_F_AUTONEG) {
1705 ecmd.advertising |= ADVERTISED_Autoneg;
1706 }
1707 if (advertise & NETDEV_F_PAUSE) {
1708 ecmd.advertising |= ADVERTISED_Pause;
1709 }
1710 if (advertise & NETDEV_F_PAUSE_ASYM) {
1711 ecmd.advertising |= ADVERTISED_Asym_Pause;
1712 }
1713 COVERAGE_INC(netdev_set_ethtool);
1714 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1715 ETHTOOL_SSET, "ETHTOOL_SSET");
1716 }
1717
1718 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1719 * successful, otherwise a positive errno value. */
1720 static int
1721 netdev_linux_set_policing(struct netdev *netdev_,
1722 uint32_t kbits_rate, uint32_t kbits_burst)
1723 {
1724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1725 const char *netdev_name = netdev_get_name(netdev_);
1726 int error;
1727
1728
1729 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1730 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1731 : kbits_burst); /* Stick with user-specified value. */
1732
1733 if (netdev->cache_valid & VALID_POLICING) {
1734 if (netdev->netdev_policing_error) {
1735 return netdev->netdev_policing_error;
1736 }
1737
1738 if (netdev->kbits_rate == kbits_rate &&
1739 netdev->kbits_burst == kbits_burst) {
1740 /* Assume that settings haven't changed since we last set them. */
1741 return 0;
1742 }
1743 netdev->cache_valid &= ~VALID_POLICING;
1744 }
1745
1746 COVERAGE_INC(netdev_set_policing);
1747 /* Remove any existing ingress qdisc. */
1748 error = tc_add_del_ingress_qdisc(netdev_, false);
1749 if (error) {
1750 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1751 netdev_name, ovs_strerror(error));
1752 goto out;
1753 }
1754
1755 if (kbits_rate) {
1756 error = tc_add_del_ingress_qdisc(netdev_, true);
1757 if (error) {
1758 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1759 netdev_name, ovs_strerror(error));
1760 goto out;
1761 }
1762
1763 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1764 if (error){
1765 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1766 netdev_name, ovs_strerror(error));
1767 goto out;
1768 }
1769 }
1770
1771 netdev->kbits_rate = kbits_rate;
1772 netdev->kbits_burst = kbits_burst;
1773
1774 out:
1775 if (!error || error == ENODEV) {
1776 netdev->netdev_policing_error = error;
1777 netdev->cache_valid |= VALID_POLICING;
1778 }
1779 return error;
1780 }
1781
1782 static int
1783 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1784 struct sset *types)
1785 {
1786 const struct tc_ops *const *opsp;
1787
1788 for (opsp = tcs; *opsp != NULL; opsp++) {
1789 const struct tc_ops *ops = *opsp;
1790 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1791 sset_add(types, ops->ovs_name);
1792 }
1793 }
1794 return 0;
1795 }
1796
1797 static const struct tc_ops *
1798 tc_lookup_ovs_name(const char *name)
1799 {
1800 const struct tc_ops *const *opsp;
1801
1802 for (opsp = tcs; *opsp != NULL; opsp++) {
1803 const struct tc_ops *ops = *opsp;
1804 if (!strcmp(name, ops->ovs_name)) {
1805 return ops;
1806 }
1807 }
1808 return NULL;
1809 }
1810
1811 static const struct tc_ops *
1812 tc_lookup_linux_name(const char *name)
1813 {
1814 const struct tc_ops *const *opsp;
1815
1816 for (opsp = tcs; *opsp != NULL; opsp++) {
1817 const struct tc_ops *ops = *opsp;
1818 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1819 return ops;
1820 }
1821 }
1822 return NULL;
1823 }
1824
1825 static struct tc_queue *
1826 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1827 size_t hash)
1828 {
1829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1830 struct tc_queue *queue;
1831
1832 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1833 if (queue->queue_id == queue_id) {
1834 return queue;
1835 }
1836 }
1837 return NULL;
1838 }
1839
1840 static struct tc_queue *
1841 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1842 {
1843 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1844 }
1845
1846 static int
1847 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1848 const char *type,
1849 struct netdev_qos_capabilities *caps)
1850 {
1851 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1852 if (!ops) {
1853 return EOPNOTSUPP;
1854 }
1855 caps->n_queues = ops->n_queues;
1856 return 0;
1857 }
1858
1859 static int
1860 netdev_linux_get_qos(const struct netdev *netdev_,
1861 const char **typep, struct smap *details)
1862 {
1863 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1864 int error;
1865
1866 error = tc_query_qdisc(netdev_);
1867 if (error) {
1868 return error;
1869 }
1870
1871 *typep = netdev->tc->ops->ovs_name;
1872 return (netdev->tc->ops->qdisc_get
1873 ? netdev->tc->ops->qdisc_get(netdev_, details)
1874 : 0);
1875 }
1876
1877 static int
1878 netdev_linux_set_qos(struct netdev *netdev_,
1879 const char *type, const struct smap *details)
1880 {
1881 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1882 const struct tc_ops *new_ops;
1883 int error;
1884
1885 new_ops = tc_lookup_ovs_name(type);
1886 if (!new_ops || !new_ops->tc_install) {
1887 return EOPNOTSUPP;
1888 }
1889
1890 error = tc_query_qdisc(netdev_);
1891 if (error) {
1892 return error;
1893 }
1894
1895 if (new_ops == netdev->tc->ops) {
1896 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1897 } else {
1898 /* Delete existing qdisc. */
1899 error = tc_del_qdisc(netdev_);
1900 if (error) {
1901 return error;
1902 }
1903 ovs_assert(netdev->tc == NULL);
1904
1905 /* Install new qdisc. */
1906 error = new_ops->tc_install(netdev_, details);
1907 ovs_assert((error == 0) == (netdev->tc != NULL));
1908
1909 return error;
1910 }
1911 }
1912
1913 static int
1914 netdev_linux_get_queue(const struct netdev *netdev_,
1915 unsigned int queue_id, struct smap *details)
1916 {
1917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1918 int error;
1919
1920 error = tc_query_qdisc(netdev_);
1921 if (error) {
1922 return error;
1923 } else {
1924 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1925 return (queue
1926 ? netdev->tc->ops->class_get(netdev_, queue, details)
1927 : ENOENT);
1928 }
1929 }
1930
1931 static int
1932 netdev_linux_set_queue(struct netdev *netdev_,
1933 unsigned int queue_id, const struct smap *details)
1934 {
1935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1936 int error;
1937
1938 error = tc_query_qdisc(netdev_);
1939 if (error) {
1940 return error;
1941 } else if (queue_id >= netdev->tc->ops->n_queues
1942 || !netdev->tc->ops->class_set) {
1943 return EINVAL;
1944 }
1945
1946 return netdev->tc->ops->class_set(netdev_, queue_id, details);
1947 }
1948
1949 static int
1950 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
1951 {
1952 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1953 int error;
1954
1955 error = tc_query_qdisc(netdev_);
1956 if (error) {
1957 return error;
1958 } else if (!netdev->tc->ops->class_delete) {
1959 return EINVAL;
1960 } else {
1961 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1962 return (queue
1963 ? netdev->tc->ops->class_delete(netdev_, queue)
1964 : ENOENT);
1965 }
1966 }
1967
1968 static int
1969 netdev_linux_get_queue_stats(const struct netdev *netdev_,
1970 unsigned int queue_id,
1971 struct netdev_queue_stats *stats)
1972 {
1973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1974 int error;
1975
1976 error = tc_query_qdisc(netdev_);
1977 if (error) {
1978 return error;
1979 } else if (!netdev->tc->ops->class_get_stats) {
1980 return EOPNOTSUPP;
1981 } else {
1982 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1983 if (!queue) {
1984 return ENOENT;
1985 }
1986 stats->created = queue->created;
1987 return netdev->tc->ops->class_get_stats(netdev_, queue, stats);
1988 }
1989 }
1990
1991 static bool
1992 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1993 {
1994 struct ofpbuf request;
1995 struct tcmsg *tcmsg;
1996
1997 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1998 if (!tcmsg) {
1999 return false;
2000 }
2001 tcmsg->tcm_parent = 0;
2002 nl_dump_start(dump, NETLINK_ROUTE, &request);
2003 ofpbuf_uninit(&request);
2004 return true;
2005 }
2006
2007 static int
2008 netdev_linux_dump_queues(const struct netdev *netdev_,
2009 netdev_dump_queues_cb *cb, void *aux)
2010 {
2011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2012 struct tc_queue *queue, *next_queue;
2013 struct smap details;
2014 int last_error;
2015 int error;
2016
2017 error = tc_query_qdisc(netdev_);
2018 if (error) {
2019 return error;
2020 } else if (!netdev->tc->ops->class_get) {
2021 return EOPNOTSUPP;
2022 }
2023
2024 last_error = 0;
2025 smap_init(&details);
2026 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2027 &netdev->tc->queues) {
2028 smap_clear(&details);
2029
2030 error = netdev->tc->ops->class_get(netdev_, queue, &details);
2031 if (!error) {
2032 (*cb)(queue->queue_id, &details, aux);
2033 } else {
2034 last_error = error;
2035 }
2036 }
2037 smap_destroy(&details);
2038
2039 return last_error;
2040 }
2041
2042 static int
2043 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2044 netdev_dump_queue_stats_cb *cb, void *aux)
2045 {
2046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2047 struct nl_dump dump;
2048 struct ofpbuf msg;
2049 int last_error;
2050 int error;
2051
2052 error = tc_query_qdisc(netdev_);
2053 if (error) {
2054 return error;
2055 } else if (!netdev->tc->ops->class_dump_stats) {
2056 return EOPNOTSUPP;
2057 }
2058
2059 last_error = 0;
2060 if (!start_queue_dump(netdev_, &dump)) {
2061 return ENODEV;
2062 }
2063 while (nl_dump_next(&dump, &msg)) {
2064 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
2065 if (error) {
2066 last_error = error;
2067 }
2068 }
2069
2070 error = nl_dump_done(&dump);
2071 return error ? error : last_error;
2072 }
2073
2074 static int
2075 netdev_linux_get_in4(const struct netdev *netdev_,
2076 struct in_addr *address, struct in_addr *netmask)
2077 {
2078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2079
2080 if (!(netdev->cache_valid & VALID_IN4)) {
2081 int error;
2082
2083 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2084 SIOCGIFADDR, "SIOCGIFADDR");
2085 if (error) {
2086 return error;
2087 }
2088
2089 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2090 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2091 if (error) {
2092 return error;
2093 }
2094
2095 netdev->cache_valid |= VALID_IN4;
2096 }
2097 *address = netdev->address;
2098 *netmask = netdev->netmask;
2099 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2100 }
2101
2102 static int
2103 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2104 struct in_addr netmask)
2105 {
2106 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2107 int error;
2108
2109 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2110 if (!error) {
2111 netdev->cache_valid |= VALID_IN4;
2112 netdev->address = address;
2113 netdev->netmask = netmask;
2114 if (address.s_addr != INADDR_ANY) {
2115 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2116 "SIOCSIFNETMASK", netmask);
2117 }
2118 }
2119 return error;
2120 }
2121
2122 static bool
2123 parse_if_inet6_line(const char *line,
2124 struct in6_addr *in6, char ifname[16 + 1])
2125 {
2126 uint8_t *s6 = in6->s6_addr;
2127 #define X8 "%2"SCNx8
2128 return sscanf(line,
2129 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2130 "%*x %*x %*x %*x %16s\n",
2131 &s6[0], &s6[1], &s6[2], &s6[3],
2132 &s6[4], &s6[5], &s6[6], &s6[7],
2133 &s6[8], &s6[9], &s6[10], &s6[11],
2134 &s6[12], &s6[13], &s6[14], &s6[15],
2135 ifname) == 17;
2136 }
2137
2138 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2139 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2140 static int
2141 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2142 {
2143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2144 if (!(netdev->cache_valid & VALID_IN6)) {
2145 FILE *file;
2146 char line[128];
2147
2148 netdev->in6 = in6addr_any;
2149
2150 file = fopen("/proc/net/if_inet6", "r");
2151 if (file != NULL) {
2152 const char *name = netdev_get_name(netdev_);
2153 while (fgets(line, sizeof line, file)) {
2154 struct in6_addr in6_tmp;
2155 char ifname[16 + 1];
2156 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2157 && !strcmp(name, ifname))
2158 {
2159 netdev->in6 = in6_tmp;
2160 break;
2161 }
2162 }
2163 fclose(file);
2164 }
2165 netdev->cache_valid |= VALID_IN6;
2166 }
2167 *in6 = netdev->in6;
2168 return 0;
2169 }
2170
2171 static void
2172 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2173 {
2174 struct sockaddr_in sin;
2175 memset(&sin, 0, sizeof sin);
2176 sin.sin_family = AF_INET;
2177 sin.sin_addr = addr;
2178 sin.sin_port = 0;
2179
2180 memset(sa, 0, sizeof *sa);
2181 memcpy(sa, &sin, sizeof sin);
2182 }
2183
2184 static int
2185 do_set_addr(struct netdev *netdev,
2186 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2187 {
2188 struct ifreq ifr;
2189 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2190 make_in4_sockaddr(&ifr.ifr_addr, addr);
2191
2192 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2193 ioctl_name);
2194 }
2195
2196 /* Adds 'router' as a default IP gateway. */
2197 static int
2198 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2199 {
2200 struct in_addr any = { INADDR_ANY };
2201 struct rtentry rt;
2202 int error;
2203
2204 memset(&rt, 0, sizeof rt);
2205 make_in4_sockaddr(&rt.rt_dst, any);
2206 make_in4_sockaddr(&rt.rt_gateway, router);
2207 make_in4_sockaddr(&rt.rt_genmask, any);
2208 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2209 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2210 if (error) {
2211 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2212 }
2213 return error;
2214 }
2215
2216 static int
2217 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2218 char **netdev_name)
2219 {
2220 static const char fn[] = "/proc/net/route";
2221 FILE *stream;
2222 char line[256];
2223 int ln;
2224
2225 *netdev_name = NULL;
2226 stream = fopen(fn, "r");
2227 if (stream == NULL) {
2228 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2229 return errno;
2230 }
2231
2232 ln = 0;
2233 while (fgets(line, sizeof line, stream)) {
2234 if (++ln >= 2) {
2235 char iface[17];
2236 ovs_be32 dest, gateway, mask;
2237 int refcnt, metric, mtu;
2238 unsigned int flags, use, window, irtt;
2239
2240 if (sscanf(line,
2241 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2242 " %d %u %u\n",
2243 iface, &dest, &gateway, &flags, &refcnt,
2244 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2245
2246 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2247 fn, ln, line);
2248 continue;
2249 }
2250 if (!(flags & RTF_UP)) {
2251 /* Skip routes that aren't up. */
2252 continue;
2253 }
2254
2255 /* The output of 'dest', 'mask', and 'gateway' were given in
2256 * network byte order, so we don't need need any endian
2257 * conversions here. */
2258 if ((dest & mask) == (host->s_addr & mask)) {
2259 if (!gateway) {
2260 /* The host is directly reachable. */
2261 next_hop->s_addr = 0;
2262 } else {
2263 /* To reach the host, we must go through a gateway. */
2264 next_hop->s_addr = gateway;
2265 }
2266 *netdev_name = xstrdup(iface);
2267 fclose(stream);
2268 return 0;
2269 }
2270 }
2271 }
2272
2273 fclose(stream);
2274 return ENXIO;
2275 }
2276
2277 static int
2278 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2279 {
2280 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2281 int error = 0;
2282
2283 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2284 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2285
2286 COVERAGE_INC(netdev_get_ethtool);
2287 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2288 error = netdev_linux_do_ethtool(netdev->up.name,
2289 cmd,
2290 ETHTOOL_GDRVINFO,
2291 "ETHTOOL_GDRVINFO");
2292 if (!error) {
2293 netdev->cache_valid |= VALID_DRVINFO;
2294 }
2295 }
2296
2297 if (!error) {
2298 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2299 smap_add(smap, "driver_version", netdev->drvinfo.version);
2300 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2301 }
2302 return error;
2303 }
2304
2305 static int
2306 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2307 struct smap *smap)
2308 {
2309 smap_add(smap, "driver_name", "openvswitch");
2310 return 0;
2311 }
2312
2313 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2314 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2315 * returns 0. Otherwise, it returns a positive errno value; in particular,
2316 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2317 static int
2318 netdev_linux_arp_lookup(const struct netdev *netdev,
2319 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2320 {
2321 struct arpreq r;
2322 struct sockaddr_in sin;
2323 int retval;
2324
2325 memset(&r, 0, sizeof r);
2326 memset(&sin, 0, sizeof sin);
2327 sin.sin_family = AF_INET;
2328 sin.sin_addr.s_addr = ip;
2329 sin.sin_port = 0;
2330 memcpy(&r.arp_pa, &sin, sizeof sin);
2331 r.arp_ha.sa_family = ARPHRD_ETHER;
2332 r.arp_flags = 0;
2333 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2334 COVERAGE_INC(netdev_arp_lookup);
2335 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2336 if (!retval) {
2337 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2338 } else if (retval != ENXIO) {
2339 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2340 netdev_get_name(netdev), IP_ARGS(ip),
2341 ovs_strerror(retval));
2342 }
2343 return retval;
2344 }
2345
2346 static int
2347 nd_to_iff_flags(enum netdev_flags nd)
2348 {
2349 int iff = 0;
2350 if (nd & NETDEV_UP) {
2351 iff |= IFF_UP;
2352 }
2353 if (nd & NETDEV_PROMISC) {
2354 iff |= IFF_PROMISC;
2355 }
2356 return iff;
2357 }
2358
2359 static int
2360 iff_to_nd_flags(int iff)
2361 {
2362 enum netdev_flags nd = 0;
2363 if (iff & IFF_UP) {
2364 nd |= NETDEV_UP;
2365 }
2366 if (iff & IFF_PROMISC) {
2367 nd |= NETDEV_PROMISC;
2368 }
2369 return nd;
2370 }
2371
2372 static int
2373 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2374 enum netdev_flags on, enum netdev_flags *old_flagsp)
2375 {
2376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2377 int old_flags, new_flags;
2378 int error = 0;
2379
2380 old_flags = netdev->ifi_flags;
2381 *old_flagsp = iff_to_nd_flags(old_flags);
2382 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2383 if (new_flags != old_flags) {
2384 error = set_flags(netdev_get_name(netdev_), new_flags);
2385 get_flags(netdev_, &netdev->ifi_flags);
2386 }
2387 return error;
2388 }
2389
2390 static unsigned int
2391 netdev_linux_change_seq(const struct netdev *netdev)
2392 {
2393 return netdev_linux_cast(netdev)->change_seq;
2394 }
2395
2396 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2397 GET_FEATURES, GET_STATUS) \
2398 { \
2399 NAME, \
2400 \
2401 netdev_linux_init, \
2402 netdev_linux_run, \
2403 netdev_linux_wait, \
2404 \
2405 CREATE, \
2406 netdev_linux_destroy, \
2407 NULL, /* get_config */ \
2408 NULL, /* set_config */ \
2409 NULL, /* get_tunnel_config */ \
2410 \
2411 netdev_linux_rx_open, \
2412 \
2413 netdev_linux_send, \
2414 netdev_linux_send_wait, \
2415 \
2416 netdev_linux_set_etheraddr, \
2417 netdev_linux_get_etheraddr, \
2418 netdev_linux_get_mtu, \
2419 netdev_linux_set_mtu, \
2420 netdev_linux_get_ifindex, \
2421 netdev_linux_get_carrier, \
2422 netdev_linux_get_carrier_resets, \
2423 netdev_linux_set_miimon_interval, \
2424 GET_STATS, \
2425 SET_STATS, \
2426 \
2427 GET_FEATURES, \
2428 netdev_linux_set_advertisements, \
2429 \
2430 netdev_linux_set_policing, \
2431 netdev_linux_get_qos_types, \
2432 netdev_linux_get_qos_capabilities, \
2433 netdev_linux_get_qos, \
2434 netdev_linux_set_qos, \
2435 netdev_linux_get_queue, \
2436 netdev_linux_set_queue, \
2437 netdev_linux_delete_queue, \
2438 netdev_linux_get_queue_stats, \
2439 netdev_linux_dump_queues, \
2440 netdev_linux_dump_queue_stats, \
2441 \
2442 netdev_linux_get_in4, \
2443 netdev_linux_set_in4, \
2444 netdev_linux_get_in6, \
2445 netdev_linux_add_router, \
2446 netdev_linux_get_next_hop, \
2447 GET_STATUS, \
2448 netdev_linux_arp_lookup, \
2449 \
2450 netdev_linux_update_flags, \
2451 \
2452 netdev_linux_change_seq \
2453 }
2454
2455 const struct netdev_class netdev_linux_class =
2456 NETDEV_LINUX_CLASS(
2457 "system",
2458 netdev_linux_create,
2459 netdev_linux_get_stats,
2460 NULL, /* set_stats */
2461 netdev_linux_get_features,
2462 netdev_linux_get_status);
2463
2464 const struct netdev_class netdev_tap_class =
2465 NETDEV_LINUX_CLASS(
2466 "tap",
2467 netdev_linux_create_tap,
2468 netdev_tap_get_stats,
2469 NULL, /* set_stats */
2470 netdev_linux_get_features,
2471 netdev_linux_get_status);
2472
2473 const struct netdev_class netdev_internal_class =
2474 NETDEV_LINUX_CLASS(
2475 "internal",
2476 netdev_linux_create,
2477 netdev_internal_get_stats,
2478 netdev_internal_set_stats,
2479 NULL, /* get_features */
2480 netdev_internal_get_status);
2481
2482 static const struct netdev_rx_class netdev_rx_linux_class = {
2483 netdev_rx_linux_destroy,
2484 netdev_rx_linux_recv,
2485 netdev_rx_linux_wait,
2486 netdev_rx_linux_drain,
2487 };
2488 \f
2489 /* HTB traffic control class. */
2490
2491 #define HTB_N_QUEUES 0xf000
2492
2493 struct htb {
2494 struct tc tc;
2495 unsigned int max_rate; /* In bytes/s. */
2496 };
2497
2498 struct htb_class {
2499 struct tc_queue tc_queue;
2500 unsigned int min_rate; /* In bytes/s. */
2501 unsigned int max_rate; /* In bytes/s. */
2502 unsigned int burst; /* In bytes. */
2503 unsigned int priority; /* Lower values are higher priorities. */
2504 };
2505
2506 static struct htb *
2507 htb_get__(const struct netdev *netdev_)
2508 {
2509 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2510 return CONTAINER_OF(netdev->tc, struct htb, tc);
2511 }
2512
2513 static void
2514 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2515 {
2516 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2517 struct htb *htb;
2518
2519 htb = xmalloc(sizeof *htb);
2520 tc_init(&htb->tc, &tc_ops_htb);
2521 htb->max_rate = max_rate;
2522
2523 netdev->tc = &htb->tc;
2524 }
2525
2526 /* Create an HTB qdisc.
2527 *
2528 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2529 static int
2530 htb_setup_qdisc__(struct netdev *netdev)
2531 {
2532 size_t opt_offset;
2533 struct tc_htb_glob opt;
2534 struct ofpbuf request;
2535 struct tcmsg *tcmsg;
2536
2537 tc_del_qdisc(netdev);
2538
2539 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2540 NLM_F_EXCL | NLM_F_CREATE, &request);
2541 if (!tcmsg) {
2542 return ENODEV;
2543 }
2544 tcmsg->tcm_handle = tc_make_handle(1, 0);
2545 tcmsg->tcm_parent = TC_H_ROOT;
2546
2547 nl_msg_put_string(&request, TCA_KIND, "htb");
2548
2549 memset(&opt, 0, sizeof opt);
2550 opt.rate2quantum = 10;
2551 opt.version = 3;
2552 opt.defcls = 1;
2553
2554 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2555 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2556 nl_msg_end_nested(&request, opt_offset);
2557
2558 return tc_transact(&request, NULL);
2559 }
2560
2561 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2562 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2563 static int
2564 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2565 unsigned int parent, struct htb_class *class)
2566 {
2567 size_t opt_offset;
2568 struct tc_htb_opt opt;
2569 struct ofpbuf request;
2570 struct tcmsg *tcmsg;
2571 int error;
2572 int mtu;
2573
2574 error = netdev_get_mtu(netdev, &mtu);
2575 if (error) {
2576 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2577 netdev_get_name(netdev));
2578 return error;
2579 }
2580
2581 memset(&opt, 0, sizeof opt);
2582 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2583 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2584 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2585 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2586 opt.prio = class->priority;
2587
2588 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2589 if (!tcmsg) {
2590 return ENODEV;
2591 }
2592 tcmsg->tcm_handle = handle;
2593 tcmsg->tcm_parent = parent;
2594
2595 nl_msg_put_string(&request, TCA_KIND, "htb");
2596 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2597 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2598 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2599 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2600 nl_msg_end_nested(&request, opt_offset);
2601
2602 error = tc_transact(&request, NULL);
2603 if (error) {
2604 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2605 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2606 netdev_get_name(netdev),
2607 tc_get_major(handle), tc_get_minor(handle),
2608 tc_get_major(parent), tc_get_minor(parent),
2609 class->min_rate, class->max_rate,
2610 class->burst, class->priority, ovs_strerror(error));
2611 }
2612 return error;
2613 }
2614
2615 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2616 * description of them into 'details'. The description complies with the
2617 * specification given in the vswitch database documentation for linux-htb
2618 * queue details. */
2619 static int
2620 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2621 {
2622 static const struct nl_policy tca_htb_policy[] = {
2623 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2624 .min_len = sizeof(struct tc_htb_opt) },
2625 };
2626
2627 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2628 const struct tc_htb_opt *htb;
2629
2630 if (!nl_parse_nested(nl_options, tca_htb_policy,
2631 attrs, ARRAY_SIZE(tca_htb_policy))) {
2632 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2633 return EPROTO;
2634 }
2635
2636 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2637 class->min_rate = htb->rate.rate;
2638 class->max_rate = htb->ceil.rate;
2639 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2640 class->priority = htb->prio;
2641 return 0;
2642 }
2643
2644 static int
2645 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2646 struct htb_class *options,
2647 struct netdev_queue_stats *stats)
2648 {
2649 struct nlattr *nl_options;
2650 unsigned int handle;
2651 int error;
2652
2653 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2654 if (!error && queue_id) {
2655 unsigned int major = tc_get_major(handle);
2656 unsigned int minor = tc_get_minor(handle);
2657 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2658 *queue_id = minor - 1;
2659 } else {
2660 error = EPROTO;
2661 }
2662 }
2663 if (!error && options) {
2664 error = htb_parse_tca_options__(nl_options, options);
2665 }
2666 return error;
2667 }
2668
2669 static void
2670 htb_parse_qdisc_details__(struct netdev *netdev,
2671 const struct smap *details, struct htb_class *hc)
2672 {
2673 const char *max_rate_s;
2674
2675 max_rate_s = smap_get(details, "max-rate");
2676 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2677 if (!hc->max_rate) {
2678 enum netdev_features current;
2679
2680 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2681 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2682 }
2683 hc->min_rate = hc->max_rate;
2684 hc->burst = 0;
2685 hc->priority = 0;
2686 }
2687
2688 static int
2689 htb_parse_class_details__(struct netdev *netdev,
2690 const struct smap *details, struct htb_class *hc)
2691 {
2692 const struct htb *htb = htb_get__(netdev);
2693 const char *min_rate_s = smap_get(details, "min-rate");
2694 const char *max_rate_s = smap_get(details, "max-rate");
2695 const char *burst_s = smap_get(details, "burst");
2696 const char *priority_s = smap_get(details, "priority");
2697 int mtu, error;
2698
2699 error = netdev_get_mtu(netdev, &mtu);
2700 if (error) {
2701 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2702 netdev_get_name(netdev));
2703 return error;
2704 }
2705
2706 /* HTB requires at least an mtu sized min-rate to send any traffic even
2707 * on uncongested links. */
2708 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2709 hc->min_rate = MAX(hc->min_rate, mtu);
2710 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2711
2712 /* max-rate */
2713 hc->max_rate = (max_rate_s
2714 ? strtoull(max_rate_s, NULL, 10) / 8
2715 : htb->max_rate);
2716 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2717 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2718
2719 /* burst
2720 *
2721 * According to hints in the documentation that I've read, it is important
2722 * that 'burst' be at least as big as the largest frame that might be
2723 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2724 * but having it a bit too small is a problem. Since netdev_get_mtu()
2725 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2726 * the MTU. We actually add 64, instead of 14, as a guard against
2727 * additional headers get tacked on somewhere that we're not aware of. */
2728 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2729 hc->burst = MAX(hc->burst, mtu + 64);
2730
2731 /* priority */
2732 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2733
2734 return 0;
2735 }
2736
2737 static int
2738 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2739 unsigned int parent, struct htb_class *options,
2740 struct netdev_queue_stats *stats)
2741 {
2742 struct ofpbuf *reply;
2743 int error;
2744
2745 error = tc_query_class(netdev, handle, parent, &reply);
2746 if (!error) {
2747 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2748 ofpbuf_delete(reply);
2749 }
2750 return error;
2751 }
2752
2753 static int
2754 htb_tc_install(struct netdev *netdev, const struct smap *details)
2755 {
2756 int error;
2757
2758 error = htb_setup_qdisc__(netdev);
2759 if (!error) {
2760 struct htb_class hc;
2761
2762 htb_parse_qdisc_details__(netdev, details, &hc);
2763 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2764 tc_make_handle(1, 0), &hc);
2765 if (!error) {
2766 htb_install__(netdev, hc.max_rate);
2767 }
2768 }
2769 return error;
2770 }
2771
2772 static struct htb_class *
2773 htb_class_cast__(const struct tc_queue *queue)
2774 {
2775 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2776 }
2777
2778 static void
2779 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2780 const struct htb_class *hc)
2781 {
2782 struct htb *htb = htb_get__(netdev);
2783 size_t hash = hash_int(queue_id, 0);
2784 struct tc_queue *queue;
2785 struct htb_class *hcp;
2786
2787 queue = tc_find_queue__(netdev, queue_id, hash);
2788 if (queue) {
2789 hcp = htb_class_cast__(queue);
2790 } else {
2791 hcp = xmalloc(sizeof *hcp);
2792 queue = &hcp->tc_queue;
2793 queue->queue_id = queue_id;
2794 queue->created = time_msec();
2795 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2796 }
2797
2798 hcp->min_rate = hc->min_rate;
2799 hcp->max_rate = hc->max_rate;
2800 hcp->burst = hc->burst;
2801 hcp->priority = hc->priority;
2802 }
2803
2804 static int
2805 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2806 {
2807 struct ofpbuf msg;
2808 struct nl_dump dump;
2809 struct htb_class hc;
2810
2811 /* Get qdisc options. */
2812 hc.max_rate = 0;
2813 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2814 htb_install__(netdev, hc.max_rate);
2815
2816 /* Get queues. */
2817 if (!start_queue_dump(netdev, &dump)) {
2818 return ENODEV;
2819 }
2820 while (nl_dump_next(&dump, &msg)) {
2821 unsigned int queue_id;
2822
2823 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2824 htb_update_queue__(netdev, queue_id, &hc);
2825 }
2826 }
2827 nl_dump_done(&dump);
2828
2829 return 0;
2830 }
2831
2832 static void
2833 htb_tc_destroy(struct tc *tc)
2834 {
2835 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2836 struct htb_class *hc, *next;
2837
2838 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2839 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2840 free(hc);
2841 }
2842 tc_destroy(tc);
2843 free(htb);
2844 }
2845
2846 static int
2847 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2848 {
2849 const struct htb *htb = htb_get__(netdev);
2850 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2851 return 0;
2852 }
2853
2854 static int
2855 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2856 {
2857 struct htb_class hc;
2858 int error;
2859
2860 htb_parse_qdisc_details__(netdev, details, &hc);
2861 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2862 tc_make_handle(1, 0), &hc);
2863 if (!error) {
2864 htb_get__(netdev)->max_rate = hc.max_rate;
2865 }
2866 return error;
2867 }
2868
2869 static int
2870 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2871 const struct tc_queue *queue, struct smap *details)
2872 {
2873 const struct htb_class *hc = htb_class_cast__(queue);
2874
2875 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2876 if (hc->min_rate != hc->max_rate) {
2877 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2878 }
2879 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2880 if (hc->priority) {
2881 smap_add_format(details, "priority", "%u", hc->priority);
2882 }
2883 return 0;
2884 }
2885
2886 static int
2887 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2888 const struct smap *details)
2889 {
2890 struct htb_class hc;
2891 int error;
2892
2893 error = htb_parse_class_details__(netdev, details, &hc);
2894 if (error) {
2895 return error;
2896 }
2897
2898 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2899 tc_make_handle(1, 0xfffe), &hc);
2900 if (error) {
2901 return error;
2902 }
2903
2904 htb_update_queue__(netdev, queue_id, &hc);
2905 return 0;
2906 }
2907
2908 static int
2909 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2910 {
2911 struct htb_class *hc = htb_class_cast__(queue);
2912 struct htb *htb = htb_get__(netdev);
2913 int error;
2914
2915 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2916 if (!error) {
2917 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2918 free(hc);
2919 }
2920 return error;
2921 }
2922
2923 static int
2924 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2925 struct netdev_queue_stats *stats)
2926 {
2927 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2928 tc_make_handle(1, 0xfffe), NULL, stats);
2929 }
2930
2931 static int
2932 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2933 const struct ofpbuf *nlmsg,
2934 netdev_dump_queue_stats_cb *cb, void *aux)
2935 {
2936 struct netdev_queue_stats stats;
2937 unsigned int handle, major, minor;
2938 int error;
2939
2940 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2941 if (error) {
2942 return error;
2943 }
2944
2945 major = tc_get_major(handle);
2946 minor = tc_get_minor(handle);
2947 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2948 (*cb)(minor - 1, &stats, aux);
2949 }
2950 return 0;
2951 }
2952
2953 static const struct tc_ops tc_ops_htb = {
2954 "htb", /* linux_name */
2955 "linux-htb", /* ovs_name */
2956 HTB_N_QUEUES, /* n_queues */
2957 htb_tc_install,
2958 htb_tc_load,
2959 htb_tc_destroy,
2960 htb_qdisc_get,
2961 htb_qdisc_set,
2962 htb_class_get,
2963 htb_class_set,
2964 htb_class_delete,
2965 htb_class_get_stats,
2966 htb_class_dump_stats
2967 };
2968 \f
2969 /* "linux-hfsc" traffic control class. */
2970
2971 #define HFSC_N_QUEUES 0xf000
2972
2973 struct hfsc {
2974 struct tc tc;
2975 uint32_t max_rate;
2976 };
2977
2978 struct hfsc_class {
2979 struct tc_queue tc_queue;
2980 uint32_t min_rate;
2981 uint32_t max_rate;
2982 };
2983
2984 static struct hfsc *
2985 hfsc_get__(const struct netdev *netdev_)
2986 {
2987 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2988 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
2989 }
2990
2991 static struct hfsc_class *
2992 hfsc_class_cast__(const struct tc_queue *queue)
2993 {
2994 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2995 }
2996
2997 static void
2998 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
2999 {
3000 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3001 struct hfsc *hfsc;
3002
3003 hfsc = xmalloc(sizeof *hfsc);
3004 tc_init(&hfsc->tc, &tc_ops_hfsc);
3005 hfsc->max_rate = max_rate;
3006 netdev->tc = &hfsc->tc;
3007 }
3008
3009 static void
3010 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3011 const struct hfsc_class *hc)
3012 {
3013 size_t hash;
3014 struct hfsc *hfsc;
3015 struct hfsc_class *hcp;
3016 struct tc_queue *queue;
3017
3018 hfsc = hfsc_get__(netdev);
3019 hash = hash_int(queue_id, 0);
3020
3021 queue = tc_find_queue__(netdev, queue_id, hash);
3022 if (queue) {
3023 hcp = hfsc_class_cast__(queue);
3024 } else {
3025 hcp = xmalloc(sizeof *hcp);
3026 queue = &hcp->tc_queue;
3027 queue->queue_id = queue_id;
3028 queue->created = time_msec();
3029 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3030 }
3031
3032 hcp->min_rate = hc->min_rate;
3033 hcp->max_rate = hc->max_rate;
3034 }
3035
3036 static int
3037 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3038 {
3039 const struct tc_service_curve *rsc, *fsc, *usc;
3040 static const struct nl_policy tca_hfsc_policy[] = {
3041 [TCA_HFSC_RSC] = {
3042 .type = NL_A_UNSPEC,
3043 .optional = false,
3044 .min_len = sizeof(struct tc_service_curve),
3045 },
3046 [TCA_HFSC_FSC] = {
3047 .type = NL_A_UNSPEC,
3048 .optional = false,
3049 .min_len = sizeof(struct tc_service_curve),
3050 },
3051 [TCA_HFSC_USC] = {
3052 .type = NL_A_UNSPEC,
3053 .optional = false,
3054 .min_len = sizeof(struct tc_service_curve),
3055 },
3056 };
3057 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3058
3059 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3060 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3061 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3062 return EPROTO;
3063 }
3064
3065 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3066 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3067 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3068
3069 if (rsc->m1 != 0 || rsc->d != 0 ||
3070 fsc->m1 != 0 || fsc->d != 0 ||
3071 usc->m1 != 0 || usc->d != 0) {
3072 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3073 "Non-linear service curves are not supported.");
3074 return EPROTO;
3075 }
3076
3077 if (rsc->m2 != fsc->m2) {
3078 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3079 "Real-time service curves are not supported ");
3080 return EPROTO;
3081 }
3082
3083 if (rsc->m2 > usc->m2) {
3084 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3085 "Min-rate service curve is greater than "
3086 "the max-rate service curve.");
3087 return EPROTO;
3088 }
3089
3090 class->min_rate = fsc->m2;
3091 class->max_rate = usc->m2;
3092 return 0;
3093 }
3094
3095 static int
3096 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3097 struct hfsc_class *options,
3098 struct netdev_queue_stats *stats)
3099 {
3100 int error;
3101 unsigned int handle;
3102 struct nlattr *nl_options;
3103
3104 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3105 if (error) {
3106 return error;
3107 }
3108
3109 if (queue_id) {
3110 unsigned int major, minor;
3111
3112 major = tc_get_major(handle);
3113 minor = tc_get_minor(handle);
3114 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3115 *queue_id = minor - 1;
3116 } else {
3117 return EPROTO;
3118 }
3119 }
3120
3121 if (options) {
3122 error = hfsc_parse_tca_options__(nl_options, options);
3123 }
3124
3125 return error;
3126 }
3127
3128 static int
3129 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3130 unsigned int parent, struct hfsc_class *options,
3131 struct netdev_queue_stats *stats)
3132 {
3133 int error;
3134 struct ofpbuf *reply;
3135
3136 error = tc_query_class(netdev, handle, parent, &reply);
3137 if (error) {
3138 return error;
3139 }
3140
3141 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3142 ofpbuf_delete(reply);
3143 return error;
3144 }
3145
3146 static void
3147 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3148 struct hfsc_class *class)
3149 {
3150 uint32_t max_rate;
3151 const char *max_rate_s;
3152
3153 max_rate_s = smap_get(details, "max-rate");
3154 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3155
3156 if (!max_rate) {
3157 enum netdev_features current;
3158
3159 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3160 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3161 }
3162
3163 class->min_rate = max_rate;
3164 class->max_rate = max_rate;
3165 }
3166
3167 static int
3168 hfsc_parse_class_details__(struct netdev *netdev,
3169 const struct smap *details,
3170 struct hfsc_class * class)
3171 {
3172 const struct hfsc *hfsc;
3173 uint32_t min_rate, max_rate;
3174 const char *min_rate_s, *max_rate_s;
3175
3176 hfsc = hfsc_get__(netdev);
3177 min_rate_s = smap_get(details, "min-rate");
3178 max_rate_s = smap_get(details, "max-rate");
3179
3180 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3181 min_rate = MAX(min_rate, 1);
3182 min_rate = MIN(min_rate, hfsc->max_rate);
3183
3184 max_rate = (max_rate_s
3185 ? strtoull(max_rate_s, NULL, 10) / 8
3186 : hfsc->max_rate);
3187 max_rate = MAX(max_rate, min_rate);
3188 max_rate = MIN(max_rate, hfsc->max_rate);
3189
3190 class->min_rate = min_rate;
3191 class->max_rate = max_rate;
3192
3193 return 0;
3194 }
3195
3196 /* Create an HFSC qdisc.
3197 *
3198 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3199 static int
3200 hfsc_setup_qdisc__(struct netdev * netdev)
3201 {
3202 struct tcmsg *tcmsg;
3203 struct ofpbuf request;
3204 struct tc_hfsc_qopt opt;
3205
3206 tc_del_qdisc(netdev);
3207
3208 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3209 NLM_F_EXCL | NLM_F_CREATE, &request);
3210
3211 if (!tcmsg) {
3212 return ENODEV;
3213 }
3214
3215 tcmsg->tcm_handle = tc_make_handle(1, 0);
3216 tcmsg->tcm_parent = TC_H_ROOT;
3217
3218 memset(&opt, 0, sizeof opt);
3219 opt.defcls = 1;
3220
3221 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3222 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3223
3224 return tc_transact(&request, NULL);
3225 }
3226
3227 /* Create an HFSC class.
3228 *
3229 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3230 * sc rate <min_rate> ul rate <max_rate>" */
3231 static int
3232 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3233 unsigned int parent, struct hfsc_class *class)
3234 {
3235 int error;
3236 size_t opt_offset;
3237 struct tcmsg *tcmsg;
3238 struct ofpbuf request;
3239 struct tc_service_curve min, max;
3240
3241 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3242
3243 if (!tcmsg) {
3244 return ENODEV;
3245 }
3246
3247 tcmsg->tcm_handle = handle;
3248 tcmsg->tcm_parent = parent;
3249
3250 min.m1 = 0;
3251 min.d = 0;
3252 min.m2 = class->min_rate;
3253
3254 max.m1 = 0;
3255 max.d = 0;
3256 max.m2 = class->max_rate;
3257
3258 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3259 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3260 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3261 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3262 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3263 nl_msg_end_nested(&request, opt_offset);
3264
3265 error = tc_transact(&request, NULL);
3266 if (error) {
3267 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3268 "min-rate %ubps, max-rate %ubps (%s)",
3269 netdev_get_name(netdev),
3270 tc_get_major(handle), tc_get_minor(handle),
3271 tc_get_major(parent), tc_get_minor(parent),
3272 class->min_rate, class->max_rate, ovs_strerror(error));
3273 }
3274
3275 return error;
3276 }
3277
3278 static int
3279 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3280 {
3281 int error;
3282 struct hfsc_class class;
3283
3284 error = hfsc_setup_qdisc__(netdev);
3285
3286 if (error) {
3287 return error;
3288 }
3289
3290 hfsc_parse_qdisc_details__(netdev, details, &class);
3291 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3292 tc_make_handle(1, 0), &class);
3293
3294 if (error) {
3295 return error;
3296 }
3297
3298 hfsc_install__(netdev, class.max_rate);
3299 return 0;
3300 }
3301
3302 static int
3303 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3304 {
3305 struct ofpbuf msg;
3306 struct nl_dump dump;
3307 struct hfsc_class hc;
3308
3309 hc.max_rate = 0;
3310 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3311 hfsc_install__(netdev, hc.max_rate);
3312
3313 if (!start_queue_dump(netdev, &dump)) {
3314 return ENODEV;
3315 }
3316
3317 while (nl_dump_next(&dump, &msg)) {
3318 unsigned int queue_id;
3319
3320 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3321 hfsc_update_queue__(netdev, queue_id, &hc);
3322 }
3323 }
3324
3325 nl_dump_done(&dump);
3326 return 0;
3327 }
3328
3329 static void
3330 hfsc_tc_destroy(struct tc *tc)
3331 {
3332 struct hfsc *hfsc;
3333 struct hfsc_class *hc, *next;
3334
3335 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3336
3337 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3338 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3339 free(hc);
3340 }
3341
3342 tc_destroy(tc);
3343 free(hfsc);
3344 }
3345
3346 static int
3347 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3348 {
3349 const struct hfsc *hfsc;
3350 hfsc = hfsc_get__(netdev);
3351 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3352 return 0;
3353 }
3354
3355 static int
3356 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3357 {
3358 int error;
3359 struct hfsc_class class;
3360
3361 hfsc_parse_qdisc_details__(netdev, details, &class);
3362 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3363 tc_make_handle(1, 0), &class);
3364
3365 if (!error) {
3366 hfsc_get__(netdev)->max_rate = class.max_rate;
3367 }
3368
3369 return error;
3370 }
3371
3372 static int
3373 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3374 const struct tc_queue *queue, struct smap *details)
3375 {
3376 const struct hfsc_class *hc;
3377
3378 hc = hfsc_class_cast__(queue);
3379 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3380 if (hc->min_rate != hc->max_rate) {
3381 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3382 }
3383 return 0;
3384 }
3385
3386 static int
3387 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3388 const struct smap *details)
3389 {
3390 int error;
3391 struct hfsc_class class;
3392
3393 error = hfsc_parse_class_details__(netdev, details, &class);
3394 if (error) {
3395 return error;
3396 }
3397
3398 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3399 tc_make_handle(1, 0xfffe), &class);
3400 if (error) {
3401 return error;
3402 }
3403
3404 hfsc_update_queue__(netdev, queue_id, &class);
3405 return 0;
3406 }
3407
3408 static int
3409 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3410 {
3411 int error;
3412 struct hfsc *hfsc;
3413 struct hfsc_class *hc;
3414
3415 hc = hfsc_class_cast__(queue);
3416 hfsc = hfsc_get__(netdev);
3417
3418 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3419 if (!error) {
3420 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3421 free(hc);
3422 }
3423 return error;
3424 }
3425
3426 static int
3427 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3428 struct netdev_queue_stats *stats)
3429 {
3430 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3431 tc_make_handle(1, 0xfffe), NULL, stats);
3432 }
3433
3434 static int
3435 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3436 const struct ofpbuf *nlmsg,
3437 netdev_dump_queue_stats_cb *cb, void *aux)
3438 {
3439 struct netdev_queue_stats stats;
3440 unsigned int handle, major, minor;
3441 int error;
3442
3443 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3444 if (error) {
3445 return error;
3446 }
3447
3448 major = tc_get_major(handle);
3449 minor = tc_get_minor(handle);
3450 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3451 (*cb)(minor - 1, &stats, aux);
3452 }
3453 return 0;
3454 }
3455
3456 static const struct tc_ops tc_ops_hfsc = {
3457 "hfsc", /* linux_name */
3458 "linux-hfsc", /* ovs_name */
3459 HFSC_N_QUEUES, /* n_queues */
3460 hfsc_tc_install, /* tc_install */
3461 hfsc_tc_load, /* tc_load */
3462 hfsc_tc_destroy, /* tc_destroy */
3463 hfsc_qdisc_get, /* qdisc_get */
3464 hfsc_qdisc_set, /* qdisc_set */
3465 hfsc_class_get, /* class_get */
3466 hfsc_class_set, /* class_set */
3467 hfsc_class_delete, /* class_delete */
3468 hfsc_class_get_stats, /* class_get_stats */
3469 hfsc_class_dump_stats /* class_dump_stats */
3470 };
3471 \f
3472 /* "linux-default" traffic control class.
3473 *
3474 * This class represents the default, unnamed Linux qdisc. It corresponds to
3475 * the "" (empty string) QoS type in the OVS database. */
3476
3477 static void
3478 default_install__(struct netdev *netdev_)
3479 {
3480 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3481 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3482
3483 /* Nothing but a tc class implementation is allowed to write to a tc. This
3484 * class never does that, so we can legitimately use a const tc object. */
3485 netdev->tc = CONST_CAST(struct tc *, &tc);
3486 }
3487
3488 static int
3489 default_tc_install(struct netdev *netdev,
3490 const struct smap *details OVS_UNUSED)
3491 {
3492 default_install__(netdev);
3493 return 0;
3494 }
3495
3496 static int
3497 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3498 {
3499 default_install__(netdev);
3500 return 0;
3501 }
3502
3503 static const struct tc_ops tc_ops_default = {
3504 NULL, /* linux_name */
3505 "", /* ovs_name */
3506 0, /* n_queues */
3507 default_tc_install,
3508 default_tc_load,
3509 NULL, /* tc_destroy */
3510 NULL, /* qdisc_get */
3511 NULL, /* qdisc_set */
3512 NULL, /* class_get */
3513 NULL, /* class_set */
3514 NULL, /* class_delete */
3515 NULL, /* class_get_stats */
3516 NULL /* class_dump_stats */
3517 };
3518 \f
3519 /* "linux-other" traffic control class.
3520 *
3521 * */
3522
3523 static int
3524 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3525 {
3526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3527 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3528
3529 /* Nothing but a tc class implementation is allowed to write to a tc. This
3530 * class never does that, so we can legitimately use a const tc object. */
3531 netdev->tc = CONST_CAST(struct tc *, &tc);
3532 return 0;
3533 }
3534
3535 static const struct tc_ops tc_ops_other = {
3536 NULL, /* linux_name */
3537 "linux-other", /* ovs_name */
3538 0, /* n_queues */
3539 NULL, /* tc_install */
3540 other_tc_load,
3541 NULL, /* tc_destroy */
3542 NULL, /* qdisc_get */
3543 NULL, /* qdisc_set */
3544 NULL, /* class_get */
3545 NULL, /* class_set */
3546 NULL, /* class_delete */
3547 NULL, /* class_get_stats */
3548 NULL /* class_dump_stats */
3549 };
3550 \f
3551 /* Traffic control. */
3552
3553 /* Number of kernel "tc" ticks per second. */
3554 static double ticks_per_s;
3555
3556 /* Number of kernel "jiffies" per second. This is used for the purpose of
3557 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3558 * one jiffy's worth of data.
3559 *
3560 * There are two possibilities here:
3561 *
3562 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3563 * approximate range of 100 to 1024. That means that we really need to
3564 * make sure that the qdisc can buffer that much data.
3565 *
3566 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3567 * has finely granular timers and there's no need to fudge additional room
3568 * for buffers. (There's no extra effort needed to implement that: the
3569 * large 'buffer_hz' is used as a divisor, so practically any number will
3570 * come out as 0 in the division. Small integer results in the case of
3571 * really high dividends won't have any real effect anyhow.)
3572 */
3573 static unsigned int buffer_hz;
3574
3575 /* Returns tc handle 'major':'minor'. */
3576 static unsigned int
3577 tc_make_handle(unsigned int major, unsigned int minor)
3578 {
3579 return TC_H_MAKE(major << 16, minor);
3580 }
3581
3582 /* Returns the major number from 'handle'. */
3583 static unsigned int
3584 tc_get_major(unsigned int handle)
3585 {
3586 return TC_H_MAJ(handle) >> 16;
3587 }
3588
3589 /* Returns the minor number from 'handle'. */
3590 static unsigned int
3591 tc_get_minor(unsigned int handle)
3592 {
3593 return TC_H_MIN(handle);
3594 }
3595
3596 static struct tcmsg *
3597 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3598 struct ofpbuf *request)
3599 {
3600 struct tcmsg *tcmsg;
3601 int ifindex;
3602 int error;
3603
3604 error = get_ifindex(netdev, &ifindex);
3605 if (error) {
3606 return NULL;
3607 }
3608
3609 ofpbuf_init(request, 512);
3610 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3611 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3612 tcmsg->tcm_family = AF_UNSPEC;
3613 tcmsg->tcm_ifindex = ifindex;
3614 /* Caller should fill in tcmsg->tcm_handle. */
3615 /* Caller should fill in tcmsg->tcm_parent. */
3616
3617 return tcmsg;
3618 }
3619
3620 static int
3621 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3622 {
3623 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3624 ofpbuf_uninit(request);
3625 return error;
3626 }
3627
3628 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3629 * policing configuration.
3630 *
3631 * This function is equivalent to running the following when 'add' is true:
3632 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3633 *
3634 * This function is equivalent to running the following when 'add' is false:
3635 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3636 *
3637 * The configuration and stats may be seen with the following command:
3638 * /sbin/tc -s qdisc show dev <devname>
3639 *
3640 * Returns 0 if successful, otherwise a positive errno value.
3641 */
3642 static int
3643 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3644 {
3645 struct ofpbuf request;
3646 struct tcmsg *tcmsg;
3647 int error;
3648 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3649 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3650
3651 tcmsg = tc_make_request(netdev, type, flags, &request);
3652 if (!tcmsg) {
3653 return ENODEV;
3654 }
3655 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3656 tcmsg->tcm_parent = TC_H_INGRESS;
3657 nl_msg_put_string(&request, TCA_KIND, "ingress");
3658 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3659
3660 error = tc_transact(&request, NULL);
3661 if (error) {
3662 /* If we're deleting the qdisc, don't worry about some of the
3663 * error conditions. */
3664 if (!add && (error == ENOENT || error == EINVAL)) {
3665 return 0;
3666 }
3667 return error;
3668 }
3669
3670 return 0;
3671 }
3672
3673 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3674 * of 'kbits_burst'.
3675 *
3676 * This function is equivalent to running:
3677 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3678 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3679 * mtu 65535 drop
3680 *
3681 * The configuration and stats may be seen with the following command:
3682 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3683 *
3684 * Returns 0 if successful, otherwise a positive errno value.
3685 */
3686 static int
3687 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3688 {
3689 struct tc_police tc_police;
3690 struct ofpbuf request;
3691 struct tcmsg *tcmsg;
3692 size_t basic_offset;
3693 size_t police_offset;
3694 int error;
3695 int mtu = 65535;
3696
3697 memset(&tc_police, 0, sizeof tc_police);
3698 tc_police.action = TC_POLICE_SHOT;
3699 tc_police.mtu = mtu;
3700 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3701 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3702 kbits_burst * 1024);
3703
3704 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3705 NLM_F_EXCL | NLM_F_CREATE, &request);
3706 if (!tcmsg) {
3707 return ENODEV;
3708 }
3709 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3710 tcmsg->tcm_info = tc_make_handle(49,
3711 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3712
3713 nl_msg_put_string(&request, TCA_KIND, "basic");
3714 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3715 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3716 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3717 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3718 nl_msg_end_nested(&request, police_offset);
3719 nl_msg_end_nested(&request, basic_offset);
3720
3721 error = tc_transact(&request, NULL);
3722 if (error) {
3723 return error;
3724 }
3725
3726 return 0;
3727 }
3728
3729 static void
3730 read_psched(void)
3731 {
3732 /* The values in psched are not individually very meaningful, but they are
3733 * important. The tables below show some values seen in the wild.
3734 *
3735 * Some notes:
3736 *
3737 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3738 * (Before that, there are hints that it was 1000000000.)
3739 *
3740 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3741 * above.
3742 *
3743 * /proc/net/psched
3744 * -----------------------------------
3745 * [1] 000c8000 000f4240 000f4240 00000064
3746 * [2] 000003e8 00000400 000f4240 3b9aca00
3747 * [3] 000003e8 00000400 000f4240 3b9aca00
3748 * [4] 000003e8 00000400 000f4240 00000064
3749 * [5] 000003e8 00000040 000f4240 3b9aca00
3750 * [6] 000003e8 00000040 000f4240 000000f9
3751 *
3752 * a b c d ticks_per_s buffer_hz
3753 * ------- --------- ---------- ------------- ----------- -------------
3754 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3755 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3756 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3757 * [4] 1,000 1,024 1,000,000 100 976,562 100
3758 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3759 * [6] 1,000 64 1,000,000 249 15,625,000 249
3760 *
3761 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3762 * [2] 2.6.26-1-686-bigmem from Debian lenny
3763 * [3] 2.6.26-2-sparc64 from Debian lenny
3764 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3765 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3766 * [6] 2.6.34 from kernel.org on KVM
3767 */
3768 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3769 static const char fn[] = "/proc/net/psched";
3770 unsigned int a, b, c, d;
3771 FILE *stream;
3772
3773 if (!ovsthread_once_start(&once)) {
3774 return;
3775 }
3776
3777 ticks_per_s = 1.0;
3778 buffer_hz = 100;
3779
3780 stream = fopen(fn, "r");
3781 if (!stream) {
3782 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3783 goto exit;
3784 }
3785
3786 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3787 VLOG_WARN("%s: read failed", fn);
3788 fclose(stream);
3789 goto exit;
3790 }
3791 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3792 fclose(stream);
3793
3794 if (!a || !c) {
3795 VLOG_WARN("%s: invalid scheduler parameters", fn);
3796 goto exit;
3797 }
3798
3799 ticks_per_s = (double) a * c / b;
3800 if (c == 1000000) {
3801 buffer_hz = d;
3802 } else {
3803 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3804 fn, a, b, c, d);
3805 }
3806 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3807
3808 exit:
3809 ovsthread_once_done(&once);
3810 }
3811
3812 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3813 * rate of 'rate' bytes per second. */
3814 static unsigned int
3815 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3816 {
3817 read_psched();
3818 return (rate * ticks) / ticks_per_s;
3819 }
3820
3821 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3822 * rate of 'rate' bytes per second. */
3823 static unsigned int
3824 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3825 {
3826 read_psched();
3827 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3828 }
3829
3830 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3831 * a transmission rate of 'rate' bytes per second. */
3832 static unsigned int
3833 tc_buffer_per_jiffy(unsigned int rate)
3834 {
3835 read_psched();
3836 return rate / buffer_hz;
3837 }
3838
3839 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3840 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3841 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3842 * stores NULL into it if it is absent.
3843 *
3844 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3845 * 'msg'.
3846 *
3847 * Returns 0 if successful, otherwise a positive errno value. */
3848 static int
3849 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3850 struct nlattr **options)
3851 {
3852 static const struct nl_policy tca_policy[] = {
3853 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3854 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3855 };
3856 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3857
3858 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3859 tca_policy, ta, ARRAY_SIZE(ta))) {
3860 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3861 goto error;
3862 }
3863
3864 if (kind) {
3865 *kind = nl_attr_get_string(ta[TCA_KIND]);
3866 }
3867
3868 if (options) {
3869 *options = ta[TCA_OPTIONS];
3870 }
3871
3872 return 0;
3873
3874 error:
3875 if (kind) {
3876 *kind = NULL;
3877 }
3878 if (options) {
3879 *options = NULL;
3880 }
3881 return EPROTO;
3882 }
3883
3884 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3885 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3886 * into '*options', and its queue statistics into '*stats'. Any of the output
3887 * arguments may be null.
3888 *
3889 * Returns 0 if successful, otherwise a positive errno value. */
3890 static int
3891 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3892 struct nlattr **options, struct netdev_queue_stats *stats)
3893 {
3894 static const struct nl_policy tca_policy[] = {
3895 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3896 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3897 };
3898 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3899
3900 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3901 tca_policy, ta, ARRAY_SIZE(ta))) {
3902 VLOG_WARN_RL(&rl, "failed to parse class message");
3903 goto error;
3904 }
3905
3906 if (handlep) {
3907 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3908 *handlep = tc->tcm_handle;
3909 }
3910
3911 if (options) {
3912 *options = ta[TCA_OPTIONS];
3913 }
3914
3915 if (stats) {
3916 const struct gnet_stats_queue *gsq;
3917 struct gnet_stats_basic gsb;
3918
3919 static const struct nl_policy stats_policy[] = {
3920 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3921 .min_len = sizeof gsb },
3922 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3923 .min_len = sizeof *gsq },
3924 };
3925 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3926
3927 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3928 sa, ARRAY_SIZE(sa))) {
3929 VLOG_WARN_RL(&rl, "failed to parse class stats");
3930 goto error;
3931 }
3932
3933 /* Alignment issues screw up the length of struct gnet_stats_basic on
3934 * some arch/bitsize combinations. Newer versions of Linux have a
3935 * struct gnet_stats_basic_packed, but we can't depend on that. The
3936 * easiest thing to do is just to make a copy. */
3937 memset(&gsb, 0, sizeof gsb);
3938 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3939 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3940 stats->tx_bytes = gsb.bytes;
3941 stats->tx_packets = gsb.packets;
3942
3943 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3944 stats->tx_errors = gsq->drops;
3945 }
3946
3947 return 0;
3948
3949 error:
3950 if (options) {
3951 *options = NULL;
3952 }
3953 if (stats) {
3954 memset(stats, 0, sizeof *stats);
3955 }
3956 return EPROTO;
3957 }
3958
3959 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3960 * on 'netdev'. */
3961 static int
3962 tc_query_class(const struct netdev *netdev,
3963 unsigned int handle, unsigned int parent,
3964 struct ofpbuf **replyp)
3965 {
3966 struct ofpbuf request;
3967 struct tcmsg *tcmsg;
3968 int error;
3969
3970 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3971 if (!tcmsg) {
3972 return ENODEV;
3973 }
3974 tcmsg->tcm_handle = handle;
3975 tcmsg->tcm_parent = parent;
3976
3977 error = tc_transact(&request, replyp);
3978 if (error) {
3979 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3980 netdev_get_name(netdev),
3981 tc_get_major(handle), tc_get_minor(handle),
3982 tc_get_major(parent), tc_get_minor(parent),
3983 ovs_strerror(error));
3984 }
3985 return error;
3986 }
3987
3988 /* Equivalent to "tc class del dev <name> handle <handle>". */
3989 static int
3990 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3991 {
3992 struct ofpbuf request;
3993 struct tcmsg *tcmsg;
3994 int error;
3995
3996 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3997 if (!tcmsg) {
3998 return ENODEV;
3999 }
4000 tcmsg->tcm_handle = handle;
4001 tcmsg->tcm_parent = 0;
4002
4003 error = tc_transact(&request, NULL);
4004 if (error) {
4005 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4006 netdev_get_name(netdev),
4007 tc_get_major(handle), tc_get_minor(handle),
4008 ovs_strerror(error));
4009 }
4010 return error;
4011 }
4012
4013 /* Equivalent to "tc qdisc del dev <name> root". */
4014 static int
4015 tc_del_qdisc(struct netdev *netdev_)
4016 {
4017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4018 struct ofpbuf request;
4019 struct tcmsg *tcmsg;
4020 int error;
4021
4022 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4023 if (!tcmsg) {
4024 return ENODEV;
4025 }
4026 tcmsg->tcm_handle = tc_make_handle(1, 0);
4027 tcmsg->tcm_parent = TC_H_ROOT;
4028
4029 error = tc_transact(&request, NULL);
4030 if (error == EINVAL) {
4031 /* EINVAL probably means that the default qdisc was in use, in which
4032 * case we've accomplished our purpose. */
4033 error = 0;
4034 }
4035 if (!error && netdev->tc) {
4036 if (netdev->tc->ops->tc_destroy) {
4037 netdev->tc->ops->tc_destroy(netdev->tc);
4038 }
4039 netdev->tc = NULL;
4040 }
4041 return error;
4042 }
4043
4044 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4045 * kernel to determine what they are. Returns 0 if successful, otherwise a
4046 * positive errno value. */
4047 static int
4048 tc_query_qdisc(const struct netdev *netdev_)
4049 {
4050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4051 struct ofpbuf request, *qdisc;
4052 const struct tc_ops *ops;
4053 struct tcmsg *tcmsg;
4054 int load_error;
4055 int error;
4056
4057 if (netdev->tc) {
4058 return 0;
4059 }
4060
4061 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4062 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4063 * 2.6.35 without that fix backported to it.
4064 *
4065 * To avoid the OOPS, we must not make a request that would attempt to dump
4066 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4067 * few others. There are a few ways that I can see to do this, but most of
4068 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4069 * technique chosen here is to assume that any non-default qdisc that we
4070 * create will have a class with handle 1:0. The built-in qdiscs only have
4071 * a class with handle 0:0.
4072 *
4073 * We could check for Linux 2.6.35+ and use a more straightforward method
4074 * there. */
4075 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4076 if (!tcmsg) {
4077 return ENODEV;
4078 }
4079 tcmsg->tcm_handle = tc_make_handle(1, 0);
4080 tcmsg->tcm_parent = 0;
4081
4082 /* Figure out what tc class to instantiate. */
4083 error = tc_transact(&request, &qdisc);
4084 if (!error) {
4085 const char *kind;
4086
4087 error = tc_parse_qdisc(qdisc, &kind, NULL);
4088 if (error) {
4089 ops = &tc_ops_other;
4090 } else {
4091 ops = tc_lookup_linux_name(kind);
4092 if (!ops) {
4093 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4094 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4095
4096 ops = &tc_ops_other;
4097 }
4098 }
4099 } else if (error == ENOENT) {
4100 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4101 * other entity that doesn't have a handle 1:0. We will assume
4102 * that it's the system default qdisc. */
4103 ops = &tc_ops_default;
4104 error = 0;
4105 } else {
4106 /* Who knows? Maybe the device got deleted. */
4107 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4108 netdev_get_name(netdev_), ovs_strerror(error));
4109 ops = &tc_ops_other;
4110 }
4111
4112 /* Instantiate it. */
4113 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4114 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4115 ofpbuf_delete(qdisc);
4116
4117 return error ? error : load_error;
4118 }
4119
4120 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4121 approximate the time to transmit packets of various lengths. For an MTU of
4122 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4123 represents two possible packet lengths; for a MTU of 513 through 1024, four
4124 possible lengths; and so on.
4125
4126 Returns, for the specified 'mtu', the number of bits that packet lengths
4127 need to be shifted right to fit within such a 256-entry table. */
4128 static int
4129 tc_calc_cell_log(unsigned int mtu)
4130 {
4131 int cell_log;
4132
4133 if (!mtu) {
4134 mtu = ETH_PAYLOAD_MAX;
4135 }
4136 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4137
4138 for (cell_log = 0; mtu >= 256; cell_log++) {
4139 mtu >>= 1;
4140 }
4141
4142 return cell_log;
4143 }
4144
4145 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4146 * of 'mtu'. */
4147 static void
4148 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4149 {
4150 memset(rate, 0, sizeof *rate);
4151 rate->cell_log = tc_calc_cell_log(mtu);
4152 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4153 /* rate->cell_align = 0; */ /* distro headers. */
4154 rate->mpu = ETH_TOTAL_MIN;
4155 rate->rate = Bps;
4156 }
4157
4158 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4159 * attribute of the specified "type".
4160 *
4161 * See tc_calc_cell_log() above for a description of "rtab"s. */
4162 static void
4163 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4164 {
4165 uint32_t *rtab;
4166 unsigned int i;
4167
4168 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4169 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4170 unsigned packet_size = (i + 1) << rate->cell_log;
4171 if (packet_size < rate->mpu) {
4172 packet_size = rate->mpu;
4173 }
4174 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4175 }
4176 }
4177
4178 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4179 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4180 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4181 * 0 is fine.) */
4182 static int
4183 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4184 {
4185 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4186 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4187 }
4188 \f
4189 /* Linux-only functions declared in netdev-linux.h */
4190
4191 /* Returns a fd for an AF_INET socket or a negative errno value. */
4192 int
4193 netdev_linux_get_af_inet_sock(void)
4194 {
4195 int error = netdev_linux_init();
4196 return error ? -error : af_inet_sock;
4197 }
4198
4199 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4200 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4201 int
4202 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4203 const char *flag_name, bool enable)
4204 {
4205 const char *netdev_name = netdev_get_name(netdev);
4206 struct ethtool_value evalue;
4207 uint32_t new_flags;
4208 int error;
4209
4210 COVERAGE_INC(netdev_get_ethtool);
4211 memset(&evalue, 0, sizeof evalue);
4212 error = netdev_linux_do_ethtool(netdev_name,
4213 (struct ethtool_cmd *)&evalue,
4214 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4215 if (error) {
4216 return error;
4217 }
4218
4219 COVERAGE_INC(netdev_set_ethtool);
4220 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4221 error = netdev_linux_do_ethtool(netdev_name,
4222 (struct ethtool_cmd *)&evalue,
4223 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4224 if (error) {
4225 return error;
4226 }
4227
4228 COVERAGE_INC(netdev_get_ethtool);
4229 memset(&evalue, 0, sizeof evalue);
4230 error = netdev_linux_do_ethtool(netdev_name,
4231 (struct ethtool_cmd *)&evalue,
4232 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4233 if (error) {
4234 return error;
4235 }
4236
4237 if (new_flags != evalue.data) {
4238 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4239 "device %s failed", enable ? "enable" : "disable",
4240 flag_name, netdev_name);
4241 return EOPNOTSUPP;
4242 }
4243
4244 return 0;
4245 }
4246 \f
4247 /* Utility functions. */
4248
4249 /* Copies 'src' into 'dst', performing format conversion in the process. */
4250 static void
4251 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4252 const struct rtnl_link_stats *src)
4253 {
4254 dst->rx_packets = src->rx_packets;
4255 dst->tx_packets = src->tx_packets;
4256 dst->rx_bytes = src->rx_bytes;
4257 dst->tx_bytes = src->tx_bytes;
4258 dst->rx_errors = src->rx_errors;
4259 dst->tx_errors = src->tx_errors;
4260 dst->rx_dropped = src->rx_dropped;
4261 dst->tx_dropped = src->tx_dropped;
4262 dst->multicast = src->multicast;
4263 dst->collisions = src->collisions;
4264 dst->rx_length_errors = src->rx_length_errors;
4265 dst->rx_over_errors = src->rx_over_errors;
4266 dst->rx_crc_errors = src->rx_crc_errors;
4267 dst->rx_frame_errors = src->rx_frame_errors;
4268 dst->rx_fifo_errors = src->rx_fifo_errors;
4269 dst->rx_missed_errors = src->rx_missed_errors;
4270 dst->tx_aborted_errors = src->tx_aborted_errors;
4271 dst->tx_carrier_errors = src->tx_carrier_errors;
4272 dst->tx_fifo_errors = src->tx_fifo_errors;
4273 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4274 dst->tx_window_errors = src->tx_window_errors;
4275 }
4276
4277 static int
4278 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4279 {
4280 /* Policy for RTNLGRP_LINK messages.
4281 *
4282 * There are *many* more fields in these messages, but currently we only
4283 * care about these fields. */
4284 static const struct nl_policy rtnlgrp_link_policy[] = {
4285 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4286 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4287 .min_len = sizeof(struct rtnl_link_stats) },
4288 };
4289
4290 struct ofpbuf request;
4291 struct ofpbuf *reply;
4292 struct ifinfomsg *ifi;
4293 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4294 int error;
4295
4296 ofpbuf_init(&request, 0);
4297 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4298 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4299 ifi->ifi_family = PF_UNSPEC;
4300 ifi->ifi_index = ifindex;
4301 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4302 ofpbuf_uninit(&request);
4303 if (error) {
4304 return error;
4305 }
4306
4307 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4308 rtnlgrp_link_policy,
4309 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4310 ofpbuf_delete(reply);
4311 return EPROTO;
4312 }
4313
4314 if (!attrs[IFLA_STATS]) {
4315 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4316 ofpbuf_delete(reply);
4317 return EPROTO;
4318 }
4319
4320 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4321
4322 ofpbuf_delete(reply);
4323
4324 return 0;
4325 }
4326
4327 static int
4328 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4329 {
4330 static const char fn[] = "/proc/net/dev";
4331 char line[1024];
4332 FILE *stream;
4333 int ln;
4334
4335 stream = fopen(fn, "r");
4336 if (!stream) {
4337 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4338 return errno;
4339 }
4340
4341 ln = 0;
4342 while (fgets(line, sizeof line, stream)) {
4343 if (++ln >= 3) {
4344 char devname[16];
4345 #define X64 "%"SCNu64
4346 if (sscanf(line,
4347 " %15[^:]:"
4348 X64 X64 X64 X64 X64 X64 X64 "%*u"
4349 X64 X64 X64 X64 X64 X64 X64 "%*u",
4350 devname,
4351 &stats->rx_bytes,
4352 &stats->rx_packets,
4353 &stats->rx_errors,
4354 &stats->rx_dropped,
4355 &stats->rx_fifo_errors,
4356 &stats->rx_frame_errors,
4357 &stats->multicast,
4358 &stats->tx_bytes,
4359 &stats->tx_packets,
4360 &stats->tx_errors,
4361 &stats->tx_dropped,
4362 &stats->tx_fifo_errors,
4363 &stats->collisions,
4364 &stats->tx_carrier_errors) != 15) {
4365 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4366 } else if (!strcmp(devname, netdev_name)) {
4367 stats->rx_length_errors = UINT64_MAX;
4368 stats->rx_over_errors = UINT64_MAX;
4369 stats->rx_crc_errors = UINT64_MAX;
4370 stats->rx_missed_errors = UINT64_MAX;
4371 stats->tx_aborted_errors = UINT64_MAX;
4372 stats->tx_heartbeat_errors = UINT64_MAX;
4373 stats->tx_window_errors = UINT64_MAX;
4374 fclose(stream);
4375 return 0;
4376 }
4377 }
4378 }
4379 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4380 fclose(stream);
4381 return ENODEV;
4382 }
4383
4384 static int
4385 get_flags(const struct netdev *dev, unsigned int *flags)
4386 {
4387 struct ifreq ifr;
4388 int error;
4389
4390 *flags = 0;
4391 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4392 "SIOCGIFFLAGS");
4393 if (!error) {
4394 *flags = ifr.ifr_flags;
4395 }
4396 return error;
4397 }
4398
4399 static int
4400 set_flags(const char *name, unsigned int flags)
4401 {
4402 struct ifreq ifr;
4403
4404 ifr.ifr_flags = flags;
4405 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4406 }
4407
4408 static int
4409 do_get_ifindex(const char *netdev_name)
4410 {
4411 struct ifreq ifr;
4412
4413 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4414 COVERAGE_INC(netdev_get_ifindex);
4415 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4416 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4417 netdev_name, ovs_strerror(errno));
4418 return -errno;
4419 }
4420 return ifr.ifr_ifindex;
4421 }
4422
4423 static int
4424 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4425 {
4426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4427
4428 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4429 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4430
4431 if (ifindex < 0) {
4432 netdev->get_ifindex_error = -ifindex;
4433 netdev->ifindex = 0;
4434 } else {
4435 netdev->get_ifindex_error = 0;
4436 netdev->ifindex = ifindex;
4437 }
4438 netdev->cache_valid |= VALID_IFINDEX;
4439 }
4440
4441 *ifindexp = netdev->ifindex;
4442 return netdev->get_ifindex_error;
4443 }
4444
4445 static int
4446 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4447 {
4448 struct ifreq ifr;
4449 int hwaddr_family;
4450
4451 memset(&ifr, 0, sizeof ifr);
4452 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4453 COVERAGE_INC(netdev_get_hwaddr);
4454 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4455 /* ENODEV probably means that a vif disappeared asynchronously and
4456 * hasn't been removed from the database yet, so reduce the log level
4457 * to INFO for that case. */
4458 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4459 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4460 netdev_name, ovs_strerror(errno));
4461 return errno;
4462 }
4463 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4464 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4465 VLOG_WARN("%s device has unknown hardware address family %d",
4466 netdev_name, hwaddr_family);
4467 }
4468 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4469 return 0;
4470 }
4471
4472 static int
4473 set_etheraddr(const char *netdev_name,
4474 const uint8_t mac[ETH_ADDR_LEN])
4475 {
4476 struct ifreq ifr;
4477
4478 memset(&ifr, 0, sizeof ifr);
4479 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4480 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4481 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4482 COVERAGE_INC(netdev_set_hwaddr);
4483 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4484 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4485 netdev_name, ovs_strerror(errno));
4486 return errno;
4487 }
4488 return 0;
4489 }
4490
4491 static int
4492 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4493 int cmd, const char *cmd_name)
4494 {
4495 struct ifreq ifr;
4496
4497 memset(&ifr, 0, sizeof ifr);
4498 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4499 ifr.ifr_data = (caddr_t) ecmd;
4500
4501 ecmd->cmd = cmd;
4502 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4503 return 0;
4504 } else {
4505 if (errno != EOPNOTSUPP) {
4506 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4507 "failed: %s", cmd_name, name, ovs_strerror(errno));
4508 } else {
4509 /* The device doesn't support this operation. That's pretty
4510 * common, so there's no point in logging anything. */
4511 }
4512 return errno;
4513 }
4514 }
4515
4516 static int
4517 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4518 const char *cmd_name)
4519 {
4520 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4521 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4522 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4523 ovs_strerror(errno));
4524 return errno;
4525 }
4526 return 0;
4527 }
4528
4529 static int
4530 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4531 int cmd, const char *cmd_name)
4532 {
4533 struct ifreq ifr;
4534 int error;
4535
4536 ifr.ifr_addr.sa_family = AF_INET;
4537 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4538 if (!error) {
4539 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4540 &ifr.ifr_addr);
4541 *ip = sin->sin_addr;
4542 }
4543 return error;
4544 }
4545
4546 /* Returns an AF_PACKET raw socket or a negative errno value. */
4547 static int
4548 af_packet_sock(void)
4549 {
4550 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4551 static int sock;
4552
4553 if (ovsthread_once_start(&once)) {
4554 sock = socket(AF_PACKET, SOCK_RAW, 0);
4555 if (sock >= 0) {
4556 int error = set_nonblocking(sock);
4557 if (error) {
4558 close(sock);
4559 sock = -error;
4560 }
4561 } else {
4562 sock = -errno;
4563 VLOG_ERR("failed to create packet socket: %s",
4564 ovs_strerror(errno));
4565 }
4566 ovsthread_once_done(&once);
4567 }
4568
4569 return sock;
4570 }