]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
298ccd6a974f97fdba32fec59070add4366862a1
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
45 #include <poll.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49
50 #include "coverage.h"
51 #include "dpif-linux.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
66 #include "packets.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
69 #include "shash.h"
70 #include "socket-util.h"
71 #include "sset.h"
72 #include "timer.h"
73 #include "unaligned.h"
74 #include "vlog.h"
75
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
85
86 \f
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 * old headers. */
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
91 #endif
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
94 #endif
95
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #endif
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 #endif
104
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 * headers. */
107 #ifndef TC_RTAB_SIZE
108 #define TC_RTAB_SIZE 1024
109 #endif
110
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
116 *
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
119 */
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
122 #endif
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
125 #endif
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
128 #endif
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
132 uint32_t tp_status;
133 uint32_t tp_len;
134 uint32_t tp_snaplen;
135 uint16_t tp_mac;
136 uint16_t tp_net;
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
139 };
140
141 enum {
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
144 VALID_IN4 = 1 << 2,
145 VALID_IN6 = 1 << 3,
146 VALID_MTU = 1 << 4,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
151 };
152 \f
153 /* Traffic control. */
154
155 /* An instance of a traffic control class. Always associated with a particular
156 * network device.
157 *
158 * Each TC implementation subclasses this with whatever additional data it
159 * needs. */
160 struct tc {
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
165 };
166
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
168
169 /* One traffic control queue.
170 *
171 * Each TC implementation subclasses this with whatever additional data it
172 * needs. */
173 struct tc_queue {
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
177 };
178
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
181 *
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
185 struct tc_ops {
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
190
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
193
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
197
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
203 *
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
207 *
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
210 *
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
214
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
218 *
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
224 * 'netdev'.
225 *
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
229
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
232 * tc_destroy(tc).
233 *
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
237 *
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
240
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
242 *
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
246 *
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
250 *
251 * This function may be null if 'tc' is not configurable.
252 */
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
254
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * This function may be null if 'tc' is not configurable.
263 */
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
265
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
268 *
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
272 *
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
276 *
277 * This function may be null if 'tc' does not have queues ('n_queues' is
278 * 0). */
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
281
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
285 * 'n_queues'.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
295
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
298 *
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
302
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
305 *
306 * On success, initializes '*stats'.
307 *
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
313
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
316 *
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
322 };
323
324 static void
325 tc_init(struct tc *tc, const struct tc_ops *ops)
326 {
327 tc->ops = ops;
328 hmap_init(&tc->queues);
329 }
330
331 static void
332 tc_destroy(struct tc *tc)
333 {
334 hmap_destroy(&tc->queues);
335 }
336
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
341
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
347 NULL
348 };
349
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
353
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
357
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
363 int kbits_burst);
364
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
374
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
377
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
383 \f
384 struct netdev_linux {
385 struct netdev up;
386
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
389
390 unsigned int cache_valid;
391
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
395
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
398 int ifindex;
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
401 struct in6_addr in6;
402 int mtu;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
414
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
418
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
420 struct tc *tc;
421
422 /* For devices of class netdev_tap_class only. */
423 int tap_fd;
424 };
425
426 struct netdev_rxq_linux {
427 struct netdev_rxq up;
428 bool is_tap;
429 int fd;
430 };
431
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
435
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
439 *
440 * Readers do not depend on this variable synchronizing with the related
441 * changes in the device miimon status, so we can use atomic_count. */
442 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
443
444 static void netdev_linux_run(void);
445
446 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
447 int cmd, const char *cmd_name);
448 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
449 int cmd, const char *cmd_name);
450 static int get_flags(const struct netdev *, unsigned int *flags);
451 static int set_flags(const char *, unsigned int flags);
452 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
453 enum netdev_flags on, enum netdev_flags *old_flagsp)
454 OVS_REQUIRES(netdev->mutex);
455 static int do_get_ifindex(const char *netdev_name);
456 static int get_ifindex(const struct netdev *, int *ifindexp);
457 static int do_set_addr(struct netdev *netdev,
458 int ioctl_nr, const char *ioctl_name,
459 struct in_addr addr);
460 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
461 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
462 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
463 static int af_packet_sock(void);
464 static bool netdev_linux_miimon_enabled(void);
465 static void netdev_linux_miimon_run(void);
466 static void netdev_linux_miimon_wait(void);
467 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
468
469 static bool
470 is_netdev_linux_class(const struct netdev_class *netdev_class)
471 {
472 return netdev_class->run == netdev_linux_run;
473 }
474
475 static bool
476 is_tap_netdev(const struct netdev *netdev)
477 {
478 return netdev_get_class(netdev) == &netdev_tap_class;
479 }
480
481 static struct netdev_linux *
482 netdev_linux_cast(const struct netdev *netdev)
483 {
484 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
485
486 return CONTAINER_OF(netdev, struct netdev_linux, up);
487 }
488
489 static struct netdev_rxq_linux *
490 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
491 {
492 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
493 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
494 }
495 \f
496 static void netdev_linux_update(struct netdev_linux *netdev,
497 const struct rtnetlink_link_change *)
498 OVS_REQUIRES(netdev->mutex);
499 static void netdev_linux_changed(struct netdev_linux *netdev,
500 unsigned int ifi_flags, unsigned int mask)
501 OVS_REQUIRES(netdev->mutex);
502
503 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
504 * if no such socket could be created. */
505 static struct nl_sock *
506 netdev_linux_notify_sock(void)
507 {
508 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
509 static struct nl_sock *sock;
510
511 if (ovsthread_once_start(&once)) {
512 int error;
513
514 error = nl_sock_create(NETLINK_ROUTE, &sock);
515 if (!error) {
516 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
517 if (error) {
518 nl_sock_destroy(sock);
519 sock = NULL;
520 }
521 }
522 ovsthread_once_done(&once);
523 }
524
525 return sock;
526 }
527
528 static bool
529 netdev_linux_miimon_enabled(void)
530 {
531 return atomic_count_get(&miimon_cnt) > 0;
532 }
533
534 static void
535 netdev_linux_run(void)
536 {
537 struct nl_sock *sock;
538 int error;
539
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
542 }
543
544 sock = netdev_linux_notify_sock();
545 if (!sock) {
546 return;
547 }
548
549 do {
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
552 struct ofpbuf buf;
553
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
556 if (!error) {
557 struct rtnetlink_link_change change;
558
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
563
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
567 }
568 netdev_close(netdev_);
569 }
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
573
574 nl_sock_drain(sock);
575
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
581 unsigned int flags;
582
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
587
588 netdev_close(netdev_);
589 }
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
594 }
595 ofpbuf_uninit(&buf);
596 } while (!error);
597 }
598
599 static void
600 netdev_linux_wait(void)
601 {
602 struct nl_sock *sock;
603
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
606 }
607 sock = netdev_linux_notify_sock();
608 if (sock) {
609 nl_sock_wait(sock, POLLIN);
610 }
611 }
612
613 static void
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
617 {
618 netdev_change_seq_changed(&dev->up);
619
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
622 }
623 dev->ifi_flags = ifi_flags;
624
625 dev->cache_valid &= mask;
626 }
627
628 static void
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
632 {
633 if (change->nlmsg_type == RTM_NEWLINK) {
634 /* Keep drv-info */
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
636
637 /* Update netdev from rtnl-change msg. */
638 if (change->mtu) {
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
642 }
643
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
648 }
649
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
653
654 } else {
655 netdev_linux_changed(dev, change->ifi_flags, 0);
656 }
657 }
658
659 static struct netdev *
660 netdev_linux_alloc(void)
661 {
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
663 return &netdev->up;
664 }
665
666 static void
667 netdev_linux_common_construct(struct netdev_linux *netdev)
668 {
669 ovs_mutex_init(&netdev->mutex);
670 }
671
672 /* Creates system and internal devices. */
673 static int
674 netdev_linux_construct(struct netdev *netdev_)
675 {
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
677 int error;
678
679 netdev_linux_common_construct(netdev);
680
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
685 return ENODEV;
686 } else {
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
691 }
692 }
693
694 return 0;
695 }
696
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
703 static int
704 netdev_linux_construct_tap(struct netdev *netdev_)
705 {
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
709 struct ifreq ifr;
710 int error;
711
712 netdev_linux_common_construct(netdev);
713
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
717 error = errno;
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
719 return error;
720 }
721
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
728 error = errno;
729 goto error_close;
730 }
731
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
734 if (error) {
735 goto error_close;
736 }
737
738 return 0;
739
740 error_close:
741 close(netdev->tap_fd);
742 return error;
743 }
744
745 static void
746 netdev_linux_destruct(struct netdev *netdev_)
747 {
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
752 }
753
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
756 {
757 close(netdev->tap_fd);
758 }
759
760 if (netdev->miimon_interval > 0) {
761 atomic_count_dec(&miimon_cnt);
762 }
763
764 ovs_mutex_destroy(&netdev->mutex);
765 }
766
767 static void
768 netdev_linux_dealloc(struct netdev *netdev_)
769 {
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771 free(netdev);
772 }
773
774 static struct netdev_rxq *
775 netdev_linux_rxq_alloc(void)
776 {
777 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
778 return &rx->up;
779 }
780
781 static int
782 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
783 {
784 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
785 struct netdev *netdev_ = rx->up.netdev;
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
787 int error;
788
789 ovs_mutex_lock(&netdev->mutex);
790 rx->is_tap = is_tap_netdev(netdev_);
791 if (rx->is_tap) {
792 rx->fd = netdev->tap_fd;
793 } else {
794 struct sockaddr_ll sll;
795 int ifindex, val;
796 /* Result of tcpdump -dd inbound */
797 static const struct sock_filter filt[] = {
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
802 };
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
805 };
806
807 /* Create file descriptor. */
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
809 if (rx->fd < 0) {
810 error = errno;
811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
812 goto error;
813 }
814
815 val = 1;
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
817 error = errno;
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
820 goto error;
821 }
822
823 /* Set non-blocking mode. */
824 error = set_nonblocking(rx->fd);
825 if (error) {
826 goto error;
827 }
828
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->up, &ifindex);
831 if (error) {
832 goto error;
833 }
834
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = htons(ETH_P_ALL);
840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
841 error = errno;
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), ovs_strerror(error));
844 goto error;
845 }
846
847 /* Filter for only inbound packets. */
848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
849 sizeof fprog);
850 if (error) {
851 error = errno;
852 VLOG_ERR("%s: failed to attach filter (%s)",
853 netdev_get_name(netdev_), ovs_strerror(error));
854 goto error;
855 }
856 }
857 ovs_mutex_unlock(&netdev->mutex);
858
859 return 0;
860
861 error:
862 if (rx->fd >= 0) {
863 close(rx->fd);
864 }
865 ovs_mutex_unlock(&netdev->mutex);
866 return error;
867 }
868
869 static void
870 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
871 {
872 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
873
874 if (!rx->is_tap) {
875 close(rx->fd);
876 }
877 }
878
879 static void
880 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
881 {
882 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
883
884 free(rx);
885 }
886
887 static ovs_be16
888 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
889 {
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
892 } else {
893 return htons(ETH_TYPE_VLAN);
894 }
895 }
896
897 static bool
898 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
899 {
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
901 }
902
903 static int
904 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
905 {
906 size_t size;
907 ssize_t retval;
908 struct iovec iov;
909 struct cmsghdr *cmsg;
910 union {
911 struct cmsghdr cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
913 } cmsg_buffer;
914 struct msghdr msgh;
915
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
919
920 iov.iov_base = ofpbuf_data(buffer);
921 iov.iov_len = size;
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
924 msgh.msg_iov = &iov;
925 msgh.msg_iovlen = 1;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
928 msgh.msg_flags = 0;
929
930 do {
931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
932 } while (retval < 0 && errno == EINTR);
933
934 if (retval < 0) {
935 return errno;
936 } else if (retval > size) {
937 return EMSGSIZE;
938 }
939
940 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
941
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
944
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
948 continue;
949 }
950
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
954 return EINVAL;
955 }
956
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
959 break;
960 }
961 }
962
963 return 0;
964 }
965
966 static int
967 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
968 {
969 ssize_t retval;
970 size_t size = ofpbuf_tailroom(buffer);
971
972 do {
973 retval = read(fd, ofpbuf_data(buffer), size);
974 } while (retval < 0 && errno == EINTR);
975
976 if (retval < 0) {
977 return errno;
978 } else if (retval > size) {
979 return EMSGSIZE;
980 }
981
982 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
983 return 0;
984 }
985
986 static int
987 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
988 int *c)
989 {
990 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
991 struct netdev *netdev = rx->up.netdev;
992 struct dpif_packet *packet;
993 struct ofpbuf *buffer;
994 ssize_t retval;
995 int mtu;
996
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
999 }
1000
1001 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1002 DP_NETDEV_HEADROOM);
1003 buffer = &packet->ofpbuf;
1004
1005 retval = (rx->is_tap
1006 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1007 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1008
1009 if (retval) {
1010 if (retval != EAGAIN && retval != EMSGSIZE) {
1011 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1012 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1013 }
1014 dpif_packet_delete(packet);
1015 } else {
1016 dp_packet_pad(buffer);
1017 dpif_packet_set_dp_hash(packet, 0);
1018 packets[0] = packet;
1019 *c = 1;
1020 }
1021
1022 return retval;
1023 }
1024
1025 static void
1026 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1027 {
1028 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1029 poll_fd_wait(rx->fd, POLLIN);
1030 }
1031
1032 static int
1033 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1034 {
1035 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1036 if (rx->is_tap) {
1037 struct ifreq ifr;
1038 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1039 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1040 if (error) {
1041 return error;
1042 }
1043 drain_fd(rx->fd, ifr.ifr_qlen);
1044 return 0;
1045 } else {
1046 return drain_rcvbuf(rx->fd);
1047 }
1048 }
1049
1050 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1051 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1052 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1053 * the packet is too big or too small to transmit on the device.
1054 *
1055 * The caller retains ownership of 'buffer' in all cases.
1056 *
1057 * The kernel maintains a packet transmission queue, so the caller is not
1058 * expected to do additional queuing of packets. */
1059 static int
1060 netdev_linux_send(struct netdev *netdev_, struct dpif_packet **pkts, int cnt,
1061 bool may_steal)
1062 {
1063 int i;
1064 int error = 0;
1065
1066 /* 'i' is incremented only if there's no error */
1067 for (i = 0; i < cnt;) {
1068 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1069 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1070 ssize_t retval;
1071
1072 if (!is_tap_netdev(netdev_)) {
1073 /* Use our AF_PACKET socket to send to this device. */
1074 struct sockaddr_ll sll;
1075 struct msghdr msg;
1076 struct iovec iov;
1077 int ifindex;
1078 int sock;
1079
1080 sock = af_packet_sock();
1081 if (sock < 0) {
1082 return -sock;
1083 }
1084
1085 ifindex = netdev_get_ifindex(netdev_);
1086 if (ifindex < 0) {
1087 return -ifindex;
1088 }
1089
1090 /* We don't bother setting most fields in sockaddr_ll because the
1091 * kernel ignores them for SOCK_RAW. */
1092 memset(&sll, 0, sizeof sll);
1093 sll.sll_family = AF_PACKET;
1094 sll.sll_ifindex = ifindex;
1095
1096 iov.iov_base = CONST_CAST(void *, data);
1097 iov.iov_len = size;
1098
1099 msg.msg_name = &sll;
1100 msg.msg_namelen = sizeof sll;
1101 msg.msg_iov = &iov;
1102 msg.msg_iovlen = 1;
1103 msg.msg_control = NULL;
1104 msg.msg_controllen = 0;
1105 msg.msg_flags = 0;
1106
1107 retval = sendmsg(sock, &msg, 0);
1108 } else {
1109 /* Use the tap fd to send to this device. This is essential for
1110 * tap devices, because packets sent to a tap device with an
1111 * AF_PACKET socket will loop back to be *received* again on the
1112 * tap device. This doesn't occur on other interface types
1113 * because we attach a socket filter to the rx socket. */
1114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1115
1116 retval = write(netdev->tap_fd, data, size);
1117 }
1118
1119 if (retval < 0) {
1120 /* The Linux AF_PACKET implementation never blocks waiting for room
1121 * for packets, instead returning ENOBUFS. Translate this into
1122 * EAGAIN for the caller. */
1123 error = errno == ENOBUFS ? EAGAIN : errno;
1124 if (error == EINTR) {
1125 /* continue without incrementing 'i', i.e. retry this packet */
1126 continue;
1127 }
1128 break;
1129 } else if (retval != size) {
1130 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1131 " of %"PRIuSIZE") on %s", retval, size,
1132 netdev_get_name(netdev_));
1133 error = EMSGSIZE;
1134 break;
1135 }
1136
1137 /* Process the next packet in the batch */
1138 i++;
1139 }
1140
1141 if (may_steal) {
1142 for (i = 0; i < cnt; i++) {
1143 dpif_packet_delete(pkts[i]);
1144 }
1145 }
1146
1147 if (error && error != EAGAIN) {
1148 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1149 netdev_get_name(netdev_), ovs_strerror(error));
1150 }
1151
1152 return error;
1153
1154 }
1155
1156 /* Registers with the poll loop to wake up from the next call to poll_block()
1157 * when the packet transmission queue has sufficient room to transmit a packet
1158 * with netdev_send().
1159 *
1160 * The kernel maintains a packet transmission queue, so the client is not
1161 * expected to do additional queuing of packets. Thus, this function is
1162 * unlikely to ever be used. It is included for completeness. */
1163 static void
1164 netdev_linux_send_wait(struct netdev *netdev)
1165 {
1166 if (is_tap_netdev(netdev)) {
1167 /* TAP device always accepts packets.*/
1168 poll_immediate_wake();
1169 }
1170 }
1171
1172 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1173 * otherwise a positive errno value. */
1174 static int
1175 netdev_linux_set_etheraddr(struct netdev *netdev_,
1176 const uint8_t mac[ETH_ADDR_LEN])
1177 {
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179 enum netdev_flags old_flags = 0;
1180 int error;
1181
1182 ovs_mutex_lock(&netdev->mutex);
1183
1184 if (netdev->cache_valid & VALID_ETHERADDR) {
1185 error = netdev->ether_addr_error;
1186 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1187 goto exit;
1188 }
1189 netdev->cache_valid &= ~VALID_ETHERADDR;
1190 }
1191
1192 /* Tap devices must be brought down before setting the address. */
1193 if (is_tap_netdev(netdev_)) {
1194 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1195 }
1196 error = set_etheraddr(netdev_get_name(netdev_), mac);
1197 if (!error || error == ENODEV) {
1198 netdev->ether_addr_error = error;
1199 netdev->cache_valid |= VALID_ETHERADDR;
1200 if (!error) {
1201 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1202 }
1203 }
1204
1205 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1206 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1207 }
1208
1209 exit:
1210 ovs_mutex_unlock(&netdev->mutex);
1211 return error;
1212 }
1213
1214 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1215 static int
1216 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1217 uint8_t mac[ETH_ADDR_LEN])
1218 {
1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1220 int error;
1221
1222 ovs_mutex_lock(&netdev->mutex);
1223 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1224 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1225 netdev->etheraddr);
1226 netdev->cache_valid |= VALID_ETHERADDR;
1227 }
1228
1229 error = netdev->ether_addr_error;
1230 if (!error) {
1231 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1232 }
1233 ovs_mutex_unlock(&netdev->mutex);
1234
1235 return error;
1236 }
1237
1238 static int
1239 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1240 {
1241 int error;
1242
1243 if (!(netdev->cache_valid & VALID_MTU)) {
1244 struct ifreq ifr;
1245
1246 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1247 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1248 netdev->mtu = ifr.ifr_mtu;
1249 netdev->cache_valid |= VALID_MTU;
1250 }
1251
1252 error = netdev->netdev_mtu_error;
1253 if (!error) {
1254 *mtup = netdev->mtu;
1255 }
1256
1257 return error;
1258 }
1259
1260 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1261 * in bytes, not including the hardware header; thus, this is typically 1500
1262 * bytes for Ethernet devices. */
1263 static int
1264 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1265 {
1266 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1267 int error;
1268
1269 ovs_mutex_lock(&netdev->mutex);
1270 error = netdev_linux_get_mtu__(netdev, mtup);
1271 ovs_mutex_unlock(&netdev->mutex);
1272
1273 return error;
1274 }
1275
1276 /* Sets the maximum size of transmitted (MTU) for given device using linux
1277 * networking ioctl interface.
1278 */
1279 static int
1280 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1281 {
1282 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1283 struct ifreq ifr;
1284 int error;
1285
1286 ovs_mutex_lock(&netdev->mutex);
1287 if (netdev->cache_valid & VALID_MTU) {
1288 error = netdev->netdev_mtu_error;
1289 if (error || netdev->mtu == mtu) {
1290 goto exit;
1291 }
1292 netdev->cache_valid &= ~VALID_MTU;
1293 }
1294 ifr.ifr_mtu = mtu;
1295 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1296 SIOCSIFMTU, "SIOCSIFMTU");
1297 if (!error || error == ENODEV) {
1298 netdev->netdev_mtu_error = error;
1299 netdev->mtu = ifr.ifr_mtu;
1300 netdev->cache_valid |= VALID_MTU;
1301 }
1302 exit:
1303 ovs_mutex_unlock(&netdev->mutex);
1304 return error;
1305 }
1306
1307 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1308 * On failure, returns a negative errno value. */
1309 static int
1310 netdev_linux_get_ifindex(const struct netdev *netdev_)
1311 {
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1313 int ifindex, error;
1314
1315 ovs_mutex_lock(&netdev->mutex);
1316 error = get_ifindex(netdev_, &ifindex);
1317 ovs_mutex_unlock(&netdev->mutex);
1318
1319 return error ? -error : ifindex;
1320 }
1321
1322 static int
1323 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1324 {
1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1326
1327 ovs_mutex_lock(&netdev->mutex);
1328 if (netdev->miimon_interval > 0) {
1329 *carrier = netdev->miimon;
1330 } else {
1331 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1332 }
1333 ovs_mutex_unlock(&netdev->mutex);
1334
1335 return 0;
1336 }
1337
1338 static long long int
1339 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1340 {
1341 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1342 long long int carrier_resets;
1343
1344 ovs_mutex_lock(&netdev->mutex);
1345 carrier_resets = netdev->carrier_resets;
1346 ovs_mutex_unlock(&netdev->mutex);
1347
1348 return carrier_resets;
1349 }
1350
1351 static int
1352 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1353 struct mii_ioctl_data *data)
1354 {
1355 struct ifreq ifr;
1356 int error;
1357
1358 memset(&ifr, 0, sizeof ifr);
1359 memcpy(&ifr.ifr_data, data, sizeof *data);
1360 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1361 memcpy(data, &ifr.ifr_data, sizeof *data);
1362
1363 return error;
1364 }
1365
1366 static int
1367 netdev_linux_get_miimon(const char *name, bool *miimon)
1368 {
1369 struct mii_ioctl_data data;
1370 int error;
1371
1372 *miimon = false;
1373
1374 memset(&data, 0, sizeof data);
1375 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1376 if (!error) {
1377 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1378 data.reg_num = MII_BMSR;
1379 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1380 &data);
1381
1382 if (!error) {
1383 *miimon = !!(data.val_out & BMSR_LSTATUS);
1384 } else {
1385 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1386 }
1387 } else {
1388 struct ethtool_cmd ecmd;
1389
1390 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1391 name);
1392
1393 COVERAGE_INC(netdev_get_ethtool);
1394 memset(&ecmd, 0, sizeof ecmd);
1395 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1396 "ETHTOOL_GLINK");
1397 if (!error) {
1398 struct ethtool_value eval;
1399
1400 memcpy(&eval, &ecmd, sizeof eval);
1401 *miimon = !!eval.data;
1402 } else {
1403 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1404 }
1405 }
1406
1407 return error;
1408 }
1409
1410 static int
1411 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1412 long long int interval)
1413 {
1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1415
1416 ovs_mutex_lock(&netdev->mutex);
1417 interval = interval > 0 ? MAX(interval, 100) : 0;
1418 if (netdev->miimon_interval != interval) {
1419 if (interval && !netdev->miimon_interval) {
1420 atomic_count_inc(&miimon_cnt);
1421 } else if (!interval && netdev->miimon_interval) {
1422 atomic_count_dec(&miimon_cnt);
1423 }
1424
1425 netdev->miimon_interval = interval;
1426 timer_set_expired(&netdev->miimon_timer);
1427 }
1428 ovs_mutex_unlock(&netdev->mutex);
1429
1430 return 0;
1431 }
1432
1433 static void
1434 netdev_linux_miimon_run(void)
1435 {
1436 struct shash device_shash;
1437 struct shash_node *node;
1438
1439 shash_init(&device_shash);
1440 netdev_get_devices(&netdev_linux_class, &device_shash);
1441 SHASH_FOR_EACH (node, &device_shash) {
1442 struct netdev *netdev = node->data;
1443 struct netdev_linux *dev = netdev_linux_cast(netdev);
1444 bool miimon;
1445
1446 ovs_mutex_lock(&dev->mutex);
1447 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1448 netdev_linux_get_miimon(dev->up.name, &miimon);
1449 if (miimon != dev->miimon) {
1450 dev->miimon = miimon;
1451 netdev_linux_changed(dev, dev->ifi_flags, 0);
1452 }
1453
1454 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1455 }
1456 ovs_mutex_unlock(&dev->mutex);
1457 netdev_close(netdev);
1458 }
1459
1460 shash_destroy(&device_shash);
1461 }
1462
1463 static void
1464 netdev_linux_miimon_wait(void)
1465 {
1466 struct shash device_shash;
1467 struct shash_node *node;
1468
1469 shash_init(&device_shash);
1470 netdev_get_devices(&netdev_linux_class, &device_shash);
1471 SHASH_FOR_EACH (node, &device_shash) {
1472 struct netdev *netdev = node->data;
1473 struct netdev_linux *dev = netdev_linux_cast(netdev);
1474
1475 ovs_mutex_lock(&dev->mutex);
1476 if (dev->miimon_interval > 0) {
1477 timer_wait(&dev->miimon_timer);
1478 }
1479 ovs_mutex_unlock(&dev->mutex);
1480 netdev_close(netdev);
1481 }
1482 shash_destroy(&device_shash);
1483 }
1484
1485 static void
1486 swap_uint64(uint64_t *a, uint64_t *b)
1487 {
1488 uint64_t tmp = *a;
1489 *a = *b;
1490 *b = tmp;
1491 }
1492
1493 /* Copies 'src' into 'dst', performing format conversion in the process.
1494 *
1495 * 'src' is allowed to be misaligned. */
1496 static void
1497 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1498 const struct ovs_vport_stats *src)
1499 {
1500 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1501 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1502 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1503 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1504 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1505 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1506 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1507 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1508 dst->multicast = 0;
1509 dst->collisions = 0;
1510 dst->rx_length_errors = 0;
1511 dst->rx_over_errors = 0;
1512 dst->rx_crc_errors = 0;
1513 dst->rx_frame_errors = 0;
1514 dst->rx_fifo_errors = 0;
1515 dst->rx_missed_errors = 0;
1516 dst->tx_aborted_errors = 0;
1517 dst->tx_carrier_errors = 0;
1518 dst->tx_fifo_errors = 0;
1519 dst->tx_heartbeat_errors = 0;
1520 dst->tx_window_errors = 0;
1521 }
1522
1523 static int
1524 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1525 {
1526 struct dpif_linux_vport reply;
1527 struct ofpbuf *buf;
1528 int error;
1529
1530 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1531 if (error) {
1532 return error;
1533 } else if (!reply.stats) {
1534 ofpbuf_delete(buf);
1535 return EOPNOTSUPP;
1536 }
1537
1538 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1539
1540 ofpbuf_delete(buf);
1541
1542 return 0;
1543 }
1544
1545 static void
1546 get_stats_via_vport(const struct netdev *netdev_,
1547 struct netdev_stats *stats)
1548 {
1549 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1550
1551 if (!netdev->vport_stats_error ||
1552 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1553 int error;
1554
1555 error = get_stats_via_vport__(netdev_, stats);
1556 if (error && error != ENOENT) {
1557 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1558 "(%s)",
1559 netdev_get_name(netdev_), ovs_strerror(error));
1560 }
1561 netdev->vport_stats_error = error;
1562 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1563 }
1564 }
1565
1566 /* Retrieves current device stats for 'netdev-linux'. */
1567 static int
1568 netdev_linux_get_stats(const struct netdev *netdev_,
1569 struct netdev_stats *stats)
1570 {
1571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1572 struct netdev_stats dev_stats;
1573 int error;
1574
1575 ovs_mutex_lock(&netdev->mutex);
1576 get_stats_via_vport(netdev_, stats);
1577 error = get_stats_via_netlink(netdev_, &dev_stats);
1578 if (error) {
1579 if (!netdev->vport_stats_error) {
1580 error = 0;
1581 }
1582 } else if (netdev->vport_stats_error) {
1583 /* stats not available from OVS then use netdev stats. */
1584 *stats = dev_stats;
1585 } else {
1586 /* Use kernel netdev's packet and byte counts since vport's counters
1587 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1588 * enabled. */
1589 stats->rx_packets = dev_stats.rx_packets;
1590 stats->rx_bytes = dev_stats.rx_bytes;
1591 stats->tx_packets = dev_stats.tx_packets;
1592 stats->tx_bytes = dev_stats.tx_bytes;
1593
1594 stats->rx_errors += dev_stats.rx_errors;
1595 stats->tx_errors += dev_stats.tx_errors;
1596 stats->rx_dropped += dev_stats.rx_dropped;
1597 stats->tx_dropped += dev_stats.tx_dropped;
1598 stats->multicast += dev_stats.multicast;
1599 stats->collisions += dev_stats.collisions;
1600 stats->rx_length_errors += dev_stats.rx_length_errors;
1601 stats->rx_over_errors += dev_stats.rx_over_errors;
1602 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1603 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1604 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1605 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1606 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1607 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1608 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1609 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1610 stats->tx_window_errors += dev_stats.tx_window_errors;
1611 }
1612 ovs_mutex_unlock(&netdev->mutex);
1613
1614 return error;
1615 }
1616
1617 /* Retrieves current device stats for 'netdev-tap' netdev or
1618 * netdev-internal. */
1619 static int
1620 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1621 {
1622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1623 struct netdev_stats dev_stats;
1624 int error;
1625
1626 ovs_mutex_lock(&netdev->mutex);
1627 get_stats_via_vport(netdev_, stats);
1628 error = get_stats_via_netlink(netdev_, &dev_stats);
1629 if (error) {
1630 if (!netdev->vport_stats_error) {
1631 error = 0;
1632 }
1633 } else if (netdev->vport_stats_error) {
1634 /* Transmit and receive stats will appear to be swapped relative to the
1635 * other ports since we are the one sending the data, not a remote
1636 * computer. For consistency, we swap them back here. This does not
1637 * apply if we are getting stats from the vport layer because it always
1638 * tracks stats from the perspective of the switch. */
1639
1640 *stats = dev_stats;
1641 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1642 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1643 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1644 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1645 stats->rx_length_errors = 0;
1646 stats->rx_over_errors = 0;
1647 stats->rx_crc_errors = 0;
1648 stats->rx_frame_errors = 0;
1649 stats->rx_fifo_errors = 0;
1650 stats->rx_missed_errors = 0;
1651 stats->tx_aborted_errors = 0;
1652 stats->tx_carrier_errors = 0;
1653 stats->tx_fifo_errors = 0;
1654 stats->tx_heartbeat_errors = 0;
1655 stats->tx_window_errors = 0;
1656 } else {
1657 /* Use kernel netdev's packet and byte counts since vport counters
1658 * do not reflect packet counts on the wire when GSO, TSO or GRO
1659 * are enabled. */
1660 stats->rx_packets = dev_stats.tx_packets;
1661 stats->rx_bytes = dev_stats.tx_bytes;
1662 stats->tx_packets = dev_stats.rx_packets;
1663 stats->tx_bytes = dev_stats.rx_bytes;
1664
1665 stats->rx_dropped += dev_stats.tx_dropped;
1666 stats->tx_dropped += dev_stats.rx_dropped;
1667
1668 stats->rx_errors += dev_stats.tx_errors;
1669 stats->tx_errors += dev_stats.rx_errors;
1670
1671 stats->multicast += dev_stats.multicast;
1672 stats->collisions += dev_stats.collisions;
1673 }
1674 ovs_mutex_unlock(&netdev->mutex);
1675
1676 return error;
1677 }
1678
1679 static int
1680 netdev_internal_get_stats(const struct netdev *netdev_,
1681 struct netdev_stats *stats)
1682 {
1683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1684 int error;
1685
1686 ovs_mutex_lock(&netdev->mutex);
1687 get_stats_via_vport(netdev_, stats);
1688 error = netdev->vport_stats_error;
1689 ovs_mutex_unlock(&netdev->mutex);
1690
1691 return error;
1692 }
1693
1694 static int
1695 netdev_internal_set_stats(struct netdev *netdev,
1696 const struct netdev_stats *stats)
1697 {
1698 struct ovs_vport_stats vport_stats;
1699 struct dpif_linux_vport vport;
1700 int err;
1701
1702 put_32aligned_u64(&vport_stats.rx_packets, stats->rx_packets);
1703 put_32aligned_u64(&vport_stats.tx_packets, stats->tx_packets);
1704 put_32aligned_u64(&vport_stats.rx_bytes, stats->rx_bytes);
1705 put_32aligned_u64(&vport_stats.tx_bytes, stats->tx_bytes);
1706 put_32aligned_u64(&vport_stats.rx_errors, stats->rx_errors);
1707 put_32aligned_u64(&vport_stats.tx_errors, stats->tx_errors);
1708 put_32aligned_u64(&vport_stats.rx_dropped, stats->rx_dropped);
1709 put_32aligned_u64(&vport_stats.tx_dropped, stats->tx_dropped);
1710
1711 dpif_linux_vport_init(&vport);
1712 vport.cmd = OVS_VPORT_CMD_SET;
1713 vport.name = netdev_get_name(netdev);
1714 vport.stats = &vport_stats;
1715
1716 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1717
1718 /* If the vport layer doesn't know about the device, that doesn't mean it
1719 * doesn't exist (after all were able to open it when netdev_open() was
1720 * called), it just means that it isn't attached and we'll be getting
1721 * stats a different way. */
1722 if (err == ENODEV) {
1723 err = EOPNOTSUPP;
1724 }
1725
1726 return err;
1727 }
1728
1729 static void
1730 netdev_linux_read_features(struct netdev_linux *netdev)
1731 {
1732 struct ethtool_cmd ecmd;
1733 uint32_t speed;
1734 int error;
1735
1736 if (netdev->cache_valid & VALID_FEATURES) {
1737 return;
1738 }
1739
1740 COVERAGE_INC(netdev_get_ethtool);
1741 memset(&ecmd, 0, sizeof ecmd);
1742 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1743 ETHTOOL_GSET, "ETHTOOL_GSET");
1744 if (error) {
1745 goto out;
1746 }
1747
1748 /* Supported features. */
1749 netdev->supported = 0;
1750 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1751 netdev->supported |= NETDEV_F_10MB_HD;
1752 }
1753 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1754 netdev->supported |= NETDEV_F_10MB_FD;
1755 }
1756 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1757 netdev->supported |= NETDEV_F_100MB_HD;
1758 }
1759 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1760 netdev->supported |= NETDEV_F_100MB_FD;
1761 }
1762 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1763 netdev->supported |= NETDEV_F_1GB_HD;
1764 }
1765 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1766 netdev->supported |= NETDEV_F_1GB_FD;
1767 }
1768 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1769 netdev->supported |= NETDEV_F_10GB_FD;
1770 }
1771 if (ecmd.supported & SUPPORTED_TP) {
1772 netdev->supported |= NETDEV_F_COPPER;
1773 }
1774 if (ecmd.supported & SUPPORTED_FIBRE) {
1775 netdev->supported |= NETDEV_F_FIBER;
1776 }
1777 if (ecmd.supported & SUPPORTED_Autoneg) {
1778 netdev->supported |= NETDEV_F_AUTONEG;
1779 }
1780 if (ecmd.supported & SUPPORTED_Pause) {
1781 netdev->supported |= NETDEV_F_PAUSE;
1782 }
1783 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1784 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1785 }
1786
1787 /* Advertised features. */
1788 netdev->advertised = 0;
1789 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1790 netdev->advertised |= NETDEV_F_10MB_HD;
1791 }
1792 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1793 netdev->advertised |= NETDEV_F_10MB_FD;
1794 }
1795 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1796 netdev->advertised |= NETDEV_F_100MB_HD;
1797 }
1798 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1799 netdev->advertised |= NETDEV_F_100MB_FD;
1800 }
1801 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1802 netdev->advertised |= NETDEV_F_1GB_HD;
1803 }
1804 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1805 netdev->advertised |= NETDEV_F_1GB_FD;
1806 }
1807 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1808 netdev->advertised |= NETDEV_F_10GB_FD;
1809 }
1810 if (ecmd.advertising & ADVERTISED_TP) {
1811 netdev->advertised |= NETDEV_F_COPPER;
1812 }
1813 if (ecmd.advertising & ADVERTISED_FIBRE) {
1814 netdev->advertised |= NETDEV_F_FIBER;
1815 }
1816 if (ecmd.advertising & ADVERTISED_Autoneg) {
1817 netdev->advertised |= NETDEV_F_AUTONEG;
1818 }
1819 if (ecmd.advertising & ADVERTISED_Pause) {
1820 netdev->advertised |= NETDEV_F_PAUSE;
1821 }
1822 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1823 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1824 }
1825
1826 /* Current settings. */
1827 speed = ecmd.speed;
1828 if (speed == SPEED_10) {
1829 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1830 } else if (speed == SPEED_100) {
1831 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1832 } else if (speed == SPEED_1000) {
1833 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1834 } else if (speed == SPEED_10000) {
1835 netdev->current = NETDEV_F_10GB_FD;
1836 } else if (speed == 40000) {
1837 netdev->current = NETDEV_F_40GB_FD;
1838 } else if (speed == 100000) {
1839 netdev->current = NETDEV_F_100GB_FD;
1840 } else if (speed == 1000000) {
1841 netdev->current = NETDEV_F_1TB_FD;
1842 } else {
1843 netdev->current = 0;
1844 }
1845
1846 if (ecmd.port == PORT_TP) {
1847 netdev->current |= NETDEV_F_COPPER;
1848 } else if (ecmd.port == PORT_FIBRE) {
1849 netdev->current |= NETDEV_F_FIBER;
1850 }
1851
1852 if (ecmd.autoneg) {
1853 netdev->current |= NETDEV_F_AUTONEG;
1854 }
1855
1856 out:
1857 netdev->cache_valid |= VALID_FEATURES;
1858 netdev->get_features_error = error;
1859 }
1860
1861 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1862 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1863 * Returns 0 if successful, otherwise a positive errno value. */
1864 static int
1865 netdev_linux_get_features(const struct netdev *netdev_,
1866 enum netdev_features *current,
1867 enum netdev_features *advertised,
1868 enum netdev_features *supported,
1869 enum netdev_features *peer)
1870 {
1871 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1872 int error;
1873
1874 ovs_mutex_lock(&netdev->mutex);
1875 netdev_linux_read_features(netdev);
1876 if (!netdev->get_features_error) {
1877 *current = netdev->current;
1878 *advertised = netdev->advertised;
1879 *supported = netdev->supported;
1880 *peer = 0; /* XXX */
1881 }
1882 error = netdev->get_features_error;
1883 ovs_mutex_unlock(&netdev->mutex);
1884
1885 return error;
1886 }
1887
1888 /* Set the features advertised by 'netdev' to 'advertise'. */
1889 static int
1890 netdev_linux_set_advertisements(struct netdev *netdev_,
1891 enum netdev_features advertise)
1892 {
1893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1894 struct ethtool_cmd ecmd;
1895 int error;
1896
1897 ovs_mutex_lock(&netdev->mutex);
1898
1899 COVERAGE_INC(netdev_get_ethtool);
1900 memset(&ecmd, 0, sizeof ecmd);
1901 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1902 ETHTOOL_GSET, "ETHTOOL_GSET");
1903 if (error) {
1904 goto exit;
1905 }
1906
1907 ecmd.advertising = 0;
1908 if (advertise & NETDEV_F_10MB_HD) {
1909 ecmd.advertising |= ADVERTISED_10baseT_Half;
1910 }
1911 if (advertise & NETDEV_F_10MB_FD) {
1912 ecmd.advertising |= ADVERTISED_10baseT_Full;
1913 }
1914 if (advertise & NETDEV_F_100MB_HD) {
1915 ecmd.advertising |= ADVERTISED_100baseT_Half;
1916 }
1917 if (advertise & NETDEV_F_100MB_FD) {
1918 ecmd.advertising |= ADVERTISED_100baseT_Full;
1919 }
1920 if (advertise & NETDEV_F_1GB_HD) {
1921 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1922 }
1923 if (advertise & NETDEV_F_1GB_FD) {
1924 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1925 }
1926 if (advertise & NETDEV_F_10GB_FD) {
1927 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1928 }
1929 if (advertise & NETDEV_F_COPPER) {
1930 ecmd.advertising |= ADVERTISED_TP;
1931 }
1932 if (advertise & NETDEV_F_FIBER) {
1933 ecmd.advertising |= ADVERTISED_FIBRE;
1934 }
1935 if (advertise & NETDEV_F_AUTONEG) {
1936 ecmd.advertising |= ADVERTISED_Autoneg;
1937 }
1938 if (advertise & NETDEV_F_PAUSE) {
1939 ecmd.advertising |= ADVERTISED_Pause;
1940 }
1941 if (advertise & NETDEV_F_PAUSE_ASYM) {
1942 ecmd.advertising |= ADVERTISED_Asym_Pause;
1943 }
1944 COVERAGE_INC(netdev_set_ethtool);
1945 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1946 ETHTOOL_SSET, "ETHTOOL_SSET");
1947
1948 exit:
1949 ovs_mutex_unlock(&netdev->mutex);
1950 return error;
1951 }
1952
1953 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1954 * successful, otherwise a positive errno value. */
1955 static int
1956 netdev_linux_set_policing(struct netdev *netdev_,
1957 uint32_t kbits_rate, uint32_t kbits_burst)
1958 {
1959 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1960 const char *netdev_name = netdev_get_name(netdev_);
1961 int error;
1962
1963 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1964 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1965 : kbits_burst); /* Stick with user-specified value. */
1966
1967 ovs_mutex_lock(&netdev->mutex);
1968 if (netdev->cache_valid & VALID_POLICING) {
1969 error = netdev->netdev_policing_error;
1970 if (error || (netdev->kbits_rate == kbits_rate &&
1971 netdev->kbits_burst == kbits_burst)) {
1972 /* Assume that settings haven't changed since we last set them. */
1973 goto out;
1974 }
1975 netdev->cache_valid &= ~VALID_POLICING;
1976 }
1977
1978 COVERAGE_INC(netdev_set_policing);
1979 /* Remove any existing ingress qdisc. */
1980 error = tc_add_del_ingress_qdisc(netdev_, false);
1981 if (error) {
1982 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1983 netdev_name, ovs_strerror(error));
1984 goto out;
1985 }
1986
1987 if (kbits_rate) {
1988 error = tc_add_del_ingress_qdisc(netdev_, true);
1989 if (error) {
1990 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1991 netdev_name, ovs_strerror(error));
1992 goto out;
1993 }
1994
1995 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1996 if (error){
1997 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1998 netdev_name, ovs_strerror(error));
1999 goto out;
2000 }
2001 }
2002
2003 netdev->kbits_rate = kbits_rate;
2004 netdev->kbits_burst = kbits_burst;
2005
2006 out:
2007 if (!error || error == ENODEV) {
2008 netdev->netdev_policing_error = error;
2009 netdev->cache_valid |= VALID_POLICING;
2010 }
2011 ovs_mutex_unlock(&netdev->mutex);
2012 return error;
2013 }
2014
2015 static int
2016 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2017 struct sset *types)
2018 {
2019 const struct tc_ops *const *opsp;
2020
2021 for (opsp = tcs; *opsp != NULL; opsp++) {
2022 const struct tc_ops *ops = *opsp;
2023 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2024 sset_add(types, ops->ovs_name);
2025 }
2026 }
2027 return 0;
2028 }
2029
2030 static const struct tc_ops *
2031 tc_lookup_ovs_name(const char *name)
2032 {
2033 const struct tc_ops *const *opsp;
2034
2035 for (opsp = tcs; *opsp != NULL; opsp++) {
2036 const struct tc_ops *ops = *opsp;
2037 if (!strcmp(name, ops->ovs_name)) {
2038 return ops;
2039 }
2040 }
2041 return NULL;
2042 }
2043
2044 static const struct tc_ops *
2045 tc_lookup_linux_name(const char *name)
2046 {
2047 const struct tc_ops *const *opsp;
2048
2049 for (opsp = tcs; *opsp != NULL; opsp++) {
2050 const struct tc_ops *ops = *opsp;
2051 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2052 return ops;
2053 }
2054 }
2055 return NULL;
2056 }
2057
2058 static struct tc_queue *
2059 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2060 size_t hash)
2061 {
2062 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2063 struct tc_queue *queue;
2064
2065 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2066 if (queue->queue_id == queue_id) {
2067 return queue;
2068 }
2069 }
2070 return NULL;
2071 }
2072
2073 static struct tc_queue *
2074 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2075 {
2076 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2077 }
2078
2079 static int
2080 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2081 const char *type,
2082 struct netdev_qos_capabilities *caps)
2083 {
2084 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2085 if (!ops) {
2086 return EOPNOTSUPP;
2087 }
2088 caps->n_queues = ops->n_queues;
2089 return 0;
2090 }
2091
2092 static int
2093 netdev_linux_get_qos(const struct netdev *netdev_,
2094 const char **typep, struct smap *details)
2095 {
2096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2097 int error;
2098
2099 ovs_mutex_lock(&netdev->mutex);
2100 error = tc_query_qdisc(netdev_);
2101 if (!error) {
2102 *typep = netdev->tc->ops->ovs_name;
2103 error = (netdev->tc->ops->qdisc_get
2104 ? netdev->tc->ops->qdisc_get(netdev_, details)
2105 : 0);
2106 }
2107 ovs_mutex_unlock(&netdev->mutex);
2108
2109 return error;
2110 }
2111
2112 static int
2113 netdev_linux_set_qos(struct netdev *netdev_,
2114 const char *type, const struct smap *details)
2115 {
2116 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2117 const struct tc_ops *new_ops;
2118 int error;
2119
2120 new_ops = tc_lookup_ovs_name(type);
2121 if (!new_ops || !new_ops->tc_install) {
2122 return EOPNOTSUPP;
2123 }
2124
2125 ovs_mutex_lock(&netdev->mutex);
2126 error = tc_query_qdisc(netdev_);
2127 if (error) {
2128 goto exit;
2129 }
2130
2131 if (new_ops == netdev->tc->ops) {
2132 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2133 } else {
2134 /* Delete existing qdisc. */
2135 error = tc_del_qdisc(netdev_);
2136 if (error) {
2137 goto exit;
2138 }
2139 ovs_assert(netdev->tc == NULL);
2140
2141 /* Install new qdisc. */
2142 error = new_ops->tc_install(netdev_, details);
2143 ovs_assert((error == 0) == (netdev->tc != NULL));
2144 }
2145
2146 exit:
2147 ovs_mutex_unlock(&netdev->mutex);
2148 return error;
2149 }
2150
2151 static int
2152 netdev_linux_get_queue(const struct netdev *netdev_,
2153 unsigned int queue_id, struct smap *details)
2154 {
2155 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2156 int error;
2157
2158 ovs_mutex_lock(&netdev->mutex);
2159 error = tc_query_qdisc(netdev_);
2160 if (!error) {
2161 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2162 error = (queue
2163 ? netdev->tc->ops->class_get(netdev_, queue, details)
2164 : ENOENT);
2165 }
2166 ovs_mutex_unlock(&netdev->mutex);
2167
2168 return error;
2169 }
2170
2171 static int
2172 netdev_linux_set_queue(struct netdev *netdev_,
2173 unsigned int queue_id, const struct smap *details)
2174 {
2175 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2176 int error;
2177
2178 ovs_mutex_lock(&netdev->mutex);
2179 error = tc_query_qdisc(netdev_);
2180 if (!error) {
2181 error = (queue_id < netdev->tc->ops->n_queues
2182 && netdev->tc->ops->class_set
2183 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2184 : EINVAL);
2185 }
2186 ovs_mutex_unlock(&netdev->mutex);
2187
2188 return error;
2189 }
2190
2191 static int
2192 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2193 {
2194 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2195 int error;
2196
2197 ovs_mutex_lock(&netdev->mutex);
2198 error = tc_query_qdisc(netdev_);
2199 if (!error) {
2200 if (netdev->tc->ops->class_delete) {
2201 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2202 error = (queue
2203 ? netdev->tc->ops->class_delete(netdev_, queue)
2204 : ENOENT);
2205 } else {
2206 error = EINVAL;
2207 }
2208 }
2209 ovs_mutex_unlock(&netdev->mutex);
2210
2211 return error;
2212 }
2213
2214 static int
2215 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2216 unsigned int queue_id,
2217 struct netdev_queue_stats *stats)
2218 {
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2220 int error;
2221
2222 ovs_mutex_lock(&netdev->mutex);
2223 error = tc_query_qdisc(netdev_);
2224 if (!error) {
2225 if (netdev->tc->ops->class_get_stats) {
2226 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2227 if (queue) {
2228 stats->created = queue->created;
2229 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2230 stats);
2231 } else {
2232 error = ENOENT;
2233 }
2234 } else {
2235 error = EOPNOTSUPP;
2236 }
2237 }
2238 ovs_mutex_unlock(&netdev->mutex);
2239
2240 return error;
2241 }
2242
2243 struct queue_dump_state {
2244 struct nl_dump dump;
2245 struct ofpbuf buf;
2246 };
2247
2248 static bool
2249 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2250 {
2251 struct ofpbuf request;
2252 struct tcmsg *tcmsg;
2253
2254 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2255 if (!tcmsg) {
2256 return false;
2257 }
2258 tcmsg->tcm_parent = 0;
2259 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2260 ofpbuf_uninit(&request);
2261
2262 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2263 return true;
2264 }
2265
2266 static int
2267 finish_queue_dump(struct queue_dump_state *state)
2268 {
2269 ofpbuf_uninit(&state->buf);
2270 return nl_dump_done(&state->dump);
2271 }
2272
2273 struct netdev_linux_queue_state {
2274 unsigned int *queues;
2275 size_t cur_queue;
2276 size_t n_queues;
2277 };
2278
2279 static int
2280 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2281 {
2282 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2283 int error;
2284
2285 ovs_mutex_lock(&netdev->mutex);
2286 error = tc_query_qdisc(netdev_);
2287 if (!error) {
2288 if (netdev->tc->ops->class_get) {
2289 struct netdev_linux_queue_state *state;
2290 struct tc_queue *queue;
2291 size_t i;
2292
2293 *statep = state = xmalloc(sizeof *state);
2294 state->n_queues = hmap_count(&netdev->tc->queues);
2295 state->cur_queue = 0;
2296 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2297
2298 i = 0;
2299 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2300 state->queues[i++] = queue->queue_id;
2301 }
2302 } else {
2303 error = EOPNOTSUPP;
2304 }
2305 }
2306 ovs_mutex_unlock(&netdev->mutex);
2307
2308 return error;
2309 }
2310
2311 static int
2312 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2313 unsigned int *queue_idp, struct smap *details)
2314 {
2315 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2316 struct netdev_linux_queue_state *state = state_;
2317 int error = EOF;
2318
2319 ovs_mutex_lock(&netdev->mutex);
2320 while (state->cur_queue < state->n_queues) {
2321 unsigned int queue_id = state->queues[state->cur_queue++];
2322 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2323
2324 if (queue) {
2325 *queue_idp = queue_id;
2326 error = netdev->tc->ops->class_get(netdev_, queue, details);
2327 break;
2328 }
2329 }
2330 ovs_mutex_unlock(&netdev->mutex);
2331
2332 return error;
2333 }
2334
2335 static int
2336 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2337 void *state_)
2338 {
2339 struct netdev_linux_queue_state *state = state_;
2340
2341 free(state->queues);
2342 free(state);
2343 return 0;
2344 }
2345
2346 static int
2347 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2348 netdev_dump_queue_stats_cb *cb, void *aux)
2349 {
2350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2351 int error;
2352
2353 ovs_mutex_lock(&netdev->mutex);
2354 error = tc_query_qdisc(netdev_);
2355 if (!error) {
2356 struct queue_dump_state state;
2357
2358 if (!netdev->tc->ops->class_dump_stats) {
2359 error = EOPNOTSUPP;
2360 } else if (!start_queue_dump(netdev_, &state)) {
2361 error = ENODEV;
2362 } else {
2363 struct ofpbuf msg;
2364 int retval;
2365
2366 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2367 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2368 cb, aux);
2369 if (retval) {
2370 error = retval;
2371 }
2372 }
2373
2374 retval = finish_queue_dump(&state);
2375 if (retval) {
2376 error = retval;
2377 }
2378 }
2379 }
2380 ovs_mutex_unlock(&netdev->mutex);
2381
2382 return error;
2383 }
2384
2385 static int
2386 netdev_linux_get_in4(const struct netdev *netdev_,
2387 struct in_addr *address, struct in_addr *netmask)
2388 {
2389 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2390 int error;
2391
2392 ovs_mutex_lock(&netdev->mutex);
2393 if (!(netdev->cache_valid & VALID_IN4)) {
2394 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2395 SIOCGIFADDR, "SIOCGIFADDR");
2396 if (!error) {
2397 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2398 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2399 if (!error) {
2400 netdev->cache_valid |= VALID_IN4;
2401 }
2402 }
2403 } else {
2404 error = 0;
2405 }
2406
2407 if (!error) {
2408 if (netdev->address.s_addr != INADDR_ANY) {
2409 *address = netdev->address;
2410 *netmask = netdev->netmask;
2411 } else {
2412 error = EADDRNOTAVAIL;
2413 }
2414 }
2415 ovs_mutex_unlock(&netdev->mutex);
2416
2417 return error;
2418 }
2419
2420 static int
2421 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2422 struct in_addr netmask)
2423 {
2424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2425 int error;
2426
2427 ovs_mutex_lock(&netdev->mutex);
2428 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2429 if (!error) {
2430 netdev->cache_valid |= VALID_IN4;
2431 netdev->address = address;
2432 netdev->netmask = netmask;
2433 if (address.s_addr != INADDR_ANY) {
2434 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2435 "SIOCSIFNETMASK", netmask);
2436 }
2437 }
2438 ovs_mutex_unlock(&netdev->mutex);
2439
2440 return error;
2441 }
2442
2443 static bool
2444 parse_if_inet6_line(const char *line,
2445 struct in6_addr *in6, char ifname[16 + 1])
2446 {
2447 uint8_t *s6 = in6->s6_addr;
2448 #define X8 "%2"SCNx8
2449 return ovs_scan(line,
2450 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2451 "%*x %*x %*x %*x %16s\n",
2452 &s6[0], &s6[1], &s6[2], &s6[3],
2453 &s6[4], &s6[5], &s6[6], &s6[7],
2454 &s6[8], &s6[9], &s6[10], &s6[11],
2455 &s6[12], &s6[13], &s6[14], &s6[15],
2456 ifname);
2457 }
2458
2459 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2460 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2461 static int
2462 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2463 {
2464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2465
2466 ovs_mutex_lock(&netdev->mutex);
2467 if (!(netdev->cache_valid & VALID_IN6)) {
2468 FILE *file;
2469 char line[128];
2470
2471 netdev->in6 = in6addr_any;
2472
2473 file = fopen("/proc/net/if_inet6", "r");
2474 if (file != NULL) {
2475 const char *name = netdev_get_name(netdev_);
2476 while (fgets(line, sizeof line, file)) {
2477 struct in6_addr in6_tmp;
2478 char ifname[16 + 1];
2479 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2480 && !strcmp(name, ifname))
2481 {
2482 netdev->in6 = in6_tmp;
2483 break;
2484 }
2485 }
2486 fclose(file);
2487 }
2488 netdev->cache_valid |= VALID_IN6;
2489 }
2490 *in6 = netdev->in6;
2491 ovs_mutex_unlock(&netdev->mutex);
2492
2493 return 0;
2494 }
2495
2496 static void
2497 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2498 {
2499 struct sockaddr_in sin;
2500 memset(&sin, 0, sizeof sin);
2501 sin.sin_family = AF_INET;
2502 sin.sin_addr = addr;
2503 sin.sin_port = 0;
2504
2505 memset(sa, 0, sizeof *sa);
2506 memcpy(sa, &sin, sizeof sin);
2507 }
2508
2509 static int
2510 do_set_addr(struct netdev *netdev,
2511 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2512 {
2513 struct ifreq ifr;
2514
2515 make_in4_sockaddr(&ifr.ifr_addr, addr);
2516 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2517 ioctl_name);
2518 }
2519
2520 /* Adds 'router' as a default IP gateway. */
2521 static int
2522 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2523 {
2524 struct in_addr any = { INADDR_ANY };
2525 struct rtentry rt;
2526 int error;
2527
2528 memset(&rt, 0, sizeof rt);
2529 make_in4_sockaddr(&rt.rt_dst, any);
2530 make_in4_sockaddr(&rt.rt_gateway, router);
2531 make_in4_sockaddr(&rt.rt_genmask, any);
2532 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2533 error = af_inet_ioctl(SIOCADDRT, &rt);
2534 if (error) {
2535 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2536 }
2537 return error;
2538 }
2539
2540 static int
2541 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2542 char **netdev_name)
2543 {
2544 static const char fn[] = "/proc/net/route";
2545 FILE *stream;
2546 char line[256];
2547 int ln;
2548
2549 *netdev_name = NULL;
2550 stream = fopen(fn, "r");
2551 if (stream == NULL) {
2552 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2553 return errno;
2554 }
2555
2556 ln = 0;
2557 while (fgets(line, sizeof line, stream)) {
2558 if (++ln >= 2) {
2559 char iface[17];
2560 ovs_be32 dest, gateway, mask;
2561 int refcnt, metric, mtu;
2562 unsigned int flags, use, window, irtt;
2563
2564 if (!ovs_scan(line,
2565 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2566 " %d %u %u\n",
2567 iface, &dest, &gateway, &flags, &refcnt,
2568 &use, &metric, &mask, &mtu, &window, &irtt)) {
2569 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2570 fn, ln, line);
2571 continue;
2572 }
2573 if (!(flags & RTF_UP)) {
2574 /* Skip routes that aren't up. */
2575 continue;
2576 }
2577
2578 /* The output of 'dest', 'mask', and 'gateway' were given in
2579 * network byte order, so we don't need need any endian
2580 * conversions here. */
2581 if ((dest & mask) == (host->s_addr & mask)) {
2582 if (!gateway) {
2583 /* The host is directly reachable. */
2584 next_hop->s_addr = 0;
2585 } else {
2586 /* To reach the host, we must go through a gateway. */
2587 next_hop->s_addr = gateway;
2588 }
2589 *netdev_name = xstrdup(iface);
2590 fclose(stream);
2591 return 0;
2592 }
2593 }
2594 }
2595
2596 fclose(stream);
2597 return ENXIO;
2598 }
2599
2600 static int
2601 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2602 {
2603 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2604 int error = 0;
2605
2606 ovs_mutex_lock(&netdev->mutex);
2607 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2608 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2609
2610 COVERAGE_INC(netdev_get_ethtool);
2611 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2612 error = netdev_linux_do_ethtool(netdev->up.name,
2613 cmd,
2614 ETHTOOL_GDRVINFO,
2615 "ETHTOOL_GDRVINFO");
2616 if (!error) {
2617 netdev->cache_valid |= VALID_DRVINFO;
2618 }
2619 }
2620
2621 if (!error) {
2622 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2623 smap_add(smap, "driver_version", netdev->drvinfo.version);
2624 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2625 }
2626 ovs_mutex_unlock(&netdev->mutex);
2627
2628 return error;
2629 }
2630
2631 static int
2632 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2633 struct smap *smap)
2634 {
2635 smap_add(smap, "driver_name", "openvswitch");
2636 return 0;
2637 }
2638
2639 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2640 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2641 * returns 0. Otherwise, it returns a positive errno value; in particular,
2642 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2643 static int
2644 netdev_linux_arp_lookup(const struct netdev *netdev,
2645 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2646 {
2647 struct arpreq r;
2648 struct sockaddr_in sin;
2649 int retval;
2650
2651 memset(&r, 0, sizeof r);
2652 memset(&sin, 0, sizeof sin);
2653 sin.sin_family = AF_INET;
2654 sin.sin_addr.s_addr = ip;
2655 sin.sin_port = 0;
2656 memcpy(&r.arp_pa, &sin, sizeof sin);
2657 r.arp_ha.sa_family = ARPHRD_ETHER;
2658 r.arp_flags = 0;
2659 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2660 COVERAGE_INC(netdev_arp_lookup);
2661 retval = af_inet_ioctl(SIOCGARP, &r);
2662 if (!retval) {
2663 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2664 } else if (retval != ENXIO) {
2665 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2666 netdev_get_name(netdev), IP_ARGS(ip),
2667 ovs_strerror(retval));
2668 }
2669 return retval;
2670 }
2671
2672 static int
2673 nd_to_iff_flags(enum netdev_flags nd)
2674 {
2675 int iff = 0;
2676 if (nd & NETDEV_UP) {
2677 iff |= IFF_UP;
2678 }
2679 if (nd & NETDEV_PROMISC) {
2680 iff |= IFF_PROMISC;
2681 }
2682 if (nd & NETDEV_LOOPBACK) {
2683 iff |= IFF_LOOPBACK;
2684 }
2685 return iff;
2686 }
2687
2688 static int
2689 iff_to_nd_flags(int iff)
2690 {
2691 enum netdev_flags nd = 0;
2692 if (iff & IFF_UP) {
2693 nd |= NETDEV_UP;
2694 }
2695 if (iff & IFF_PROMISC) {
2696 nd |= NETDEV_PROMISC;
2697 }
2698 if (iff & IFF_LOOPBACK) {
2699 nd |= NETDEV_LOOPBACK;
2700 }
2701 return nd;
2702 }
2703
2704 static int
2705 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2706 enum netdev_flags on, enum netdev_flags *old_flagsp)
2707 OVS_REQUIRES(netdev->mutex)
2708 {
2709 int old_flags, new_flags;
2710 int error = 0;
2711
2712 old_flags = netdev->ifi_flags;
2713 *old_flagsp = iff_to_nd_flags(old_flags);
2714 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2715 if (new_flags != old_flags) {
2716 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2717 get_flags(&netdev->up, &netdev->ifi_flags);
2718 }
2719
2720 return error;
2721 }
2722
2723 static int
2724 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2725 enum netdev_flags on, enum netdev_flags *old_flagsp)
2726 {
2727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2728 int error;
2729
2730 ovs_mutex_lock(&netdev->mutex);
2731 error = update_flags(netdev, off, on, old_flagsp);
2732 ovs_mutex_unlock(&netdev->mutex);
2733
2734 return error;
2735 }
2736
2737 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2738 GET_FEATURES, GET_STATUS) \
2739 { \
2740 NAME, \
2741 \
2742 NULL, \
2743 netdev_linux_run, \
2744 netdev_linux_wait, \
2745 \
2746 netdev_linux_alloc, \
2747 CONSTRUCT, \
2748 netdev_linux_destruct, \
2749 netdev_linux_dealloc, \
2750 NULL, /* get_config */ \
2751 NULL, /* set_config */ \
2752 NULL, /* get_tunnel_config */ \
2753 NULL, /* get_numa_id */ \
2754 \
2755 netdev_linux_send, \
2756 netdev_linux_send_wait, \
2757 \
2758 netdev_linux_set_etheraddr, \
2759 netdev_linux_get_etheraddr, \
2760 netdev_linux_get_mtu, \
2761 netdev_linux_set_mtu, \
2762 netdev_linux_get_ifindex, \
2763 netdev_linux_get_carrier, \
2764 netdev_linux_get_carrier_resets, \
2765 netdev_linux_set_miimon_interval, \
2766 GET_STATS, \
2767 SET_STATS, \
2768 \
2769 GET_FEATURES, \
2770 netdev_linux_set_advertisements, \
2771 \
2772 netdev_linux_set_policing, \
2773 netdev_linux_get_qos_types, \
2774 netdev_linux_get_qos_capabilities, \
2775 netdev_linux_get_qos, \
2776 netdev_linux_set_qos, \
2777 netdev_linux_get_queue, \
2778 netdev_linux_set_queue, \
2779 netdev_linux_delete_queue, \
2780 netdev_linux_get_queue_stats, \
2781 netdev_linux_queue_dump_start, \
2782 netdev_linux_queue_dump_next, \
2783 netdev_linux_queue_dump_done, \
2784 netdev_linux_dump_queue_stats, \
2785 \
2786 netdev_linux_get_in4, \
2787 netdev_linux_set_in4, \
2788 netdev_linux_get_in6, \
2789 netdev_linux_add_router, \
2790 netdev_linux_get_next_hop, \
2791 GET_STATUS, \
2792 netdev_linux_arp_lookup, \
2793 \
2794 netdev_linux_update_flags, \
2795 \
2796 netdev_linux_rxq_alloc, \
2797 netdev_linux_rxq_construct, \
2798 netdev_linux_rxq_destruct, \
2799 netdev_linux_rxq_dealloc, \
2800 netdev_linux_rxq_recv, \
2801 netdev_linux_rxq_wait, \
2802 netdev_linux_rxq_drain, \
2803 }
2804
2805 const struct netdev_class netdev_linux_class =
2806 NETDEV_LINUX_CLASS(
2807 "system",
2808 netdev_linux_construct,
2809 netdev_linux_get_stats,
2810 NULL, /* set_stats */
2811 netdev_linux_get_features,
2812 netdev_linux_get_status);
2813
2814 const struct netdev_class netdev_tap_class =
2815 NETDEV_LINUX_CLASS(
2816 "tap",
2817 netdev_linux_construct_tap,
2818 netdev_tap_get_stats,
2819 NULL, /* set_stats */
2820 netdev_linux_get_features,
2821 netdev_linux_get_status);
2822
2823 const struct netdev_class netdev_internal_class =
2824 NETDEV_LINUX_CLASS(
2825 "internal",
2826 netdev_linux_construct,
2827 netdev_internal_get_stats,
2828 netdev_internal_set_stats,
2829 NULL, /* get_features */
2830 netdev_internal_get_status);
2831 \f
2832 /* HTB traffic control class. */
2833
2834 #define HTB_N_QUEUES 0xf000
2835
2836 struct htb {
2837 struct tc tc;
2838 unsigned int max_rate; /* In bytes/s. */
2839 };
2840
2841 struct htb_class {
2842 struct tc_queue tc_queue;
2843 unsigned int min_rate; /* In bytes/s. */
2844 unsigned int max_rate; /* In bytes/s. */
2845 unsigned int burst; /* In bytes. */
2846 unsigned int priority; /* Lower values are higher priorities. */
2847 };
2848
2849 static struct htb *
2850 htb_get__(const struct netdev *netdev_)
2851 {
2852 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2853 return CONTAINER_OF(netdev->tc, struct htb, tc);
2854 }
2855
2856 static void
2857 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2858 {
2859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2860 struct htb *htb;
2861
2862 htb = xmalloc(sizeof *htb);
2863 tc_init(&htb->tc, &tc_ops_htb);
2864 htb->max_rate = max_rate;
2865
2866 netdev->tc = &htb->tc;
2867 }
2868
2869 /* Create an HTB qdisc.
2870 *
2871 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2872 static int
2873 htb_setup_qdisc__(struct netdev *netdev)
2874 {
2875 size_t opt_offset;
2876 struct tc_htb_glob opt;
2877 struct ofpbuf request;
2878 struct tcmsg *tcmsg;
2879
2880 tc_del_qdisc(netdev);
2881
2882 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2883 NLM_F_EXCL | NLM_F_CREATE, &request);
2884 if (!tcmsg) {
2885 return ENODEV;
2886 }
2887 tcmsg->tcm_handle = tc_make_handle(1, 0);
2888 tcmsg->tcm_parent = TC_H_ROOT;
2889
2890 nl_msg_put_string(&request, TCA_KIND, "htb");
2891
2892 memset(&opt, 0, sizeof opt);
2893 opt.rate2quantum = 10;
2894 opt.version = 3;
2895 opt.defcls = 1;
2896
2897 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2898 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2899 nl_msg_end_nested(&request, opt_offset);
2900
2901 return tc_transact(&request, NULL);
2902 }
2903
2904 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2905 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2906 static int
2907 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2908 unsigned int parent, struct htb_class *class)
2909 {
2910 size_t opt_offset;
2911 struct tc_htb_opt opt;
2912 struct ofpbuf request;
2913 struct tcmsg *tcmsg;
2914 int error;
2915 int mtu;
2916
2917 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2918 if (error) {
2919 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2920 netdev_get_name(netdev));
2921 return error;
2922 }
2923
2924 memset(&opt, 0, sizeof opt);
2925 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2926 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2927 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2928 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2929 opt.prio = class->priority;
2930
2931 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2932 if (!tcmsg) {
2933 return ENODEV;
2934 }
2935 tcmsg->tcm_handle = handle;
2936 tcmsg->tcm_parent = parent;
2937
2938 nl_msg_put_string(&request, TCA_KIND, "htb");
2939 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2940 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2941 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2942 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2943 nl_msg_end_nested(&request, opt_offset);
2944
2945 error = tc_transact(&request, NULL);
2946 if (error) {
2947 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2948 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2949 netdev_get_name(netdev),
2950 tc_get_major(handle), tc_get_minor(handle),
2951 tc_get_major(parent), tc_get_minor(parent),
2952 class->min_rate, class->max_rate,
2953 class->burst, class->priority, ovs_strerror(error));
2954 }
2955 return error;
2956 }
2957
2958 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2959 * description of them into 'details'. The description complies with the
2960 * specification given in the vswitch database documentation for linux-htb
2961 * queue details. */
2962 static int
2963 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2964 {
2965 static const struct nl_policy tca_htb_policy[] = {
2966 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2967 .min_len = sizeof(struct tc_htb_opt) },
2968 };
2969
2970 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2971 const struct tc_htb_opt *htb;
2972
2973 if (!nl_parse_nested(nl_options, tca_htb_policy,
2974 attrs, ARRAY_SIZE(tca_htb_policy))) {
2975 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2976 return EPROTO;
2977 }
2978
2979 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2980 class->min_rate = htb->rate.rate;
2981 class->max_rate = htb->ceil.rate;
2982 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2983 class->priority = htb->prio;
2984 return 0;
2985 }
2986
2987 static int
2988 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2989 struct htb_class *options,
2990 struct netdev_queue_stats *stats)
2991 {
2992 struct nlattr *nl_options;
2993 unsigned int handle;
2994 int error;
2995
2996 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2997 if (!error && queue_id) {
2998 unsigned int major = tc_get_major(handle);
2999 unsigned int minor = tc_get_minor(handle);
3000 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3001 *queue_id = minor - 1;
3002 } else {
3003 error = EPROTO;
3004 }
3005 }
3006 if (!error && options) {
3007 error = htb_parse_tca_options__(nl_options, options);
3008 }
3009 return error;
3010 }
3011
3012 static void
3013 htb_parse_qdisc_details__(struct netdev *netdev_,
3014 const struct smap *details, struct htb_class *hc)
3015 {
3016 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3017 const char *max_rate_s;
3018
3019 max_rate_s = smap_get(details, "max-rate");
3020 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3021 if (!hc->max_rate) {
3022 enum netdev_features current;
3023
3024 netdev_linux_read_features(netdev);
3025 current = !netdev->get_features_error ? netdev->current : 0;
3026 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3027 }
3028 hc->min_rate = hc->max_rate;
3029 hc->burst = 0;
3030 hc->priority = 0;
3031 }
3032
3033 static int
3034 htb_parse_class_details__(struct netdev *netdev,
3035 const struct smap *details, struct htb_class *hc)
3036 {
3037 const struct htb *htb = htb_get__(netdev);
3038 const char *min_rate_s = smap_get(details, "min-rate");
3039 const char *max_rate_s = smap_get(details, "max-rate");
3040 const char *burst_s = smap_get(details, "burst");
3041 const char *priority_s = smap_get(details, "priority");
3042 int mtu, error;
3043
3044 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3045 if (error) {
3046 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3047 netdev_get_name(netdev));
3048 return error;
3049 }
3050
3051 /* HTB requires at least an mtu sized min-rate to send any traffic even
3052 * on uncongested links. */
3053 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3054 hc->min_rate = MAX(hc->min_rate, mtu);
3055 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3056
3057 /* max-rate */
3058 hc->max_rate = (max_rate_s
3059 ? strtoull(max_rate_s, NULL, 10) / 8
3060 : htb->max_rate);
3061 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3062 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3063
3064 /* burst
3065 *
3066 * According to hints in the documentation that I've read, it is important
3067 * that 'burst' be at least as big as the largest frame that might be
3068 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3069 * but having it a bit too small is a problem. Since netdev_get_mtu()
3070 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3071 * the MTU. We actually add 64, instead of 14, as a guard against
3072 * additional headers get tacked on somewhere that we're not aware of. */
3073 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3074 hc->burst = MAX(hc->burst, mtu + 64);
3075
3076 /* priority */
3077 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3078
3079 return 0;
3080 }
3081
3082 static int
3083 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3084 unsigned int parent, struct htb_class *options,
3085 struct netdev_queue_stats *stats)
3086 {
3087 struct ofpbuf *reply;
3088 int error;
3089
3090 error = tc_query_class(netdev, handle, parent, &reply);
3091 if (!error) {
3092 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3093 ofpbuf_delete(reply);
3094 }
3095 return error;
3096 }
3097
3098 static int
3099 htb_tc_install(struct netdev *netdev, const struct smap *details)
3100 {
3101 int error;
3102
3103 error = htb_setup_qdisc__(netdev);
3104 if (!error) {
3105 struct htb_class hc;
3106
3107 htb_parse_qdisc_details__(netdev, details, &hc);
3108 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3109 tc_make_handle(1, 0), &hc);
3110 if (!error) {
3111 htb_install__(netdev, hc.max_rate);
3112 }
3113 }
3114 return error;
3115 }
3116
3117 static struct htb_class *
3118 htb_class_cast__(const struct tc_queue *queue)
3119 {
3120 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3121 }
3122
3123 static void
3124 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3125 const struct htb_class *hc)
3126 {
3127 struct htb *htb = htb_get__(netdev);
3128 size_t hash = hash_int(queue_id, 0);
3129 struct tc_queue *queue;
3130 struct htb_class *hcp;
3131
3132 queue = tc_find_queue__(netdev, queue_id, hash);
3133 if (queue) {
3134 hcp = htb_class_cast__(queue);
3135 } else {
3136 hcp = xmalloc(sizeof *hcp);
3137 queue = &hcp->tc_queue;
3138 queue->queue_id = queue_id;
3139 queue->created = time_msec();
3140 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3141 }
3142
3143 hcp->min_rate = hc->min_rate;
3144 hcp->max_rate = hc->max_rate;
3145 hcp->burst = hc->burst;
3146 hcp->priority = hc->priority;
3147 }
3148
3149 static int
3150 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3151 {
3152 struct ofpbuf msg;
3153 struct queue_dump_state state;
3154 struct htb_class hc;
3155
3156 /* Get qdisc options. */
3157 hc.max_rate = 0;
3158 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3159 htb_install__(netdev, hc.max_rate);
3160
3161 /* Get queues. */
3162 if (!start_queue_dump(netdev, &state)) {
3163 return ENODEV;
3164 }
3165 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3166 unsigned int queue_id;
3167
3168 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3169 htb_update_queue__(netdev, queue_id, &hc);
3170 }
3171 }
3172 finish_queue_dump(&state);
3173
3174 return 0;
3175 }
3176
3177 static void
3178 htb_tc_destroy(struct tc *tc)
3179 {
3180 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3181 struct htb_class *hc, *next;
3182
3183 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3184 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3185 free(hc);
3186 }
3187 tc_destroy(tc);
3188 free(htb);
3189 }
3190
3191 static int
3192 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3193 {
3194 const struct htb *htb = htb_get__(netdev);
3195 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3196 return 0;
3197 }
3198
3199 static int
3200 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3201 {
3202 struct htb_class hc;
3203 int error;
3204
3205 htb_parse_qdisc_details__(netdev, details, &hc);
3206 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3207 tc_make_handle(1, 0), &hc);
3208 if (!error) {
3209 htb_get__(netdev)->max_rate = hc.max_rate;
3210 }
3211 return error;
3212 }
3213
3214 static int
3215 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3216 const struct tc_queue *queue, struct smap *details)
3217 {
3218 const struct htb_class *hc = htb_class_cast__(queue);
3219
3220 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3221 if (hc->min_rate != hc->max_rate) {
3222 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3223 }
3224 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3225 if (hc->priority) {
3226 smap_add_format(details, "priority", "%u", hc->priority);
3227 }
3228 return 0;
3229 }
3230
3231 static int
3232 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3233 const struct smap *details)
3234 {
3235 struct htb_class hc;
3236 int error;
3237
3238 error = htb_parse_class_details__(netdev, details, &hc);
3239 if (error) {
3240 return error;
3241 }
3242
3243 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3244 tc_make_handle(1, 0xfffe), &hc);
3245 if (error) {
3246 return error;
3247 }
3248
3249 htb_update_queue__(netdev, queue_id, &hc);
3250 return 0;
3251 }
3252
3253 static int
3254 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3255 {
3256 struct htb_class *hc = htb_class_cast__(queue);
3257 struct htb *htb = htb_get__(netdev);
3258 int error;
3259
3260 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3261 if (!error) {
3262 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3263 free(hc);
3264 }
3265 return error;
3266 }
3267
3268 static int
3269 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3270 struct netdev_queue_stats *stats)
3271 {
3272 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3273 tc_make_handle(1, 0xfffe), NULL, stats);
3274 }
3275
3276 static int
3277 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3278 const struct ofpbuf *nlmsg,
3279 netdev_dump_queue_stats_cb *cb, void *aux)
3280 {
3281 struct netdev_queue_stats stats;
3282 unsigned int handle, major, minor;
3283 int error;
3284
3285 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3286 if (error) {
3287 return error;
3288 }
3289
3290 major = tc_get_major(handle);
3291 minor = tc_get_minor(handle);
3292 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3293 (*cb)(minor - 1, &stats, aux);
3294 }
3295 return 0;
3296 }
3297
3298 static const struct tc_ops tc_ops_htb = {
3299 "htb", /* linux_name */
3300 "linux-htb", /* ovs_name */
3301 HTB_N_QUEUES, /* n_queues */
3302 htb_tc_install,
3303 htb_tc_load,
3304 htb_tc_destroy,
3305 htb_qdisc_get,
3306 htb_qdisc_set,
3307 htb_class_get,
3308 htb_class_set,
3309 htb_class_delete,
3310 htb_class_get_stats,
3311 htb_class_dump_stats
3312 };
3313 \f
3314 /* "linux-hfsc" traffic control class. */
3315
3316 #define HFSC_N_QUEUES 0xf000
3317
3318 struct hfsc {
3319 struct tc tc;
3320 uint32_t max_rate;
3321 };
3322
3323 struct hfsc_class {
3324 struct tc_queue tc_queue;
3325 uint32_t min_rate;
3326 uint32_t max_rate;
3327 };
3328
3329 static struct hfsc *
3330 hfsc_get__(const struct netdev *netdev_)
3331 {
3332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3333 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3334 }
3335
3336 static struct hfsc_class *
3337 hfsc_class_cast__(const struct tc_queue *queue)
3338 {
3339 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3340 }
3341
3342 static void
3343 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3344 {
3345 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3346 struct hfsc *hfsc;
3347
3348 hfsc = xmalloc(sizeof *hfsc);
3349 tc_init(&hfsc->tc, &tc_ops_hfsc);
3350 hfsc->max_rate = max_rate;
3351 netdev->tc = &hfsc->tc;
3352 }
3353
3354 static void
3355 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3356 const struct hfsc_class *hc)
3357 {
3358 size_t hash;
3359 struct hfsc *hfsc;
3360 struct hfsc_class *hcp;
3361 struct tc_queue *queue;
3362
3363 hfsc = hfsc_get__(netdev);
3364 hash = hash_int(queue_id, 0);
3365
3366 queue = tc_find_queue__(netdev, queue_id, hash);
3367 if (queue) {
3368 hcp = hfsc_class_cast__(queue);
3369 } else {
3370 hcp = xmalloc(sizeof *hcp);
3371 queue = &hcp->tc_queue;
3372 queue->queue_id = queue_id;
3373 queue->created = time_msec();
3374 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3375 }
3376
3377 hcp->min_rate = hc->min_rate;
3378 hcp->max_rate = hc->max_rate;
3379 }
3380
3381 static int
3382 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3383 {
3384 const struct tc_service_curve *rsc, *fsc, *usc;
3385 static const struct nl_policy tca_hfsc_policy[] = {
3386 [TCA_HFSC_RSC] = {
3387 .type = NL_A_UNSPEC,
3388 .optional = false,
3389 .min_len = sizeof(struct tc_service_curve),
3390 },
3391 [TCA_HFSC_FSC] = {
3392 .type = NL_A_UNSPEC,
3393 .optional = false,
3394 .min_len = sizeof(struct tc_service_curve),
3395 },
3396 [TCA_HFSC_USC] = {
3397 .type = NL_A_UNSPEC,
3398 .optional = false,
3399 .min_len = sizeof(struct tc_service_curve),
3400 },
3401 };
3402 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3403
3404 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3405 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3406 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3407 return EPROTO;
3408 }
3409
3410 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3411 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3412 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3413
3414 if (rsc->m1 != 0 || rsc->d != 0 ||
3415 fsc->m1 != 0 || fsc->d != 0 ||
3416 usc->m1 != 0 || usc->d != 0) {
3417 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3418 "Non-linear service curves are not supported.");
3419 return EPROTO;
3420 }
3421
3422 if (rsc->m2 != fsc->m2) {
3423 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3424 "Real-time service curves are not supported ");
3425 return EPROTO;
3426 }
3427
3428 if (rsc->m2 > usc->m2) {
3429 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3430 "Min-rate service curve is greater than "
3431 "the max-rate service curve.");
3432 return EPROTO;
3433 }
3434
3435 class->min_rate = fsc->m2;
3436 class->max_rate = usc->m2;
3437 return 0;
3438 }
3439
3440 static int
3441 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3442 struct hfsc_class *options,
3443 struct netdev_queue_stats *stats)
3444 {
3445 int error;
3446 unsigned int handle;
3447 struct nlattr *nl_options;
3448
3449 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3450 if (error) {
3451 return error;
3452 }
3453
3454 if (queue_id) {
3455 unsigned int major, minor;
3456
3457 major = tc_get_major(handle);
3458 minor = tc_get_minor(handle);
3459 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3460 *queue_id = minor - 1;
3461 } else {
3462 return EPROTO;
3463 }
3464 }
3465
3466 if (options) {
3467 error = hfsc_parse_tca_options__(nl_options, options);
3468 }
3469
3470 return error;
3471 }
3472
3473 static int
3474 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3475 unsigned int parent, struct hfsc_class *options,
3476 struct netdev_queue_stats *stats)
3477 {
3478 int error;
3479 struct ofpbuf *reply;
3480
3481 error = tc_query_class(netdev, handle, parent, &reply);
3482 if (error) {
3483 return error;
3484 }
3485
3486 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3487 ofpbuf_delete(reply);
3488 return error;
3489 }
3490
3491 static void
3492 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3493 struct hfsc_class *class)
3494 {
3495 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3496 uint32_t max_rate;
3497 const char *max_rate_s;
3498
3499 max_rate_s = smap_get(details, "max-rate");
3500 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3501
3502 if (!max_rate) {
3503 enum netdev_features current;
3504
3505 netdev_linux_read_features(netdev);
3506 current = !netdev->get_features_error ? netdev->current : 0;
3507 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3508 }
3509
3510 class->min_rate = max_rate;
3511 class->max_rate = max_rate;
3512 }
3513
3514 static int
3515 hfsc_parse_class_details__(struct netdev *netdev,
3516 const struct smap *details,
3517 struct hfsc_class * class)
3518 {
3519 const struct hfsc *hfsc;
3520 uint32_t min_rate, max_rate;
3521 const char *min_rate_s, *max_rate_s;
3522
3523 hfsc = hfsc_get__(netdev);
3524 min_rate_s = smap_get(details, "min-rate");
3525 max_rate_s = smap_get(details, "max-rate");
3526
3527 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3528 min_rate = MAX(min_rate, 1);
3529 min_rate = MIN(min_rate, hfsc->max_rate);
3530
3531 max_rate = (max_rate_s
3532 ? strtoull(max_rate_s, NULL, 10) / 8
3533 : hfsc->max_rate);
3534 max_rate = MAX(max_rate, min_rate);
3535 max_rate = MIN(max_rate, hfsc->max_rate);
3536
3537 class->min_rate = min_rate;
3538 class->max_rate = max_rate;
3539
3540 return 0;
3541 }
3542
3543 /* Create an HFSC qdisc.
3544 *
3545 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3546 static int
3547 hfsc_setup_qdisc__(struct netdev * netdev)
3548 {
3549 struct tcmsg *tcmsg;
3550 struct ofpbuf request;
3551 struct tc_hfsc_qopt opt;
3552
3553 tc_del_qdisc(netdev);
3554
3555 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3556 NLM_F_EXCL | NLM_F_CREATE, &request);
3557
3558 if (!tcmsg) {
3559 return ENODEV;
3560 }
3561
3562 tcmsg->tcm_handle = tc_make_handle(1, 0);
3563 tcmsg->tcm_parent = TC_H_ROOT;
3564
3565 memset(&opt, 0, sizeof opt);
3566 opt.defcls = 1;
3567
3568 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3569 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3570
3571 return tc_transact(&request, NULL);
3572 }
3573
3574 /* Create an HFSC class.
3575 *
3576 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3577 * sc rate <min_rate> ul rate <max_rate>" */
3578 static int
3579 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3580 unsigned int parent, struct hfsc_class *class)
3581 {
3582 int error;
3583 size_t opt_offset;
3584 struct tcmsg *tcmsg;
3585 struct ofpbuf request;
3586 struct tc_service_curve min, max;
3587
3588 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3589
3590 if (!tcmsg) {
3591 return ENODEV;
3592 }
3593
3594 tcmsg->tcm_handle = handle;
3595 tcmsg->tcm_parent = parent;
3596
3597 min.m1 = 0;
3598 min.d = 0;
3599 min.m2 = class->min_rate;
3600
3601 max.m1 = 0;
3602 max.d = 0;
3603 max.m2 = class->max_rate;
3604
3605 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3606 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3607 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3608 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3609 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3610 nl_msg_end_nested(&request, opt_offset);
3611
3612 error = tc_transact(&request, NULL);
3613 if (error) {
3614 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3615 "min-rate %ubps, max-rate %ubps (%s)",
3616 netdev_get_name(netdev),
3617 tc_get_major(handle), tc_get_minor(handle),
3618 tc_get_major(parent), tc_get_minor(parent),
3619 class->min_rate, class->max_rate, ovs_strerror(error));
3620 }
3621
3622 return error;
3623 }
3624
3625 static int
3626 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3627 {
3628 int error;
3629 struct hfsc_class class;
3630
3631 error = hfsc_setup_qdisc__(netdev);
3632
3633 if (error) {
3634 return error;
3635 }
3636
3637 hfsc_parse_qdisc_details__(netdev, details, &class);
3638 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3639 tc_make_handle(1, 0), &class);
3640
3641 if (error) {
3642 return error;
3643 }
3644
3645 hfsc_install__(netdev, class.max_rate);
3646 return 0;
3647 }
3648
3649 static int
3650 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3651 {
3652 struct ofpbuf msg;
3653 struct queue_dump_state state;
3654 struct hfsc_class hc;
3655
3656 hc.max_rate = 0;
3657 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3658 hfsc_install__(netdev, hc.max_rate);
3659
3660 if (!start_queue_dump(netdev, &state)) {
3661 return ENODEV;
3662 }
3663
3664 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3665 unsigned int queue_id;
3666
3667 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3668 hfsc_update_queue__(netdev, queue_id, &hc);
3669 }
3670 }
3671
3672 finish_queue_dump(&state);
3673 return 0;
3674 }
3675
3676 static void
3677 hfsc_tc_destroy(struct tc *tc)
3678 {
3679 struct hfsc *hfsc;
3680 struct hfsc_class *hc, *next;
3681
3682 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3683
3684 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3685 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3686 free(hc);
3687 }
3688
3689 tc_destroy(tc);
3690 free(hfsc);
3691 }
3692
3693 static int
3694 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3695 {
3696 const struct hfsc *hfsc;
3697 hfsc = hfsc_get__(netdev);
3698 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3699 return 0;
3700 }
3701
3702 static int
3703 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3704 {
3705 int error;
3706 struct hfsc_class class;
3707
3708 hfsc_parse_qdisc_details__(netdev, details, &class);
3709 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3710 tc_make_handle(1, 0), &class);
3711
3712 if (!error) {
3713 hfsc_get__(netdev)->max_rate = class.max_rate;
3714 }
3715
3716 return error;
3717 }
3718
3719 static int
3720 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3721 const struct tc_queue *queue, struct smap *details)
3722 {
3723 const struct hfsc_class *hc;
3724
3725 hc = hfsc_class_cast__(queue);
3726 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3727 if (hc->min_rate != hc->max_rate) {
3728 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3729 }
3730 return 0;
3731 }
3732
3733 static int
3734 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3735 const struct smap *details)
3736 {
3737 int error;
3738 struct hfsc_class class;
3739
3740 error = hfsc_parse_class_details__(netdev, details, &class);
3741 if (error) {
3742 return error;
3743 }
3744
3745 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3746 tc_make_handle(1, 0xfffe), &class);
3747 if (error) {
3748 return error;
3749 }
3750
3751 hfsc_update_queue__(netdev, queue_id, &class);
3752 return 0;
3753 }
3754
3755 static int
3756 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3757 {
3758 int error;
3759 struct hfsc *hfsc;
3760 struct hfsc_class *hc;
3761
3762 hc = hfsc_class_cast__(queue);
3763 hfsc = hfsc_get__(netdev);
3764
3765 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3766 if (!error) {
3767 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3768 free(hc);
3769 }
3770 return error;
3771 }
3772
3773 static int
3774 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3775 struct netdev_queue_stats *stats)
3776 {
3777 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3778 tc_make_handle(1, 0xfffe), NULL, stats);
3779 }
3780
3781 static int
3782 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3783 const struct ofpbuf *nlmsg,
3784 netdev_dump_queue_stats_cb *cb, void *aux)
3785 {
3786 struct netdev_queue_stats stats;
3787 unsigned int handle, major, minor;
3788 int error;
3789
3790 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3791 if (error) {
3792 return error;
3793 }
3794
3795 major = tc_get_major(handle);
3796 minor = tc_get_minor(handle);
3797 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3798 (*cb)(minor - 1, &stats, aux);
3799 }
3800 return 0;
3801 }
3802
3803 static const struct tc_ops tc_ops_hfsc = {
3804 "hfsc", /* linux_name */
3805 "linux-hfsc", /* ovs_name */
3806 HFSC_N_QUEUES, /* n_queues */
3807 hfsc_tc_install, /* tc_install */
3808 hfsc_tc_load, /* tc_load */
3809 hfsc_tc_destroy, /* tc_destroy */
3810 hfsc_qdisc_get, /* qdisc_get */
3811 hfsc_qdisc_set, /* qdisc_set */
3812 hfsc_class_get, /* class_get */
3813 hfsc_class_set, /* class_set */
3814 hfsc_class_delete, /* class_delete */
3815 hfsc_class_get_stats, /* class_get_stats */
3816 hfsc_class_dump_stats /* class_dump_stats */
3817 };
3818 \f
3819 /* "linux-default" traffic control class.
3820 *
3821 * This class represents the default, unnamed Linux qdisc. It corresponds to
3822 * the "" (empty string) QoS type in the OVS database. */
3823
3824 static void
3825 default_install__(struct netdev *netdev_)
3826 {
3827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3828 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3829
3830 /* Nothing but a tc class implementation is allowed to write to a tc. This
3831 * class never does that, so we can legitimately use a const tc object. */
3832 netdev->tc = CONST_CAST(struct tc *, &tc);
3833 }
3834
3835 static int
3836 default_tc_install(struct netdev *netdev,
3837 const struct smap *details OVS_UNUSED)
3838 {
3839 default_install__(netdev);
3840 return 0;
3841 }
3842
3843 static int
3844 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3845 {
3846 default_install__(netdev);
3847 return 0;
3848 }
3849
3850 static const struct tc_ops tc_ops_default = {
3851 NULL, /* linux_name */
3852 "", /* ovs_name */
3853 0, /* n_queues */
3854 default_tc_install,
3855 default_tc_load,
3856 NULL, /* tc_destroy */
3857 NULL, /* qdisc_get */
3858 NULL, /* qdisc_set */
3859 NULL, /* class_get */
3860 NULL, /* class_set */
3861 NULL, /* class_delete */
3862 NULL, /* class_get_stats */
3863 NULL /* class_dump_stats */
3864 };
3865 \f
3866 /* "linux-other" traffic control class.
3867 *
3868 * */
3869
3870 static int
3871 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3872 {
3873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3874 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3875
3876 /* Nothing but a tc class implementation is allowed to write to a tc. This
3877 * class never does that, so we can legitimately use a const tc object. */
3878 netdev->tc = CONST_CAST(struct tc *, &tc);
3879 return 0;
3880 }
3881
3882 static const struct tc_ops tc_ops_other = {
3883 NULL, /* linux_name */
3884 "linux-other", /* ovs_name */
3885 0, /* n_queues */
3886 NULL, /* tc_install */
3887 other_tc_load,
3888 NULL, /* tc_destroy */
3889 NULL, /* qdisc_get */
3890 NULL, /* qdisc_set */
3891 NULL, /* class_get */
3892 NULL, /* class_set */
3893 NULL, /* class_delete */
3894 NULL, /* class_get_stats */
3895 NULL /* class_dump_stats */
3896 };
3897 \f
3898 /* Traffic control. */
3899
3900 /* Number of kernel "tc" ticks per second. */
3901 static double ticks_per_s;
3902
3903 /* Number of kernel "jiffies" per second. This is used for the purpose of
3904 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3905 * one jiffy's worth of data.
3906 *
3907 * There are two possibilities here:
3908 *
3909 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3910 * approximate range of 100 to 1024. That means that we really need to
3911 * make sure that the qdisc can buffer that much data.
3912 *
3913 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3914 * has finely granular timers and there's no need to fudge additional room
3915 * for buffers. (There's no extra effort needed to implement that: the
3916 * large 'buffer_hz' is used as a divisor, so practically any number will
3917 * come out as 0 in the division. Small integer results in the case of
3918 * really high dividends won't have any real effect anyhow.)
3919 */
3920 static unsigned int buffer_hz;
3921
3922 /* Returns tc handle 'major':'minor'. */
3923 static unsigned int
3924 tc_make_handle(unsigned int major, unsigned int minor)
3925 {
3926 return TC_H_MAKE(major << 16, minor);
3927 }
3928
3929 /* Returns the major number from 'handle'. */
3930 static unsigned int
3931 tc_get_major(unsigned int handle)
3932 {
3933 return TC_H_MAJ(handle) >> 16;
3934 }
3935
3936 /* Returns the minor number from 'handle'. */
3937 static unsigned int
3938 tc_get_minor(unsigned int handle)
3939 {
3940 return TC_H_MIN(handle);
3941 }
3942
3943 static struct tcmsg *
3944 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3945 struct ofpbuf *request)
3946 {
3947 struct tcmsg *tcmsg;
3948 int ifindex;
3949 int error;
3950
3951 error = get_ifindex(netdev, &ifindex);
3952 if (error) {
3953 return NULL;
3954 }
3955
3956 ofpbuf_init(request, 512);
3957 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3958 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3959 tcmsg->tcm_family = AF_UNSPEC;
3960 tcmsg->tcm_ifindex = ifindex;
3961 /* Caller should fill in tcmsg->tcm_handle. */
3962 /* Caller should fill in tcmsg->tcm_parent. */
3963
3964 return tcmsg;
3965 }
3966
3967 static int
3968 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3969 {
3970 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3971 ofpbuf_uninit(request);
3972 return error;
3973 }
3974
3975 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3976 * policing configuration.
3977 *
3978 * This function is equivalent to running the following when 'add' is true:
3979 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3980 *
3981 * This function is equivalent to running the following when 'add' is false:
3982 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3983 *
3984 * The configuration and stats may be seen with the following command:
3985 * /sbin/tc -s qdisc show dev <devname>
3986 *
3987 * Returns 0 if successful, otherwise a positive errno value.
3988 */
3989 static int
3990 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3991 {
3992 struct ofpbuf request;
3993 struct tcmsg *tcmsg;
3994 int error;
3995 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3996 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3997
3998 tcmsg = tc_make_request(netdev, type, flags, &request);
3999 if (!tcmsg) {
4000 return ENODEV;
4001 }
4002 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4003 tcmsg->tcm_parent = TC_H_INGRESS;
4004 nl_msg_put_string(&request, TCA_KIND, "ingress");
4005 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4006
4007 error = tc_transact(&request, NULL);
4008 if (error) {
4009 /* If we're deleting the qdisc, don't worry about some of the
4010 * error conditions. */
4011 if (!add && (error == ENOENT || error == EINVAL)) {
4012 return 0;
4013 }
4014 return error;
4015 }
4016
4017 return 0;
4018 }
4019
4020 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4021 * of 'kbits_burst'.
4022 *
4023 * This function is equivalent to running:
4024 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4025 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4026 * mtu 65535 drop
4027 *
4028 * The configuration and stats may be seen with the following command:
4029 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4030 *
4031 * Returns 0 if successful, otherwise a positive errno value.
4032 */
4033 static int
4034 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4035 {
4036 struct tc_police tc_police;
4037 struct ofpbuf request;
4038 struct tcmsg *tcmsg;
4039 size_t basic_offset;
4040 size_t police_offset;
4041 int error;
4042 int mtu = 65535;
4043
4044 memset(&tc_police, 0, sizeof tc_police);
4045 tc_police.action = TC_POLICE_SHOT;
4046 tc_police.mtu = mtu;
4047 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4048 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4049 kbits_burst * 1024);
4050
4051 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4052 NLM_F_EXCL | NLM_F_CREATE, &request);
4053 if (!tcmsg) {
4054 return ENODEV;
4055 }
4056 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4057 tcmsg->tcm_info = tc_make_handle(49,
4058 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4059
4060 nl_msg_put_string(&request, TCA_KIND, "basic");
4061 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4062 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4063 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4064 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4065 nl_msg_end_nested(&request, police_offset);
4066 nl_msg_end_nested(&request, basic_offset);
4067
4068 error = tc_transact(&request, NULL);
4069 if (error) {
4070 return error;
4071 }
4072
4073 return 0;
4074 }
4075
4076 static void
4077 read_psched(void)
4078 {
4079 /* The values in psched are not individually very meaningful, but they are
4080 * important. The tables below show some values seen in the wild.
4081 *
4082 * Some notes:
4083 *
4084 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4085 * (Before that, there are hints that it was 1000000000.)
4086 *
4087 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4088 * above.
4089 *
4090 * /proc/net/psched
4091 * -----------------------------------
4092 * [1] 000c8000 000f4240 000f4240 00000064
4093 * [2] 000003e8 00000400 000f4240 3b9aca00
4094 * [3] 000003e8 00000400 000f4240 3b9aca00
4095 * [4] 000003e8 00000400 000f4240 00000064
4096 * [5] 000003e8 00000040 000f4240 3b9aca00
4097 * [6] 000003e8 00000040 000f4240 000000f9
4098 *
4099 * a b c d ticks_per_s buffer_hz
4100 * ------- --------- ---------- ------------- ----------- -------------
4101 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4102 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4103 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4104 * [4] 1,000 1,024 1,000,000 100 976,562 100
4105 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4106 * [6] 1,000 64 1,000,000 249 15,625,000 249
4107 *
4108 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4109 * [2] 2.6.26-1-686-bigmem from Debian lenny
4110 * [3] 2.6.26-2-sparc64 from Debian lenny
4111 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4112 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4113 * [6] 2.6.34 from kernel.org on KVM
4114 */
4115 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4116 static const char fn[] = "/proc/net/psched";
4117 unsigned int a, b, c, d;
4118 FILE *stream;
4119
4120 if (!ovsthread_once_start(&once)) {
4121 return;
4122 }
4123
4124 ticks_per_s = 1.0;
4125 buffer_hz = 100;
4126
4127 stream = fopen(fn, "r");
4128 if (!stream) {
4129 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4130 goto exit;
4131 }
4132
4133 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4134 VLOG_WARN("%s: read failed", fn);
4135 fclose(stream);
4136 goto exit;
4137 }
4138 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4139 fclose(stream);
4140
4141 if (!a || !c) {
4142 VLOG_WARN("%s: invalid scheduler parameters", fn);
4143 goto exit;
4144 }
4145
4146 ticks_per_s = (double) a * c / b;
4147 if (c == 1000000) {
4148 buffer_hz = d;
4149 } else {
4150 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4151 fn, a, b, c, d);
4152 }
4153 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4154
4155 exit:
4156 ovsthread_once_done(&once);
4157 }
4158
4159 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4160 * rate of 'rate' bytes per second. */
4161 static unsigned int
4162 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4163 {
4164 read_psched();
4165 return (rate * ticks) / ticks_per_s;
4166 }
4167
4168 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4169 * rate of 'rate' bytes per second. */
4170 static unsigned int
4171 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4172 {
4173 read_psched();
4174 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4175 }
4176
4177 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4178 * a transmission rate of 'rate' bytes per second. */
4179 static unsigned int
4180 tc_buffer_per_jiffy(unsigned int rate)
4181 {
4182 read_psched();
4183 return rate / buffer_hz;
4184 }
4185
4186 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4187 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4188 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4189 * stores NULL into it if it is absent.
4190 *
4191 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4192 * 'msg'.
4193 *
4194 * Returns 0 if successful, otherwise a positive errno value. */
4195 static int
4196 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4197 struct nlattr **options)
4198 {
4199 static const struct nl_policy tca_policy[] = {
4200 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4201 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4202 };
4203 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4204
4205 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4206 tca_policy, ta, ARRAY_SIZE(ta))) {
4207 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4208 goto error;
4209 }
4210
4211 if (kind) {
4212 *kind = nl_attr_get_string(ta[TCA_KIND]);
4213 }
4214
4215 if (options) {
4216 *options = ta[TCA_OPTIONS];
4217 }
4218
4219 return 0;
4220
4221 error:
4222 if (kind) {
4223 *kind = NULL;
4224 }
4225 if (options) {
4226 *options = NULL;
4227 }
4228 return EPROTO;
4229 }
4230
4231 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4232 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4233 * into '*options', and its queue statistics into '*stats'. Any of the output
4234 * arguments may be null.
4235 *
4236 * Returns 0 if successful, otherwise a positive errno value. */
4237 static int
4238 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4239 struct nlattr **options, struct netdev_queue_stats *stats)
4240 {
4241 static const struct nl_policy tca_policy[] = {
4242 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4243 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4244 };
4245 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4246
4247 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4248 tca_policy, ta, ARRAY_SIZE(ta))) {
4249 VLOG_WARN_RL(&rl, "failed to parse class message");
4250 goto error;
4251 }
4252
4253 if (handlep) {
4254 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4255 *handlep = tc->tcm_handle;
4256 }
4257
4258 if (options) {
4259 *options = ta[TCA_OPTIONS];
4260 }
4261
4262 if (stats) {
4263 const struct gnet_stats_queue *gsq;
4264 struct gnet_stats_basic gsb;
4265
4266 static const struct nl_policy stats_policy[] = {
4267 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4268 .min_len = sizeof gsb },
4269 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4270 .min_len = sizeof *gsq },
4271 };
4272 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4273
4274 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4275 sa, ARRAY_SIZE(sa))) {
4276 VLOG_WARN_RL(&rl, "failed to parse class stats");
4277 goto error;
4278 }
4279
4280 /* Alignment issues screw up the length of struct gnet_stats_basic on
4281 * some arch/bitsize combinations. Newer versions of Linux have a
4282 * struct gnet_stats_basic_packed, but we can't depend on that. The
4283 * easiest thing to do is just to make a copy. */
4284 memset(&gsb, 0, sizeof gsb);
4285 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4286 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4287 stats->tx_bytes = gsb.bytes;
4288 stats->tx_packets = gsb.packets;
4289
4290 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4291 stats->tx_errors = gsq->drops;
4292 }
4293
4294 return 0;
4295
4296 error:
4297 if (options) {
4298 *options = NULL;
4299 }
4300 if (stats) {
4301 memset(stats, 0, sizeof *stats);
4302 }
4303 return EPROTO;
4304 }
4305
4306 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4307 * on 'netdev'. */
4308 static int
4309 tc_query_class(const struct netdev *netdev,
4310 unsigned int handle, unsigned int parent,
4311 struct ofpbuf **replyp)
4312 {
4313 struct ofpbuf request;
4314 struct tcmsg *tcmsg;
4315 int error;
4316
4317 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4318 if (!tcmsg) {
4319 return ENODEV;
4320 }
4321 tcmsg->tcm_handle = handle;
4322 tcmsg->tcm_parent = parent;
4323
4324 error = tc_transact(&request, replyp);
4325 if (error) {
4326 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4327 netdev_get_name(netdev),
4328 tc_get_major(handle), tc_get_minor(handle),
4329 tc_get_major(parent), tc_get_minor(parent),
4330 ovs_strerror(error));
4331 }
4332 return error;
4333 }
4334
4335 /* Equivalent to "tc class del dev <name> handle <handle>". */
4336 static int
4337 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4338 {
4339 struct ofpbuf request;
4340 struct tcmsg *tcmsg;
4341 int error;
4342
4343 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4344 if (!tcmsg) {
4345 return ENODEV;
4346 }
4347 tcmsg->tcm_handle = handle;
4348 tcmsg->tcm_parent = 0;
4349
4350 error = tc_transact(&request, NULL);
4351 if (error) {
4352 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4353 netdev_get_name(netdev),
4354 tc_get_major(handle), tc_get_minor(handle),
4355 ovs_strerror(error));
4356 }
4357 return error;
4358 }
4359
4360 /* Equivalent to "tc qdisc del dev <name> root". */
4361 static int
4362 tc_del_qdisc(struct netdev *netdev_)
4363 {
4364 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4365 struct ofpbuf request;
4366 struct tcmsg *tcmsg;
4367 int error;
4368
4369 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4370 if (!tcmsg) {
4371 return ENODEV;
4372 }
4373 tcmsg->tcm_handle = tc_make_handle(1, 0);
4374 tcmsg->tcm_parent = TC_H_ROOT;
4375
4376 error = tc_transact(&request, NULL);
4377 if (error == EINVAL) {
4378 /* EINVAL probably means that the default qdisc was in use, in which
4379 * case we've accomplished our purpose. */
4380 error = 0;
4381 }
4382 if (!error && netdev->tc) {
4383 if (netdev->tc->ops->tc_destroy) {
4384 netdev->tc->ops->tc_destroy(netdev->tc);
4385 }
4386 netdev->tc = NULL;
4387 }
4388 return error;
4389 }
4390
4391 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4392 * kernel to determine what they are. Returns 0 if successful, otherwise a
4393 * positive errno value. */
4394 static int
4395 tc_query_qdisc(const struct netdev *netdev_)
4396 {
4397 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4398 struct ofpbuf request, *qdisc;
4399 const struct tc_ops *ops;
4400 struct tcmsg *tcmsg;
4401 int load_error;
4402 int error;
4403
4404 if (netdev->tc) {
4405 return 0;
4406 }
4407
4408 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4409 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4410 * 2.6.35 without that fix backported to it.
4411 *
4412 * To avoid the OOPS, we must not make a request that would attempt to dump
4413 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4414 * few others. There are a few ways that I can see to do this, but most of
4415 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4416 * technique chosen here is to assume that any non-default qdisc that we
4417 * create will have a class with handle 1:0. The built-in qdiscs only have
4418 * a class with handle 0:0.
4419 *
4420 * We could check for Linux 2.6.35+ and use a more straightforward method
4421 * there. */
4422 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4423 if (!tcmsg) {
4424 return ENODEV;
4425 }
4426 tcmsg->tcm_handle = tc_make_handle(1, 0);
4427 tcmsg->tcm_parent = 0;
4428
4429 /* Figure out what tc class to instantiate. */
4430 error = tc_transact(&request, &qdisc);
4431 if (!error) {
4432 const char *kind;
4433
4434 error = tc_parse_qdisc(qdisc, &kind, NULL);
4435 if (error) {
4436 ops = &tc_ops_other;
4437 } else {
4438 ops = tc_lookup_linux_name(kind);
4439 if (!ops) {
4440 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4441 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4442
4443 ops = &tc_ops_other;
4444 }
4445 }
4446 } else if (error == ENOENT) {
4447 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4448 * other entity that doesn't have a handle 1:0. We will assume
4449 * that it's the system default qdisc. */
4450 ops = &tc_ops_default;
4451 error = 0;
4452 } else {
4453 /* Who knows? Maybe the device got deleted. */
4454 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4455 netdev_get_name(netdev_), ovs_strerror(error));
4456 ops = &tc_ops_other;
4457 }
4458
4459 /* Instantiate it. */
4460 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4461 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4462 ofpbuf_delete(qdisc);
4463
4464 return error ? error : load_error;
4465 }
4466
4467 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4468 approximate the time to transmit packets of various lengths. For an MTU of
4469 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4470 represents two possible packet lengths; for a MTU of 513 through 1024, four
4471 possible lengths; and so on.
4472
4473 Returns, for the specified 'mtu', the number of bits that packet lengths
4474 need to be shifted right to fit within such a 256-entry table. */
4475 static int
4476 tc_calc_cell_log(unsigned int mtu)
4477 {
4478 int cell_log;
4479
4480 if (!mtu) {
4481 mtu = ETH_PAYLOAD_MAX;
4482 }
4483 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4484
4485 for (cell_log = 0; mtu >= 256; cell_log++) {
4486 mtu >>= 1;
4487 }
4488
4489 return cell_log;
4490 }
4491
4492 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4493 * of 'mtu'. */
4494 static void
4495 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4496 {
4497 memset(rate, 0, sizeof *rate);
4498 rate->cell_log = tc_calc_cell_log(mtu);
4499 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4500 /* rate->cell_align = 0; */ /* distro headers. */
4501 rate->mpu = ETH_TOTAL_MIN;
4502 rate->rate = Bps;
4503 }
4504
4505 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4506 * attribute of the specified "type".
4507 *
4508 * See tc_calc_cell_log() above for a description of "rtab"s. */
4509 static void
4510 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4511 {
4512 uint32_t *rtab;
4513 unsigned int i;
4514
4515 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4516 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4517 unsigned packet_size = (i + 1) << rate->cell_log;
4518 if (packet_size < rate->mpu) {
4519 packet_size = rate->mpu;
4520 }
4521 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4522 }
4523 }
4524
4525 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4526 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4527 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4528 * 0 is fine.) */
4529 static int
4530 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4531 {
4532 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4533 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4534 }
4535 \f
4536 /* Linux-only functions declared in netdev-linux.h */
4537
4538 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4539 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4540 int
4541 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4542 const char *flag_name, bool enable)
4543 {
4544 const char *netdev_name = netdev_get_name(netdev);
4545 struct ethtool_value evalue;
4546 uint32_t new_flags;
4547 int error;
4548
4549 COVERAGE_INC(netdev_get_ethtool);
4550 memset(&evalue, 0, sizeof evalue);
4551 error = netdev_linux_do_ethtool(netdev_name,
4552 (struct ethtool_cmd *)&evalue,
4553 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4554 if (error) {
4555 return error;
4556 }
4557
4558 COVERAGE_INC(netdev_set_ethtool);
4559 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4560 error = netdev_linux_do_ethtool(netdev_name,
4561 (struct ethtool_cmd *)&evalue,
4562 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4563 if (error) {
4564 return error;
4565 }
4566
4567 COVERAGE_INC(netdev_get_ethtool);
4568 memset(&evalue, 0, sizeof evalue);
4569 error = netdev_linux_do_ethtool(netdev_name,
4570 (struct ethtool_cmd *)&evalue,
4571 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4572 if (error) {
4573 return error;
4574 }
4575
4576 if (new_flags != evalue.data) {
4577 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4578 "device %s failed", enable ? "enable" : "disable",
4579 flag_name, netdev_name);
4580 return EOPNOTSUPP;
4581 }
4582
4583 return 0;
4584 }
4585 \f
4586 /* Utility functions. */
4587
4588 /* Copies 'src' into 'dst', performing format conversion in the process. */
4589 static void
4590 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4591 const struct rtnl_link_stats *src)
4592 {
4593 dst->rx_packets = src->rx_packets;
4594 dst->tx_packets = src->tx_packets;
4595 dst->rx_bytes = src->rx_bytes;
4596 dst->tx_bytes = src->tx_bytes;
4597 dst->rx_errors = src->rx_errors;
4598 dst->tx_errors = src->tx_errors;
4599 dst->rx_dropped = src->rx_dropped;
4600 dst->tx_dropped = src->tx_dropped;
4601 dst->multicast = src->multicast;
4602 dst->collisions = src->collisions;
4603 dst->rx_length_errors = src->rx_length_errors;
4604 dst->rx_over_errors = src->rx_over_errors;
4605 dst->rx_crc_errors = src->rx_crc_errors;
4606 dst->rx_frame_errors = src->rx_frame_errors;
4607 dst->rx_fifo_errors = src->rx_fifo_errors;
4608 dst->rx_missed_errors = src->rx_missed_errors;
4609 dst->tx_aborted_errors = src->tx_aborted_errors;
4610 dst->tx_carrier_errors = src->tx_carrier_errors;
4611 dst->tx_fifo_errors = src->tx_fifo_errors;
4612 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4613 dst->tx_window_errors = src->tx_window_errors;
4614 }
4615
4616 static int
4617 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4618 {
4619 struct ofpbuf request;
4620 struct ofpbuf *reply;
4621 int error;
4622
4623 ofpbuf_init(&request, 0);
4624 nl_msg_put_nlmsghdr(&request,
4625 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4626 RTM_GETLINK, NLM_F_REQUEST);
4627 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4628 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4629 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4630 ofpbuf_uninit(&request);
4631 if (error) {
4632 return error;
4633 }
4634
4635 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4636 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4637 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4638 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4639 error = 0;
4640 } else {
4641 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4642 error = EPROTO;
4643 }
4644 } else {
4645 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4646 error = EPROTO;
4647 }
4648
4649
4650 ofpbuf_delete(reply);
4651 return error;
4652 }
4653
4654 static int
4655 get_flags(const struct netdev *dev, unsigned int *flags)
4656 {
4657 struct ifreq ifr;
4658 int error;
4659
4660 *flags = 0;
4661 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4662 if (!error) {
4663 *flags = ifr.ifr_flags;
4664 }
4665 return error;
4666 }
4667
4668 static int
4669 set_flags(const char *name, unsigned int flags)
4670 {
4671 struct ifreq ifr;
4672
4673 ifr.ifr_flags = flags;
4674 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4675 }
4676
4677 static int
4678 do_get_ifindex(const char *netdev_name)
4679 {
4680 struct ifreq ifr;
4681 int error;
4682
4683 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4684 COVERAGE_INC(netdev_get_ifindex);
4685
4686 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4687 if (error) {
4688 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4689 netdev_name, ovs_strerror(error));
4690 return -error;
4691 }
4692 return ifr.ifr_ifindex;
4693 }
4694
4695 static int
4696 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4697 {
4698 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4699
4700 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4701 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4702
4703 if (ifindex < 0) {
4704 netdev->get_ifindex_error = -ifindex;
4705 netdev->ifindex = 0;
4706 } else {
4707 netdev->get_ifindex_error = 0;
4708 netdev->ifindex = ifindex;
4709 }
4710 netdev->cache_valid |= VALID_IFINDEX;
4711 }
4712
4713 *ifindexp = netdev->ifindex;
4714 return netdev->get_ifindex_error;
4715 }
4716
4717 static int
4718 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4719 {
4720 struct ifreq ifr;
4721 int hwaddr_family;
4722 int error;
4723
4724 memset(&ifr, 0, sizeof ifr);
4725 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4726 COVERAGE_INC(netdev_get_hwaddr);
4727 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4728 if (error) {
4729 /* ENODEV probably means that a vif disappeared asynchronously and
4730 * hasn't been removed from the database yet, so reduce the log level
4731 * to INFO for that case. */
4732 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4733 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4734 netdev_name, ovs_strerror(error));
4735 return error;
4736 }
4737 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4738 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4739 VLOG_WARN("%s device has unknown hardware address family %d",
4740 netdev_name, hwaddr_family);
4741 }
4742 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4743 return 0;
4744 }
4745
4746 static int
4747 set_etheraddr(const char *netdev_name,
4748 const uint8_t mac[ETH_ADDR_LEN])
4749 {
4750 struct ifreq ifr;
4751 int error;
4752
4753 memset(&ifr, 0, sizeof ifr);
4754 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4755 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4756 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4757 COVERAGE_INC(netdev_set_hwaddr);
4758 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4759 if (error) {
4760 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4761 netdev_name, ovs_strerror(error));
4762 }
4763 return error;
4764 }
4765
4766 static int
4767 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4768 int cmd, const char *cmd_name)
4769 {
4770 struct ifreq ifr;
4771 int error;
4772
4773 memset(&ifr, 0, sizeof ifr);
4774 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4775 ifr.ifr_data = (caddr_t) ecmd;
4776
4777 ecmd->cmd = cmd;
4778 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4779 if (error) {
4780 if (error != EOPNOTSUPP) {
4781 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4782 "failed: %s", cmd_name, name, ovs_strerror(error));
4783 } else {
4784 /* The device doesn't support this operation. That's pretty
4785 * common, so there's no point in logging anything. */
4786 }
4787 }
4788 return error;
4789 }
4790
4791 static int
4792 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4793 int cmd, const char *cmd_name)
4794 {
4795 struct ifreq ifr;
4796 int error;
4797
4798 ifr.ifr_addr.sa_family = AF_INET;
4799 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4800 if (!error) {
4801 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4802 &ifr.ifr_addr);
4803 *ip = sin->sin_addr;
4804 }
4805 return error;
4806 }
4807
4808 /* Returns an AF_PACKET raw socket or a negative errno value. */
4809 static int
4810 af_packet_sock(void)
4811 {
4812 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4813 static int sock;
4814
4815 if (ovsthread_once_start(&once)) {
4816 sock = socket(AF_PACKET, SOCK_RAW, 0);
4817 if (sock >= 0) {
4818 int error = set_nonblocking(sock);
4819 if (error) {
4820 close(sock);
4821 sock = -error;
4822 }
4823 } else {
4824 sock = -errno;
4825 VLOG_ERR("failed to create packet socket: %s",
4826 ovs_strerror(errno));
4827 }
4828 ovsthread_once_done(&once);
4829 }
4830
4831 return sock;
4832 }