]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
ofp-print: Print bucket ids of OpenFlow 1.5 group messages.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
45 #include <poll.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49
50 #include "coverage.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "ofpbuf.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
66 #include "packets.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
69 #include "shash.h"
70 #include "socket-util.h"
71 #include "sset.h"
72 #include "timer.h"
73 #include "unaligned.h"
74 #include "vlog.h"
75
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
85
86 \f
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 * old headers. */
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
91 #endif
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
94 #endif
95
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #endif
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 #endif
104
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 * headers. */
107 #ifndef TC_RTAB_SIZE
108 #define TC_RTAB_SIZE 1024
109 #endif
110
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
116 *
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
119 */
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
122 #endif
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
125 #endif
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
128 #endif
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
132 uint32_t tp_status;
133 uint32_t tp_len;
134 uint32_t tp_snaplen;
135 uint16_t tp_mac;
136 uint16_t tp_net;
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
139 };
140
141 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
142 *
143 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
144 * 2.6.32-431.29.2.el6.x86_64 (see report at
145 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
146 * if_link.h is not self-contained on those kernels. It is easiest to
147 * unconditionally define a replacement. */
148 #ifndef IFLA_STATS64
149 #define IFLA_STATS64 23
150 #endif
151 #define rtnl_link_stats64 rpl_rtnl_link_stats64
152 struct rtnl_link_stats64 {
153 uint64_t rx_packets;
154 uint64_t tx_packets;
155 uint64_t rx_bytes;
156 uint64_t tx_bytes;
157 uint64_t rx_errors;
158 uint64_t tx_errors;
159 uint64_t rx_dropped;
160 uint64_t tx_dropped;
161 uint64_t multicast;
162 uint64_t collisions;
163
164 uint64_t rx_length_errors;
165 uint64_t rx_over_errors;
166 uint64_t rx_crc_errors;
167 uint64_t rx_frame_errors;
168 uint64_t rx_fifo_errors;
169 uint64_t rx_missed_errors;
170
171 uint64_t tx_aborted_errors;
172 uint64_t tx_carrier_errors;
173 uint64_t tx_fifo_errors;
174 uint64_t tx_heartbeat_errors;
175 uint64_t tx_window_errors;
176
177 uint64_t rx_compressed;
178 uint64_t tx_compressed;
179 };
180
181 enum {
182 VALID_IFINDEX = 1 << 0,
183 VALID_ETHERADDR = 1 << 1,
184 VALID_IN4 = 1 << 2,
185 VALID_IN6 = 1 << 3,
186 VALID_MTU = 1 << 4,
187 VALID_POLICING = 1 << 5,
188 VALID_VPORT_STAT_ERROR = 1 << 6,
189 VALID_DRVINFO = 1 << 7,
190 VALID_FEATURES = 1 << 8,
191 };
192 \f
193 /* Traffic control. */
194
195 /* An instance of a traffic control class. Always associated with a particular
196 * network device.
197 *
198 * Each TC implementation subclasses this with whatever additional data it
199 * needs. */
200 struct tc {
201 const struct tc_ops *ops;
202 struct hmap queues; /* Contains "struct tc_queue"s.
203 * Read by generic TC layer.
204 * Written only by TC implementation. */
205 };
206
207 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
208
209 /* One traffic control queue.
210 *
211 * Each TC implementation subclasses this with whatever additional data it
212 * needs. */
213 struct tc_queue {
214 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
215 unsigned int queue_id; /* OpenFlow queue ID. */
216 long long int created; /* Time queue was created, in msecs. */
217 };
218
219 /* A particular kind of traffic control. Each implementation generally maps to
220 * one particular Linux qdisc class.
221 *
222 * The functions below return 0 if successful or a positive errno value on
223 * failure, except where otherwise noted. All of them must be provided, except
224 * where otherwise noted. */
225 struct tc_ops {
226 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
227 * This is null for tc_ops_default and tc_ops_other, for which there are no
228 * appropriate values. */
229 const char *linux_name;
230
231 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
232 const char *ovs_name;
233
234 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
235 * queues. The queues are numbered 0 through n_queues - 1. */
236 unsigned int n_queues;
237
238 /* Called to install this TC class on 'netdev'. The implementation should
239 * make the Netlink calls required to set up 'netdev' with the right qdisc
240 * and configure it according to 'details'. The implementation may assume
241 * that the current qdisc is the default; that is, there is no need for it
242 * to delete the current qdisc before installing itself.
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
246 * (which is built as ovs-vswitchd.conf.db(8)).
247 *
248 * This function must return 0 if and only if it sets 'netdev->tc' to an
249 * initialized 'struct tc'.
250 *
251 * (This function is null for tc_ops_other, which cannot be installed. For
252 * other TC classes it should always be nonnull.) */
253 int (*tc_install)(struct netdev *netdev, const struct smap *details);
254
255 /* Called when the netdev code determines (through a Netlink query) that
256 * this TC class's qdisc is installed on 'netdev', but we didn't install
257 * it ourselves and so don't know any of the details.
258 *
259 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
260 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
261 * implementation should parse the other attributes of 'nlmsg' as
262 * necessary to determine its configuration. If necessary it should also
263 * use Netlink queries to determine the configuration of queues on
264 * 'netdev'.
265 *
266 * This function must return 0 if and only if it sets 'netdev->tc' to an
267 * initialized 'struct tc'. */
268 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
269
270 /* Destroys the data structures allocated by the implementation as part of
271 * 'tc'. (This includes destroying 'tc->queues' by calling
272 * tc_destroy(tc).
273 *
274 * The implementation should not need to perform any Netlink calls. If
275 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
276 * (But it may not be desirable.)
277 *
278 * This function may be null if 'tc' is trivial. */
279 void (*tc_destroy)(struct tc *tc);
280
281 /* Retrieves details of 'netdev->tc' configuration into 'details'.
282 *
283 * The implementation should not need to perform any Netlink calls, because
284 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
285 * cached the configuration.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function may be null if 'tc' is not configurable.
292 */
293 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
294
295 /* Reconfigures 'netdev->tc' according to 'details', performing any
296 * required Netlink calls to complete the reconfiguration.
297 *
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
301 *
302 * This function may be null if 'tc' is not configurable.
303 */
304 int (*qdisc_set)(struct netdev *, const struct smap *details);
305
306 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
307 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
308 *
309 * The contents of 'details' should be documented as valid for 'ovs_name'
310 * in the "other_config" column in the "Queue" table in
311 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
312 *
313 * The implementation should not need to perform any Netlink calls, because
314 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
315 * cached the queue configuration.
316 *
317 * This function may be null if 'tc' does not have queues ('n_queues' is
318 * 0). */
319 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
320 struct smap *details);
321
322 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
323 * 'details', perfoming any required Netlink calls to complete the
324 * reconfiguration. The caller ensures that 'queue_id' is less than
325 * 'n_queues'.
326 *
327 * The contents of 'details' should be documented as valid for 'ovs_name'
328 * in the "other_config" column in the "Queue" table in
329 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
330 *
331 * This function may be null if 'tc' does not have queues or its queues are
332 * not configurable. */
333 int (*class_set)(struct netdev *, unsigned int queue_id,
334 const struct smap *details);
335
336 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
337 * tc_queue's within 'netdev->tc->queues'.
338 *
339 * This function may be null if 'tc' does not have queues or its queues
340 * cannot be deleted. */
341 int (*class_delete)(struct netdev *, struct tc_queue *queue);
342
343 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
344 * 'struct tc_queue's within 'netdev->tc->queues'.
345 *
346 * On success, initializes '*stats'.
347 *
348 * This function may be null if 'tc' does not have queues or if it cannot
349 * report queue statistics. */
350 int (*class_get_stats)(const struct netdev *netdev,
351 const struct tc_queue *queue,
352 struct netdev_queue_stats *stats);
353
354 /* Extracts queue stats from 'nlmsg', which is a response to a
355 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
356 *
357 * This function may be null if 'tc' does not have queues or if it cannot
358 * report queue statistics. */
359 int (*class_dump_stats)(const struct netdev *netdev,
360 const struct ofpbuf *nlmsg,
361 netdev_dump_queue_stats_cb *cb, void *aux);
362 };
363
364 static void
365 tc_init(struct tc *tc, const struct tc_ops *ops)
366 {
367 tc->ops = ops;
368 hmap_init(&tc->queues);
369 }
370
371 static void
372 tc_destroy(struct tc *tc)
373 {
374 hmap_destroy(&tc->queues);
375 }
376
377 static const struct tc_ops tc_ops_htb;
378 static const struct tc_ops tc_ops_hfsc;
379 static const struct tc_ops tc_ops_default;
380 static const struct tc_ops tc_ops_other;
381
382 static const struct tc_ops *const tcs[] = {
383 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
384 &tc_ops_hfsc, /* Hierarchical fair service curve. */
385 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
386 &tc_ops_other, /* Some other qdisc. */
387 NULL
388 };
389
390 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
391 static unsigned int tc_get_major(unsigned int handle);
392 static unsigned int tc_get_minor(unsigned int handle);
393
394 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
395 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
396 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
397
398 static struct tcmsg *tc_make_request(const struct netdev *, int type,
399 unsigned int flags, struct ofpbuf *);
400 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
401 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
402 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
403 int kbits_burst);
404
405 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
406 struct nlattr **options);
407 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
408 struct nlattr **options,
409 struct netdev_queue_stats *);
410 static int tc_query_class(const struct netdev *,
411 unsigned int handle, unsigned int parent,
412 struct ofpbuf **replyp);
413 static int tc_delete_class(const struct netdev *, unsigned int handle);
414
415 static int tc_del_qdisc(struct netdev *netdev);
416 static int tc_query_qdisc(const struct netdev *netdev);
417
418 static int tc_calc_cell_log(unsigned int mtu);
419 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
420 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
421 const struct tc_ratespec *rate);
422 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
423 \f
424 struct netdev_linux {
425 struct netdev up;
426
427 /* Protects all members below. */
428 struct ovs_mutex mutex;
429
430 unsigned int cache_valid;
431
432 bool miimon; /* Link status of last poll. */
433 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
434 struct timer miimon_timer;
435
436 /* The following are figured out "on demand" only. They are only valid
437 * when the corresponding VALID_* bit in 'cache_valid' is set. */
438 int ifindex;
439 uint8_t etheraddr[ETH_ADDR_LEN];
440 struct in_addr address, netmask;
441 struct in6_addr in6;
442 int mtu;
443 unsigned int ifi_flags;
444 long long int carrier_resets;
445 uint32_t kbits_rate; /* Policing data. */
446 uint32_t kbits_burst;
447 int vport_stats_error; /* Cached error code from vport_get_stats().
448 0 or an errno value. */
449 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
450 int ether_addr_error; /* Cached error code from set/get etheraddr. */
451 int netdev_policing_error; /* Cached error code from set policing. */
452 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
453 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
454
455 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
456 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
457 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
458
459 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
460 struct tc *tc;
461
462 /* For devices of class netdev_tap_class only. */
463 int tap_fd;
464 };
465
466 struct netdev_rxq_linux {
467 struct netdev_rxq up;
468 bool is_tap;
469 int fd;
470 };
471
472 /* This is set pretty low because we probably won't learn anything from the
473 * additional log messages. */
474 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
475
476 /* Polling miimon status for all ports causes performance degradation when
477 * handling a large number of ports. If there are no devices using miimon, then
478 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
479 *
480 * Readers do not depend on this variable synchronizing with the related
481 * changes in the device miimon status, so we can use atomic_count. */
482 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
483
484 static void netdev_linux_run(void);
485
486 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
487 int cmd, const char *cmd_name);
488 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
489 int cmd, const char *cmd_name);
490 static int get_flags(const struct netdev *, unsigned int *flags);
491 static int set_flags(const char *, unsigned int flags);
492 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
493 enum netdev_flags on, enum netdev_flags *old_flagsp)
494 OVS_REQUIRES(netdev->mutex);
495 static int do_get_ifindex(const char *netdev_name);
496 static int get_ifindex(const struct netdev *, int *ifindexp);
497 static int do_set_addr(struct netdev *netdev,
498 int ioctl_nr, const char *ioctl_name,
499 struct in_addr addr);
500 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
501 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
502 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
503 static int af_packet_sock(void);
504 static bool netdev_linux_miimon_enabled(void);
505 static void netdev_linux_miimon_run(void);
506 static void netdev_linux_miimon_wait(void);
507 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
508
509 static bool
510 is_netdev_linux_class(const struct netdev_class *netdev_class)
511 {
512 return netdev_class->run == netdev_linux_run;
513 }
514
515 static bool
516 is_tap_netdev(const struct netdev *netdev)
517 {
518 return netdev_get_class(netdev) == &netdev_tap_class;
519 }
520
521 static struct netdev_linux *
522 netdev_linux_cast(const struct netdev *netdev)
523 {
524 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
525
526 return CONTAINER_OF(netdev, struct netdev_linux, up);
527 }
528
529 static struct netdev_rxq_linux *
530 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
531 {
532 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
533 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
534 }
535 \f
536 static void netdev_linux_update(struct netdev_linux *netdev,
537 const struct rtnetlink_link_change *)
538 OVS_REQUIRES(netdev->mutex);
539 static void netdev_linux_changed(struct netdev_linux *netdev,
540 unsigned int ifi_flags, unsigned int mask)
541 OVS_REQUIRES(netdev->mutex);
542
543 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
544 * if no such socket could be created. */
545 static struct nl_sock *
546 netdev_linux_notify_sock(void)
547 {
548 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
549 static struct nl_sock *sock;
550
551 if (ovsthread_once_start(&once)) {
552 int error;
553
554 error = nl_sock_create(NETLINK_ROUTE, &sock);
555 if (!error) {
556 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
557 if (error) {
558 nl_sock_destroy(sock);
559 sock = NULL;
560 }
561 }
562 ovsthread_once_done(&once);
563 }
564
565 return sock;
566 }
567
568 static bool
569 netdev_linux_miimon_enabled(void)
570 {
571 return atomic_count_get(&miimon_cnt) > 0;
572 }
573
574 static void
575 netdev_linux_run(void)
576 {
577 struct nl_sock *sock;
578 int error;
579
580 if (netdev_linux_miimon_enabled()) {
581 netdev_linux_miimon_run();
582 }
583
584 sock = netdev_linux_notify_sock();
585 if (!sock) {
586 return;
587 }
588
589 do {
590 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
591 uint64_t buf_stub[4096 / 8];
592 struct ofpbuf buf;
593
594 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
595 error = nl_sock_recv(sock, &buf, false);
596 if (!error) {
597 struct rtnetlink_link_change change;
598
599 if (rtnetlink_link_parse(&buf, &change)) {
600 struct netdev *netdev_ = netdev_from_name(change.ifname);
601 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
603
604 ovs_mutex_lock(&netdev->mutex);
605 netdev_linux_update(netdev, &change);
606 ovs_mutex_unlock(&netdev->mutex);
607 }
608 netdev_close(netdev_);
609 }
610 } else if (error == ENOBUFS) {
611 struct shash device_shash;
612 struct shash_node *node;
613
614 nl_sock_drain(sock);
615
616 shash_init(&device_shash);
617 netdev_get_devices(&netdev_linux_class, &device_shash);
618 SHASH_FOR_EACH (node, &device_shash) {
619 struct netdev *netdev_ = node->data;
620 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
621 unsigned int flags;
622
623 ovs_mutex_lock(&netdev->mutex);
624 get_flags(netdev_, &flags);
625 netdev_linux_changed(netdev, flags, 0);
626 ovs_mutex_unlock(&netdev->mutex);
627
628 netdev_close(netdev_);
629 }
630 shash_destroy(&device_shash);
631 } else if (error != EAGAIN) {
632 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
633 ovs_strerror(error));
634 }
635 ofpbuf_uninit(&buf);
636 } while (!error);
637 }
638
639 static void
640 netdev_linux_wait(void)
641 {
642 struct nl_sock *sock;
643
644 if (netdev_linux_miimon_enabled()) {
645 netdev_linux_miimon_wait();
646 }
647 sock = netdev_linux_notify_sock();
648 if (sock) {
649 nl_sock_wait(sock, POLLIN);
650 }
651 }
652
653 static void
654 netdev_linux_changed(struct netdev_linux *dev,
655 unsigned int ifi_flags, unsigned int mask)
656 OVS_REQUIRES(dev->mutex)
657 {
658 netdev_change_seq_changed(&dev->up);
659
660 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
661 dev->carrier_resets++;
662 }
663 dev->ifi_flags = ifi_flags;
664
665 dev->cache_valid &= mask;
666 }
667
668 static void
669 netdev_linux_update(struct netdev_linux *dev,
670 const struct rtnetlink_link_change *change)
671 OVS_REQUIRES(dev->mutex)
672 {
673 if (change->nlmsg_type == RTM_NEWLINK) {
674 /* Keep drv-info */
675 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
676
677 /* Update netdev from rtnl-change msg. */
678 if (change->mtu) {
679 dev->mtu = change->mtu;
680 dev->cache_valid |= VALID_MTU;
681 dev->netdev_mtu_error = 0;
682 }
683
684 if (!eth_addr_is_zero(change->addr)) {
685 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
686 dev->cache_valid |= VALID_ETHERADDR;
687 dev->ether_addr_error = 0;
688 }
689
690 dev->ifindex = change->ifi_index;
691 dev->cache_valid |= VALID_IFINDEX;
692 dev->get_ifindex_error = 0;
693
694 } else {
695 netdev_linux_changed(dev, change->ifi_flags, 0);
696 }
697 }
698
699 static struct netdev *
700 netdev_linux_alloc(void)
701 {
702 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
703 return &netdev->up;
704 }
705
706 static void
707 netdev_linux_common_construct(struct netdev_linux *netdev)
708 {
709 ovs_mutex_init(&netdev->mutex);
710 }
711
712 /* Creates system and internal devices. */
713 static int
714 netdev_linux_construct(struct netdev *netdev_)
715 {
716 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
717 int error;
718
719 netdev_linux_common_construct(netdev);
720
721 error = get_flags(&netdev->up, &netdev->ifi_flags);
722 if (error == ENODEV) {
723 if (netdev->up.netdev_class != &netdev_internal_class) {
724 /* The device does not exist, so don't allow it to be opened. */
725 return ENODEV;
726 } else {
727 /* "Internal" netdevs have to be created as netdev objects before
728 * they exist in the kernel, because creating them in the kernel
729 * happens by passing a netdev object to dpif_port_add().
730 * Therefore, ignore the error. */
731 }
732 }
733
734 return 0;
735 }
736
737 /* For most types of netdevs we open the device for each call of
738 * netdev_open(). However, this is not the case with tap devices,
739 * since it is only possible to open the device once. In this
740 * situation we share a single file descriptor, and consequently
741 * buffers, across all readers. Therefore once data is read it will
742 * be unavailable to other reads for tap devices. */
743 static int
744 netdev_linux_construct_tap(struct netdev *netdev_)
745 {
746 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
747 static const char tap_dev[] = "/dev/net/tun";
748 const char *name = netdev_->name;
749 struct ifreq ifr;
750 int error;
751
752 netdev_linux_common_construct(netdev);
753
754 /* Open tap device. */
755 netdev->tap_fd = open(tap_dev, O_RDWR);
756 if (netdev->tap_fd < 0) {
757 error = errno;
758 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
759 return error;
760 }
761
762 /* Create tap device. */
763 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
764 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
765 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
766 VLOG_WARN("%s: creating tap device failed: %s", name,
767 ovs_strerror(errno));
768 error = errno;
769 goto error_close;
770 }
771
772 /* Make non-blocking. */
773 error = set_nonblocking(netdev->tap_fd);
774 if (error) {
775 goto error_close;
776 }
777
778 return 0;
779
780 error_close:
781 close(netdev->tap_fd);
782 return error;
783 }
784
785 static void
786 netdev_linux_destruct(struct netdev *netdev_)
787 {
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789
790 if (netdev->tc && netdev->tc->ops->tc_destroy) {
791 netdev->tc->ops->tc_destroy(netdev->tc);
792 }
793
794 if (netdev_get_class(netdev_) == &netdev_tap_class
795 && netdev->tap_fd >= 0)
796 {
797 close(netdev->tap_fd);
798 }
799
800 if (netdev->miimon_interval > 0) {
801 atomic_count_dec(&miimon_cnt);
802 }
803
804 ovs_mutex_destroy(&netdev->mutex);
805 }
806
807 static void
808 netdev_linux_dealloc(struct netdev *netdev_)
809 {
810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
811 free(netdev);
812 }
813
814 static struct netdev_rxq *
815 netdev_linux_rxq_alloc(void)
816 {
817 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
818 return &rx->up;
819 }
820
821 static int
822 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
823 {
824 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
825 struct netdev *netdev_ = rx->up.netdev;
826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
827 int error;
828
829 ovs_mutex_lock(&netdev->mutex);
830 rx->is_tap = is_tap_netdev(netdev_);
831 if (rx->is_tap) {
832 rx->fd = netdev->tap_fd;
833 } else {
834 struct sockaddr_ll sll;
835 int ifindex, val;
836 /* Result of tcpdump -dd inbound */
837 static const struct sock_filter filt[] = {
838 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
839 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
840 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
841 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
842 };
843 static const struct sock_fprog fprog = {
844 ARRAY_SIZE(filt), (struct sock_filter *) filt
845 };
846
847 /* Create file descriptor. */
848 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
849 if (rx->fd < 0) {
850 error = errno;
851 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
852 goto error;
853 }
854
855 val = 1;
856 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
857 error = errno;
858 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
859 netdev_get_name(netdev_), ovs_strerror(error));
860 goto error;
861 }
862
863 /* Set non-blocking mode. */
864 error = set_nonblocking(rx->fd);
865 if (error) {
866 goto error;
867 }
868
869 /* Get ethernet device index. */
870 error = get_ifindex(&netdev->up, &ifindex);
871 if (error) {
872 goto error;
873 }
874
875 /* Bind to specific ethernet device. */
876 memset(&sll, 0, sizeof sll);
877 sll.sll_family = AF_PACKET;
878 sll.sll_ifindex = ifindex;
879 sll.sll_protocol = htons(ETH_P_ALL);
880 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
881 error = errno;
882 VLOG_ERR("%s: failed to bind raw socket (%s)",
883 netdev_get_name(netdev_), ovs_strerror(error));
884 goto error;
885 }
886
887 /* Filter for only inbound packets. */
888 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
889 sizeof fprog);
890 if (error) {
891 error = errno;
892 VLOG_ERR("%s: failed to attach filter (%s)",
893 netdev_get_name(netdev_), ovs_strerror(error));
894 goto error;
895 }
896 }
897 ovs_mutex_unlock(&netdev->mutex);
898
899 return 0;
900
901 error:
902 if (rx->fd >= 0) {
903 close(rx->fd);
904 }
905 ovs_mutex_unlock(&netdev->mutex);
906 return error;
907 }
908
909 static void
910 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
911 {
912 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
913
914 if (!rx->is_tap) {
915 close(rx->fd);
916 }
917 }
918
919 static void
920 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
921 {
922 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
923
924 free(rx);
925 }
926
927 static ovs_be16
928 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
929 {
930 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
931 return htons(aux->tp_vlan_tpid);
932 } else {
933 return htons(ETH_TYPE_VLAN);
934 }
935 }
936
937 static bool
938 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
939 {
940 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
941 }
942
943 static int
944 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
945 {
946 size_t size;
947 ssize_t retval;
948 struct iovec iov;
949 struct cmsghdr *cmsg;
950 union {
951 struct cmsghdr cmsg;
952 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
953 } cmsg_buffer;
954 struct msghdr msgh;
955
956 /* Reserve headroom for a single VLAN tag */
957 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
958 size = ofpbuf_tailroom(buffer);
959
960 iov.iov_base = ofpbuf_data(buffer);
961 iov.iov_len = size;
962 msgh.msg_name = NULL;
963 msgh.msg_namelen = 0;
964 msgh.msg_iov = &iov;
965 msgh.msg_iovlen = 1;
966 msgh.msg_control = &cmsg_buffer;
967 msgh.msg_controllen = sizeof cmsg_buffer;
968 msgh.msg_flags = 0;
969
970 do {
971 retval = recvmsg(fd, &msgh, MSG_TRUNC);
972 } while (retval < 0 && errno == EINTR);
973
974 if (retval < 0) {
975 return errno;
976 } else if (retval > size) {
977 return EMSGSIZE;
978 }
979
980 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
981
982 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
983 const struct tpacket_auxdata *aux;
984
985 if (cmsg->cmsg_level != SOL_PACKET
986 || cmsg->cmsg_type != PACKET_AUXDATA
987 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
988 continue;
989 }
990
991 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
992 if (auxdata_has_vlan_tci(aux)) {
993 if (retval < ETH_HEADER_LEN) {
994 return EINVAL;
995 }
996
997 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
998 htons(aux->tp_vlan_tci));
999 break;
1000 }
1001 }
1002
1003 return 0;
1004 }
1005
1006 static int
1007 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
1008 {
1009 ssize_t retval;
1010 size_t size = ofpbuf_tailroom(buffer);
1011
1012 do {
1013 retval = read(fd, ofpbuf_data(buffer), size);
1014 } while (retval < 0 && errno == EINTR);
1015
1016 if (retval < 0) {
1017 return errno;
1018 } else if (retval > size) {
1019 return EMSGSIZE;
1020 }
1021
1022 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
1023 return 0;
1024 }
1025
1026 static int
1027 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
1028 int *c)
1029 {
1030 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1031 struct netdev *netdev = rx->up.netdev;
1032 struct dpif_packet *packet;
1033 struct ofpbuf *buffer;
1034 ssize_t retval;
1035 int mtu;
1036
1037 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1038 mtu = ETH_PAYLOAD_MAX;
1039 }
1040
1041 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1042 DP_NETDEV_HEADROOM);
1043 buffer = &packet->ofpbuf;
1044
1045 retval = (rx->is_tap
1046 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1047 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1048
1049 if (retval) {
1050 if (retval != EAGAIN && retval != EMSGSIZE) {
1051 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1052 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1053 }
1054 dpif_packet_delete(packet);
1055 } else {
1056 dp_packet_pad(buffer);
1057 dpif_packet_set_dp_hash(packet, 0);
1058 packets[0] = packet;
1059 *c = 1;
1060 }
1061
1062 return retval;
1063 }
1064
1065 static void
1066 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1067 {
1068 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1069 poll_fd_wait(rx->fd, POLLIN);
1070 }
1071
1072 static int
1073 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1074 {
1075 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1076 if (rx->is_tap) {
1077 struct ifreq ifr;
1078 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1079 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1080 if (error) {
1081 return error;
1082 }
1083 drain_fd(rx->fd, ifr.ifr_qlen);
1084 return 0;
1085 } else {
1086 return drain_rcvbuf(rx->fd);
1087 }
1088 }
1089
1090 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1091 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1092 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1093 * the packet is too big or too small to transmit on the device.
1094 *
1095 * The caller retains ownership of 'buffer' in all cases.
1096 *
1097 * The kernel maintains a packet transmission queue, so the caller is not
1098 * expected to do additional queuing of packets. */
1099 static int
1100 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1101 struct dpif_packet **pkts, int cnt, bool may_steal)
1102 {
1103 int i;
1104 int error = 0;
1105
1106 /* 'i' is incremented only if there's no error */
1107 for (i = 0; i < cnt;) {
1108 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1109 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1110 ssize_t retval;
1111
1112 if (!is_tap_netdev(netdev_)) {
1113 /* Use our AF_PACKET socket to send to this device. */
1114 struct sockaddr_ll sll;
1115 struct msghdr msg;
1116 struct iovec iov;
1117 int ifindex;
1118 int sock;
1119
1120 sock = af_packet_sock();
1121 if (sock < 0) {
1122 return -sock;
1123 }
1124
1125 ifindex = netdev_get_ifindex(netdev_);
1126 if (ifindex < 0) {
1127 return -ifindex;
1128 }
1129
1130 /* We don't bother setting most fields in sockaddr_ll because the
1131 * kernel ignores them for SOCK_RAW. */
1132 memset(&sll, 0, sizeof sll);
1133 sll.sll_family = AF_PACKET;
1134 sll.sll_ifindex = ifindex;
1135
1136 iov.iov_base = CONST_CAST(void *, data);
1137 iov.iov_len = size;
1138
1139 msg.msg_name = &sll;
1140 msg.msg_namelen = sizeof sll;
1141 msg.msg_iov = &iov;
1142 msg.msg_iovlen = 1;
1143 msg.msg_control = NULL;
1144 msg.msg_controllen = 0;
1145 msg.msg_flags = 0;
1146
1147 retval = sendmsg(sock, &msg, 0);
1148 } else {
1149 /* Use the tap fd to send to this device. This is essential for
1150 * tap devices, because packets sent to a tap device with an
1151 * AF_PACKET socket will loop back to be *received* again on the
1152 * tap device. This doesn't occur on other interface types
1153 * because we attach a socket filter to the rx socket. */
1154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1155
1156 retval = write(netdev->tap_fd, data, size);
1157 }
1158
1159 if (retval < 0) {
1160 /* The Linux AF_PACKET implementation never blocks waiting for room
1161 * for packets, instead returning ENOBUFS. Translate this into
1162 * EAGAIN for the caller. */
1163 error = errno == ENOBUFS ? EAGAIN : errno;
1164 if (error == EINTR) {
1165 /* continue without incrementing 'i', i.e. retry this packet */
1166 continue;
1167 }
1168 break;
1169 } else if (retval != size) {
1170 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1171 " of %"PRIuSIZE") on %s", retval, size,
1172 netdev_get_name(netdev_));
1173 error = EMSGSIZE;
1174 break;
1175 }
1176
1177 /* Process the next packet in the batch */
1178 i++;
1179 }
1180
1181 if (may_steal) {
1182 for (i = 0; i < cnt; i++) {
1183 dpif_packet_delete(pkts[i]);
1184 }
1185 }
1186
1187 if (error && error != EAGAIN) {
1188 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1189 netdev_get_name(netdev_), ovs_strerror(error));
1190 }
1191
1192 return error;
1193
1194 }
1195
1196 /* Registers with the poll loop to wake up from the next call to poll_block()
1197 * when the packet transmission queue has sufficient room to transmit a packet
1198 * with netdev_send().
1199 *
1200 * The kernel maintains a packet transmission queue, so the client is not
1201 * expected to do additional queuing of packets. Thus, this function is
1202 * unlikely to ever be used. It is included for completeness. */
1203 static void
1204 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1205 {
1206 if (is_tap_netdev(netdev)) {
1207 /* TAP device always accepts packets.*/
1208 poll_immediate_wake();
1209 }
1210 }
1211
1212 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1213 * otherwise a positive errno value. */
1214 static int
1215 netdev_linux_set_etheraddr(struct netdev *netdev_,
1216 const uint8_t mac[ETH_ADDR_LEN])
1217 {
1218 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1219 enum netdev_flags old_flags = 0;
1220 int error;
1221
1222 ovs_mutex_lock(&netdev->mutex);
1223
1224 if (netdev->cache_valid & VALID_ETHERADDR) {
1225 error = netdev->ether_addr_error;
1226 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1227 goto exit;
1228 }
1229 netdev->cache_valid &= ~VALID_ETHERADDR;
1230 }
1231
1232 /* Tap devices must be brought down before setting the address. */
1233 if (is_tap_netdev(netdev_)) {
1234 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1235 }
1236 error = set_etheraddr(netdev_get_name(netdev_), mac);
1237 if (!error || error == ENODEV) {
1238 netdev->ether_addr_error = error;
1239 netdev->cache_valid |= VALID_ETHERADDR;
1240 if (!error) {
1241 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1242 }
1243 }
1244
1245 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1246 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1247 }
1248
1249 exit:
1250 ovs_mutex_unlock(&netdev->mutex);
1251 return error;
1252 }
1253
1254 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1255 static int
1256 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1257 uint8_t mac[ETH_ADDR_LEN])
1258 {
1259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1260 int error;
1261
1262 ovs_mutex_lock(&netdev->mutex);
1263 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1264 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1265 netdev->etheraddr);
1266 netdev->cache_valid |= VALID_ETHERADDR;
1267 }
1268
1269 error = netdev->ether_addr_error;
1270 if (!error) {
1271 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1272 }
1273 ovs_mutex_unlock(&netdev->mutex);
1274
1275 return error;
1276 }
1277
1278 static int
1279 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1280 {
1281 int error;
1282
1283 if (!(netdev->cache_valid & VALID_MTU)) {
1284 struct ifreq ifr;
1285
1286 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1287 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1288 netdev->mtu = ifr.ifr_mtu;
1289 netdev->cache_valid |= VALID_MTU;
1290 }
1291
1292 error = netdev->netdev_mtu_error;
1293 if (!error) {
1294 *mtup = netdev->mtu;
1295 }
1296
1297 return error;
1298 }
1299
1300 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1301 * in bytes, not including the hardware header; thus, this is typically 1500
1302 * bytes for Ethernet devices. */
1303 static int
1304 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1305 {
1306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1307 int error;
1308
1309 ovs_mutex_lock(&netdev->mutex);
1310 error = netdev_linux_get_mtu__(netdev, mtup);
1311 ovs_mutex_unlock(&netdev->mutex);
1312
1313 return error;
1314 }
1315
1316 /* Sets the maximum size of transmitted (MTU) for given device using linux
1317 * networking ioctl interface.
1318 */
1319 static int
1320 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1321 {
1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1323 struct ifreq ifr;
1324 int error;
1325
1326 ovs_mutex_lock(&netdev->mutex);
1327 if (netdev->cache_valid & VALID_MTU) {
1328 error = netdev->netdev_mtu_error;
1329 if (error || netdev->mtu == mtu) {
1330 goto exit;
1331 }
1332 netdev->cache_valid &= ~VALID_MTU;
1333 }
1334 ifr.ifr_mtu = mtu;
1335 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1336 SIOCSIFMTU, "SIOCSIFMTU");
1337 if (!error || error == ENODEV) {
1338 netdev->netdev_mtu_error = error;
1339 netdev->mtu = ifr.ifr_mtu;
1340 netdev->cache_valid |= VALID_MTU;
1341 }
1342 exit:
1343 ovs_mutex_unlock(&netdev->mutex);
1344 return error;
1345 }
1346
1347 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1348 * On failure, returns a negative errno value. */
1349 static int
1350 netdev_linux_get_ifindex(const struct netdev *netdev_)
1351 {
1352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1353 int ifindex, error;
1354
1355 ovs_mutex_lock(&netdev->mutex);
1356 error = get_ifindex(netdev_, &ifindex);
1357 ovs_mutex_unlock(&netdev->mutex);
1358
1359 return error ? -error : ifindex;
1360 }
1361
1362 static int
1363 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1364 {
1365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1366
1367 ovs_mutex_lock(&netdev->mutex);
1368 if (netdev->miimon_interval > 0) {
1369 *carrier = netdev->miimon;
1370 } else {
1371 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1372 }
1373 ovs_mutex_unlock(&netdev->mutex);
1374
1375 return 0;
1376 }
1377
1378 static long long int
1379 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1380 {
1381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1382 long long int carrier_resets;
1383
1384 ovs_mutex_lock(&netdev->mutex);
1385 carrier_resets = netdev->carrier_resets;
1386 ovs_mutex_unlock(&netdev->mutex);
1387
1388 return carrier_resets;
1389 }
1390
1391 static int
1392 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1393 struct mii_ioctl_data *data)
1394 {
1395 struct ifreq ifr;
1396 int error;
1397
1398 memset(&ifr, 0, sizeof ifr);
1399 memcpy(&ifr.ifr_data, data, sizeof *data);
1400 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1401 memcpy(data, &ifr.ifr_data, sizeof *data);
1402
1403 return error;
1404 }
1405
1406 static int
1407 netdev_linux_get_miimon(const char *name, bool *miimon)
1408 {
1409 struct mii_ioctl_data data;
1410 int error;
1411
1412 *miimon = false;
1413
1414 memset(&data, 0, sizeof data);
1415 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1416 if (!error) {
1417 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1418 data.reg_num = MII_BMSR;
1419 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1420 &data);
1421
1422 if (!error) {
1423 *miimon = !!(data.val_out & BMSR_LSTATUS);
1424 } else {
1425 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1426 }
1427 } else {
1428 struct ethtool_cmd ecmd;
1429
1430 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1431 name);
1432
1433 COVERAGE_INC(netdev_get_ethtool);
1434 memset(&ecmd, 0, sizeof ecmd);
1435 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1436 "ETHTOOL_GLINK");
1437 if (!error) {
1438 struct ethtool_value eval;
1439
1440 memcpy(&eval, &ecmd, sizeof eval);
1441 *miimon = !!eval.data;
1442 } else {
1443 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1444 }
1445 }
1446
1447 return error;
1448 }
1449
1450 static int
1451 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1452 long long int interval)
1453 {
1454 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1455
1456 ovs_mutex_lock(&netdev->mutex);
1457 interval = interval > 0 ? MAX(interval, 100) : 0;
1458 if (netdev->miimon_interval != interval) {
1459 if (interval && !netdev->miimon_interval) {
1460 atomic_count_inc(&miimon_cnt);
1461 } else if (!interval && netdev->miimon_interval) {
1462 atomic_count_dec(&miimon_cnt);
1463 }
1464
1465 netdev->miimon_interval = interval;
1466 timer_set_expired(&netdev->miimon_timer);
1467 }
1468 ovs_mutex_unlock(&netdev->mutex);
1469
1470 return 0;
1471 }
1472
1473 static void
1474 netdev_linux_miimon_run(void)
1475 {
1476 struct shash device_shash;
1477 struct shash_node *node;
1478
1479 shash_init(&device_shash);
1480 netdev_get_devices(&netdev_linux_class, &device_shash);
1481 SHASH_FOR_EACH (node, &device_shash) {
1482 struct netdev *netdev = node->data;
1483 struct netdev_linux *dev = netdev_linux_cast(netdev);
1484 bool miimon;
1485
1486 ovs_mutex_lock(&dev->mutex);
1487 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1488 netdev_linux_get_miimon(dev->up.name, &miimon);
1489 if (miimon != dev->miimon) {
1490 dev->miimon = miimon;
1491 netdev_linux_changed(dev, dev->ifi_flags, 0);
1492 }
1493
1494 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1495 }
1496 ovs_mutex_unlock(&dev->mutex);
1497 netdev_close(netdev);
1498 }
1499
1500 shash_destroy(&device_shash);
1501 }
1502
1503 static void
1504 netdev_linux_miimon_wait(void)
1505 {
1506 struct shash device_shash;
1507 struct shash_node *node;
1508
1509 shash_init(&device_shash);
1510 netdev_get_devices(&netdev_linux_class, &device_shash);
1511 SHASH_FOR_EACH (node, &device_shash) {
1512 struct netdev *netdev = node->data;
1513 struct netdev_linux *dev = netdev_linux_cast(netdev);
1514
1515 ovs_mutex_lock(&dev->mutex);
1516 if (dev->miimon_interval > 0) {
1517 timer_wait(&dev->miimon_timer);
1518 }
1519 ovs_mutex_unlock(&dev->mutex);
1520 netdev_close(netdev);
1521 }
1522 shash_destroy(&device_shash);
1523 }
1524
1525 static void
1526 swap_uint64(uint64_t *a, uint64_t *b)
1527 {
1528 uint64_t tmp = *a;
1529 *a = *b;
1530 *b = tmp;
1531 }
1532
1533 /* Copies 'src' into 'dst', performing format conversion in the process.
1534 *
1535 * 'src' is allowed to be misaligned. */
1536 static void
1537 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1538 const struct ovs_vport_stats *src)
1539 {
1540 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1541 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1542 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1543 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1544 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1545 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1546 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1547 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1548 dst->multicast = 0;
1549 dst->collisions = 0;
1550 dst->rx_length_errors = 0;
1551 dst->rx_over_errors = 0;
1552 dst->rx_crc_errors = 0;
1553 dst->rx_frame_errors = 0;
1554 dst->rx_fifo_errors = 0;
1555 dst->rx_missed_errors = 0;
1556 dst->tx_aborted_errors = 0;
1557 dst->tx_carrier_errors = 0;
1558 dst->tx_fifo_errors = 0;
1559 dst->tx_heartbeat_errors = 0;
1560 dst->tx_window_errors = 0;
1561 }
1562
1563 static int
1564 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1565 {
1566 struct dpif_netlink_vport reply;
1567 struct ofpbuf *buf;
1568 int error;
1569
1570 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1571 if (error) {
1572 return error;
1573 } else if (!reply.stats) {
1574 ofpbuf_delete(buf);
1575 return EOPNOTSUPP;
1576 }
1577
1578 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1579
1580 ofpbuf_delete(buf);
1581
1582 return 0;
1583 }
1584
1585 static void
1586 get_stats_via_vport(const struct netdev *netdev_,
1587 struct netdev_stats *stats)
1588 {
1589 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1590
1591 if (!netdev->vport_stats_error ||
1592 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1593 int error;
1594
1595 error = get_stats_via_vport__(netdev_, stats);
1596 if (error && error != ENOENT) {
1597 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1598 "(%s)",
1599 netdev_get_name(netdev_), ovs_strerror(error));
1600 }
1601 netdev->vport_stats_error = error;
1602 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1603 }
1604 }
1605
1606 /* Retrieves current device stats for 'netdev-linux'. */
1607 static int
1608 netdev_linux_get_stats(const struct netdev *netdev_,
1609 struct netdev_stats *stats)
1610 {
1611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1612 struct netdev_stats dev_stats;
1613 int error;
1614
1615 ovs_mutex_lock(&netdev->mutex);
1616 get_stats_via_vport(netdev_, stats);
1617 error = get_stats_via_netlink(netdev_, &dev_stats);
1618 if (error) {
1619 if (!netdev->vport_stats_error) {
1620 error = 0;
1621 }
1622 } else if (netdev->vport_stats_error) {
1623 /* stats not available from OVS then use netdev stats. */
1624 *stats = dev_stats;
1625 } else {
1626 /* Use kernel netdev's packet and byte counts since vport's counters
1627 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1628 * enabled. */
1629 stats->rx_packets = dev_stats.rx_packets;
1630 stats->rx_bytes = dev_stats.rx_bytes;
1631 stats->tx_packets = dev_stats.tx_packets;
1632 stats->tx_bytes = dev_stats.tx_bytes;
1633
1634 stats->rx_errors += dev_stats.rx_errors;
1635 stats->tx_errors += dev_stats.tx_errors;
1636 stats->rx_dropped += dev_stats.rx_dropped;
1637 stats->tx_dropped += dev_stats.tx_dropped;
1638 stats->multicast += dev_stats.multicast;
1639 stats->collisions += dev_stats.collisions;
1640 stats->rx_length_errors += dev_stats.rx_length_errors;
1641 stats->rx_over_errors += dev_stats.rx_over_errors;
1642 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1643 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1644 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1645 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1646 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1647 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1648 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1649 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1650 stats->tx_window_errors += dev_stats.tx_window_errors;
1651 }
1652 ovs_mutex_unlock(&netdev->mutex);
1653
1654 return error;
1655 }
1656
1657 /* Retrieves current device stats for 'netdev-tap' netdev or
1658 * netdev-internal. */
1659 static int
1660 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1661 {
1662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1663 struct netdev_stats dev_stats;
1664 int error;
1665
1666 ovs_mutex_lock(&netdev->mutex);
1667 get_stats_via_vport(netdev_, stats);
1668 error = get_stats_via_netlink(netdev_, &dev_stats);
1669 if (error) {
1670 if (!netdev->vport_stats_error) {
1671 error = 0;
1672 }
1673 } else if (netdev->vport_stats_error) {
1674 /* Transmit and receive stats will appear to be swapped relative to the
1675 * other ports since we are the one sending the data, not a remote
1676 * computer. For consistency, we swap them back here. This does not
1677 * apply if we are getting stats from the vport layer because it always
1678 * tracks stats from the perspective of the switch. */
1679
1680 *stats = dev_stats;
1681 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1682 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1683 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1684 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1685 stats->rx_length_errors = 0;
1686 stats->rx_over_errors = 0;
1687 stats->rx_crc_errors = 0;
1688 stats->rx_frame_errors = 0;
1689 stats->rx_fifo_errors = 0;
1690 stats->rx_missed_errors = 0;
1691 stats->tx_aborted_errors = 0;
1692 stats->tx_carrier_errors = 0;
1693 stats->tx_fifo_errors = 0;
1694 stats->tx_heartbeat_errors = 0;
1695 stats->tx_window_errors = 0;
1696 } else {
1697 /* Use kernel netdev's packet and byte counts since vport counters
1698 * do not reflect packet counts on the wire when GSO, TSO or GRO
1699 * are enabled. */
1700 stats->rx_packets = dev_stats.tx_packets;
1701 stats->rx_bytes = dev_stats.tx_bytes;
1702 stats->tx_packets = dev_stats.rx_packets;
1703 stats->tx_bytes = dev_stats.rx_bytes;
1704
1705 stats->rx_dropped += dev_stats.tx_dropped;
1706 stats->tx_dropped += dev_stats.rx_dropped;
1707
1708 stats->rx_errors += dev_stats.tx_errors;
1709 stats->tx_errors += dev_stats.rx_errors;
1710
1711 stats->multicast += dev_stats.multicast;
1712 stats->collisions += dev_stats.collisions;
1713 }
1714 ovs_mutex_unlock(&netdev->mutex);
1715
1716 return error;
1717 }
1718
1719 static int
1720 netdev_internal_get_stats(const struct netdev *netdev_,
1721 struct netdev_stats *stats)
1722 {
1723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1724 int error;
1725
1726 ovs_mutex_lock(&netdev->mutex);
1727 get_stats_via_vport(netdev_, stats);
1728 error = netdev->vport_stats_error;
1729 ovs_mutex_unlock(&netdev->mutex);
1730
1731 return error;
1732 }
1733
1734 static void
1735 netdev_linux_read_features(struct netdev_linux *netdev)
1736 {
1737 struct ethtool_cmd ecmd;
1738 uint32_t speed;
1739 int error;
1740
1741 if (netdev->cache_valid & VALID_FEATURES) {
1742 return;
1743 }
1744
1745 COVERAGE_INC(netdev_get_ethtool);
1746 memset(&ecmd, 0, sizeof ecmd);
1747 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1748 ETHTOOL_GSET, "ETHTOOL_GSET");
1749 if (error) {
1750 goto out;
1751 }
1752
1753 /* Supported features. */
1754 netdev->supported = 0;
1755 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1756 netdev->supported |= NETDEV_F_10MB_HD;
1757 }
1758 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1759 netdev->supported |= NETDEV_F_10MB_FD;
1760 }
1761 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1762 netdev->supported |= NETDEV_F_100MB_HD;
1763 }
1764 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1765 netdev->supported |= NETDEV_F_100MB_FD;
1766 }
1767 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1768 netdev->supported |= NETDEV_F_1GB_HD;
1769 }
1770 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1771 netdev->supported |= NETDEV_F_1GB_FD;
1772 }
1773 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1774 netdev->supported |= NETDEV_F_10GB_FD;
1775 }
1776 if (ecmd.supported & SUPPORTED_TP) {
1777 netdev->supported |= NETDEV_F_COPPER;
1778 }
1779 if (ecmd.supported & SUPPORTED_FIBRE) {
1780 netdev->supported |= NETDEV_F_FIBER;
1781 }
1782 if (ecmd.supported & SUPPORTED_Autoneg) {
1783 netdev->supported |= NETDEV_F_AUTONEG;
1784 }
1785 if (ecmd.supported & SUPPORTED_Pause) {
1786 netdev->supported |= NETDEV_F_PAUSE;
1787 }
1788 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1789 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1790 }
1791
1792 /* Advertised features. */
1793 netdev->advertised = 0;
1794 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1795 netdev->advertised |= NETDEV_F_10MB_HD;
1796 }
1797 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1798 netdev->advertised |= NETDEV_F_10MB_FD;
1799 }
1800 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1801 netdev->advertised |= NETDEV_F_100MB_HD;
1802 }
1803 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1804 netdev->advertised |= NETDEV_F_100MB_FD;
1805 }
1806 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1807 netdev->advertised |= NETDEV_F_1GB_HD;
1808 }
1809 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1810 netdev->advertised |= NETDEV_F_1GB_FD;
1811 }
1812 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1813 netdev->advertised |= NETDEV_F_10GB_FD;
1814 }
1815 if (ecmd.advertising & ADVERTISED_TP) {
1816 netdev->advertised |= NETDEV_F_COPPER;
1817 }
1818 if (ecmd.advertising & ADVERTISED_FIBRE) {
1819 netdev->advertised |= NETDEV_F_FIBER;
1820 }
1821 if (ecmd.advertising & ADVERTISED_Autoneg) {
1822 netdev->advertised |= NETDEV_F_AUTONEG;
1823 }
1824 if (ecmd.advertising & ADVERTISED_Pause) {
1825 netdev->advertised |= NETDEV_F_PAUSE;
1826 }
1827 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1828 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1829 }
1830
1831 /* Current settings. */
1832 speed = ecmd.speed;
1833 if (speed == SPEED_10) {
1834 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1835 } else if (speed == SPEED_100) {
1836 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1837 } else if (speed == SPEED_1000) {
1838 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1839 } else if (speed == SPEED_10000) {
1840 netdev->current = NETDEV_F_10GB_FD;
1841 } else if (speed == 40000) {
1842 netdev->current = NETDEV_F_40GB_FD;
1843 } else if (speed == 100000) {
1844 netdev->current = NETDEV_F_100GB_FD;
1845 } else if (speed == 1000000) {
1846 netdev->current = NETDEV_F_1TB_FD;
1847 } else {
1848 netdev->current = 0;
1849 }
1850
1851 if (ecmd.port == PORT_TP) {
1852 netdev->current |= NETDEV_F_COPPER;
1853 } else if (ecmd.port == PORT_FIBRE) {
1854 netdev->current |= NETDEV_F_FIBER;
1855 }
1856
1857 if (ecmd.autoneg) {
1858 netdev->current |= NETDEV_F_AUTONEG;
1859 }
1860
1861 out:
1862 netdev->cache_valid |= VALID_FEATURES;
1863 netdev->get_features_error = error;
1864 }
1865
1866 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1867 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1868 * Returns 0 if successful, otherwise a positive errno value. */
1869 static int
1870 netdev_linux_get_features(const struct netdev *netdev_,
1871 enum netdev_features *current,
1872 enum netdev_features *advertised,
1873 enum netdev_features *supported,
1874 enum netdev_features *peer)
1875 {
1876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1877 int error;
1878
1879 ovs_mutex_lock(&netdev->mutex);
1880 netdev_linux_read_features(netdev);
1881 if (!netdev->get_features_error) {
1882 *current = netdev->current;
1883 *advertised = netdev->advertised;
1884 *supported = netdev->supported;
1885 *peer = 0; /* XXX */
1886 }
1887 error = netdev->get_features_error;
1888 ovs_mutex_unlock(&netdev->mutex);
1889
1890 return error;
1891 }
1892
1893 /* Set the features advertised by 'netdev' to 'advertise'. */
1894 static int
1895 netdev_linux_set_advertisements(struct netdev *netdev_,
1896 enum netdev_features advertise)
1897 {
1898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1899 struct ethtool_cmd ecmd;
1900 int error;
1901
1902 ovs_mutex_lock(&netdev->mutex);
1903
1904 COVERAGE_INC(netdev_get_ethtool);
1905 memset(&ecmd, 0, sizeof ecmd);
1906 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1907 ETHTOOL_GSET, "ETHTOOL_GSET");
1908 if (error) {
1909 goto exit;
1910 }
1911
1912 ecmd.advertising = 0;
1913 if (advertise & NETDEV_F_10MB_HD) {
1914 ecmd.advertising |= ADVERTISED_10baseT_Half;
1915 }
1916 if (advertise & NETDEV_F_10MB_FD) {
1917 ecmd.advertising |= ADVERTISED_10baseT_Full;
1918 }
1919 if (advertise & NETDEV_F_100MB_HD) {
1920 ecmd.advertising |= ADVERTISED_100baseT_Half;
1921 }
1922 if (advertise & NETDEV_F_100MB_FD) {
1923 ecmd.advertising |= ADVERTISED_100baseT_Full;
1924 }
1925 if (advertise & NETDEV_F_1GB_HD) {
1926 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1927 }
1928 if (advertise & NETDEV_F_1GB_FD) {
1929 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1930 }
1931 if (advertise & NETDEV_F_10GB_FD) {
1932 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1933 }
1934 if (advertise & NETDEV_F_COPPER) {
1935 ecmd.advertising |= ADVERTISED_TP;
1936 }
1937 if (advertise & NETDEV_F_FIBER) {
1938 ecmd.advertising |= ADVERTISED_FIBRE;
1939 }
1940 if (advertise & NETDEV_F_AUTONEG) {
1941 ecmd.advertising |= ADVERTISED_Autoneg;
1942 }
1943 if (advertise & NETDEV_F_PAUSE) {
1944 ecmd.advertising |= ADVERTISED_Pause;
1945 }
1946 if (advertise & NETDEV_F_PAUSE_ASYM) {
1947 ecmd.advertising |= ADVERTISED_Asym_Pause;
1948 }
1949 COVERAGE_INC(netdev_set_ethtool);
1950 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1951 ETHTOOL_SSET, "ETHTOOL_SSET");
1952
1953 exit:
1954 ovs_mutex_unlock(&netdev->mutex);
1955 return error;
1956 }
1957
1958 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1959 * successful, otherwise a positive errno value. */
1960 static int
1961 netdev_linux_set_policing(struct netdev *netdev_,
1962 uint32_t kbits_rate, uint32_t kbits_burst)
1963 {
1964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1965 const char *netdev_name = netdev_get_name(netdev_);
1966 int error;
1967
1968 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1969 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1970 : kbits_burst); /* Stick with user-specified value. */
1971
1972 ovs_mutex_lock(&netdev->mutex);
1973 if (netdev->cache_valid & VALID_POLICING) {
1974 error = netdev->netdev_policing_error;
1975 if (error || (netdev->kbits_rate == kbits_rate &&
1976 netdev->kbits_burst == kbits_burst)) {
1977 /* Assume that settings haven't changed since we last set them. */
1978 goto out;
1979 }
1980 netdev->cache_valid &= ~VALID_POLICING;
1981 }
1982
1983 COVERAGE_INC(netdev_set_policing);
1984 /* Remove any existing ingress qdisc. */
1985 error = tc_add_del_ingress_qdisc(netdev_, false);
1986 if (error) {
1987 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1988 netdev_name, ovs_strerror(error));
1989 goto out;
1990 }
1991
1992 if (kbits_rate) {
1993 error = tc_add_del_ingress_qdisc(netdev_, true);
1994 if (error) {
1995 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1996 netdev_name, ovs_strerror(error));
1997 goto out;
1998 }
1999
2000 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2001 if (error){
2002 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2003 netdev_name, ovs_strerror(error));
2004 goto out;
2005 }
2006 }
2007
2008 netdev->kbits_rate = kbits_rate;
2009 netdev->kbits_burst = kbits_burst;
2010
2011 out:
2012 if (!error || error == ENODEV) {
2013 netdev->netdev_policing_error = error;
2014 netdev->cache_valid |= VALID_POLICING;
2015 }
2016 ovs_mutex_unlock(&netdev->mutex);
2017 return error;
2018 }
2019
2020 static int
2021 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2022 struct sset *types)
2023 {
2024 const struct tc_ops *const *opsp;
2025
2026 for (opsp = tcs; *opsp != NULL; opsp++) {
2027 const struct tc_ops *ops = *opsp;
2028 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2029 sset_add(types, ops->ovs_name);
2030 }
2031 }
2032 return 0;
2033 }
2034
2035 static const struct tc_ops *
2036 tc_lookup_ovs_name(const char *name)
2037 {
2038 const struct tc_ops *const *opsp;
2039
2040 for (opsp = tcs; *opsp != NULL; opsp++) {
2041 const struct tc_ops *ops = *opsp;
2042 if (!strcmp(name, ops->ovs_name)) {
2043 return ops;
2044 }
2045 }
2046 return NULL;
2047 }
2048
2049 static const struct tc_ops *
2050 tc_lookup_linux_name(const char *name)
2051 {
2052 const struct tc_ops *const *opsp;
2053
2054 for (opsp = tcs; *opsp != NULL; opsp++) {
2055 const struct tc_ops *ops = *opsp;
2056 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2057 return ops;
2058 }
2059 }
2060 return NULL;
2061 }
2062
2063 static struct tc_queue *
2064 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2065 size_t hash)
2066 {
2067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2068 struct tc_queue *queue;
2069
2070 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2071 if (queue->queue_id == queue_id) {
2072 return queue;
2073 }
2074 }
2075 return NULL;
2076 }
2077
2078 static struct tc_queue *
2079 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2080 {
2081 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2082 }
2083
2084 static int
2085 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2086 const char *type,
2087 struct netdev_qos_capabilities *caps)
2088 {
2089 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2090 if (!ops) {
2091 return EOPNOTSUPP;
2092 }
2093 caps->n_queues = ops->n_queues;
2094 return 0;
2095 }
2096
2097 static int
2098 netdev_linux_get_qos(const struct netdev *netdev_,
2099 const char **typep, struct smap *details)
2100 {
2101 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2102 int error;
2103
2104 ovs_mutex_lock(&netdev->mutex);
2105 error = tc_query_qdisc(netdev_);
2106 if (!error) {
2107 *typep = netdev->tc->ops->ovs_name;
2108 error = (netdev->tc->ops->qdisc_get
2109 ? netdev->tc->ops->qdisc_get(netdev_, details)
2110 : 0);
2111 }
2112 ovs_mutex_unlock(&netdev->mutex);
2113
2114 return error;
2115 }
2116
2117 static int
2118 netdev_linux_set_qos(struct netdev *netdev_,
2119 const char *type, const struct smap *details)
2120 {
2121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2122 const struct tc_ops *new_ops;
2123 int error;
2124
2125 new_ops = tc_lookup_ovs_name(type);
2126 if (!new_ops || !new_ops->tc_install) {
2127 return EOPNOTSUPP;
2128 }
2129
2130 ovs_mutex_lock(&netdev->mutex);
2131 error = tc_query_qdisc(netdev_);
2132 if (error) {
2133 goto exit;
2134 }
2135
2136 if (new_ops == netdev->tc->ops) {
2137 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2138 } else {
2139 /* Delete existing qdisc. */
2140 error = tc_del_qdisc(netdev_);
2141 if (error) {
2142 goto exit;
2143 }
2144 ovs_assert(netdev->tc == NULL);
2145
2146 /* Install new qdisc. */
2147 error = new_ops->tc_install(netdev_, details);
2148 ovs_assert((error == 0) == (netdev->tc != NULL));
2149 }
2150
2151 exit:
2152 ovs_mutex_unlock(&netdev->mutex);
2153 return error;
2154 }
2155
2156 static int
2157 netdev_linux_get_queue(const struct netdev *netdev_,
2158 unsigned int queue_id, struct smap *details)
2159 {
2160 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2161 int error;
2162
2163 ovs_mutex_lock(&netdev->mutex);
2164 error = tc_query_qdisc(netdev_);
2165 if (!error) {
2166 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2167 error = (queue
2168 ? netdev->tc->ops->class_get(netdev_, queue, details)
2169 : ENOENT);
2170 }
2171 ovs_mutex_unlock(&netdev->mutex);
2172
2173 return error;
2174 }
2175
2176 static int
2177 netdev_linux_set_queue(struct netdev *netdev_,
2178 unsigned int queue_id, const struct smap *details)
2179 {
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2181 int error;
2182
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2185 if (!error) {
2186 error = (queue_id < netdev->tc->ops->n_queues
2187 && netdev->tc->ops->class_set
2188 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2189 : EINVAL);
2190 }
2191 ovs_mutex_unlock(&netdev->mutex);
2192
2193 return error;
2194 }
2195
2196 static int
2197 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2198 {
2199 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2200 int error;
2201
2202 ovs_mutex_lock(&netdev->mutex);
2203 error = tc_query_qdisc(netdev_);
2204 if (!error) {
2205 if (netdev->tc->ops->class_delete) {
2206 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2207 error = (queue
2208 ? netdev->tc->ops->class_delete(netdev_, queue)
2209 : ENOENT);
2210 } else {
2211 error = EINVAL;
2212 }
2213 }
2214 ovs_mutex_unlock(&netdev->mutex);
2215
2216 return error;
2217 }
2218
2219 static int
2220 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2221 unsigned int queue_id,
2222 struct netdev_queue_stats *stats)
2223 {
2224 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2225 int error;
2226
2227 ovs_mutex_lock(&netdev->mutex);
2228 error = tc_query_qdisc(netdev_);
2229 if (!error) {
2230 if (netdev->tc->ops->class_get_stats) {
2231 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2232 if (queue) {
2233 stats->created = queue->created;
2234 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2235 stats);
2236 } else {
2237 error = ENOENT;
2238 }
2239 } else {
2240 error = EOPNOTSUPP;
2241 }
2242 }
2243 ovs_mutex_unlock(&netdev->mutex);
2244
2245 return error;
2246 }
2247
2248 struct queue_dump_state {
2249 struct nl_dump dump;
2250 struct ofpbuf buf;
2251 };
2252
2253 static bool
2254 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2255 {
2256 struct ofpbuf request;
2257 struct tcmsg *tcmsg;
2258
2259 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2260 if (!tcmsg) {
2261 return false;
2262 }
2263 tcmsg->tcm_parent = 0;
2264 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2265 ofpbuf_uninit(&request);
2266
2267 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2268 return true;
2269 }
2270
2271 static int
2272 finish_queue_dump(struct queue_dump_state *state)
2273 {
2274 ofpbuf_uninit(&state->buf);
2275 return nl_dump_done(&state->dump);
2276 }
2277
2278 struct netdev_linux_queue_state {
2279 unsigned int *queues;
2280 size_t cur_queue;
2281 size_t n_queues;
2282 };
2283
2284 static int
2285 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2286 {
2287 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2288 int error;
2289
2290 ovs_mutex_lock(&netdev->mutex);
2291 error = tc_query_qdisc(netdev_);
2292 if (!error) {
2293 if (netdev->tc->ops->class_get) {
2294 struct netdev_linux_queue_state *state;
2295 struct tc_queue *queue;
2296 size_t i;
2297
2298 *statep = state = xmalloc(sizeof *state);
2299 state->n_queues = hmap_count(&netdev->tc->queues);
2300 state->cur_queue = 0;
2301 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2302
2303 i = 0;
2304 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2305 state->queues[i++] = queue->queue_id;
2306 }
2307 } else {
2308 error = EOPNOTSUPP;
2309 }
2310 }
2311 ovs_mutex_unlock(&netdev->mutex);
2312
2313 return error;
2314 }
2315
2316 static int
2317 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2318 unsigned int *queue_idp, struct smap *details)
2319 {
2320 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2321 struct netdev_linux_queue_state *state = state_;
2322 int error = EOF;
2323
2324 ovs_mutex_lock(&netdev->mutex);
2325 while (state->cur_queue < state->n_queues) {
2326 unsigned int queue_id = state->queues[state->cur_queue++];
2327 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2328
2329 if (queue) {
2330 *queue_idp = queue_id;
2331 error = netdev->tc->ops->class_get(netdev_, queue, details);
2332 break;
2333 }
2334 }
2335 ovs_mutex_unlock(&netdev->mutex);
2336
2337 return error;
2338 }
2339
2340 static int
2341 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2342 void *state_)
2343 {
2344 struct netdev_linux_queue_state *state = state_;
2345
2346 free(state->queues);
2347 free(state);
2348 return 0;
2349 }
2350
2351 static int
2352 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2353 netdev_dump_queue_stats_cb *cb, void *aux)
2354 {
2355 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2356 int error;
2357
2358 ovs_mutex_lock(&netdev->mutex);
2359 error = tc_query_qdisc(netdev_);
2360 if (!error) {
2361 struct queue_dump_state state;
2362
2363 if (!netdev->tc->ops->class_dump_stats) {
2364 error = EOPNOTSUPP;
2365 } else if (!start_queue_dump(netdev_, &state)) {
2366 error = ENODEV;
2367 } else {
2368 struct ofpbuf msg;
2369 int retval;
2370
2371 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2372 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2373 cb, aux);
2374 if (retval) {
2375 error = retval;
2376 }
2377 }
2378
2379 retval = finish_queue_dump(&state);
2380 if (retval) {
2381 error = retval;
2382 }
2383 }
2384 }
2385 ovs_mutex_unlock(&netdev->mutex);
2386
2387 return error;
2388 }
2389
2390 static int
2391 netdev_linux_get_in4(const struct netdev *netdev_,
2392 struct in_addr *address, struct in_addr *netmask)
2393 {
2394 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2395 int error;
2396
2397 ovs_mutex_lock(&netdev->mutex);
2398 if (!(netdev->cache_valid & VALID_IN4)) {
2399 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2400 SIOCGIFADDR, "SIOCGIFADDR");
2401 if (!error) {
2402 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2403 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2404 if (!error) {
2405 netdev->cache_valid |= VALID_IN4;
2406 }
2407 }
2408 } else {
2409 error = 0;
2410 }
2411
2412 if (!error) {
2413 if (netdev->address.s_addr != INADDR_ANY) {
2414 *address = netdev->address;
2415 *netmask = netdev->netmask;
2416 } else {
2417 error = EADDRNOTAVAIL;
2418 }
2419 }
2420 ovs_mutex_unlock(&netdev->mutex);
2421
2422 return error;
2423 }
2424
2425 static int
2426 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2427 struct in_addr netmask)
2428 {
2429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2430 int error;
2431
2432 ovs_mutex_lock(&netdev->mutex);
2433 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2434 if (!error) {
2435 netdev->cache_valid |= VALID_IN4;
2436 netdev->address = address;
2437 netdev->netmask = netmask;
2438 if (address.s_addr != INADDR_ANY) {
2439 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2440 "SIOCSIFNETMASK", netmask);
2441 }
2442 }
2443 ovs_mutex_unlock(&netdev->mutex);
2444
2445 return error;
2446 }
2447
2448 static bool
2449 parse_if_inet6_line(const char *line,
2450 struct in6_addr *in6, char ifname[16 + 1])
2451 {
2452 uint8_t *s6 = in6->s6_addr;
2453 #define X8 "%2"SCNx8
2454 return ovs_scan(line,
2455 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2456 "%*x %*x %*x %*x %16s\n",
2457 &s6[0], &s6[1], &s6[2], &s6[3],
2458 &s6[4], &s6[5], &s6[6], &s6[7],
2459 &s6[8], &s6[9], &s6[10], &s6[11],
2460 &s6[12], &s6[13], &s6[14], &s6[15],
2461 ifname);
2462 }
2463
2464 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2465 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2466 static int
2467 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2468 {
2469 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2470
2471 ovs_mutex_lock(&netdev->mutex);
2472 if (!(netdev->cache_valid & VALID_IN6)) {
2473 FILE *file;
2474 char line[128];
2475
2476 netdev->in6 = in6addr_any;
2477
2478 file = fopen("/proc/net/if_inet6", "r");
2479 if (file != NULL) {
2480 const char *name = netdev_get_name(netdev_);
2481 while (fgets(line, sizeof line, file)) {
2482 struct in6_addr in6_tmp;
2483 char ifname[16 + 1];
2484 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2485 && !strcmp(name, ifname))
2486 {
2487 netdev->in6 = in6_tmp;
2488 break;
2489 }
2490 }
2491 fclose(file);
2492 }
2493 netdev->cache_valid |= VALID_IN6;
2494 }
2495 *in6 = netdev->in6;
2496 ovs_mutex_unlock(&netdev->mutex);
2497
2498 return 0;
2499 }
2500
2501 static void
2502 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2503 {
2504 struct sockaddr_in sin;
2505 memset(&sin, 0, sizeof sin);
2506 sin.sin_family = AF_INET;
2507 sin.sin_addr = addr;
2508 sin.sin_port = 0;
2509
2510 memset(sa, 0, sizeof *sa);
2511 memcpy(sa, &sin, sizeof sin);
2512 }
2513
2514 static int
2515 do_set_addr(struct netdev *netdev,
2516 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2517 {
2518 struct ifreq ifr;
2519
2520 make_in4_sockaddr(&ifr.ifr_addr, addr);
2521 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2522 ioctl_name);
2523 }
2524
2525 /* Adds 'router' as a default IP gateway. */
2526 static int
2527 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2528 {
2529 struct in_addr any = { INADDR_ANY };
2530 struct rtentry rt;
2531 int error;
2532
2533 memset(&rt, 0, sizeof rt);
2534 make_in4_sockaddr(&rt.rt_dst, any);
2535 make_in4_sockaddr(&rt.rt_gateway, router);
2536 make_in4_sockaddr(&rt.rt_genmask, any);
2537 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2538 error = af_inet_ioctl(SIOCADDRT, &rt);
2539 if (error) {
2540 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2541 }
2542 return error;
2543 }
2544
2545 static int
2546 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2547 char **netdev_name)
2548 {
2549 static const char fn[] = "/proc/net/route";
2550 FILE *stream;
2551 char line[256];
2552 int ln;
2553
2554 *netdev_name = NULL;
2555 stream = fopen(fn, "r");
2556 if (stream == NULL) {
2557 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2558 return errno;
2559 }
2560
2561 ln = 0;
2562 while (fgets(line, sizeof line, stream)) {
2563 if (++ln >= 2) {
2564 char iface[17];
2565 ovs_be32 dest, gateway, mask;
2566 int refcnt, metric, mtu;
2567 unsigned int flags, use, window, irtt;
2568
2569 if (!ovs_scan(line,
2570 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2571 " %d %u %u\n",
2572 iface, &dest, &gateway, &flags, &refcnt,
2573 &use, &metric, &mask, &mtu, &window, &irtt)) {
2574 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2575 fn, ln, line);
2576 continue;
2577 }
2578 if (!(flags & RTF_UP)) {
2579 /* Skip routes that aren't up. */
2580 continue;
2581 }
2582
2583 /* The output of 'dest', 'mask', and 'gateway' were given in
2584 * network byte order, so we don't need need any endian
2585 * conversions here. */
2586 if ((dest & mask) == (host->s_addr & mask)) {
2587 if (!gateway) {
2588 /* The host is directly reachable. */
2589 next_hop->s_addr = 0;
2590 } else {
2591 /* To reach the host, we must go through a gateway. */
2592 next_hop->s_addr = gateway;
2593 }
2594 *netdev_name = xstrdup(iface);
2595 fclose(stream);
2596 return 0;
2597 }
2598 }
2599 }
2600
2601 fclose(stream);
2602 return ENXIO;
2603 }
2604
2605 static int
2606 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2607 {
2608 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2609 int error = 0;
2610
2611 ovs_mutex_lock(&netdev->mutex);
2612 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2613 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2614
2615 COVERAGE_INC(netdev_get_ethtool);
2616 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2617 error = netdev_linux_do_ethtool(netdev->up.name,
2618 cmd,
2619 ETHTOOL_GDRVINFO,
2620 "ETHTOOL_GDRVINFO");
2621 if (!error) {
2622 netdev->cache_valid |= VALID_DRVINFO;
2623 }
2624 }
2625
2626 if (!error) {
2627 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2628 smap_add(smap, "driver_version", netdev->drvinfo.version);
2629 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2630 }
2631 ovs_mutex_unlock(&netdev->mutex);
2632
2633 return error;
2634 }
2635
2636 static int
2637 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2638 struct smap *smap)
2639 {
2640 smap_add(smap, "driver_name", "openvswitch");
2641 return 0;
2642 }
2643
2644 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2645 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2646 * returns 0. Otherwise, it returns a positive errno value; in particular,
2647 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2648 static int
2649 netdev_linux_arp_lookup(const struct netdev *netdev,
2650 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2651 {
2652 struct arpreq r;
2653 struct sockaddr_in sin;
2654 int retval;
2655
2656 memset(&r, 0, sizeof r);
2657 memset(&sin, 0, sizeof sin);
2658 sin.sin_family = AF_INET;
2659 sin.sin_addr.s_addr = ip;
2660 sin.sin_port = 0;
2661 memcpy(&r.arp_pa, &sin, sizeof sin);
2662 r.arp_ha.sa_family = ARPHRD_ETHER;
2663 r.arp_flags = 0;
2664 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2665 COVERAGE_INC(netdev_arp_lookup);
2666 retval = af_inet_ioctl(SIOCGARP, &r);
2667 if (!retval) {
2668 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2669 } else if (retval != ENXIO) {
2670 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2671 netdev_get_name(netdev), IP_ARGS(ip),
2672 ovs_strerror(retval));
2673 }
2674 return retval;
2675 }
2676
2677 static int
2678 nd_to_iff_flags(enum netdev_flags nd)
2679 {
2680 int iff = 0;
2681 if (nd & NETDEV_UP) {
2682 iff |= IFF_UP;
2683 }
2684 if (nd & NETDEV_PROMISC) {
2685 iff |= IFF_PROMISC;
2686 }
2687 if (nd & NETDEV_LOOPBACK) {
2688 iff |= IFF_LOOPBACK;
2689 }
2690 return iff;
2691 }
2692
2693 static int
2694 iff_to_nd_flags(int iff)
2695 {
2696 enum netdev_flags nd = 0;
2697 if (iff & IFF_UP) {
2698 nd |= NETDEV_UP;
2699 }
2700 if (iff & IFF_PROMISC) {
2701 nd |= NETDEV_PROMISC;
2702 }
2703 if (iff & IFF_LOOPBACK) {
2704 nd |= NETDEV_LOOPBACK;
2705 }
2706 return nd;
2707 }
2708
2709 static int
2710 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2711 enum netdev_flags on, enum netdev_flags *old_flagsp)
2712 OVS_REQUIRES(netdev->mutex)
2713 {
2714 int old_flags, new_flags;
2715 int error = 0;
2716
2717 old_flags = netdev->ifi_flags;
2718 *old_flagsp = iff_to_nd_flags(old_flags);
2719 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2720 if (new_flags != old_flags) {
2721 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2722 get_flags(&netdev->up, &netdev->ifi_flags);
2723 }
2724
2725 return error;
2726 }
2727
2728 static int
2729 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2730 enum netdev_flags on, enum netdev_flags *old_flagsp)
2731 {
2732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2733 int error;
2734
2735 ovs_mutex_lock(&netdev->mutex);
2736 error = update_flags(netdev, off, on, old_flagsp);
2737 ovs_mutex_unlock(&netdev->mutex);
2738
2739 return error;
2740 }
2741
2742 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2743 GET_FEATURES, GET_STATUS) \
2744 { \
2745 NAME, \
2746 \
2747 NULL, \
2748 netdev_linux_run, \
2749 netdev_linux_wait, \
2750 \
2751 netdev_linux_alloc, \
2752 CONSTRUCT, \
2753 netdev_linux_destruct, \
2754 netdev_linux_dealloc, \
2755 NULL, /* get_config */ \
2756 NULL, /* set_config */ \
2757 NULL, /* get_tunnel_config */ \
2758 NULL, /* get_numa_id */ \
2759 NULL, /* set_multiq */ \
2760 \
2761 netdev_linux_send, \
2762 netdev_linux_send_wait, \
2763 \
2764 netdev_linux_set_etheraddr, \
2765 netdev_linux_get_etheraddr, \
2766 netdev_linux_get_mtu, \
2767 netdev_linux_set_mtu, \
2768 netdev_linux_get_ifindex, \
2769 netdev_linux_get_carrier, \
2770 netdev_linux_get_carrier_resets, \
2771 netdev_linux_set_miimon_interval, \
2772 GET_STATS, \
2773 \
2774 GET_FEATURES, \
2775 netdev_linux_set_advertisements, \
2776 \
2777 netdev_linux_set_policing, \
2778 netdev_linux_get_qos_types, \
2779 netdev_linux_get_qos_capabilities, \
2780 netdev_linux_get_qos, \
2781 netdev_linux_set_qos, \
2782 netdev_linux_get_queue, \
2783 netdev_linux_set_queue, \
2784 netdev_linux_delete_queue, \
2785 netdev_linux_get_queue_stats, \
2786 netdev_linux_queue_dump_start, \
2787 netdev_linux_queue_dump_next, \
2788 netdev_linux_queue_dump_done, \
2789 netdev_linux_dump_queue_stats, \
2790 \
2791 netdev_linux_get_in4, \
2792 netdev_linux_set_in4, \
2793 netdev_linux_get_in6, \
2794 netdev_linux_add_router, \
2795 netdev_linux_get_next_hop, \
2796 GET_STATUS, \
2797 netdev_linux_arp_lookup, \
2798 \
2799 netdev_linux_update_flags, \
2800 \
2801 netdev_linux_rxq_alloc, \
2802 netdev_linux_rxq_construct, \
2803 netdev_linux_rxq_destruct, \
2804 netdev_linux_rxq_dealloc, \
2805 netdev_linux_rxq_recv, \
2806 netdev_linux_rxq_wait, \
2807 netdev_linux_rxq_drain, \
2808 }
2809
2810 const struct netdev_class netdev_linux_class =
2811 NETDEV_LINUX_CLASS(
2812 "system",
2813 netdev_linux_construct,
2814 netdev_linux_get_stats,
2815 netdev_linux_get_features,
2816 netdev_linux_get_status);
2817
2818 const struct netdev_class netdev_tap_class =
2819 NETDEV_LINUX_CLASS(
2820 "tap",
2821 netdev_linux_construct_tap,
2822 netdev_tap_get_stats,
2823 netdev_linux_get_features,
2824 netdev_linux_get_status);
2825
2826 const struct netdev_class netdev_internal_class =
2827 NETDEV_LINUX_CLASS(
2828 "internal",
2829 netdev_linux_construct,
2830 netdev_internal_get_stats,
2831 NULL, /* get_features */
2832 netdev_internal_get_status);
2833 \f
2834 /* HTB traffic control class. */
2835
2836 #define HTB_N_QUEUES 0xf000
2837
2838 struct htb {
2839 struct tc tc;
2840 unsigned int max_rate; /* In bytes/s. */
2841 };
2842
2843 struct htb_class {
2844 struct tc_queue tc_queue;
2845 unsigned int min_rate; /* In bytes/s. */
2846 unsigned int max_rate; /* In bytes/s. */
2847 unsigned int burst; /* In bytes. */
2848 unsigned int priority; /* Lower values are higher priorities. */
2849 };
2850
2851 static struct htb *
2852 htb_get__(const struct netdev *netdev_)
2853 {
2854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2855 return CONTAINER_OF(netdev->tc, struct htb, tc);
2856 }
2857
2858 static void
2859 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2860 {
2861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2862 struct htb *htb;
2863
2864 htb = xmalloc(sizeof *htb);
2865 tc_init(&htb->tc, &tc_ops_htb);
2866 htb->max_rate = max_rate;
2867
2868 netdev->tc = &htb->tc;
2869 }
2870
2871 /* Create an HTB qdisc.
2872 *
2873 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2874 static int
2875 htb_setup_qdisc__(struct netdev *netdev)
2876 {
2877 size_t opt_offset;
2878 struct tc_htb_glob opt;
2879 struct ofpbuf request;
2880 struct tcmsg *tcmsg;
2881
2882 tc_del_qdisc(netdev);
2883
2884 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2885 NLM_F_EXCL | NLM_F_CREATE, &request);
2886 if (!tcmsg) {
2887 return ENODEV;
2888 }
2889 tcmsg->tcm_handle = tc_make_handle(1, 0);
2890 tcmsg->tcm_parent = TC_H_ROOT;
2891
2892 nl_msg_put_string(&request, TCA_KIND, "htb");
2893
2894 memset(&opt, 0, sizeof opt);
2895 opt.rate2quantum = 10;
2896 opt.version = 3;
2897 opt.defcls = 1;
2898
2899 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2900 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2901 nl_msg_end_nested(&request, opt_offset);
2902
2903 return tc_transact(&request, NULL);
2904 }
2905
2906 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2907 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2908 static int
2909 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2910 unsigned int parent, struct htb_class *class)
2911 {
2912 size_t opt_offset;
2913 struct tc_htb_opt opt;
2914 struct ofpbuf request;
2915 struct tcmsg *tcmsg;
2916 int error;
2917 int mtu;
2918
2919 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2920 if (error) {
2921 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2922 netdev_get_name(netdev));
2923 return error;
2924 }
2925
2926 memset(&opt, 0, sizeof opt);
2927 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2928 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2929 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2930 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2931 opt.prio = class->priority;
2932
2933 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2934 if (!tcmsg) {
2935 return ENODEV;
2936 }
2937 tcmsg->tcm_handle = handle;
2938 tcmsg->tcm_parent = parent;
2939
2940 nl_msg_put_string(&request, TCA_KIND, "htb");
2941 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2942 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2943 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2944 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2945 nl_msg_end_nested(&request, opt_offset);
2946
2947 error = tc_transact(&request, NULL);
2948 if (error) {
2949 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2950 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2951 netdev_get_name(netdev),
2952 tc_get_major(handle), tc_get_minor(handle),
2953 tc_get_major(parent), tc_get_minor(parent),
2954 class->min_rate, class->max_rate,
2955 class->burst, class->priority, ovs_strerror(error));
2956 }
2957 return error;
2958 }
2959
2960 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2961 * description of them into 'details'. The description complies with the
2962 * specification given in the vswitch database documentation for linux-htb
2963 * queue details. */
2964 static int
2965 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2966 {
2967 static const struct nl_policy tca_htb_policy[] = {
2968 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2969 .min_len = sizeof(struct tc_htb_opt) },
2970 };
2971
2972 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2973 const struct tc_htb_opt *htb;
2974
2975 if (!nl_parse_nested(nl_options, tca_htb_policy,
2976 attrs, ARRAY_SIZE(tca_htb_policy))) {
2977 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2978 return EPROTO;
2979 }
2980
2981 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2982 class->min_rate = htb->rate.rate;
2983 class->max_rate = htb->ceil.rate;
2984 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2985 class->priority = htb->prio;
2986 return 0;
2987 }
2988
2989 static int
2990 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2991 struct htb_class *options,
2992 struct netdev_queue_stats *stats)
2993 {
2994 struct nlattr *nl_options;
2995 unsigned int handle;
2996 int error;
2997
2998 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2999 if (!error && queue_id) {
3000 unsigned int major = tc_get_major(handle);
3001 unsigned int minor = tc_get_minor(handle);
3002 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3003 *queue_id = minor - 1;
3004 } else {
3005 error = EPROTO;
3006 }
3007 }
3008 if (!error && options) {
3009 error = htb_parse_tca_options__(nl_options, options);
3010 }
3011 return error;
3012 }
3013
3014 static void
3015 htb_parse_qdisc_details__(struct netdev *netdev_,
3016 const struct smap *details, struct htb_class *hc)
3017 {
3018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3019 const char *max_rate_s;
3020
3021 max_rate_s = smap_get(details, "max-rate");
3022 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3023 if (!hc->max_rate) {
3024 enum netdev_features current;
3025
3026 netdev_linux_read_features(netdev);
3027 current = !netdev->get_features_error ? netdev->current : 0;
3028 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3029 }
3030 hc->min_rate = hc->max_rate;
3031 hc->burst = 0;
3032 hc->priority = 0;
3033 }
3034
3035 static int
3036 htb_parse_class_details__(struct netdev *netdev,
3037 const struct smap *details, struct htb_class *hc)
3038 {
3039 const struct htb *htb = htb_get__(netdev);
3040 const char *min_rate_s = smap_get(details, "min-rate");
3041 const char *max_rate_s = smap_get(details, "max-rate");
3042 const char *burst_s = smap_get(details, "burst");
3043 const char *priority_s = smap_get(details, "priority");
3044 int mtu, error;
3045
3046 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3047 if (error) {
3048 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3049 netdev_get_name(netdev));
3050 return error;
3051 }
3052
3053 /* HTB requires at least an mtu sized min-rate to send any traffic even
3054 * on uncongested links. */
3055 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3056 hc->min_rate = MAX(hc->min_rate, mtu);
3057 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3058
3059 /* max-rate */
3060 hc->max_rate = (max_rate_s
3061 ? strtoull(max_rate_s, NULL, 10) / 8
3062 : htb->max_rate);
3063 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3064 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3065
3066 /* burst
3067 *
3068 * According to hints in the documentation that I've read, it is important
3069 * that 'burst' be at least as big as the largest frame that might be
3070 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3071 * but having it a bit too small is a problem. Since netdev_get_mtu()
3072 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3073 * the MTU. We actually add 64, instead of 14, as a guard against
3074 * additional headers get tacked on somewhere that we're not aware of. */
3075 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3076 hc->burst = MAX(hc->burst, mtu + 64);
3077
3078 /* priority */
3079 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3080
3081 return 0;
3082 }
3083
3084 static int
3085 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3086 unsigned int parent, struct htb_class *options,
3087 struct netdev_queue_stats *stats)
3088 {
3089 struct ofpbuf *reply;
3090 int error;
3091
3092 error = tc_query_class(netdev, handle, parent, &reply);
3093 if (!error) {
3094 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3095 ofpbuf_delete(reply);
3096 }
3097 return error;
3098 }
3099
3100 static int
3101 htb_tc_install(struct netdev *netdev, const struct smap *details)
3102 {
3103 int error;
3104
3105 error = htb_setup_qdisc__(netdev);
3106 if (!error) {
3107 struct htb_class hc;
3108
3109 htb_parse_qdisc_details__(netdev, details, &hc);
3110 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3111 tc_make_handle(1, 0), &hc);
3112 if (!error) {
3113 htb_install__(netdev, hc.max_rate);
3114 }
3115 }
3116 return error;
3117 }
3118
3119 static struct htb_class *
3120 htb_class_cast__(const struct tc_queue *queue)
3121 {
3122 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3123 }
3124
3125 static void
3126 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3127 const struct htb_class *hc)
3128 {
3129 struct htb *htb = htb_get__(netdev);
3130 size_t hash = hash_int(queue_id, 0);
3131 struct tc_queue *queue;
3132 struct htb_class *hcp;
3133
3134 queue = tc_find_queue__(netdev, queue_id, hash);
3135 if (queue) {
3136 hcp = htb_class_cast__(queue);
3137 } else {
3138 hcp = xmalloc(sizeof *hcp);
3139 queue = &hcp->tc_queue;
3140 queue->queue_id = queue_id;
3141 queue->created = time_msec();
3142 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3143 }
3144
3145 hcp->min_rate = hc->min_rate;
3146 hcp->max_rate = hc->max_rate;
3147 hcp->burst = hc->burst;
3148 hcp->priority = hc->priority;
3149 }
3150
3151 static int
3152 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3153 {
3154 struct ofpbuf msg;
3155 struct queue_dump_state state;
3156 struct htb_class hc;
3157
3158 /* Get qdisc options. */
3159 hc.max_rate = 0;
3160 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3161 htb_install__(netdev, hc.max_rate);
3162
3163 /* Get queues. */
3164 if (!start_queue_dump(netdev, &state)) {
3165 return ENODEV;
3166 }
3167 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3168 unsigned int queue_id;
3169
3170 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3171 htb_update_queue__(netdev, queue_id, &hc);
3172 }
3173 }
3174 finish_queue_dump(&state);
3175
3176 return 0;
3177 }
3178
3179 static void
3180 htb_tc_destroy(struct tc *tc)
3181 {
3182 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3183 struct htb_class *hc, *next;
3184
3185 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3186 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3187 free(hc);
3188 }
3189 tc_destroy(tc);
3190 free(htb);
3191 }
3192
3193 static int
3194 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3195 {
3196 const struct htb *htb = htb_get__(netdev);
3197 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3198 return 0;
3199 }
3200
3201 static int
3202 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3203 {
3204 struct htb_class hc;
3205 int error;
3206
3207 htb_parse_qdisc_details__(netdev, details, &hc);
3208 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3209 tc_make_handle(1, 0), &hc);
3210 if (!error) {
3211 htb_get__(netdev)->max_rate = hc.max_rate;
3212 }
3213 return error;
3214 }
3215
3216 static int
3217 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3218 const struct tc_queue *queue, struct smap *details)
3219 {
3220 const struct htb_class *hc = htb_class_cast__(queue);
3221
3222 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3223 if (hc->min_rate != hc->max_rate) {
3224 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3225 }
3226 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3227 if (hc->priority) {
3228 smap_add_format(details, "priority", "%u", hc->priority);
3229 }
3230 return 0;
3231 }
3232
3233 static int
3234 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3235 const struct smap *details)
3236 {
3237 struct htb_class hc;
3238 int error;
3239
3240 error = htb_parse_class_details__(netdev, details, &hc);
3241 if (error) {
3242 return error;
3243 }
3244
3245 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3246 tc_make_handle(1, 0xfffe), &hc);
3247 if (error) {
3248 return error;
3249 }
3250
3251 htb_update_queue__(netdev, queue_id, &hc);
3252 return 0;
3253 }
3254
3255 static int
3256 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3257 {
3258 struct htb_class *hc = htb_class_cast__(queue);
3259 struct htb *htb = htb_get__(netdev);
3260 int error;
3261
3262 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3263 if (!error) {
3264 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3265 free(hc);
3266 }
3267 return error;
3268 }
3269
3270 static int
3271 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3272 struct netdev_queue_stats *stats)
3273 {
3274 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3275 tc_make_handle(1, 0xfffe), NULL, stats);
3276 }
3277
3278 static int
3279 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3280 const struct ofpbuf *nlmsg,
3281 netdev_dump_queue_stats_cb *cb, void *aux)
3282 {
3283 struct netdev_queue_stats stats;
3284 unsigned int handle, major, minor;
3285 int error;
3286
3287 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3288 if (error) {
3289 return error;
3290 }
3291
3292 major = tc_get_major(handle);
3293 minor = tc_get_minor(handle);
3294 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3295 (*cb)(minor - 1, &stats, aux);
3296 }
3297 return 0;
3298 }
3299
3300 static const struct tc_ops tc_ops_htb = {
3301 "htb", /* linux_name */
3302 "linux-htb", /* ovs_name */
3303 HTB_N_QUEUES, /* n_queues */
3304 htb_tc_install,
3305 htb_tc_load,
3306 htb_tc_destroy,
3307 htb_qdisc_get,
3308 htb_qdisc_set,
3309 htb_class_get,
3310 htb_class_set,
3311 htb_class_delete,
3312 htb_class_get_stats,
3313 htb_class_dump_stats
3314 };
3315 \f
3316 /* "linux-hfsc" traffic control class. */
3317
3318 #define HFSC_N_QUEUES 0xf000
3319
3320 struct hfsc {
3321 struct tc tc;
3322 uint32_t max_rate;
3323 };
3324
3325 struct hfsc_class {
3326 struct tc_queue tc_queue;
3327 uint32_t min_rate;
3328 uint32_t max_rate;
3329 };
3330
3331 static struct hfsc *
3332 hfsc_get__(const struct netdev *netdev_)
3333 {
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3335 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3336 }
3337
3338 static struct hfsc_class *
3339 hfsc_class_cast__(const struct tc_queue *queue)
3340 {
3341 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3342 }
3343
3344 static void
3345 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3346 {
3347 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3348 struct hfsc *hfsc;
3349
3350 hfsc = xmalloc(sizeof *hfsc);
3351 tc_init(&hfsc->tc, &tc_ops_hfsc);
3352 hfsc->max_rate = max_rate;
3353 netdev->tc = &hfsc->tc;
3354 }
3355
3356 static void
3357 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3358 const struct hfsc_class *hc)
3359 {
3360 size_t hash;
3361 struct hfsc *hfsc;
3362 struct hfsc_class *hcp;
3363 struct tc_queue *queue;
3364
3365 hfsc = hfsc_get__(netdev);
3366 hash = hash_int(queue_id, 0);
3367
3368 queue = tc_find_queue__(netdev, queue_id, hash);
3369 if (queue) {
3370 hcp = hfsc_class_cast__(queue);
3371 } else {
3372 hcp = xmalloc(sizeof *hcp);
3373 queue = &hcp->tc_queue;
3374 queue->queue_id = queue_id;
3375 queue->created = time_msec();
3376 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3377 }
3378
3379 hcp->min_rate = hc->min_rate;
3380 hcp->max_rate = hc->max_rate;
3381 }
3382
3383 static int
3384 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3385 {
3386 const struct tc_service_curve *rsc, *fsc, *usc;
3387 static const struct nl_policy tca_hfsc_policy[] = {
3388 [TCA_HFSC_RSC] = {
3389 .type = NL_A_UNSPEC,
3390 .optional = false,
3391 .min_len = sizeof(struct tc_service_curve),
3392 },
3393 [TCA_HFSC_FSC] = {
3394 .type = NL_A_UNSPEC,
3395 .optional = false,
3396 .min_len = sizeof(struct tc_service_curve),
3397 },
3398 [TCA_HFSC_USC] = {
3399 .type = NL_A_UNSPEC,
3400 .optional = false,
3401 .min_len = sizeof(struct tc_service_curve),
3402 },
3403 };
3404 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3405
3406 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3407 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3408 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3409 return EPROTO;
3410 }
3411
3412 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3413 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3414 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3415
3416 if (rsc->m1 != 0 || rsc->d != 0 ||
3417 fsc->m1 != 0 || fsc->d != 0 ||
3418 usc->m1 != 0 || usc->d != 0) {
3419 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3420 "Non-linear service curves are not supported.");
3421 return EPROTO;
3422 }
3423
3424 if (rsc->m2 != fsc->m2) {
3425 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3426 "Real-time service curves are not supported ");
3427 return EPROTO;
3428 }
3429
3430 if (rsc->m2 > usc->m2) {
3431 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3432 "Min-rate service curve is greater than "
3433 "the max-rate service curve.");
3434 return EPROTO;
3435 }
3436
3437 class->min_rate = fsc->m2;
3438 class->max_rate = usc->m2;
3439 return 0;
3440 }
3441
3442 static int
3443 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3444 struct hfsc_class *options,
3445 struct netdev_queue_stats *stats)
3446 {
3447 int error;
3448 unsigned int handle;
3449 struct nlattr *nl_options;
3450
3451 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3452 if (error) {
3453 return error;
3454 }
3455
3456 if (queue_id) {
3457 unsigned int major, minor;
3458
3459 major = tc_get_major(handle);
3460 minor = tc_get_minor(handle);
3461 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3462 *queue_id = minor - 1;
3463 } else {
3464 return EPROTO;
3465 }
3466 }
3467
3468 if (options) {
3469 error = hfsc_parse_tca_options__(nl_options, options);
3470 }
3471
3472 return error;
3473 }
3474
3475 static int
3476 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3477 unsigned int parent, struct hfsc_class *options,
3478 struct netdev_queue_stats *stats)
3479 {
3480 int error;
3481 struct ofpbuf *reply;
3482
3483 error = tc_query_class(netdev, handle, parent, &reply);
3484 if (error) {
3485 return error;
3486 }
3487
3488 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3489 ofpbuf_delete(reply);
3490 return error;
3491 }
3492
3493 static void
3494 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3495 struct hfsc_class *class)
3496 {
3497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3498 uint32_t max_rate;
3499 const char *max_rate_s;
3500
3501 max_rate_s = smap_get(details, "max-rate");
3502 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3503
3504 if (!max_rate) {
3505 enum netdev_features current;
3506
3507 netdev_linux_read_features(netdev);
3508 current = !netdev->get_features_error ? netdev->current : 0;
3509 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3510 }
3511
3512 class->min_rate = max_rate;
3513 class->max_rate = max_rate;
3514 }
3515
3516 static int
3517 hfsc_parse_class_details__(struct netdev *netdev,
3518 const struct smap *details,
3519 struct hfsc_class * class)
3520 {
3521 const struct hfsc *hfsc;
3522 uint32_t min_rate, max_rate;
3523 const char *min_rate_s, *max_rate_s;
3524
3525 hfsc = hfsc_get__(netdev);
3526 min_rate_s = smap_get(details, "min-rate");
3527 max_rate_s = smap_get(details, "max-rate");
3528
3529 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3530 min_rate = MAX(min_rate, 1);
3531 min_rate = MIN(min_rate, hfsc->max_rate);
3532
3533 max_rate = (max_rate_s
3534 ? strtoull(max_rate_s, NULL, 10) / 8
3535 : hfsc->max_rate);
3536 max_rate = MAX(max_rate, min_rate);
3537 max_rate = MIN(max_rate, hfsc->max_rate);
3538
3539 class->min_rate = min_rate;
3540 class->max_rate = max_rate;
3541
3542 return 0;
3543 }
3544
3545 /* Create an HFSC qdisc.
3546 *
3547 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3548 static int
3549 hfsc_setup_qdisc__(struct netdev * netdev)
3550 {
3551 struct tcmsg *tcmsg;
3552 struct ofpbuf request;
3553 struct tc_hfsc_qopt opt;
3554
3555 tc_del_qdisc(netdev);
3556
3557 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3558 NLM_F_EXCL | NLM_F_CREATE, &request);
3559
3560 if (!tcmsg) {
3561 return ENODEV;
3562 }
3563
3564 tcmsg->tcm_handle = tc_make_handle(1, 0);
3565 tcmsg->tcm_parent = TC_H_ROOT;
3566
3567 memset(&opt, 0, sizeof opt);
3568 opt.defcls = 1;
3569
3570 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3571 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3572
3573 return tc_transact(&request, NULL);
3574 }
3575
3576 /* Create an HFSC class.
3577 *
3578 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3579 * sc rate <min_rate> ul rate <max_rate>" */
3580 static int
3581 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3582 unsigned int parent, struct hfsc_class *class)
3583 {
3584 int error;
3585 size_t opt_offset;
3586 struct tcmsg *tcmsg;
3587 struct ofpbuf request;
3588 struct tc_service_curve min, max;
3589
3590 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3591
3592 if (!tcmsg) {
3593 return ENODEV;
3594 }
3595
3596 tcmsg->tcm_handle = handle;
3597 tcmsg->tcm_parent = parent;
3598
3599 min.m1 = 0;
3600 min.d = 0;
3601 min.m2 = class->min_rate;
3602
3603 max.m1 = 0;
3604 max.d = 0;
3605 max.m2 = class->max_rate;
3606
3607 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3608 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3609 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3610 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3611 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3612 nl_msg_end_nested(&request, opt_offset);
3613
3614 error = tc_transact(&request, NULL);
3615 if (error) {
3616 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3617 "min-rate %ubps, max-rate %ubps (%s)",
3618 netdev_get_name(netdev),
3619 tc_get_major(handle), tc_get_minor(handle),
3620 tc_get_major(parent), tc_get_minor(parent),
3621 class->min_rate, class->max_rate, ovs_strerror(error));
3622 }
3623
3624 return error;
3625 }
3626
3627 static int
3628 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3629 {
3630 int error;
3631 struct hfsc_class class;
3632
3633 error = hfsc_setup_qdisc__(netdev);
3634
3635 if (error) {
3636 return error;
3637 }
3638
3639 hfsc_parse_qdisc_details__(netdev, details, &class);
3640 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3641 tc_make_handle(1, 0), &class);
3642
3643 if (error) {
3644 return error;
3645 }
3646
3647 hfsc_install__(netdev, class.max_rate);
3648 return 0;
3649 }
3650
3651 static int
3652 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3653 {
3654 struct ofpbuf msg;
3655 struct queue_dump_state state;
3656 struct hfsc_class hc;
3657
3658 hc.max_rate = 0;
3659 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3660 hfsc_install__(netdev, hc.max_rate);
3661
3662 if (!start_queue_dump(netdev, &state)) {
3663 return ENODEV;
3664 }
3665
3666 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3667 unsigned int queue_id;
3668
3669 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3670 hfsc_update_queue__(netdev, queue_id, &hc);
3671 }
3672 }
3673
3674 finish_queue_dump(&state);
3675 return 0;
3676 }
3677
3678 static void
3679 hfsc_tc_destroy(struct tc *tc)
3680 {
3681 struct hfsc *hfsc;
3682 struct hfsc_class *hc, *next;
3683
3684 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3685
3686 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3687 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3688 free(hc);
3689 }
3690
3691 tc_destroy(tc);
3692 free(hfsc);
3693 }
3694
3695 static int
3696 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3697 {
3698 const struct hfsc *hfsc;
3699 hfsc = hfsc_get__(netdev);
3700 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3701 return 0;
3702 }
3703
3704 static int
3705 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3706 {
3707 int error;
3708 struct hfsc_class class;
3709
3710 hfsc_parse_qdisc_details__(netdev, details, &class);
3711 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3712 tc_make_handle(1, 0), &class);
3713
3714 if (!error) {
3715 hfsc_get__(netdev)->max_rate = class.max_rate;
3716 }
3717
3718 return error;
3719 }
3720
3721 static int
3722 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3723 const struct tc_queue *queue, struct smap *details)
3724 {
3725 const struct hfsc_class *hc;
3726
3727 hc = hfsc_class_cast__(queue);
3728 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3729 if (hc->min_rate != hc->max_rate) {
3730 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3731 }
3732 return 0;
3733 }
3734
3735 static int
3736 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3737 const struct smap *details)
3738 {
3739 int error;
3740 struct hfsc_class class;
3741
3742 error = hfsc_parse_class_details__(netdev, details, &class);
3743 if (error) {
3744 return error;
3745 }
3746
3747 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3748 tc_make_handle(1, 0xfffe), &class);
3749 if (error) {
3750 return error;
3751 }
3752
3753 hfsc_update_queue__(netdev, queue_id, &class);
3754 return 0;
3755 }
3756
3757 static int
3758 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3759 {
3760 int error;
3761 struct hfsc *hfsc;
3762 struct hfsc_class *hc;
3763
3764 hc = hfsc_class_cast__(queue);
3765 hfsc = hfsc_get__(netdev);
3766
3767 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3768 if (!error) {
3769 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3770 free(hc);
3771 }
3772 return error;
3773 }
3774
3775 static int
3776 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3777 struct netdev_queue_stats *stats)
3778 {
3779 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3780 tc_make_handle(1, 0xfffe), NULL, stats);
3781 }
3782
3783 static int
3784 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3785 const struct ofpbuf *nlmsg,
3786 netdev_dump_queue_stats_cb *cb, void *aux)
3787 {
3788 struct netdev_queue_stats stats;
3789 unsigned int handle, major, minor;
3790 int error;
3791
3792 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3793 if (error) {
3794 return error;
3795 }
3796
3797 major = tc_get_major(handle);
3798 minor = tc_get_minor(handle);
3799 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3800 (*cb)(minor - 1, &stats, aux);
3801 }
3802 return 0;
3803 }
3804
3805 static const struct tc_ops tc_ops_hfsc = {
3806 "hfsc", /* linux_name */
3807 "linux-hfsc", /* ovs_name */
3808 HFSC_N_QUEUES, /* n_queues */
3809 hfsc_tc_install, /* tc_install */
3810 hfsc_tc_load, /* tc_load */
3811 hfsc_tc_destroy, /* tc_destroy */
3812 hfsc_qdisc_get, /* qdisc_get */
3813 hfsc_qdisc_set, /* qdisc_set */
3814 hfsc_class_get, /* class_get */
3815 hfsc_class_set, /* class_set */
3816 hfsc_class_delete, /* class_delete */
3817 hfsc_class_get_stats, /* class_get_stats */
3818 hfsc_class_dump_stats /* class_dump_stats */
3819 };
3820 \f
3821 /* "linux-default" traffic control class.
3822 *
3823 * This class represents the default, unnamed Linux qdisc. It corresponds to
3824 * the "" (empty string) QoS type in the OVS database. */
3825
3826 static void
3827 default_install__(struct netdev *netdev_)
3828 {
3829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3830 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3831
3832 /* Nothing but a tc class implementation is allowed to write to a tc. This
3833 * class never does that, so we can legitimately use a const tc object. */
3834 netdev->tc = CONST_CAST(struct tc *, &tc);
3835 }
3836
3837 static int
3838 default_tc_install(struct netdev *netdev,
3839 const struct smap *details OVS_UNUSED)
3840 {
3841 default_install__(netdev);
3842 return 0;
3843 }
3844
3845 static int
3846 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3847 {
3848 default_install__(netdev);
3849 return 0;
3850 }
3851
3852 static const struct tc_ops tc_ops_default = {
3853 NULL, /* linux_name */
3854 "", /* ovs_name */
3855 0, /* n_queues */
3856 default_tc_install,
3857 default_tc_load,
3858 NULL, /* tc_destroy */
3859 NULL, /* qdisc_get */
3860 NULL, /* qdisc_set */
3861 NULL, /* class_get */
3862 NULL, /* class_set */
3863 NULL, /* class_delete */
3864 NULL, /* class_get_stats */
3865 NULL /* class_dump_stats */
3866 };
3867 \f
3868 /* "linux-other" traffic control class.
3869 *
3870 * */
3871
3872 static int
3873 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3874 {
3875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3876 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3877
3878 /* Nothing but a tc class implementation is allowed to write to a tc. This
3879 * class never does that, so we can legitimately use a const tc object. */
3880 netdev->tc = CONST_CAST(struct tc *, &tc);
3881 return 0;
3882 }
3883
3884 static const struct tc_ops tc_ops_other = {
3885 NULL, /* linux_name */
3886 "linux-other", /* ovs_name */
3887 0, /* n_queues */
3888 NULL, /* tc_install */
3889 other_tc_load,
3890 NULL, /* tc_destroy */
3891 NULL, /* qdisc_get */
3892 NULL, /* qdisc_set */
3893 NULL, /* class_get */
3894 NULL, /* class_set */
3895 NULL, /* class_delete */
3896 NULL, /* class_get_stats */
3897 NULL /* class_dump_stats */
3898 };
3899 \f
3900 /* Traffic control. */
3901
3902 /* Number of kernel "tc" ticks per second. */
3903 static double ticks_per_s;
3904
3905 /* Number of kernel "jiffies" per second. This is used for the purpose of
3906 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3907 * one jiffy's worth of data.
3908 *
3909 * There are two possibilities here:
3910 *
3911 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3912 * approximate range of 100 to 1024. That means that we really need to
3913 * make sure that the qdisc can buffer that much data.
3914 *
3915 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3916 * has finely granular timers and there's no need to fudge additional room
3917 * for buffers. (There's no extra effort needed to implement that: the
3918 * large 'buffer_hz' is used as a divisor, so practically any number will
3919 * come out as 0 in the division. Small integer results in the case of
3920 * really high dividends won't have any real effect anyhow.)
3921 */
3922 static unsigned int buffer_hz;
3923
3924 /* Returns tc handle 'major':'minor'. */
3925 static unsigned int
3926 tc_make_handle(unsigned int major, unsigned int minor)
3927 {
3928 return TC_H_MAKE(major << 16, minor);
3929 }
3930
3931 /* Returns the major number from 'handle'. */
3932 static unsigned int
3933 tc_get_major(unsigned int handle)
3934 {
3935 return TC_H_MAJ(handle) >> 16;
3936 }
3937
3938 /* Returns the minor number from 'handle'. */
3939 static unsigned int
3940 tc_get_minor(unsigned int handle)
3941 {
3942 return TC_H_MIN(handle);
3943 }
3944
3945 static struct tcmsg *
3946 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3947 struct ofpbuf *request)
3948 {
3949 struct tcmsg *tcmsg;
3950 int ifindex;
3951 int error;
3952
3953 error = get_ifindex(netdev, &ifindex);
3954 if (error) {
3955 return NULL;
3956 }
3957
3958 ofpbuf_init(request, 512);
3959 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3960 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3961 tcmsg->tcm_family = AF_UNSPEC;
3962 tcmsg->tcm_ifindex = ifindex;
3963 /* Caller should fill in tcmsg->tcm_handle. */
3964 /* Caller should fill in tcmsg->tcm_parent. */
3965
3966 return tcmsg;
3967 }
3968
3969 static int
3970 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3971 {
3972 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3973 ofpbuf_uninit(request);
3974 return error;
3975 }
3976
3977 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3978 * policing configuration.
3979 *
3980 * This function is equivalent to running the following when 'add' is true:
3981 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3982 *
3983 * This function is equivalent to running the following when 'add' is false:
3984 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3985 *
3986 * The configuration and stats may be seen with the following command:
3987 * /sbin/tc -s qdisc show dev <devname>
3988 *
3989 * Returns 0 if successful, otherwise a positive errno value.
3990 */
3991 static int
3992 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3993 {
3994 struct ofpbuf request;
3995 struct tcmsg *tcmsg;
3996 int error;
3997 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3998 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3999
4000 tcmsg = tc_make_request(netdev, type, flags, &request);
4001 if (!tcmsg) {
4002 return ENODEV;
4003 }
4004 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4005 tcmsg->tcm_parent = TC_H_INGRESS;
4006 nl_msg_put_string(&request, TCA_KIND, "ingress");
4007 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4008
4009 error = tc_transact(&request, NULL);
4010 if (error) {
4011 /* If we're deleting the qdisc, don't worry about some of the
4012 * error conditions. */
4013 if (!add && (error == ENOENT || error == EINVAL)) {
4014 return 0;
4015 }
4016 return error;
4017 }
4018
4019 return 0;
4020 }
4021
4022 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4023 * of 'kbits_burst'.
4024 *
4025 * This function is equivalent to running:
4026 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4027 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4028 * mtu 65535 drop
4029 *
4030 * The configuration and stats may be seen with the following command:
4031 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4032 *
4033 * Returns 0 if successful, otherwise a positive errno value.
4034 */
4035 static int
4036 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4037 {
4038 struct tc_police tc_police;
4039 struct ofpbuf request;
4040 struct tcmsg *tcmsg;
4041 size_t basic_offset;
4042 size_t police_offset;
4043 int error;
4044 int mtu = 65535;
4045
4046 memset(&tc_police, 0, sizeof tc_police);
4047 tc_police.action = TC_POLICE_SHOT;
4048 tc_police.mtu = mtu;
4049 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4050 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4051 kbits_burst * 1024);
4052
4053 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4054 NLM_F_EXCL | NLM_F_CREATE, &request);
4055 if (!tcmsg) {
4056 return ENODEV;
4057 }
4058 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4059 tcmsg->tcm_info = tc_make_handle(49,
4060 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4061
4062 nl_msg_put_string(&request, TCA_KIND, "basic");
4063 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4064 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4065 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4066 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4067 nl_msg_end_nested(&request, police_offset);
4068 nl_msg_end_nested(&request, basic_offset);
4069
4070 error = tc_transact(&request, NULL);
4071 if (error) {
4072 return error;
4073 }
4074
4075 return 0;
4076 }
4077
4078 static void
4079 read_psched(void)
4080 {
4081 /* The values in psched are not individually very meaningful, but they are
4082 * important. The tables below show some values seen in the wild.
4083 *
4084 * Some notes:
4085 *
4086 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4087 * (Before that, there are hints that it was 1000000000.)
4088 *
4089 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4090 * above.
4091 *
4092 * /proc/net/psched
4093 * -----------------------------------
4094 * [1] 000c8000 000f4240 000f4240 00000064
4095 * [2] 000003e8 00000400 000f4240 3b9aca00
4096 * [3] 000003e8 00000400 000f4240 3b9aca00
4097 * [4] 000003e8 00000400 000f4240 00000064
4098 * [5] 000003e8 00000040 000f4240 3b9aca00
4099 * [6] 000003e8 00000040 000f4240 000000f9
4100 *
4101 * a b c d ticks_per_s buffer_hz
4102 * ------- --------- ---------- ------------- ----------- -------------
4103 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4104 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4105 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4106 * [4] 1,000 1,024 1,000,000 100 976,562 100
4107 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4108 * [6] 1,000 64 1,000,000 249 15,625,000 249
4109 *
4110 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4111 * [2] 2.6.26-1-686-bigmem from Debian lenny
4112 * [3] 2.6.26-2-sparc64 from Debian lenny
4113 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4114 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4115 * [6] 2.6.34 from kernel.org on KVM
4116 */
4117 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4118 static const char fn[] = "/proc/net/psched";
4119 unsigned int a, b, c, d;
4120 FILE *stream;
4121
4122 if (!ovsthread_once_start(&once)) {
4123 return;
4124 }
4125
4126 ticks_per_s = 1.0;
4127 buffer_hz = 100;
4128
4129 stream = fopen(fn, "r");
4130 if (!stream) {
4131 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4132 goto exit;
4133 }
4134
4135 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4136 VLOG_WARN("%s: read failed", fn);
4137 fclose(stream);
4138 goto exit;
4139 }
4140 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4141 fclose(stream);
4142
4143 if (!a || !c) {
4144 VLOG_WARN("%s: invalid scheduler parameters", fn);
4145 goto exit;
4146 }
4147
4148 ticks_per_s = (double) a * c / b;
4149 if (c == 1000000) {
4150 buffer_hz = d;
4151 } else {
4152 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4153 fn, a, b, c, d);
4154 }
4155 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4156
4157 exit:
4158 ovsthread_once_done(&once);
4159 }
4160
4161 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4162 * rate of 'rate' bytes per second. */
4163 static unsigned int
4164 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4165 {
4166 read_psched();
4167 return (rate * ticks) / ticks_per_s;
4168 }
4169
4170 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4171 * rate of 'rate' bytes per second. */
4172 static unsigned int
4173 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4174 {
4175 read_psched();
4176 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4177 }
4178
4179 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4180 * a transmission rate of 'rate' bytes per second. */
4181 static unsigned int
4182 tc_buffer_per_jiffy(unsigned int rate)
4183 {
4184 read_psched();
4185 return rate / buffer_hz;
4186 }
4187
4188 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4189 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4190 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4191 * stores NULL into it if it is absent.
4192 *
4193 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4194 * 'msg'.
4195 *
4196 * Returns 0 if successful, otherwise a positive errno value. */
4197 static int
4198 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4199 struct nlattr **options)
4200 {
4201 static const struct nl_policy tca_policy[] = {
4202 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4203 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4204 };
4205 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4206
4207 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4208 tca_policy, ta, ARRAY_SIZE(ta))) {
4209 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4210 goto error;
4211 }
4212
4213 if (kind) {
4214 *kind = nl_attr_get_string(ta[TCA_KIND]);
4215 }
4216
4217 if (options) {
4218 *options = ta[TCA_OPTIONS];
4219 }
4220
4221 return 0;
4222
4223 error:
4224 if (kind) {
4225 *kind = NULL;
4226 }
4227 if (options) {
4228 *options = NULL;
4229 }
4230 return EPROTO;
4231 }
4232
4233 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4234 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4235 * into '*options', and its queue statistics into '*stats'. Any of the output
4236 * arguments may be null.
4237 *
4238 * Returns 0 if successful, otherwise a positive errno value. */
4239 static int
4240 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4241 struct nlattr **options, struct netdev_queue_stats *stats)
4242 {
4243 static const struct nl_policy tca_policy[] = {
4244 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4245 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4246 };
4247 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4248
4249 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4250 tca_policy, ta, ARRAY_SIZE(ta))) {
4251 VLOG_WARN_RL(&rl, "failed to parse class message");
4252 goto error;
4253 }
4254
4255 if (handlep) {
4256 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4257 *handlep = tc->tcm_handle;
4258 }
4259
4260 if (options) {
4261 *options = ta[TCA_OPTIONS];
4262 }
4263
4264 if (stats) {
4265 const struct gnet_stats_queue *gsq;
4266 struct gnet_stats_basic gsb;
4267
4268 static const struct nl_policy stats_policy[] = {
4269 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4270 .min_len = sizeof gsb },
4271 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4272 .min_len = sizeof *gsq },
4273 };
4274 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4275
4276 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4277 sa, ARRAY_SIZE(sa))) {
4278 VLOG_WARN_RL(&rl, "failed to parse class stats");
4279 goto error;
4280 }
4281
4282 /* Alignment issues screw up the length of struct gnet_stats_basic on
4283 * some arch/bitsize combinations. Newer versions of Linux have a
4284 * struct gnet_stats_basic_packed, but we can't depend on that. The
4285 * easiest thing to do is just to make a copy. */
4286 memset(&gsb, 0, sizeof gsb);
4287 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4288 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4289 stats->tx_bytes = gsb.bytes;
4290 stats->tx_packets = gsb.packets;
4291
4292 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4293 stats->tx_errors = gsq->drops;
4294 }
4295
4296 return 0;
4297
4298 error:
4299 if (options) {
4300 *options = NULL;
4301 }
4302 if (stats) {
4303 memset(stats, 0, sizeof *stats);
4304 }
4305 return EPROTO;
4306 }
4307
4308 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4309 * on 'netdev'. */
4310 static int
4311 tc_query_class(const struct netdev *netdev,
4312 unsigned int handle, unsigned int parent,
4313 struct ofpbuf **replyp)
4314 {
4315 struct ofpbuf request;
4316 struct tcmsg *tcmsg;
4317 int error;
4318
4319 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4320 if (!tcmsg) {
4321 return ENODEV;
4322 }
4323 tcmsg->tcm_handle = handle;
4324 tcmsg->tcm_parent = parent;
4325
4326 error = tc_transact(&request, replyp);
4327 if (error) {
4328 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4329 netdev_get_name(netdev),
4330 tc_get_major(handle), tc_get_minor(handle),
4331 tc_get_major(parent), tc_get_minor(parent),
4332 ovs_strerror(error));
4333 }
4334 return error;
4335 }
4336
4337 /* Equivalent to "tc class del dev <name> handle <handle>". */
4338 static int
4339 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4340 {
4341 struct ofpbuf request;
4342 struct tcmsg *tcmsg;
4343 int error;
4344
4345 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4346 if (!tcmsg) {
4347 return ENODEV;
4348 }
4349 tcmsg->tcm_handle = handle;
4350 tcmsg->tcm_parent = 0;
4351
4352 error = tc_transact(&request, NULL);
4353 if (error) {
4354 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4355 netdev_get_name(netdev),
4356 tc_get_major(handle), tc_get_minor(handle),
4357 ovs_strerror(error));
4358 }
4359 return error;
4360 }
4361
4362 /* Equivalent to "tc qdisc del dev <name> root". */
4363 static int
4364 tc_del_qdisc(struct netdev *netdev_)
4365 {
4366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4367 struct ofpbuf request;
4368 struct tcmsg *tcmsg;
4369 int error;
4370
4371 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4372 if (!tcmsg) {
4373 return ENODEV;
4374 }
4375 tcmsg->tcm_handle = tc_make_handle(1, 0);
4376 tcmsg->tcm_parent = TC_H_ROOT;
4377
4378 error = tc_transact(&request, NULL);
4379 if (error == EINVAL) {
4380 /* EINVAL probably means that the default qdisc was in use, in which
4381 * case we've accomplished our purpose. */
4382 error = 0;
4383 }
4384 if (!error && netdev->tc) {
4385 if (netdev->tc->ops->tc_destroy) {
4386 netdev->tc->ops->tc_destroy(netdev->tc);
4387 }
4388 netdev->tc = NULL;
4389 }
4390 return error;
4391 }
4392
4393 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4394 * kernel to determine what they are. Returns 0 if successful, otherwise a
4395 * positive errno value. */
4396 static int
4397 tc_query_qdisc(const struct netdev *netdev_)
4398 {
4399 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4400 struct ofpbuf request, *qdisc;
4401 const struct tc_ops *ops;
4402 struct tcmsg *tcmsg;
4403 int load_error;
4404 int error;
4405
4406 if (netdev->tc) {
4407 return 0;
4408 }
4409
4410 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4411 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4412 * 2.6.35 without that fix backported to it.
4413 *
4414 * To avoid the OOPS, we must not make a request that would attempt to dump
4415 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4416 * few others. There are a few ways that I can see to do this, but most of
4417 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4418 * technique chosen here is to assume that any non-default qdisc that we
4419 * create will have a class with handle 1:0. The built-in qdiscs only have
4420 * a class with handle 0:0.
4421 *
4422 * We could check for Linux 2.6.35+ and use a more straightforward method
4423 * there. */
4424 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4425 if (!tcmsg) {
4426 return ENODEV;
4427 }
4428 tcmsg->tcm_handle = tc_make_handle(1, 0);
4429 tcmsg->tcm_parent = 0;
4430
4431 /* Figure out what tc class to instantiate. */
4432 error = tc_transact(&request, &qdisc);
4433 if (!error) {
4434 const char *kind;
4435
4436 error = tc_parse_qdisc(qdisc, &kind, NULL);
4437 if (error) {
4438 ops = &tc_ops_other;
4439 } else {
4440 ops = tc_lookup_linux_name(kind);
4441 if (!ops) {
4442 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4443 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4444
4445 ops = &tc_ops_other;
4446 }
4447 }
4448 } else if (error == ENOENT) {
4449 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4450 * other entity that doesn't have a handle 1:0. We will assume
4451 * that it's the system default qdisc. */
4452 ops = &tc_ops_default;
4453 error = 0;
4454 } else {
4455 /* Who knows? Maybe the device got deleted. */
4456 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4457 netdev_get_name(netdev_), ovs_strerror(error));
4458 ops = &tc_ops_other;
4459 }
4460
4461 /* Instantiate it. */
4462 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4463 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4464 ofpbuf_delete(qdisc);
4465
4466 return error ? error : load_error;
4467 }
4468
4469 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4470 approximate the time to transmit packets of various lengths. For an MTU of
4471 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4472 represents two possible packet lengths; for a MTU of 513 through 1024, four
4473 possible lengths; and so on.
4474
4475 Returns, for the specified 'mtu', the number of bits that packet lengths
4476 need to be shifted right to fit within such a 256-entry table. */
4477 static int
4478 tc_calc_cell_log(unsigned int mtu)
4479 {
4480 int cell_log;
4481
4482 if (!mtu) {
4483 mtu = ETH_PAYLOAD_MAX;
4484 }
4485 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4486
4487 for (cell_log = 0; mtu >= 256; cell_log++) {
4488 mtu >>= 1;
4489 }
4490
4491 return cell_log;
4492 }
4493
4494 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4495 * of 'mtu'. */
4496 static void
4497 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4498 {
4499 memset(rate, 0, sizeof *rate);
4500 rate->cell_log = tc_calc_cell_log(mtu);
4501 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4502 /* rate->cell_align = 0; */ /* distro headers. */
4503 rate->mpu = ETH_TOTAL_MIN;
4504 rate->rate = Bps;
4505 }
4506
4507 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4508 * attribute of the specified "type".
4509 *
4510 * See tc_calc_cell_log() above for a description of "rtab"s. */
4511 static void
4512 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4513 {
4514 uint32_t *rtab;
4515 unsigned int i;
4516
4517 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4518 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4519 unsigned packet_size = (i + 1) << rate->cell_log;
4520 if (packet_size < rate->mpu) {
4521 packet_size = rate->mpu;
4522 }
4523 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4524 }
4525 }
4526
4527 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4528 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4529 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4530 * 0 is fine.) */
4531 static int
4532 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4533 {
4534 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4535 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4536 }
4537 \f
4538 /* Linux-only functions declared in netdev-linux.h */
4539
4540 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4541 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4542 int
4543 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4544 const char *flag_name, bool enable)
4545 {
4546 const char *netdev_name = netdev_get_name(netdev);
4547 struct ethtool_value evalue;
4548 uint32_t new_flags;
4549 int error;
4550
4551 COVERAGE_INC(netdev_get_ethtool);
4552 memset(&evalue, 0, sizeof evalue);
4553 error = netdev_linux_do_ethtool(netdev_name,
4554 (struct ethtool_cmd *)&evalue,
4555 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4556 if (error) {
4557 return error;
4558 }
4559
4560 COVERAGE_INC(netdev_set_ethtool);
4561 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4562 error = netdev_linux_do_ethtool(netdev_name,
4563 (struct ethtool_cmd *)&evalue,
4564 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4565 if (error) {
4566 return error;
4567 }
4568
4569 COVERAGE_INC(netdev_get_ethtool);
4570 memset(&evalue, 0, sizeof evalue);
4571 error = netdev_linux_do_ethtool(netdev_name,
4572 (struct ethtool_cmd *)&evalue,
4573 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4574 if (error) {
4575 return error;
4576 }
4577
4578 if (new_flags != evalue.data) {
4579 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4580 "device %s failed", enable ? "enable" : "disable",
4581 flag_name, netdev_name);
4582 return EOPNOTSUPP;
4583 }
4584
4585 return 0;
4586 }
4587 \f
4588 /* Utility functions. */
4589
4590 /* Copies 'src' into 'dst', performing format conversion in the process. */
4591 static void
4592 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4593 const struct rtnl_link_stats *src)
4594 {
4595 dst->rx_packets = src->rx_packets;
4596 dst->tx_packets = src->tx_packets;
4597 dst->rx_bytes = src->rx_bytes;
4598 dst->tx_bytes = src->tx_bytes;
4599 dst->rx_errors = src->rx_errors;
4600 dst->tx_errors = src->tx_errors;
4601 dst->rx_dropped = src->rx_dropped;
4602 dst->tx_dropped = src->tx_dropped;
4603 dst->multicast = src->multicast;
4604 dst->collisions = src->collisions;
4605 dst->rx_length_errors = src->rx_length_errors;
4606 dst->rx_over_errors = src->rx_over_errors;
4607 dst->rx_crc_errors = src->rx_crc_errors;
4608 dst->rx_frame_errors = src->rx_frame_errors;
4609 dst->rx_fifo_errors = src->rx_fifo_errors;
4610 dst->rx_missed_errors = src->rx_missed_errors;
4611 dst->tx_aborted_errors = src->tx_aborted_errors;
4612 dst->tx_carrier_errors = src->tx_carrier_errors;
4613 dst->tx_fifo_errors = src->tx_fifo_errors;
4614 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4615 dst->tx_window_errors = src->tx_window_errors;
4616 }
4617
4618 /* Copies 'src' into 'dst', performing format conversion in the process. */
4619 static void
4620 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4621 const struct rtnl_link_stats64 *src)
4622 {
4623 dst->rx_packets = src->rx_packets;
4624 dst->tx_packets = src->tx_packets;
4625 dst->rx_bytes = src->rx_bytes;
4626 dst->tx_bytes = src->tx_bytes;
4627 dst->rx_errors = src->rx_errors;
4628 dst->tx_errors = src->tx_errors;
4629 dst->rx_dropped = src->rx_dropped;
4630 dst->tx_dropped = src->tx_dropped;
4631 dst->multicast = src->multicast;
4632 dst->collisions = src->collisions;
4633 dst->rx_length_errors = src->rx_length_errors;
4634 dst->rx_over_errors = src->rx_over_errors;
4635 dst->rx_crc_errors = src->rx_crc_errors;
4636 dst->rx_frame_errors = src->rx_frame_errors;
4637 dst->rx_fifo_errors = src->rx_fifo_errors;
4638 dst->rx_missed_errors = src->rx_missed_errors;
4639 dst->tx_aborted_errors = src->tx_aborted_errors;
4640 dst->tx_carrier_errors = src->tx_carrier_errors;
4641 dst->tx_fifo_errors = src->tx_fifo_errors;
4642 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4643 dst->tx_window_errors = src->tx_window_errors;
4644 }
4645
4646 static int
4647 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4648 {
4649 struct ofpbuf request;
4650 struct ofpbuf *reply;
4651 int error;
4652
4653 ofpbuf_init(&request, 0);
4654 nl_msg_put_nlmsghdr(&request,
4655 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4656 RTM_GETLINK, NLM_F_REQUEST);
4657 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4658 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4659 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4660 ofpbuf_uninit(&request);
4661 if (error) {
4662 return error;
4663 }
4664
4665 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4666 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
4667 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
4668 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
4669 error = 0;
4670 } else {
4671 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4672 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4673 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4674 error = 0;
4675 } else {
4676 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4677 error = EPROTO;
4678 }
4679 }
4680 } else {
4681 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4682 error = EPROTO;
4683 }
4684
4685
4686 ofpbuf_delete(reply);
4687 return error;
4688 }
4689
4690 static int
4691 get_flags(const struct netdev *dev, unsigned int *flags)
4692 {
4693 struct ifreq ifr;
4694 int error;
4695
4696 *flags = 0;
4697 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4698 if (!error) {
4699 *flags = ifr.ifr_flags;
4700 }
4701 return error;
4702 }
4703
4704 static int
4705 set_flags(const char *name, unsigned int flags)
4706 {
4707 struct ifreq ifr;
4708
4709 ifr.ifr_flags = flags;
4710 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4711 }
4712
4713 static int
4714 do_get_ifindex(const char *netdev_name)
4715 {
4716 struct ifreq ifr;
4717 int error;
4718
4719 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4720 COVERAGE_INC(netdev_get_ifindex);
4721
4722 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4723 if (error) {
4724 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4725 netdev_name, ovs_strerror(error));
4726 return -error;
4727 }
4728 return ifr.ifr_ifindex;
4729 }
4730
4731 static int
4732 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4733 {
4734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4735
4736 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4737 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4738
4739 if (ifindex < 0) {
4740 netdev->get_ifindex_error = -ifindex;
4741 netdev->ifindex = 0;
4742 } else {
4743 netdev->get_ifindex_error = 0;
4744 netdev->ifindex = ifindex;
4745 }
4746 netdev->cache_valid |= VALID_IFINDEX;
4747 }
4748
4749 *ifindexp = netdev->ifindex;
4750 return netdev->get_ifindex_error;
4751 }
4752
4753 static int
4754 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4755 {
4756 struct ifreq ifr;
4757 int hwaddr_family;
4758 int error;
4759
4760 memset(&ifr, 0, sizeof ifr);
4761 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4762 COVERAGE_INC(netdev_get_hwaddr);
4763 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4764 if (error) {
4765 /* ENODEV probably means that a vif disappeared asynchronously and
4766 * hasn't been removed from the database yet, so reduce the log level
4767 * to INFO for that case. */
4768 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4769 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4770 netdev_name, ovs_strerror(error));
4771 return error;
4772 }
4773 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4774 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4775 VLOG_WARN("%s device has unknown hardware address family %d",
4776 netdev_name, hwaddr_family);
4777 }
4778 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4779 return 0;
4780 }
4781
4782 static int
4783 set_etheraddr(const char *netdev_name,
4784 const uint8_t mac[ETH_ADDR_LEN])
4785 {
4786 struct ifreq ifr;
4787 int error;
4788
4789 memset(&ifr, 0, sizeof ifr);
4790 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4791 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4792 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4793 COVERAGE_INC(netdev_set_hwaddr);
4794 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4795 if (error) {
4796 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4797 netdev_name, ovs_strerror(error));
4798 }
4799 return error;
4800 }
4801
4802 static int
4803 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4804 int cmd, const char *cmd_name)
4805 {
4806 struct ifreq ifr;
4807 int error;
4808
4809 memset(&ifr, 0, sizeof ifr);
4810 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4811 ifr.ifr_data = (caddr_t) ecmd;
4812
4813 ecmd->cmd = cmd;
4814 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4815 if (error) {
4816 if (error != EOPNOTSUPP) {
4817 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4818 "failed: %s", cmd_name, name, ovs_strerror(error));
4819 } else {
4820 /* The device doesn't support this operation. That's pretty
4821 * common, so there's no point in logging anything. */
4822 }
4823 }
4824 return error;
4825 }
4826
4827 static int
4828 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4829 int cmd, const char *cmd_name)
4830 {
4831 struct ifreq ifr;
4832 int error;
4833
4834 ifr.ifr_addr.sa_family = AF_INET;
4835 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4836 if (!error) {
4837 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4838 &ifr.ifr_addr);
4839 *ip = sin->sin_addr;
4840 }
4841 return error;
4842 }
4843
4844 /* Returns an AF_PACKET raw socket or a negative errno value. */
4845 static int
4846 af_packet_sock(void)
4847 {
4848 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4849 static int sock;
4850
4851 if (ovsthread_once_start(&once)) {
4852 sock = socket(AF_PACKET, SOCK_RAW, 0);
4853 if (sock >= 0) {
4854 int error = set_nonblocking(sock);
4855 if (error) {
4856 close(sock);
4857 sock = -error;
4858 }
4859 } else {
4860 sock = -errno;
4861 VLOG_ERR("failed to create packet socket: %s",
4862 ovs_strerror(errno));
4863 }
4864 ovsthread_once_done(&once);
4865 }
4866
4867 return sock;
4868 }