]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev-linux: Add new QoS type linux-noop.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
57 #include "hash.h"
58 #include "hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
86
87 \f
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
92 #endif
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
95 #endif
96
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #endif
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 #endif
105
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108 #ifndef TC_RTAB_SIZE
109 #define TC_RTAB_SIZE 1024
110 #endif
111
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
123 #endif
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #endif
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #endif
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140 };
141
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
143 *
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
150 {
151 return ep->speed | (ep->speed_hi << 16);
152 }
153
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
165 #endif
166
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
178 #endif
179
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
181 *
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
187 #ifndef IFLA_STATS64
188 #define IFLA_STATS64 23
189 #endif
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
192 uint64_t rx_packets;
193 uint64_t tx_packets;
194 uint64_t rx_bytes;
195 uint64_t tx_bytes;
196 uint64_t rx_errors;
197 uint64_t tx_errors;
198 uint64_t rx_dropped;
199 uint64_t tx_dropped;
200 uint64_t multicast;
201 uint64_t collisions;
202
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
209
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
215
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
218 };
219
220 enum {
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
223 VALID_IN = 1 << 2,
224 VALID_MTU = 1 << 3,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
229 };
230 \f
231 /* Traffic control. */
232
233 /* An instance of a traffic control class. Always associated with a particular
234 * network device.
235 *
236 * Each TC implementation subclasses this with whatever additional data it
237 * needs. */
238 struct tc {
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
243 };
244
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
246
247 /* One traffic control queue.
248 *
249 * Each TC implementation subclasses this with whatever additional data it
250 * needs. */
251 struct tc_queue {
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
255 };
256
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
259 *
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
263 struct tc_ops {
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
268
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
271
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
275
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
281 *
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
285 *
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
288 *
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
292
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
296 *
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
302 * 'netdev'.
303 *
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
307
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
310 * tc_destroy(tc).
311 *
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
315 *
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
318
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
320 *
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
324 *
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
328 *
329 * This function may be null if 'tc' is not configurable.
330 */
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
332
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
335 *
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
339 *
340 * This function may be null if 'tc' is not configurable.
341 */
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
343
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
346 *
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
350 *
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
354 *
355 * This function may be null if 'tc' does not have queues ('n_queues' is
356 * 0). */
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
359
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
363 * 'n_queues'.
364 *
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
368 *
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
373
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
376 *
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
380
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
383 *
384 * On success, initializes '*stats'.
385 *
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
391
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
394 *
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
400 };
401
402 static void
403 tc_init(struct tc *tc, const struct tc_ops *ops)
404 {
405 tc->ops = ops;
406 hmap_init(&tc->queues);
407 }
408
409 static void
410 tc_destroy(struct tc *tc)
411 {
412 hmap_destroy(&tc->queues);
413 }
414
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_noop;
422 static const struct tc_ops tc_ops_other;
423
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_noop, /* Non operating qos type. */
431 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
432 &tc_ops_other, /* Some other qdisc. */
433 NULL
434 };
435
436 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
437 static unsigned int tc_get_major(unsigned int handle);
438 static unsigned int tc_get_minor(unsigned int handle);
439
440 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
441 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
442 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
443
444 static struct tcmsg *tc_make_request(const struct netdev *, int type,
445 unsigned int flags, struct ofpbuf *);
446 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
447 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
448 static int tc_add_policer(struct netdev *,
449 uint32_t kbits_rate, uint32_t kbits_burst);
450
451 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
452 struct nlattr **options);
453 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
454 struct nlattr **options,
455 struct netdev_queue_stats *);
456 static int tc_query_class(const struct netdev *,
457 unsigned int handle, unsigned int parent,
458 struct ofpbuf **replyp);
459 static int tc_delete_class(const struct netdev *, unsigned int handle);
460
461 static int tc_del_qdisc(struct netdev *netdev);
462 static int tc_query_qdisc(const struct netdev *netdev);
463
464 static int tc_calc_cell_log(unsigned int mtu);
465 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
466 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
467 const struct tc_ratespec *rate);
468 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
469 \f
470 struct netdev_linux {
471 struct netdev up;
472
473 /* Protects all members below. */
474 struct ovs_mutex mutex;
475
476 unsigned int cache_valid;
477
478 bool miimon; /* Link status of last poll. */
479 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
480 struct timer miimon_timer;
481
482 /* The following are figured out "on demand" only. They are only valid
483 * when the corresponding VALID_* bit in 'cache_valid' is set. */
484 int ifindex;
485 struct eth_addr etheraddr;
486 int mtu;
487 unsigned int ifi_flags;
488 long long int carrier_resets;
489 uint32_t kbits_rate; /* Policing data. */
490 uint32_t kbits_burst;
491 int vport_stats_error; /* Cached error code from vport_get_stats().
492 0 or an errno value. */
493 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
494 int ether_addr_error; /* Cached error code from set/get etheraddr. */
495 int netdev_policing_error; /* Cached error code from set policing. */
496 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
497 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
498
499 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
500 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
502
503 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
504 struct tc *tc;
505
506 /* For devices of class netdev_tap_class only. */
507 int tap_fd;
508 };
509
510 struct netdev_rxq_linux {
511 struct netdev_rxq up;
512 bool is_tap;
513 int fd;
514 };
515
516 /* This is set pretty low because we probably won't learn anything from the
517 * additional log messages. */
518 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
519
520 /* Polling miimon status for all ports causes performance degradation when
521 * handling a large number of ports. If there are no devices using miimon, then
522 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
523 *
524 * Readers do not depend on this variable synchronizing with the related
525 * changes in the device miimon status, so we can use atomic_count. */
526 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
527
528 static void netdev_linux_run(void);
529
530 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
531 int cmd, const char *cmd_name);
532 static int get_flags(const struct netdev *, unsigned int *flags);
533 static int set_flags(const char *, unsigned int flags);
534 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
535 enum netdev_flags on, enum netdev_flags *old_flagsp)
536 OVS_REQUIRES(netdev->mutex);
537 static int do_get_ifindex(const char *netdev_name);
538 static int get_ifindex(const struct netdev *, int *ifindexp);
539 static int do_set_addr(struct netdev *netdev,
540 int ioctl_nr, const char *ioctl_name,
541 struct in_addr addr);
542 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
543 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
544 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
545 static int af_packet_sock(void);
546 static bool netdev_linux_miimon_enabled(void);
547 static void netdev_linux_miimon_run(void);
548 static void netdev_linux_miimon_wait(void);
549 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
550
551 static bool
552 is_netdev_linux_class(const struct netdev_class *netdev_class)
553 {
554 return netdev_class->run == netdev_linux_run;
555 }
556
557 static bool
558 is_tap_netdev(const struct netdev *netdev)
559 {
560 return netdev_get_class(netdev) == &netdev_tap_class;
561 }
562
563 static struct netdev_linux *
564 netdev_linux_cast(const struct netdev *netdev)
565 {
566 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
567
568 return CONTAINER_OF(netdev, struct netdev_linux, up);
569 }
570
571 static struct netdev_rxq_linux *
572 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
573 {
574 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
575 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
576 }
577 \f
578 static void netdev_linux_update(struct netdev_linux *netdev,
579 const struct rtnetlink_change *)
580 OVS_REQUIRES(netdev->mutex);
581 static void netdev_linux_changed(struct netdev_linux *netdev,
582 unsigned int ifi_flags, unsigned int mask)
583 OVS_REQUIRES(netdev->mutex);
584
585 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
586 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
587 * if no such socket could be created. */
588 static struct nl_sock *
589 netdev_linux_notify_sock(void)
590 {
591 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
592 static struct nl_sock *sock;
593 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
594 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
595
596 if (ovsthread_once_start(&once)) {
597 int error;
598
599 error = nl_sock_create(NETLINK_ROUTE, &sock);
600 if (!error) {
601 size_t i;
602
603 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
604 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
605 if (error) {
606 nl_sock_destroy(sock);
607 sock = NULL;
608 break;
609 }
610 }
611 }
612 ovsthread_once_done(&once);
613 }
614
615 return sock;
616 }
617
618 static bool
619 netdev_linux_miimon_enabled(void)
620 {
621 return atomic_count_get(&miimon_cnt) > 0;
622 }
623
624 static void
625 netdev_linux_run(void)
626 {
627 struct nl_sock *sock;
628 int error;
629
630 if (netdev_linux_miimon_enabled()) {
631 netdev_linux_miimon_run();
632 }
633
634 sock = netdev_linux_notify_sock();
635 if (!sock) {
636 return;
637 }
638
639 do {
640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
641 uint64_t buf_stub[4096 / 8];
642 struct ofpbuf buf;
643
644 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
645 error = nl_sock_recv(sock, &buf, false);
646 if (!error) {
647 struct rtnetlink_change change;
648
649 if (rtnetlink_parse(&buf, &change)) {
650 struct netdev *netdev_ = NULL;
651 char dev_name[IFNAMSIZ];
652
653 if (!change.ifname) {
654 change.ifname = if_indextoname(change.if_index, dev_name);
655 }
656
657 if (change.ifname) {
658 netdev_ = netdev_from_name(change.ifname);
659 }
660 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
662
663 ovs_mutex_lock(&netdev->mutex);
664 netdev_linux_update(netdev, &change);
665 ovs_mutex_unlock(&netdev->mutex);
666 }
667 netdev_close(netdev_);
668 }
669 } else if (error == ENOBUFS) {
670 struct shash device_shash;
671 struct shash_node *node;
672
673 nl_sock_drain(sock);
674
675 shash_init(&device_shash);
676 netdev_get_devices(&netdev_linux_class, &device_shash);
677 SHASH_FOR_EACH (node, &device_shash) {
678 struct netdev *netdev_ = node->data;
679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
680 unsigned int flags;
681
682 ovs_mutex_lock(&netdev->mutex);
683 get_flags(netdev_, &flags);
684 netdev_linux_changed(netdev, flags, 0);
685 ovs_mutex_unlock(&netdev->mutex);
686
687 netdev_close(netdev_);
688 }
689 shash_destroy(&device_shash);
690 } else if (error != EAGAIN) {
691 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
692 ovs_strerror(error));
693 }
694 ofpbuf_uninit(&buf);
695 } while (!error);
696 }
697
698 static void
699 netdev_linux_wait(void)
700 {
701 struct nl_sock *sock;
702
703 if (netdev_linux_miimon_enabled()) {
704 netdev_linux_miimon_wait();
705 }
706 sock = netdev_linux_notify_sock();
707 if (sock) {
708 nl_sock_wait(sock, POLLIN);
709 }
710 }
711
712 static void
713 netdev_linux_changed(struct netdev_linux *dev,
714 unsigned int ifi_flags, unsigned int mask)
715 OVS_REQUIRES(dev->mutex)
716 {
717 netdev_change_seq_changed(&dev->up);
718
719 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
720 dev->carrier_resets++;
721 }
722 dev->ifi_flags = ifi_flags;
723
724 dev->cache_valid &= mask;
725 if (!(mask & VALID_IN)) {
726 netdev_get_addrs_list_flush();
727 }
728 }
729
730 static void
731 netdev_linux_update(struct netdev_linux *dev,
732 const struct rtnetlink_change *change)
733 OVS_REQUIRES(dev->mutex)
734 {
735 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
736 if (change->nlmsg_type == RTM_NEWLINK) {
737 /* Keep drv-info, and ip addresses. */
738 netdev_linux_changed(dev, change->ifi_flags,
739 VALID_DRVINFO | VALID_IN);
740
741 /* Update netdev from rtnl-change msg. */
742 if (change->mtu) {
743 dev->mtu = change->mtu;
744 dev->cache_valid |= VALID_MTU;
745 dev->netdev_mtu_error = 0;
746 }
747
748 if (!eth_addr_is_zero(change->mac)) {
749 dev->etheraddr = change->mac;
750 dev->cache_valid |= VALID_ETHERADDR;
751 dev->ether_addr_error = 0;
752 }
753
754 dev->ifindex = change->if_index;
755 dev->cache_valid |= VALID_IFINDEX;
756 dev->get_ifindex_error = 0;
757 } else {
758 netdev_linux_changed(dev, change->ifi_flags, 0);
759 }
760 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
761 /* Invalidates in4, in6. */
762 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
763 } else {
764 OVS_NOT_REACHED();
765 }
766 }
767
768 static struct netdev *
769 netdev_linux_alloc(void)
770 {
771 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
772 return &netdev->up;
773 }
774
775 static void
776 netdev_linux_common_construct(struct netdev_linux *netdev)
777 {
778 ovs_mutex_init(&netdev->mutex);
779 }
780
781 /* Creates system and internal devices. */
782 static int
783 netdev_linux_construct(struct netdev *netdev_)
784 {
785 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
786 int error;
787
788 netdev_linux_common_construct(netdev);
789
790 error = get_flags(&netdev->up, &netdev->ifi_flags);
791 if (error == ENODEV) {
792 if (netdev->up.netdev_class != &netdev_internal_class) {
793 /* The device does not exist, so don't allow it to be opened. */
794 return ENODEV;
795 } else {
796 /* "Internal" netdevs have to be created as netdev objects before
797 * they exist in the kernel, because creating them in the kernel
798 * happens by passing a netdev object to dpif_port_add().
799 * Therefore, ignore the error. */
800 }
801 }
802
803 return 0;
804 }
805
806 /* For most types of netdevs we open the device for each call of
807 * netdev_open(). However, this is not the case with tap devices,
808 * since it is only possible to open the device once. In this
809 * situation we share a single file descriptor, and consequently
810 * buffers, across all readers. Therefore once data is read it will
811 * be unavailable to other reads for tap devices. */
812 static int
813 netdev_linux_construct_tap(struct netdev *netdev_)
814 {
815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
816 static const char tap_dev[] = "/dev/net/tun";
817 const char *name = netdev_->name;
818 struct ifreq ifr;
819 int error;
820
821 netdev_linux_common_construct(netdev);
822
823 /* Open tap device. */
824 netdev->tap_fd = open(tap_dev, O_RDWR);
825 if (netdev->tap_fd < 0) {
826 error = errno;
827 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
828 return error;
829 }
830
831 /* Create tap device. */
832 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
833 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
834 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
835 VLOG_WARN("%s: creating tap device failed: %s", name,
836 ovs_strerror(errno));
837 error = errno;
838 goto error_close;
839 }
840
841 /* Make non-blocking. */
842 error = set_nonblocking(netdev->tap_fd);
843 if (error) {
844 goto error_close;
845 }
846
847 return 0;
848
849 error_close:
850 close(netdev->tap_fd);
851 return error;
852 }
853
854 static void
855 netdev_linux_destruct(struct netdev *netdev_)
856 {
857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
858
859 if (netdev->tc && netdev->tc->ops->tc_destroy) {
860 netdev->tc->ops->tc_destroy(netdev->tc);
861 }
862
863 if (netdev_get_class(netdev_) == &netdev_tap_class
864 && netdev->tap_fd >= 0)
865 {
866 close(netdev->tap_fd);
867 }
868
869 if (netdev->miimon_interval > 0) {
870 atomic_count_dec(&miimon_cnt);
871 }
872
873 ovs_mutex_destroy(&netdev->mutex);
874 }
875
876 static void
877 netdev_linux_dealloc(struct netdev *netdev_)
878 {
879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
880 free(netdev);
881 }
882
883 static struct netdev_rxq *
884 netdev_linux_rxq_alloc(void)
885 {
886 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
887 return &rx->up;
888 }
889
890 static int
891 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
892 {
893 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
894 struct netdev *netdev_ = rx->up.netdev;
895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
896 int error;
897
898 ovs_mutex_lock(&netdev->mutex);
899 rx->is_tap = is_tap_netdev(netdev_);
900 if (rx->is_tap) {
901 rx->fd = netdev->tap_fd;
902 } else {
903 struct sockaddr_ll sll;
904 int ifindex, val;
905 /* Result of tcpdump -dd inbound */
906 static const struct sock_filter filt[] = {
907 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
908 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
909 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
910 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
911 };
912 static const struct sock_fprog fprog = {
913 ARRAY_SIZE(filt), (struct sock_filter *) filt
914 };
915
916 /* Create file descriptor. */
917 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
918 if (rx->fd < 0) {
919 error = errno;
920 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
921 goto error;
922 }
923
924 val = 1;
925 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
926 error = errno;
927 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
928 netdev_get_name(netdev_), ovs_strerror(error));
929 goto error;
930 }
931
932 /* Set non-blocking mode. */
933 error = set_nonblocking(rx->fd);
934 if (error) {
935 goto error;
936 }
937
938 /* Get ethernet device index. */
939 error = get_ifindex(&netdev->up, &ifindex);
940 if (error) {
941 goto error;
942 }
943
944 /* Bind to specific ethernet device. */
945 memset(&sll, 0, sizeof sll);
946 sll.sll_family = AF_PACKET;
947 sll.sll_ifindex = ifindex;
948 sll.sll_protocol = htons(ETH_P_ALL);
949 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
950 error = errno;
951 VLOG_ERR("%s: failed to bind raw socket (%s)",
952 netdev_get_name(netdev_), ovs_strerror(error));
953 goto error;
954 }
955
956 /* Filter for only inbound packets. */
957 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
958 sizeof fprog);
959 if (error) {
960 error = errno;
961 VLOG_ERR("%s: failed to attach filter (%s)",
962 netdev_get_name(netdev_), ovs_strerror(error));
963 goto error;
964 }
965 }
966 ovs_mutex_unlock(&netdev->mutex);
967
968 return 0;
969
970 error:
971 if (rx->fd >= 0) {
972 close(rx->fd);
973 }
974 ovs_mutex_unlock(&netdev->mutex);
975 return error;
976 }
977
978 static void
979 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
980 {
981 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
982
983 if (!rx->is_tap) {
984 close(rx->fd);
985 }
986 }
987
988 static void
989 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
990 {
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
992
993 free(rx);
994 }
995
996 static ovs_be16
997 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
998 {
999 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1000 return htons(aux->tp_vlan_tpid);
1001 } else {
1002 return htons(ETH_TYPE_VLAN);
1003 }
1004 }
1005
1006 static bool
1007 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1008 {
1009 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1010 }
1011
1012 static int
1013 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1014 {
1015 size_t size;
1016 ssize_t retval;
1017 struct iovec iov;
1018 struct cmsghdr *cmsg;
1019 union {
1020 struct cmsghdr cmsg;
1021 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1022 } cmsg_buffer;
1023 struct msghdr msgh;
1024
1025 /* Reserve headroom for a single VLAN tag */
1026 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1027 size = dp_packet_tailroom(buffer);
1028
1029 iov.iov_base = dp_packet_data(buffer);
1030 iov.iov_len = size;
1031 msgh.msg_name = NULL;
1032 msgh.msg_namelen = 0;
1033 msgh.msg_iov = &iov;
1034 msgh.msg_iovlen = 1;
1035 msgh.msg_control = &cmsg_buffer;
1036 msgh.msg_controllen = sizeof cmsg_buffer;
1037 msgh.msg_flags = 0;
1038
1039 do {
1040 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1041 } while (retval < 0 && errno == EINTR);
1042
1043 if (retval < 0) {
1044 return errno;
1045 } else if (retval > size) {
1046 return EMSGSIZE;
1047 }
1048
1049 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1050
1051 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1052 const struct tpacket_auxdata *aux;
1053
1054 if (cmsg->cmsg_level != SOL_PACKET
1055 || cmsg->cmsg_type != PACKET_AUXDATA
1056 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1057 continue;
1058 }
1059
1060 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1061 if (auxdata_has_vlan_tci(aux)) {
1062 if (retval < ETH_HEADER_LEN) {
1063 return EINVAL;
1064 }
1065
1066 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1067 htons(aux->tp_vlan_tci));
1068 break;
1069 }
1070 }
1071
1072 return 0;
1073 }
1074
1075 static int
1076 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1077 {
1078 ssize_t retval;
1079 size_t size = dp_packet_tailroom(buffer);
1080
1081 do {
1082 retval = read(fd, dp_packet_data(buffer), size);
1083 } while (retval < 0 && errno == EINTR);
1084
1085 if (retval < 0) {
1086 return errno;
1087 }
1088
1089 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1090 return 0;
1091 }
1092
1093 static int
1094 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1095 int *c)
1096 {
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 struct netdev *netdev = rx->up.netdev;
1099 struct dp_packet *buffer;
1100 ssize_t retval;
1101 int mtu;
1102
1103 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1104 mtu = ETH_PAYLOAD_MAX;
1105 }
1106
1107 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1108 DP_NETDEV_HEADROOM);
1109 retval = (rx->is_tap
1110 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1111 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1112
1113 if (retval) {
1114 if (retval != EAGAIN && retval != EMSGSIZE) {
1115 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1116 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1117 }
1118 dp_packet_delete(buffer);
1119 } else {
1120 dp_packet_pad(buffer);
1121 packets[0] = buffer;
1122 *c = 1;
1123 }
1124
1125 return retval;
1126 }
1127
1128 static void
1129 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1130 {
1131 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1132 poll_fd_wait(rx->fd, POLLIN);
1133 }
1134
1135 static int
1136 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1137 {
1138 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1139 if (rx->is_tap) {
1140 struct ifreq ifr;
1141 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1142 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1143 if (error) {
1144 return error;
1145 }
1146 drain_fd(rx->fd, ifr.ifr_qlen);
1147 return 0;
1148 } else {
1149 return drain_rcvbuf(rx->fd);
1150 }
1151 }
1152
1153 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1154 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1155 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1156 * the packet is too big or too small to transmit on the device.
1157 *
1158 * The caller retains ownership of 'buffer' in all cases.
1159 *
1160 * The kernel maintains a packet transmission queue, so the caller is not
1161 * expected to do additional queuing of packets. */
1162 static int
1163 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1164 struct dp_packet **pkts, int cnt, bool may_steal)
1165 {
1166 int i;
1167 int error = 0;
1168
1169 /* 'i' is incremented only if there's no error */
1170 for (i = 0; i < cnt;) {
1171 const void *data = dp_packet_data(pkts[i]);
1172 size_t size = dp_packet_size(pkts[i]);
1173 ssize_t retval;
1174
1175 if (!is_tap_netdev(netdev_)) {
1176 /* Use our AF_PACKET socket to send to this device. */
1177 struct sockaddr_ll sll;
1178 struct msghdr msg;
1179 struct iovec iov;
1180 int ifindex;
1181 int sock;
1182
1183 sock = af_packet_sock();
1184 if (sock < 0) {
1185 return -sock;
1186 }
1187
1188 ifindex = netdev_get_ifindex(netdev_);
1189 if (ifindex < 0) {
1190 return -ifindex;
1191 }
1192
1193 /* We don't bother setting most fields in sockaddr_ll because the
1194 * kernel ignores them for SOCK_RAW. */
1195 memset(&sll, 0, sizeof sll);
1196 sll.sll_family = AF_PACKET;
1197 sll.sll_ifindex = ifindex;
1198
1199 iov.iov_base = CONST_CAST(void *, data);
1200 iov.iov_len = size;
1201
1202 msg.msg_name = &sll;
1203 msg.msg_namelen = sizeof sll;
1204 msg.msg_iov = &iov;
1205 msg.msg_iovlen = 1;
1206 msg.msg_control = NULL;
1207 msg.msg_controllen = 0;
1208 msg.msg_flags = 0;
1209
1210 retval = sendmsg(sock, &msg, 0);
1211 } else {
1212 /* Use the tap fd to send to this device. This is essential for
1213 * tap devices, because packets sent to a tap device with an
1214 * AF_PACKET socket will loop back to be *received* again on the
1215 * tap device. This doesn't occur on other interface types
1216 * because we attach a socket filter to the rx socket. */
1217 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1218
1219 retval = write(netdev->tap_fd, data, size);
1220 }
1221
1222 if (retval < 0) {
1223 /* The Linux AF_PACKET implementation never blocks waiting for room
1224 * for packets, instead returning ENOBUFS. Translate this into
1225 * EAGAIN for the caller. */
1226 error = errno == ENOBUFS ? EAGAIN : errno;
1227 if (error == EINTR) {
1228 /* continue without incrementing 'i', i.e. retry this packet */
1229 continue;
1230 }
1231 break;
1232 } else if (retval != size) {
1233 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1234 " of %"PRIuSIZE") on %s", retval, size,
1235 netdev_get_name(netdev_));
1236 error = EMSGSIZE;
1237 break;
1238 }
1239
1240 /* Process the next packet in the batch */
1241 i++;
1242 }
1243
1244 if (may_steal) {
1245 for (i = 0; i < cnt; i++) {
1246 dp_packet_delete(pkts[i]);
1247 }
1248 }
1249
1250 if (error && error != EAGAIN) {
1251 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1252 netdev_get_name(netdev_), ovs_strerror(error));
1253 }
1254
1255 return error;
1256
1257 }
1258
1259 /* Registers with the poll loop to wake up from the next call to poll_block()
1260 * when the packet transmission queue has sufficient room to transmit a packet
1261 * with netdev_send().
1262 *
1263 * The kernel maintains a packet transmission queue, so the client is not
1264 * expected to do additional queuing of packets. Thus, this function is
1265 * unlikely to ever be used. It is included for completeness. */
1266 static void
1267 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1268 {
1269 if (is_tap_netdev(netdev)) {
1270 /* TAP device always accepts packets.*/
1271 poll_immediate_wake();
1272 }
1273 }
1274
1275 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1276 * otherwise a positive errno value. */
1277 static int
1278 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1279 {
1280 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1281 enum netdev_flags old_flags = 0;
1282 int error;
1283
1284 ovs_mutex_lock(&netdev->mutex);
1285
1286 if (netdev->cache_valid & VALID_ETHERADDR) {
1287 error = netdev->ether_addr_error;
1288 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1289 goto exit;
1290 }
1291 netdev->cache_valid &= ~VALID_ETHERADDR;
1292 }
1293
1294 /* Tap devices must be brought down before setting the address. */
1295 if (is_tap_netdev(netdev_)) {
1296 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1297 }
1298 error = set_etheraddr(netdev_get_name(netdev_), mac);
1299 if (!error || error == ENODEV) {
1300 netdev->ether_addr_error = error;
1301 netdev->cache_valid |= VALID_ETHERADDR;
1302 if (!error) {
1303 netdev->etheraddr = mac;
1304 }
1305 }
1306
1307 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1308 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1309 }
1310
1311 exit:
1312 ovs_mutex_unlock(&netdev->mutex);
1313 return error;
1314 }
1315
1316 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1317 static int
1318 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1319 {
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1321 int error;
1322
1323 ovs_mutex_lock(&netdev->mutex);
1324 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1325 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1326 &netdev->etheraddr);
1327 netdev->cache_valid |= VALID_ETHERADDR;
1328 }
1329
1330 error = netdev->ether_addr_error;
1331 if (!error) {
1332 *mac = netdev->etheraddr;
1333 }
1334 ovs_mutex_unlock(&netdev->mutex);
1335
1336 return error;
1337 }
1338
1339 static int
1340 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1341 {
1342 int error;
1343
1344 if (!(netdev->cache_valid & VALID_MTU)) {
1345 struct ifreq ifr;
1346
1347 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1348 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1349 netdev->mtu = ifr.ifr_mtu;
1350 netdev->cache_valid |= VALID_MTU;
1351 }
1352
1353 error = netdev->netdev_mtu_error;
1354 if (!error) {
1355 *mtup = netdev->mtu;
1356 }
1357
1358 return error;
1359 }
1360
1361 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1362 * in bytes, not including the hardware header; thus, this is typically 1500
1363 * bytes for Ethernet devices. */
1364 static int
1365 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1366 {
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1368 int error;
1369
1370 ovs_mutex_lock(&netdev->mutex);
1371 error = netdev_linux_get_mtu__(netdev, mtup);
1372 ovs_mutex_unlock(&netdev->mutex);
1373
1374 return error;
1375 }
1376
1377 /* Sets the maximum size of transmitted (MTU) for given device using linux
1378 * networking ioctl interface.
1379 */
1380 static int
1381 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1382 {
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1384 struct ifreq ifr;
1385 int error;
1386
1387 ovs_mutex_lock(&netdev->mutex);
1388 if (netdev->cache_valid & VALID_MTU) {
1389 error = netdev->netdev_mtu_error;
1390 if (error || netdev->mtu == mtu) {
1391 goto exit;
1392 }
1393 netdev->cache_valid &= ~VALID_MTU;
1394 }
1395 ifr.ifr_mtu = mtu;
1396 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1397 SIOCSIFMTU, "SIOCSIFMTU");
1398 if (!error || error == ENODEV) {
1399 netdev->netdev_mtu_error = error;
1400 netdev->mtu = ifr.ifr_mtu;
1401 netdev->cache_valid |= VALID_MTU;
1402 }
1403 exit:
1404 ovs_mutex_unlock(&netdev->mutex);
1405 return error;
1406 }
1407
1408 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1409 * On failure, returns a negative errno value. */
1410 static int
1411 netdev_linux_get_ifindex(const struct netdev *netdev_)
1412 {
1413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1414 int ifindex, error;
1415
1416 ovs_mutex_lock(&netdev->mutex);
1417 error = get_ifindex(netdev_, &ifindex);
1418 ovs_mutex_unlock(&netdev->mutex);
1419
1420 return error ? -error : ifindex;
1421 }
1422
1423 static int
1424 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1425 {
1426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1427
1428 ovs_mutex_lock(&netdev->mutex);
1429 if (netdev->miimon_interval > 0) {
1430 *carrier = netdev->miimon;
1431 } else {
1432 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1433 }
1434 ovs_mutex_unlock(&netdev->mutex);
1435
1436 return 0;
1437 }
1438
1439 static long long int
1440 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1441 {
1442 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1443 long long int carrier_resets;
1444
1445 ovs_mutex_lock(&netdev->mutex);
1446 carrier_resets = netdev->carrier_resets;
1447 ovs_mutex_unlock(&netdev->mutex);
1448
1449 return carrier_resets;
1450 }
1451
1452 static int
1453 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1454 struct mii_ioctl_data *data)
1455 {
1456 struct ifreq ifr;
1457 int error;
1458
1459 memset(&ifr, 0, sizeof ifr);
1460 memcpy(&ifr.ifr_data, data, sizeof *data);
1461 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1462 memcpy(data, &ifr.ifr_data, sizeof *data);
1463
1464 return error;
1465 }
1466
1467 static int
1468 netdev_linux_get_miimon(const char *name, bool *miimon)
1469 {
1470 struct mii_ioctl_data data;
1471 int error;
1472
1473 *miimon = false;
1474
1475 memset(&data, 0, sizeof data);
1476 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1477 if (!error) {
1478 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1479 data.reg_num = MII_BMSR;
1480 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1481 &data);
1482
1483 if (!error) {
1484 *miimon = !!(data.val_out & BMSR_LSTATUS);
1485 } else {
1486 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1487 }
1488 } else {
1489 struct ethtool_cmd ecmd;
1490
1491 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1492 name);
1493
1494 COVERAGE_INC(netdev_get_ethtool);
1495 memset(&ecmd, 0, sizeof ecmd);
1496 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1497 "ETHTOOL_GLINK");
1498 if (!error) {
1499 struct ethtool_value eval;
1500
1501 memcpy(&eval, &ecmd, sizeof eval);
1502 *miimon = !!eval.data;
1503 } else {
1504 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1505 }
1506 }
1507
1508 return error;
1509 }
1510
1511 static int
1512 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1513 long long int interval)
1514 {
1515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1516
1517 ovs_mutex_lock(&netdev->mutex);
1518 interval = interval > 0 ? MAX(interval, 100) : 0;
1519 if (netdev->miimon_interval != interval) {
1520 if (interval && !netdev->miimon_interval) {
1521 atomic_count_inc(&miimon_cnt);
1522 } else if (!interval && netdev->miimon_interval) {
1523 atomic_count_dec(&miimon_cnt);
1524 }
1525
1526 netdev->miimon_interval = interval;
1527 timer_set_expired(&netdev->miimon_timer);
1528 }
1529 ovs_mutex_unlock(&netdev->mutex);
1530
1531 return 0;
1532 }
1533
1534 static void
1535 netdev_linux_miimon_run(void)
1536 {
1537 struct shash device_shash;
1538 struct shash_node *node;
1539
1540 shash_init(&device_shash);
1541 netdev_get_devices(&netdev_linux_class, &device_shash);
1542 SHASH_FOR_EACH (node, &device_shash) {
1543 struct netdev *netdev = node->data;
1544 struct netdev_linux *dev = netdev_linux_cast(netdev);
1545 bool miimon;
1546
1547 ovs_mutex_lock(&dev->mutex);
1548 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1549 netdev_linux_get_miimon(dev->up.name, &miimon);
1550 if (miimon != dev->miimon) {
1551 dev->miimon = miimon;
1552 netdev_linux_changed(dev, dev->ifi_flags, 0);
1553 }
1554
1555 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1556 }
1557 ovs_mutex_unlock(&dev->mutex);
1558 netdev_close(netdev);
1559 }
1560
1561 shash_destroy(&device_shash);
1562 }
1563
1564 static void
1565 netdev_linux_miimon_wait(void)
1566 {
1567 struct shash device_shash;
1568 struct shash_node *node;
1569
1570 shash_init(&device_shash);
1571 netdev_get_devices(&netdev_linux_class, &device_shash);
1572 SHASH_FOR_EACH (node, &device_shash) {
1573 struct netdev *netdev = node->data;
1574 struct netdev_linux *dev = netdev_linux_cast(netdev);
1575
1576 ovs_mutex_lock(&dev->mutex);
1577 if (dev->miimon_interval > 0) {
1578 timer_wait(&dev->miimon_timer);
1579 }
1580 ovs_mutex_unlock(&dev->mutex);
1581 netdev_close(netdev);
1582 }
1583 shash_destroy(&device_shash);
1584 }
1585
1586 static void
1587 swap_uint64(uint64_t *a, uint64_t *b)
1588 {
1589 uint64_t tmp = *a;
1590 *a = *b;
1591 *b = tmp;
1592 }
1593
1594 /* Copies 'src' into 'dst', performing format conversion in the process.
1595 *
1596 * 'src' is allowed to be misaligned. */
1597 static void
1598 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1599 const struct ovs_vport_stats *src)
1600 {
1601 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1602 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1603 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1604 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1605 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1606 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1607 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1608 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1609 dst->multicast = 0;
1610 dst->collisions = 0;
1611 dst->rx_length_errors = 0;
1612 dst->rx_over_errors = 0;
1613 dst->rx_crc_errors = 0;
1614 dst->rx_frame_errors = 0;
1615 dst->rx_fifo_errors = 0;
1616 dst->rx_missed_errors = 0;
1617 dst->tx_aborted_errors = 0;
1618 dst->tx_carrier_errors = 0;
1619 dst->tx_fifo_errors = 0;
1620 dst->tx_heartbeat_errors = 0;
1621 dst->tx_window_errors = 0;
1622 }
1623
1624 static int
1625 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1626 {
1627 struct dpif_netlink_vport reply;
1628 struct ofpbuf *buf;
1629 int error;
1630
1631 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1632 if (error) {
1633 return error;
1634 } else if (!reply.stats) {
1635 ofpbuf_delete(buf);
1636 return EOPNOTSUPP;
1637 }
1638
1639 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1640
1641 ofpbuf_delete(buf);
1642
1643 return 0;
1644 }
1645
1646 static void
1647 get_stats_via_vport(const struct netdev *netdev_,
1648 struct netdev_stats *stats)
1649 {
1650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1651
1652 if (!netdev->vport_stats_error ||
1653 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1654 int error;
1655
1656 error = get_stats_via_vport__(netdev_, stats);
1657 if (error && error != ENOENT && error != ENODEV) {
1658 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1659 "(%s)",
1660 netdev_get_name(netdev_), ovs_strerror(error));
1661 }
1662 netdev->vport_stats_error = error;
1663 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1664 }
1665 }
1666
1667 /* Retrieves current device stats for 'netdev-linux'. */
1668 static int
1669 netdev_linux_get_stats(const struct netdev *netdev_,
1670 struct netdev_stats *stats)
1671 {
1672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1673 struct netdev_stats dev_stats;
1674 int error;
1675
1676 ovs_mutex_lock(&netdev->mutex);
1677 get_stats_via_vport(netdev_, stats);
1678 error = get_stats_via_netlink(netdev_, &dev_stats);
1679 if (error) {
1680 if (!netdev->vport_stats_error) {
1681 error = 0;
1682 }
1683 } else if (netdev->vport_stats_error) {
1684 /* stats not available from OVS then use netdev stats. */
1685 *stats = dev_stats;
1686 } else {
1687 /* Use kernel netdev's packet and byte counts since vport's counters
1688 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1689 * enabled. */
1690 stats->rx_packets = dev_stats.rx_packets;
1691 stats->rx_bytes = dev_stats.rx_bytes;
1692 stats->tx_packets = dev_stats.tx_packets;
1693 stats->tx_bytes = dev_stats.tx_bytes;
1694
1695 stats->rx_errors += dev_stats.rx_errors;
1696 stats->tx_errors += dev_stats.tx_errors;
1697 stats->rx_dropped += dev_stats.rx_dropped;
1698 stats->tx_dropped += dev_stats.tx_dropped;
1699 stats->multicast += dev_stats.multicast;
1700 stats->collisions += dev_stats.collisions;
1701 stats->rx_length_errors += dev_stats.rx_length_errors;
1702 stats->rx_over_errors += dev_stats.rx_over_errors;
1703 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1704 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1705 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1706 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1707 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1708 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1709 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1710 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1711 stats->tx_window_errors += dev_stats.tx_window_errors;
1712 }
1713 ovs_mutex_unlock(&netdev->mutex);
1714
1715 return error;
1716 }
1717
1718 /* Retrieves current device stats for 'netdev-tap' netdev or
1719 * netdev-internal. */
1720 static int
1721 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1722 {
1723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1724 struct netdev_stats dev_stats;
1725 int error;
1726
1727 ovs_mutex_lock(&netdev->mutex);
1728 get_stats_via_vport(netdev_, stats);
1729 error = get_stats_via_netlink(netdev_, &dev_stats);
1730 if (error) {
1731 if (!netdev->vport_stats_error) {
1732 error = 0;
1733 }
1734 } else if (netdev->vport_stats_error) {
1735 /* Transmit and receive stats will appear to be swapped relative to the
1736 * other ports since we are the one sending the data, not a remote
1737 * computer. For consistency, we swap them back here. This does not
1738 * apply if we are getting stats from the vport layer because it always
1739 * tracks stats from the perspective of the switch. */
1740
1741 *stats = dev_stats;
1742 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1743 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1744 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1745 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1746 stats->rx_length_errors = 0;
1747 stats->rx_over_errors = 0;
1748 stats->rx_crc_errors = 0;
1749 stats->rx_frame_errors = 0;
1750 stats->rx_fifo_errors = 0;
1751 stats->rx_missed_errors = 0;
1752 stats->tx_aborted_errors = 0;
1753 stats->tx_carrier_errors = 0;
1754 stats->tx_fifo_errors = 0;
1755 stats->tx_heartbeat_errors = 0;
1756 stats->tx_window_errors = 0;
1757 } else {
1758 /* Use kernel netdev's packet and byte counts since vport counters
1759 * do not reflect packet counts on the wire when GSO, TSO or GRO
1760 * are enabled. */
1761 stats->rx_packets = dev_stats.tx_packets;
1762 stats->rx_bytes = dev_stats.tx_bytes;
1763 stats->tx_packets = dev_stats.rx_packets;
1764 stats->tx_bytes = dev_stats.rx_bytes;
1765
1766 stats->rx_dropped += dev_stats.tx_dropped;
1767 stats->tx_dropped += dev_stats.rx_dropped;
1768
1769 stats->rx_errors += dev_stats.tx_errors;
1770 stats->tx_errors += dev_stats.rx_errors;
1771
1772 stats->multicast += dev_stats.multicast;
1773 stats->collisions += dev_stats.collisions;
1774 }
1775 ovs_mutex_unlock(&netdev->mutex);
1776
1777 return error;
1778 }
1779
1780 static int
1781 netdev_internal_get_stats(const struct netdev *netdev_,
1782 struct netdev_stats *stats)
1783 {
1784 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1785 int error;
1786
1787 ovs_mutex_lock(&netdev->mutex);
1788 get_stats_via_vport(netdev_, stats);
1789 error = netdev->vport_stats_error;
1790 ovs_mutex_unlock(&netdev->mutex);
1791
1792 return error;
1793 }
1794
1795 static void
1796 netdev_linux_read_features(struct netdev_linux *netdev)
1797 {
1798 struct ethtool_cmd ecmd;
1799 uint32_t speed;
1800 int error;
1801
1802 if (netdev->cache_valid & VALID_FEATURES) {
1803 return;
1804 }
1805
1806 COVERAGE_INC(netdev_get_ethtool);
1807 memset(&ecmd, 0, sizeof ecmd);
1808 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1809 ETHTOOL_GSET, "ETHTOOL_GSET");
1810 if (error) {
1811 goto out;
1812 }
1813
1814 /* Supported features. */
1815 netdev->supported = 0;
1816 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1817 netdev->supported |= NETDEV_F_10MB_HD;
1818 }
1819 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1820 netdev->supported |= NETDEV_F_10MB_FD;
1821 }
1822 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1823 netdev->supported |= NETDEV_F_100MB_HD;
1824 }
1825 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1826 netdev->supported |= NETDEV_F_100MB_FD;
1827 }
1828 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1829 netdev->supported |= NETDEV_F_1GB_HD;
1830 }
1831 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1832 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1833 netdev->supported |= NETDEV_F_1GB_FD;
1834 }
1835 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1836 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1837 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1838 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1839 netdev->supported |= NETDEV_F_10GB_FD;
1840 }
1841 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1842 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1843 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1844 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1845 netdev->supported |= NETDEV_F_40GB_FD;
1846 }
1847 if (ecmd.supported & SUPPORTED_TP) {
1848 netdev->supported |= NETDEV_F_COPPER;
1849 }
1850 if (ecmd.supported & SUPPORTED_FIBRE) {
1851 netdev->supported |= NETDEV_F_FIBER;
1852 }
1853 if (ecmd.supported & SUPPORTED_Autoneg) {
1854 netdev->supported |= NETDEV_F_AUTONEG;
1855 }
1856 if (ecmd.supported & SUPPORTED_Pause) {
1857 netdev->supported |= NETDEV_F_PAUSE;
1858 }
1859 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1860 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1861 }
1862
1863 /* Advertised features. */
1864 netdev->advertised = 0;
1865 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1866 netdev->advertised |= NETDEV_F_10MB_HD;
1867 }
1868 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1869 netdev->advertised |= NETDEV_F_10MB_FD;
1870 }
1871 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1872 netdev->advertised |= NETDEV_F_100MB_HD;
1873 }
1874 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1875 netdev->advertised |= NETDEV_F_100MB_FD;
1876 }
1877 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1878 netdev->advertised |= NETDEV_F_1GB_HD;
1879 }
1880 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1881 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1882 netdev->advertised |= NETDEV_F_1GB_FD;
1883 }
1884 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1885 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1886 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1887 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1888 netdev->advertised |= NETDEV_F_10GB_FD;
1889 }
1890 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1891 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1892 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1893 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1894 netdev->advertised |= NETDEV_F_40GB_FD;
1895 }
1896 if (ecmd.advertising & ADVERTISED_TP) {
1897 netdev->advertised |= NETDEV_F_COPPER;
1898 }
1899 if (ecmd.advertising & ADVERTISED_FIBRE) {
1900 netdev->advertised |= NETDEV_F_FIBER;
1901 }
1902 if (ecmd.advertising & ADVERTISED_Autoneg) {
1903 netdev->advertised |= NETDEV_F_AUTONEG;
1904 }
1905 if (ecmd.advertising & ADVERTISED_Pause) {
1906 netdev->advertised |= NETDEV_F_PAUSE;
1907 }
1908 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1909 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1910 }
1911
1912 /* Current settings. */
1913 speed = ethtool_cmd_speed(&ecmd);
1914 if (speed == SPEED_10) {
1915 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1916 } else if (speed == SPEED_100) {
1917 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1918 } else if (speed == SPEED_1000) {
1919 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1920 } else if (speed == SPEED_10000) {
1921 netdev->current = NETDEV_F_10GB_FD;
1922 } else if (speed == 40000) {
1923 netdev->current = NETDEV_F_40GB_FD;
1924 } else if (speed == 100000) {
1925 netdev->current = NETDEV_F_100GB_FD;
1926 } else if (speed == 1000000) {
1927 netdev->current = NETDEV_F_1TB_FD;
1928 } else {
1929 netdev->current = 0;
1930 }
1931
1932 if (ecmd.port == PORT_TP) {
1933 netdev->current |= NETDEV_F_COPPER;
1934 } else if (ecmd.port == PORT_FIBRE) {
1935 netdev->current |= NETDEV_F_FIBER;
1936 }
1937
1938 if (ecmd.autoneg) {
1939 netdev->current |= NETDEV_F_AUTONEG;
1940 }
1941
1942 out:
1943 netdev->cache_valid |= VALID_FEATURES;
1944 netdev->get_features_error = error;
1945 }
1946
1947 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1948 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1949 * Returns 0 if successful, otherwise a positive errno value. */
1950 static int
1951 netdev_linux_get_features(const struct netdev *netdev_,
1952 enum netdev_features *current,
1953 enum netdev_features *advertised,
1954 enum netdev_features *supported,
1955 enum netdev_features *peer)
1956 {
1957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1958 int error;
1959
1960 ovs_mutex_lock(&netdev->mutex);
1961 netdev_linux_read_features(netdev);
1962 if (!netdev->get_features_error) {
1963 *current = netdev->current;
1964 *advertised = netdev->advertised;
1965 *supported = netdev->supported;
1966 *peer = 0; /* XXX */
1967 }
1968 error = netdev->get_features_error;
1969 ovs_mutex_unlock(&netdev->mutex);
1970
1971 return error;
1972 }
1973
1974 /* Set the features advertised by 'netdev' to 'advertise'. */
1975 static int
1976 netdev_linux_set_advertisements(struct netdev *netdev_,
1977 enum netdev_features advertise)
1978 {
1979 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1980 struct ethtool_cmd ecmd;
1981 int error;
1982
1983 ovs_mutex_lock(&netdev->mutex);
1984
1985 COVERAGE_INC(netdev_get_ethtool);
1986 memset(&ecmd, 0, sizeof ecmd);
1987 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1988 ETHTOOL_GSET, "ETHTOOL_GSET");
1989 if (error) {
1990 goto exit;
1991 }
1992
1993 ecmd.advertising = 0;
1994 if (advertise & NETDEV_F_10MB_HD) {
1995 ecmd.advertising |= ADVERTISED_10baseT_Half;
1996 }
1997 if (advertise & NETDEV_F_10MB_FD) {
1998 ecmd.advertising |= ADVERTISED_10baseT_Full;
1999 }
2000 if (advertise & NETDEV_F_100MB_HD) {
2001 ecmd.advertising |= ADVERTISED_100baseT_Half;
2002 }
2003 if (advertise & NETDEV_F_100MB_FD) {
2004 ecmd.advertising |= ADVERTISED_100baseT_Full;
2005 }
2006 if (advertise & NETDEV_F_1GB_HD) {
2007 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2008 }
2009 if (advertise & NETDEV_F_1GB_FD) {
2010 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2011 }
2012 if (advertise & NETDEV_F_10GB_FD) {
2013 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2014 }
2015 if (advertise & NETDEV_F_COPPER) {
2016 ecmd.advertising |= ADVERTISED_TP;
2017 }
2018 if (advertise & NETDEV_F_FIBER) {
2019 ecmd.advertising |= ADVERTISED_FIBRE;
2020 }
2021 if (advertise & NETDEV_F_AUTONEG) {
2022 ecmd.advertising |= ADVERTISED_Autoneg;
2023 }
2024 if (advertise & NETDEV_F_PAUSE) {
2025 ecmd.advertising |= ADVERTISED_Pause;
2026 }
2027 if (advertise & NETDEV_F_PAUSE_ASYM) {
2028 ecmd.advertising |= ADVERTISED_Asym_Pause;
2029 }
2030 COVERAGE_INC(netdev_set_ethtool);
2031 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2032 ETHTOOL_SSET, "ETHTOOL_SSET");
2033
2034 exit:
2035 ovs_mutex_unlock(&netdev->mutex);
2036 return error;
2037 }
2038
2039 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2040 * successful, otherwise a positive errno value. */
2041 static int
2042 netdev_linux_set_policing(struct netdev *netdev_,
2043 uint32_t kbits_rate, uint32_t kbits_burst)
2044 {
2045 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2046 const char *netdev_name = netdev_get_name(netdev_);
2047 int error;
2048
2049 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2050 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2051 : kbits_burst); /* Stick with user-specified value. */
2052
2053 ovs_mutex_lock(&netdev->mutex);
2054 if (netdev->cache_valid & VALID_POLICING) {
2055 error = netdev->netdev_policing_error;
2056 if (error || (netdev->kbits_rate == kbits_rate &&
2057 netdev->kbits_burst == kbits_burst)) {
2058 /* Assume that settings haven't changed since we last set them. */
2059 goto out;
2060 }
2061 netdev->cache_valid &= ~VALID_POLICING;
2062 }
2063
2064 COVERAGE_INC(netdev_set_policing);
2065 /* Remove any existing ingress qdisc. */
2066 error = tc_add_del_ingress_qdisc(netdev_, false);
2067 if (error) {
2068 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2069 netdev_name, ovs_strerror(error));
2070 goto out;
2071 }
2072
2073 if (kbits_rate) {
2074 error = tc_add_del_ingress_qdisc(netdev_, true);
2075 if (error) {
2076 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2077 netdev_name, ovs_strerror(error));
2078 goto out;
2079 }
2080
2081 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2082 if (error){
2083 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2084 netdev_name, ovs_strerror(error));
2085 goto out;
2086 }
2087 }
2088
2089 netdev->kbits_rate = kbits_rate;
2090 netdev->kbits_burst = kbits_burst;
2091
2092 out:
2093 if (!error || error == ENODEV) {
2094 netdev->netdev_policing_error = error;
2095 netdev->cache_valid |= VALID_POLICING;
2096 }
2097 ovs_mutex_unlock(&netdev->mutex);
2098 return error;
2099 }
2100
2101 static int
2102 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2103 struct sset *types)
2104 {
2105 const struct tc_ops *const *opsp;
2106 for (opsp = tcs; *opsp != NULL; opsp++) {
2107 const struct tc_ops *ops = *opsp;
2108 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2109 sset_add(types, ops->ovs_name);
2110 }
2111 }
2112 return 0;
2113 }
2114
2115 static const struct tc_ops *
2116 tc_lookup_ovs_name(const char *name)
2117 {
2118 const struct tc_ops *const *opsp;
2119
2120 for (opsp = tcs; *opsp != NULL; opsp++) {
2121 const struct tc_ops *ops = *opsp;
2122 if (!strcmp(name, ops->ovs_name)) {
2123 return ops;
2124 }
2125 }
2126 return NULL;
2127 }
2128
2129 static const struct tc_ops *
2130 tc_lookup_linux_name(const char *name)
2131 {
2132 const struct tc_ops *const *opsp;
2133
2134 for (opsp = tcs; *opsp != NULL; opsp++) {
2135 const struct tc_ops *ops = *opsp;
2136 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2137 return ops;
2138 }
2139 }
2140 return NULL;
2141 }
2142
2143 static struct tc_queue *
2144 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2145 size_t hash)
2146 {
2147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 struct tc_queue *queue;
2149
2150 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2151 if (queue->queue_id == queue_id) {
2152 return queue;
2153 }
2154 }
2155 return NULL;
2156 }
2157
2158 static struct tc_queue *
2159 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2160 {
2161 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2162 }
2163
2164 static int
2165 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2166 const char *type,
2167 struct netdev_qos_capabilities *caps)
2168 {
2169 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2170 if (!ops) {
2171 return EOPNOTSUPP;
2172 }
2173 caps->n_queues = ops->n_queues;
2174 return 0;
2175 }
2176
2177 static int
2178 netdev_linux_get_qos(const struct netdev *netdev_,
2179 const char **typep, struct smap *details)
2180 {
2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2182 int error;
2183
2184 ovs_mutex_lock(&netdev->mutex);
2185 error = tc_query_qdisc(netdev_);
2186 if (!error) {
2187 *typep = netdev->tc->ops->ovs_name;
2188 error = (netdev->tc->ops->qdisc_get
2189 ? netdev->tc->ops->qdisc_get(netdev_, details)
2190 : 0);
2191 }
2192 ovs_mutex_unlock(&netdev->mutex);
2193
2194 return error;
2195 }
2196
2197 static int
2198 netdev_linux_set_qos(struct netdev *netdev_,
2199 const char *type, const struct smap *details)
2200 {
2201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2202 const struct tc_ops *new_ops;
2203 int error;
2204
2205 new_ops = tc_lookup_ovs_name(type);
2206 if (!new_ops || !new_ops->tc_install) {
2207 return EOPNOTSUPP;
2208 }
2209
2210 if (new_ops == &tc_ops_noop) {
2211 return new_ops->tc_install(netdev_, details);
2212 }
2213
2214 ovs_mutex_lock(&netdev->mutex);
2215 error = tc_query_qdisc(netdev_);
2216 if (error) {
2217 goto exit;
2218 }
2219
2220 if (new_ops == netdev->tc->ops) {
2221 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2222 } else {
2223 /* Delete existing qdisc. */
2224 error = tc_del_qdisc(netdev_);
2225 if (error) {
2226 goto exit;
2227 }
2228 ovs_assert(netdev->tc == NULL);
2229
2230 /* Install new qdisc. */
2231 error = new_ops->tc_install(netdev_, details);
2232 ovs_assert((error == 0) == (netdev->tc != NULL));
2233 }
2234
2235 exit:
2236 ovs_mutex_unlock(&netdev->mutex);
2237 return error;
2238 }
2239
2240 static int
2241 netdev_linux_get_queue(const struct netdev *netdev_,
2242 unsigned int queue_id, struct smap *details)
2243 {
2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2245 int error;
2246
2247 ovs_mutex_lock(&netdev->mutex);
2248 error = tc_query_qdisc(netdev_);
2249 if (!error) {
2250 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2251 error = (queue
2252 ? netdev->tc->ops->class_get(netdev_, queue, details)
2253 : ENOENT);
2254 }
2255 ovs_mutex_unlock(&netdev->mutex);
2256
2257 return error;
2258 }
2259
2260 static int
2261 netdev_linux_set_queue(struct netdev *netdev_,
2262 unsigned int queue_id, const struct smap *details)
2263 {
2264 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2265 int error;
2266
2267 ovs_mutex_lock(&netdev->mutex);
2268 error = tc_query_qdisc(netdev_);
2269 if (!error) {
2270 error = (queue_id < netdev->tc->ops->n_queues
2271 && netdev->tc->ops->class_set
2272 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2273 : EINVAL);
2274 }
2275 ovs_mutex_unlock(&netdev->mutex);
2276
2277 return error;
2278 }
2279
2280 static int
2281 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2282 {
2283 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2284 int error;
2285
2286 ovs_mutex_lock(&netdev->mutex);
2287 error = tc_query_qdisc(netdev_);
2288 if (!error) {
2289 if (netdev->tc->ops->class_delete) {
2290 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2291 error = (queue
2292 ? netdev->tc->ops->class_delete(netdev_, queue)
2293 : ENOENT);
2294 } else {
2295 error = EINVAL;
2296 }
2297 }
2298 ovs_mutex_unlock(&netdev->mutex);
2299
2300 return error;
2301 }
2302
2303 static int
2304 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2305 unsigned int queue_id,
2306 struct netdev_queue_stats *stats)
2307 {
2308 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2309 int error;
2310
2311 ovs_mutex_lock(&netdev->mutex);
2312 error = tc_query_qdisc(netdev_);
2313 if (!error) {
2314 if (netdev->tc->ops->class_get_stats) {
2315 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2316 if (queue) {
2317 stats->created = queue->created;
2318 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2319 stats);
2320 } else {
2321 error = ENOENT;
2322 }
2323 } else {
2324 error = EOPNOTSUPP;
2325 }
2326 }
2327 ovs_mutex_unlock(&netdev->mutex);
2328
2329 return error;
2330 }
2331
2332 struct queue_dump_state {
2333 struct nl_dump dump;
2334 struct ofpbuf buf;
2335 };
2336
2337 static bool
2338 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2339 {
2340 struct ofpbuf request;
2341 struct tcmsg *tcmsg;
2342
2343 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2344 if (!tcmsg) {
2345 return false;
2346 }
2347 tcmsg->tcm_parent = 0;
2348 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2349 ofpbuf_uninit(&request);
2350
2351 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2352 return true;
2353 }
2354
2355 static int
2356 finish_queue_dump(struct queue_dump_state *state)
2357 {
2358 ofpbuf_uninit(&state->buf);
2359 return nl_dump_done(&state->dump);
2360 }
2361
2362 struct netdev_linux_queue_state {
2363 unsigned int *queues;
2364 size_t cur_queue;
2365 size_t n_queues;
2366 };
2367
2368 static int
2369 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2370 {
2371 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2372 int error;
2373
2374 ovs_mutex_lock(&netdev->mutex);
2375 error = tc_query_qdisc(netdev_);
2376 if (!error) {
2377 if (netdev->tc->ops->class_get) {
2378 struct netdev_linux_queue_state *state;
2379 struct tc_queue *queue;
2380 size_t i;
2381
2382 *statep = state = xmalloc(sizeof *state);
2383 state->n_queues = hmap_count(&netdev->tc->queues);
2384 state->cur_queue = 0;
2385 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2386
2387 i = 0;
2388 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2389 state->queues[i++] = queue->queue_id;
2390 }
2391 } else {
2392 error = EOPNOTSUPP;
2393 }
2394 }
2395 ovs_mutex_unlock(&netdev->mutex);
2396
2397 return error;
2398 }
2399
2400 static int
2401 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2402 unsigned int *queue_idp, struct smap *details)
2403 {
2404 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2405 struct netdev_linux_queue_state *state = state_;
2406 int error = EOF;
2407
2408 ovs_mutex_lock(&netdev->mutex);
2409 while (state->cur_queue < state->n_queues) {
2410 unsigned int queue_id = state->queues[state->cur_queue++];
2411 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2412
2413 if (queue) {
2414 *queue_idp = queue_id;
2415 error = netdev->tc->ops->class_get(netdev_, queue, details);
2416 break;
2417 }
2418 }
2419 ovs_mutex_unlock(&netdev->mutex);
2420
2421 return error;
2422 }
2423
2424 static int
2425 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2426 void *state_)
2427 {
2428 struct netdev_linux_queue_state *state = state_;
2429
2430 free(state->queues);
2431 free(state);
2432 return 0;
2433 }
2434
2435 static int
2436 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2437 netdev_dump_queue_stats_cb *cb, void *aux)
2438 {
2439 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2440 int error;
2441
2442 ovs_mutex_lock(&netdev->mutex);
2443 error = tc_query_qdisc(netdev_);
2444 if (!error) {
2445 struct queue_dump_state state;
2446
2447 if (!netdev->tc->ops->class_dump_stats) {
2448 error = EOPNOTSUPP;
2449 } else if (!start_queue_dump(netdev_, &state)) {
2450 error = ENODEV;
2451 } else {
2452 struct ofpbuf msg;
2453 int retval;
2454
2455 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2456 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2457 cb, aux);
2458 if (retval) {
2459 error = retval;
2460 }
2461 }
2462
2463 retval = finish_queue_dump(&state);
2464 if (retval) {
2465 error = retval;
2466 }
2467 }
2468 }
2469 ovs_mutex_unlock(&netdev->mutex);
2470
2471 return error;
2472 }
2473
2474 static int
2475 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2476 struct in_addr netmask)
2477 {
2478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2479 int error;
2480
2481 ovs_mutex_lock(&netdev->mutex);
2482 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2483 if (!error) {
2484 if (address.s_addr != INADDR_ANY) {
2485 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2486 "SIOCSIFNETMASK", netmask);
2487 }
2488 }
2489
2490 ovs_mutex_unlock(&netdev->mutex);
2491
2492 return error;
2493 }
2494
2495 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2496 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2497 * error. */
2498 static int
2499 netdev_linux_get_addr_list(const struct netdev *netdev_,
2500 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2501 {
2502 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2503 int error;
2504
2505 ovs_mutex_lock(&netdev->mutex);
2506 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2507 ovs_mutex_unlock(&netdev->mutex);
2508
2509 return error;
2510 }
2511
2512 static void
2513 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2514 {
2515 struct sockaddr_in sin;
2516 memset(&sin, 0, sizeof sin);
2517 sin.sin_family = AF_INET;
2518 sin.sin_addr = addr;
2519 sin.sin_port = 0;
2520
2521 memset(sa, 0, sizeof *sa);
2522 memcpy(sa, &sin, sizeof sin);
2523 }
2524
2525 static int
2526 do_set_addr(struct netdev *netdev,
2527 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2528 {
2529 struct ifreq ifr;
2530
2531 make_in4_sockaddr(&ifr.ifr_addr, addr);
2532 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2533 ioctl_name);
2534 }
2535
2536 /* Adds 'router' as a default IP gateway. */
2537 static int
2538 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2539 {
2540 struct in_addr any = { INADDR_ANY };
2541 struct rtentry rt;
2542 int error;
2543
2544 memset(&rt, 0, sizeof rt);
2545 make_in4_sockaddr(&rt.rt_dst, any);
2546 make_in4_sockaddr(&rt.rt_gateway, router);
2547 make_in4_sockaddr(&rt.rt_genmask, any);
2548 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2549 error = af_inet_ioctl(SIOCADDRT, &rt);
2550 if (error) {
2551 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2552 }
2553 return error;
2554 }
2555
2556 static int
2557 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2558 char **netdev_name)
2559 {
2560 static const char fn[] = "/proc/net/route";
2561 FILE *stream;
2562 char line[256];
2563 int ln;
2564
2565 *netdev_name = NULL;
2566 stream = fopen(fn, "r");
2567 if (stream == NULL) {
2568 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2569 return errno;
2570 }
2571
2572 ln = 0;
2573 while (fgets(line, sizeof line, stream)) {
2574 if (++ln >= 2) {
2575 char iface[17];
2576 ovs_be32 dest, gateway, mask;
2577 int refcnt, metric, mtu;
2578 unsigned int flags, use, window, irtt;
2579
2580 if (!ovs_scan(line,
2581 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2582 " %d %u %u\n",
2583 iface, &dest, &gateway, &flags, &refcnt,
2584 &use, &metric, &mask, &mtu, &window, &irtt)) {
2585 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2586 fn, ln, line);
2587 continue;
2588 }
2589 if (!(flags & RTF_UP)) {
2590 /* Skip routes that aren't up. */
2591 continue;
2592 }
2593
2594 /* The output of 'dest', 'mask', and 'gateway' were given in
2595 * network byte order, so we don't need need any endian
2596 * conversions here. */
2597 if ((dest & mask) == (host->s_addr & mask)) {
2598 if (!gateway) {
2599 /* The host is directly reachable. */
2600 next_hop->s_addr = 0;
2601 } else {
2602 /* To reach the host, we must go through a gateway. */
2603 next_hop->s_addr = gateway;
2604 }
2605 *netdev_name = xstrdup(iface);
2606 fclose(stream);
2607 return 0;
2608 }
2609 }
2610 }
2611
2612 fclose(stream);
2613 return ENXIO;
2614 }
2615
2616 static int
2617 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2618 {
2619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2620 int error = 0;
2621
2622 ovs_mutex_lock(&netdev->mutex);
2623 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2624 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2625
2626 COVERAGE_INC(netdev_get_ethtool);
2627 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2628 error = netdev_linux_do_ethtool(netdev->up.name,
2629 cmd,
2630 ETHTOOL_GDRVINFO,
2631 "ETHTOOL_GDRVINFO");
2632 if (!error) {
2633 netdev->cache_valid |= VALID_DRVINFO;
2634 }
2635 }
2636
2637 if (!error) {
2638 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2639 smap_add(smap, "driver_version", netdev->drvinfo.version);
2640 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2641 }
2642 ovs_mutex_unlock(&netdev->mutex);
2643
2644 return error;
2645 }
2646
2647 static int
2648 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2649 struct smap *smap)
2650 {
2651 smap_add(smap, "driver_name", "openvswitch");
2652 return 0;
2653 }
2654
2655 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2656 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2657 * returns 0. Otherwise, it returns a positive errno value; in particular,
2658 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2659 static int
2660 netdev_linux_arp_lookup(const struct netdev *netdev,
2661 ovs_be32 ip, struct eth_addr *mac)
2662 {
2663 struct arpreq r;
2664 struct sockaddr_in sin;
2665 int retval;
2666
2667 memset(&r, 0, sizeof r);
2668 memset(&sin, 0, sizeof sin);
2669 sin.sin_family = AF_INET;
2670 sin.sin_addr.s_addr = ip;
2671 sin.sin_port = 0;
2672 memcpy(&r.arp_pa, &sin, sizeof sin);
2673 r.arp_ha.sa_family = ARPHRD_ETHER;
2674 r.arp_flags = 0;
2675 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2676 COVERAGE_INC(netdev_arp_lookup);
2677 retval = af_inet_ioctl(SIOCGARP, &r);
2678 if (!retval) {
2679 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2680 } else if (retval != ENXIO) {
2681 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2682 netdev_get_name(netdev), IP_ARGS(ip),
2683 ovs_strerror(retval));
2684 }
2685 return retval;
2686 }
2687
2688 static int
2689 nd_to_iff_flags(enum netdev_flags nd)
2690 {
2691 int iff = 0;
2692 if (nd & NETDEV_UP) {
2693 iff |= IFF_UP;
2694 }
2695 if (nd & NETDEV_PROMISC) {
2696 iff |= IFF_PROMISC;
2697 }
2698 if (nd & NETDEV_LOOPBACK) {
2699 iff |= IFF_LOOPBACK;
2700 }
2701 return iff;
2702 }
2703
2704 static int
2705 iff_to_nd_flags(int iff)
2706 {
2707 enum netdev_flags nd = 0;
2708 if (iff & IFF_UP) {
2709 nd |= NETDEV_UP;
2710 }
2711 if (iff & IFF_PROMISC) {
2712 nd |= NETDEV_PROMISC;
2713 }
2714 if (iff & IFF_LOOPBACK) {
2715 nd |= NETDEV_LOOPBACK;
2716 }
2717 return nd;
2718 }
2719
2720 static int
2721 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2722 enum netdev_flags on, enum netdev_flags *old_flagsp)
2723 OVS_REQUIRES(netdev->mutex)
2724 {
2725 int old_flags, new_flags;
2726 int error = 0;
2727
2728 old_flags = netdev->ifi_flags;
2729 *old_flagsp = iff_to_nd_flags(old_flags);
2730 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2731 if (new_flags != old_flags) {
2732 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2733 get_flags(&netdev->up, &netdev->ifi_flags);
2734 }
2735
2736 return error;
2737 }
2738
2739 static int
2740 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2741 enum netdev_flags on, enum netdev_flags *old_flagsp)
2742 {
2743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2744 int error;
2745
2746 ovs_mutex_lock(&netdev->mutex);
2747 error = update_flags(netdev, off, on, old_flagsp);
2748 ovs_mutex_unlock(&netdev->mutex);
2749
2750 return error;
2751 }
2752
2753 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2754 GET_FEATURES, GET_STATUS) \
2755 { \
2756 NAME, \
2757 false, /* is_pmd */ \
2758 \
2759 NULL, \
2760 netdev_linux_run, \
2761 netdev_linux_wait, \
2762 \
2763 netdev_linux_alloc, \
2764 CONSTRUCT, \
2765 netdev_linux_destruct, \
2766 netdev_linux_dealloc, \
2767 NULL, /* get_config */ \
2768 NULL, /* set_config */ \
2769 NULL, /* get_tunnel_config */ \
2770 NULL, /* build header */ \
2771 NULL, /* push header */ \
2772 NULL, /* pop header */ \
2773 NULL, /* get_numa_id */ \
2774 NULL, /* set_tx_multiq */ \
2775 \
2776 netdev_linux_send, \
2777 netdev_linux_send_wait, \
2778 \
2779 netdev_linux_set_etheraddr, \
2780 netdev_linux_get_etheraddr, \
2781 netdev_linux_get_mtu, \
2782 netdev_linux_set_mtu, \
2783 netdev_linux_get_ifindex, \
2784 netdev_linux_get_carrier, \
2785 netdev_linux_get_carrier_resets, \
2786 netdev_linux_set_miimon_interval, \
2787 GET_STATS, \
2788 \
2789 GET_FEATURES, \
2790 netdev_linux_set_advertisements, \
2791 \
2792 netdev_linux_set_policing, \
2793 netdev_linux_get_qos_types, \
2794 netdev_linux_get_qos_capabilities, \
2795 netdev_linux_get_qos, \
2796 netdev_linux_set_qos, \
2797 netdev_linux_get_queue, \
2798 netdev_linux_set_queue, \
2799 netdev_linux_delete_queue, \
2800 netdev_linux_get_queue_stats, \
2801 netdev_linux_queue_dump_start, \
2802 netdev_linux_queue_dump_next, \
2803 netdev_linux_queue_dump_done, \
2804 netdev_linux_dump_queue_stats, \
2805 \
2806 netdev_linux_set_in4, \
2807 netdev_linux_get_addr_list, \
2808 netdev_linux_add_router, \
2809 netdev_linux_get_next_hop, \
2810 GET_STATUS, \
2811 netdev_linux_arp_lookup, \
2812 \
2813 netdev_linux_update_flags, \
2814 NULL, /* reconfigure */ \
2815 \
2816 netdev_linux_rxq_alloc, \
2817 netdev_linux_rxq_construct, \
2818 netdev_linux_rxq_destruct, \
2819 netdev_linux_rxq_dealloc, \
2820 netdev_linux_rxq_recv, \
2821 netdev_linux_rxq_wait, \
2822 netdev_linux_rxq_drain, \
2823 }
2824
2825 const struct netdev_class netdev_linux_class =
2826 NETDEV_LINUX_CLASS(
2827 "system",
2828 netdev_linux_construct,
2829 netdev_linux_get_stats,
2830 netdev_linux_get_features,
2831 netdev_linux_get_status);
2832
2833 const struct netdev_class netdev_tap_class =
2834 NETDEV_LINUX_CLASS(
2835 "tap",
2836 netdev_linux_construct_tap,
2837 netdev_tap_get_stats,
2838 netdev_linux_get_features,
2839 netdev_linux_get_status);
2840
2841 const struct netdev_class netdev_internal_class =
2842 NETDEV_LINUX_CLASS(
2843 "internal",
2844 netdev_linux_construct,
2845 netdev_internal_get_stats,
2846 NULL, /* get_features */
2847 netdev_internal_get_status);
2848 \f
2849
2850 #define CODEL_N_QUEUES 0x0000
2851
2852 /* In sufficiently new kernel headers these are defined as enums in
2853 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2854 * kernels. (This overrides any enum definition in the header file but that's
2855 * harmless.) */
2856 #define TCA_CODEL_TARGET 1
2857 #define TCA_CODEL_LIMIT 2
2858 #define TCA_CODEL_INTERVAL 3
2859
2860 struct codel {
2861 struct tc tc;
2862 uint32_t target;
2863 uint32_t limit;
2864 uint32_t interval;
2865 };
2866
2867 static struct codel *
2868 codel_get__(const struct netdev *netdev_)
2869 {
2870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2871 return CONTAINER_OF(netdev->tc, struct codel, tc);
2872 }
2873
2874 static void
2875 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2876 uint32_t interval)
2877 {
2878 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2879 struct codel *codel;
2880
2881 codel = xmalloc(sizeof *codel);
2882 tc_init(&codel->tc, &tc_ops_codel);
2883 codel->target = target;
2884 codel->limit = limit;
2885 codel->interval = interval;
2886
2887 netdev->tc = &codel->tc;
2888 }
2889
2890 static int
2891 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2892 uint32_t interval)
2893 {
2894 size_t opt_offset;
2895 struct ofpbuf request;
2896 struct tcmsg *tcmsg;
2897 uint32_t otarget, olimit, ointerval;
2898 int error;
2899
2900 tc_del_qdisc(netdev);
2901
2902 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2903 NLM_F_EXCL | NLM_F_CREATE, &request);
2904 if (!tcmsg) {
2905 return ENODEV;
2906 }
2907 tcmsg->tcm_handle = tc_make_handle(1, 0);
2908 tcmsg->tcm_parent = TC_H_ROOT;
2909
2910 otarget = target ? target : 5000;
2911 olimit = limit ? limit : 10240;
2912 ointerval = interval ? interval : 100000;
2913
2914 nl_msg_put_string(&request, TCA_KIND, "codel");
2915 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2916 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2917 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2918 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2919 nl_msg_end_nested(&request, opt_offset);
2920
2921 error = tc_transact(&request, NULL);
2922 if (error) {
2923 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2924 "target %u, limit %u, interval %u error %d(%s)",
2925 netdev_get_name(netdev),
2926 otarget, olimit, ointerval,
2927 error, ovs_strerror(error));
2928 }
2929 return error;
2930 }
2931
2932 static void
2933 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2934 const struct smap *details, struct codel *codel)
2935 {
2936 const char *target_s;
2937 const char *limit_s;
2938 const char *interval_s;
2939
2940 target_s = smap_get(details, "target");
2941 limit_s = smap_get(details, "limit");
2942 interval_s = smap_get(details, "interval");
2943
2944 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2945 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2946 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2947
2948 if (!codel->target) {
2949 codel->target = 5000;
2950 }
2951 if (!codel->limit) {
2952 codel->limit = 10240;
2953 }
2954 if (!codel->interval) {
2955 codel->interval = 100000;
2956 }
2957 }
2958
2959 static int
2960 codel_tc_install(struct netdev *netdev, const struct smap *details)
2961 {
2962 int error;
2963 struct codel codel;
2964
2965 codel_parse_qdisc_details__(netdev, details, &codel);
2966 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2967 codel.interval);
2968 if (!error) {
2969 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2970 }
2971 return error;
2972 }
2973
2974 static int
2975 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2976 {
2977 static const struct nl_policy tca_codel_policy[] = {
2978 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2979 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2980 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2981 };
2982
2983 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2984
2985 if (!nl_parse_nested(nl_options, tca_codel_policy,
2986 attrs, ARRAY_SIZE(tca_codel_policy))) {
2987 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2988 return EPROTO;
2989 }
2990
2991 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2992 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2993 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2994 return 0;
2995 }
2996
2997 static int
2998 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2999 {
3000 struct nlattr *nlattr;
3001 const char * kind;
3002 int error;
3003 struct codel codel;
3004
3005 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3006 if (error != 0) {
3007 return error;
3008 }
3009
3010 error = codel_parse_tca_options__(nlattr, &codel);
3011 if (error != 0) {
3012 return error;
3013 }
3014
3015 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3016 return 0;
3017 }
3018
3019
3020 static void
3021 codel_tc_destroy(struct tc *tc)
3022 {
3023 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3024 tc_destroy(tc);
3025 free(codel);
3026 }
3027
3028 static int
3029 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3030 {
3031 const struct codel *codel = codel_get__(netdev);
3032 smap_add_format(details, "target", "%u", codel->target);
3033 smap_add_format(details, "limit", "%u", codel->limit);
3034 smap_add_format(details, "interval", "%u", codel->interval);
3035 return 0;
3036 }
3037
3038 static int
3039 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3040 {
3041 struct codel codel;
3042
3043 codel_parse_qdisc_details__(netdev, details, &codel);
3044 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3045 codel_get__(netdev)->target = codel.target;
3046 codel_get__(netdev)->limit = codel.limit;
3047 codel_get__(netdev)->interval = codel.interval;
3048 return 0;
3049 }
3050
3051 static const struct tc_ops tc_ops_codel = {
3052 "codel", /* linux_name */
3053 "linux-codel", /* ovs_name */
3054 CODEL_N_QUEUES, /* n_queues */
3055 codel_tc_install,
3056 codel_tc_load,
3057 codel_tc_destroy,
3058 codel_qdisc_get,
3059 codel_qdisc_set,
3060 NULL,
3061 NULL,
3062 NULL,
3063 NULL,
3064 NULL
3065 };
3066 \f
3067 /* FQ-CoDel traffic control class. */
3068
3069 #define FQCODEL_N_QUEUES 0x0000
3070
3071 /* In sufficiently new kernel headers these are defined as enums in
3072 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3073 * kernels. (This overrides any enum definition in the header file but that's
3074 * harmless.) */
3075 #define TCA_FQ_CODEL_TARGET 1
3076 #define TCA_FQ_CODEL_LIMIT 2
3077 #define TCA_FQ_CODEL_INTERVAL 3
3078 #define TCA_FQ_CODEL_ECN 4
3079 #define TCA_FQ_CODEL_FLOWS 5
3080 #define TCA_FQ_CODEL_QUANTUM 6
3081
3082 struct fqcodel {
3083 struct tc tc;
3084 uint32_t target;
3085 uint32_t limit;
3086 uint32_t interval;
3087 uint32_t flows;
3088 uint32_t quantum;
3089 };
3090
3091 static struct fqcodel *
3092 fqcodel_get__(const struct netdev *netdev_)
3093 {
3094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3095 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3096 }
3097
3098 static void
3099 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3100 uint32_t interval, uint32_t flows, uint32_t quantum)
3101 {
3102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3103 struct fqcodel *fqcodel;
3104
3105 fqcodel = xmalloc(sizeof *fqcodel);
3106 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3107 fqcodel->target = target;
3108 fqcodel->limit = limit;
3109 fqcodel->interval = interval;
3110 fqcodel->flows = flows;
3111 fqcodel->quantum = quantum;
3112
3113 netdev->tc = &fqcodel->tc;
3114 }
3115
3116 static int
3117 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3118 uint32_t interval, uint32_t flows, uint32_t quantum)
3119 {
3120 size_t opt_offset;
3121 struct ofpbuf request;
3122 struct tcmsg *tcmsg;
3123 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3124 int error;
3125
3126 tc_del_qdisc(netdev);
3127
3128 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3129 NLM_F_EXCL | NLM_F_CREATE, &request);
3130 if (!tcmsg) {
3131 return ENODEV;
3132 }
3133 tcmsg->tcm_handle = tc_make_handle(1, 0);
3134 tcmsg->tcm_parent = TC_H_ROOT;
3135
3136 otarget = target ? target : 5000;
3137 olimit = limit ? limit : 10240;
3138 ointerval = interval ? interval : 100000;
3139 oflows = flows ? flows : 1024;
3140 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3141 not mtu */
3142
3143 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3144 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3145 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3146 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3147 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3148 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3149 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3150 nl_msg_end_nested(&request, opt_offset);
3151
3152 error = tc_transact(&request, NULL);
3153 if (error) {
3154 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3155 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3156 netdev_get_name(netdev),
3157 otarget, olimit, ointerval, oflows, oquantum,
3158 error, ovs_strerror(error));
3159 }
3160 return error;
3161 }
3162
3163 static void
3164 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3165 const struct smap *details, struct fqcodel *fqcodel)
3166 {
3167 const char *target_s;
3168 const char *limit_s;
3169 const char *interval_s;
3170 const char *flows_s;
3171 const char *quantum_s;
3172
3173 target_s = smap_get(details, "target");
3174 limit_s = smap_get(details, "limit");
3175 interval_s = smap_get(details, "interval");
3176 flows_s = smap_get(details, "flows");
3177 quantum_s = smap_get(details, "quantum");
3178 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3179 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3180 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3181 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3182 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3183 if (!fqcodel->target) {
3184 fqcodel->target = 5000;
3185 }
3186 if (!fqcodel->limit) {
3187 fqcodel->limit = 10240;
3188 }
3189 if (!fqcodel->interval) {
3190 fqcodel->interval = 1000000;
3191 }
3192 if (!fqcodel->flows) {
3193 fqcodel->flows = 1024;
3194 }
3195 if (!fqcodel->quantum) {
3196 fqcodel->quantum = 1514;
3197 }
3198 }
3199
3200 static int
3201 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3202 {
3203 int error;
3204 struct fqcodel fqcodel;
3205
3206 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3207 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3208 fqcodel.interval, fqcodel.flows,
3209 fqcodel.quantum);
3210 if (!error) {
3211 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3212 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3213 }
3214 return error;
3215 }
3216
3217 static int
3218 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3219 {
3220 static const struct nl_policy tca_fqcodel_policy[] = {
3221 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3222 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3223 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3224 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3225 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3226 };
3227
3228 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3229
3230 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3231 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3232 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3233 return EPROTO;
3234 }
3235
3236 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3237 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3238 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3239 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3240 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3241 return 0;
3242 }
3243
3244 static int
3245 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3246 {
3247 struct nlattr *nlattr;
3248 const char * kind;
3249 int error;
3250 struct fqcodel fqcodel;
3251
3252 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3253 if (error != 0) {
3254 return error;
3255 }
3256
3257 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3258 if (error != 0) {
3259 return error;
3260 }
3261
3262 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3263 fqcodel.flows, fqcodel.quantum);
3264 return 0;
3265 }
3266
3267 static void
3268 fqcodel_tc_destroy(struct tc *tc)
3269 {
3270 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3271 tc_destroy(tc);
3272 free(fqcodel);
3273 }
3274
3275 static int
3276 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3277 {
3278 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3279 smap_add_format(details, "target", "%u", fqcodel->target);
3280 smap_add_format(details, "limit", "%u", fqcodel->limit);
3281 smap_add_format(details, "interval", "%u", fqcodel->interval);
3282 smap_add_format(details, "flows", "%u", fqcodel->flows);
3283 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3284 return 0;
3285 }
3286
3287 static int
3288 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3289 {
3290 struct fqcodel fqcodel;
3291
3292 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3293 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3294 fqcodel.flows, fqcodel.quantum);
3295 fqcodel_get__(netdev)->target = fqcodel.target;
3296 fqcodel_get__(netdev)->limit = fqcodel.limit;
3297 fqcodel_get__(netdev)->interval = fqcodel.interval;
3298 fqcodel_get__(netdev)->flows = fqcodel.flows;
3299 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3300 return 0;
3301 }
3302
3303 static const struct tc_ops tc_ops_fqcodel = {
3304 "fq_codel", /* linux_name */
3305 "linux-fq_codel", /* ovs_name */
3306 FQCODEL_N_QUEUES, /* n_queues */
3307 fqcodel_tc_install,
3308 fqcodel_tc_load,
3309 fqcodel_tc_destroy,
3310 fqcodel_qdisc_get,
3311 fqcodel_qdisc_set,
3312 NULL,
3313 NULL,
3314 NULL,
3315 NULL,
3316 NULL
3317 };
3318 \f
3319 /* SFQ traffic control class. */
3320
3321 #define SFQ_N_QUEUES 0x0000
3322
3323 struct sfq {
3324 struct tc tc;
3325 uint32_t quantum;
3326 uint32_t perturb;
3327 };
3328
3329 static struct sfq *
3330 sfq_get__(const struct netdev *netdev_)
3331 {
3332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3333 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3334 }
3335
3336 static void
3337 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3338 {
3339 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3340 struct sfq *sfq;
3341
3342 sfq = xmalloc(sizeof *sfq);
3343 tc_init(&sfq->tc, &tc_ops_sfq);
3344 sfq->perturb = perturb;
3345 sfq->quantum = quantum;
3346
3347 netdev->tc = &sfq->tc;
3348 }
3349
3350 static int
3351 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3352 {
3353 struct tc_sfq_qopt opt;
3354 struct ofpbuf request;
3355 struct tcmsg *tcmsg;
3356 int mtu;
3357 int mtu_error, error;
3358 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3359
3360 tc_del_qdisc(netdev);
3361
3362 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3363 NLM_F_EXCL | NLM_F_CREATE, &request);
3364 if (!tcmsg) {
3365 return ENODEV;
3366 }
3367 tcmsg->tcm_handle = tc_make_handle(1, 0);
3368 tcmsg->tcm_parent = TC_H_ROOT;
3369
3370 memset(&opt, 0, sizeof opt);
3371 if (!quantum) {
3372 if (!mtu_error) {
3373 opt.quantum = mtu; /* if we cannot find mtu, use default */
3374 }
3375 } else {
3376 opt.quantum = quantum;
3377 }
3378
3379 if (!perturb) {
3380 opt.perturb_period = 10;
3381 } else {
3382 opt.perturb_period = perturb;
3383 }
3384
3385 nl_msg_put_string(&request, TCA_KIND, "sfq");
3386 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3387
3388 error = tc_transact(&request, NULL);
3389 if (error) {
3390 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3391 "quantum %u, perturb %u error %d(%s)",
3392 netdev_get_name(netdev),
3393 opt.quantum, opt.perturb_period,
3394 error, ovs_strerror(error));
3395 }
3396 return error;
3397 }
3398
3399 static void
3400 sfq_parse_qdisc_details__(struct netdev *netdev,
3401 const struct smap *details, struct sfq *sfq)
3402 {
3403 const char *perturb_s;
3404 const char *quantum_s;
3405 int mtu;
3406 int mtu_error;
3407
3408 perturb_s = smap_get(details, "perturb");
3409 quantum_s = smap_get(details, "quantum");
3410 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3411 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3412 if (!sfq->perturb) {
3413 sfq->perturb = 10;
3414 }
3415
3416 if (!sfq->quantum) {
3417 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3418 if (!mtu_error) {
3419 sfq->quantum = mtu;
3420 } else {
3421 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3422 "device without mtu");
3423 return;
3424 }
3425 }
3426 }
3427
3428 static int
3429 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3430 {
3431 int error;
3432 struct sfq sfq;
3433
3434 sfq_parse_qdisc_details__(netdev, details, &sfq);
3435 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3436 if (!error) {
3437 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3438 }
3439 return error;
3440 }
3441
3442 static int
3443 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3444 {
3445 const struct tc_sfq_qopt *sfq;
3446 struct nlattr *nlattr;
3447 const char * kind;
3448 int error;
3449
3450 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3451 if (error == 0) {
3452 sfq = nl_attr_get(nlattr);
3453 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3454 return 0;
3455 }
3456
3457 return error;
3458 }
3459
3460 static void
3461 sfq_tc_destroy(struct tc *tc)
3462 {
3463 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3464 tc_destroy(tc);
3465 free(sfq);
3466 }
3467
3468 static int
3469 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3470 {
3471 const struct sfq *sfq = sfq_get__(netdev);
3472 smap_add_format(details, "quantum", "%u", sfq->quantum);
3473 smap_add_format(details, "perturb", "%u", sfq->perturb);
3474 return 0;
3475 }
3476
3477 static int
3478 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3479 {
3480 struct sfq sfq;
3481
3482 sfq_parse_qdisc_details__(netdev, details, &sfq);
3483 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3484 sfq_get__(netdev)->quantum = sfq.quantum;
3485 sfq_get__(netdev)->perturb = sfq.perturb;
3486 return 0;
3487 }
3488
3489 static const struct tc_ops tc_ops_sfq = {
3490 "sfq", /* linux_name */
3491 "linux-sfq", /* ovs_name */
3492 SFQ_N_QUEUES, /* n_queues */
3493 sfq_tc_install,
3494 sfq_tc_load,
3495 sfq_tc_destroy,
3496 sfq_qdisc_get,
3497 sfq_qdisc_set,
3498 NULL,
3499 NULL,
3500 NULL,
3501 NULL,
3502 NULL
3503 };
3504 \f
3505 /* HTB traffic control class. */
3506
3507 #define HTB_N_QUEUES 0xf000
3508 #define HTB_RATE2QUANTUM 10
3509
3510 struct htb {
3511 struct tc tc;
3512 unsigned int max_rate; /* In bytes/s. */
3513 };
3514
3515 struct htb_class {
3516 struct tc_queue tc_queue;
3517 unsigned int min_rate; /* In bytes/s. */
3518 unsigned int max_rate; /* In bytes/s. */
3519 unsigned int burst; /* In bytes. */
3520 unsigned int priority; /* Lower values are higher priorities. */
3521 };
3522
3523 static struct htb *
3524 htb_get__(const struct netdev *netdev_)
3525 {
3526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3527 return CONTAINER_OF(netdev->tc, struct htb, tc);
3528 }
3529
3530 static void
3531 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3532 {
3533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3534 struct htb *htb;
3535
3536 htb = xmalloc(sizeof *htb);
3537 tc_init(&htb->tc, &tc_ops_htb);
3538 htb->max_rate = max_rate;
3539
3540 netdev->tc = &htb->tc;
3541 }
3542
3543 /* Create an HTB qdisc.
3544 *
3545 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3546 static int
3547 htb_setup_qdisc__(struct netdev *netdev)
3548 {
3549 size_t opt_offset;
3550 struct tc_htb_glob opt;
3551 struct ofpbuf request;
3552 struct tcmsg *tcmsg;
3553
3554 tc_del_qdisc(netdev);
3555
3556 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3557 NLM_F_EXCL | NLM_F_CREATE, &request);
3558 if (!tcmsg) {
3559 return ENODEV;
3560 }
3561 tcmsg->tcm_handle = tc_make_handle(1, 0);
3562 tcmsg->tcm_parent = TC_H_ROOT;
3563
3564 nl_msg_put_string(&request, TCA_KIND, "htb");
3565
3566 memset(&opt, 0, sizeof opt);
3567 opt.rate2quantum = HTB_RATE2QUANTUM;
3568 opt.version = 3;
3569 opt.defcls = 1;
3570
3571 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3572 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3573 nl_msg_end_nested(&request, opt_offset);
3574
3575 return tc_transact(&request, NULL);
3576 }
3577
3578 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3579 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3580 static int
3581 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3582 unsigned int parent, struct htb_class *class)
3583 {
3584 size_t opt_offset;
3585 struct tc_htb_opt opt;
3586 struct ofpbuf request;
3587 struct tcmsg *tcmsg;
3588 int error;
3589 int mtu;
3590
3591 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3592 if (error) {
3593 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3594 netdev_get_name(netdev));
3595 return error;
3596 }
3597
3598 memset(&opt, 0, sizeof opt);
3599 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3600 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3601 /* Makes sure the quantum is at least MTU. Setting quantum will
3602 * make htb ignore the r2q for this class. */
3603 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3604 opt.quantum = mtu;
3605 }
3606 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3607 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3608 opt.prio = class->priority;
3609
3610 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3611 if (!tcmsg) {
3612 return ENODEV;
3613 }
3614 tcmsg->tcm_handle = handle;
3615 tcmsg->tcm_parent = parent;
3616
3617 nl_msg_put_string(&request, TCA_KIND, "htb");
3618 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3619 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3620 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3621 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3622 nl_msg_end_nested(&request, opt_offset);
3623
3624 error = tc_transact(&request, NULL);
3625 if (error) {
3626 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3627 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3628 netdev_get_name(netdev),
3629 tc_get_major(handle), tc_get_minor(handle),
3630 tc_get_major(parent), tc_get_minor(parent),
3631 class->min_rate, class->max_rate,
3632 class->burst, class->priority, ovs_strerror(error));
3633 }
3634 return error;
3635 }
3636
3637 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3638 * description of them into 'details'. The description complies with the
3639 * specification given in the vswitch database documentation for linux-htb
3640 * queue details. */
3641 static int
3642 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3643 {
3644 static const struct nl_policy tca_htb_policy[] = {
3645 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3646 .min_len = sizeof(struct tc_htb_opt) },
3647 };
3648
3649 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3650 const struct tc_htb_opt *htb;
3651
3652 if (!nl_parse_nested(nl_options, tca_htb_policy,
3653 attrs, ARRAY_SIZE(tca_htb_policy))) {
3654 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3655 return EPROTO;
3656 }
3657
3658 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3659 class->min_rate = htb->rate.rate;
3660 class->max_rate = htb->ceil.rate;
3661 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3662 class->priority = htb->prio;
3663 return 0;
3664 }
3665
3666 static int
3667 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3668 struct htb_class *options,
3669 struct netdev_queue_stats *stats)
3670 {
3671 struct nlattr *nl_options;
3672 unsigned int handle;
3673 int error;
3674
3675 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3676 if (!error && queue_id) {
3677 unsigned int major = tc_get_major(handle);
3678 unsigned int minor = tc_get_minor(handle);
3679 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3680 *queue_id = minor - 1;
3681 } else {
3682 error = EPROTO;
3683 }
3684 }
3685 if (!error && options) {
3686 error = htb_parse_tca_options__(nl_options, options);
3687 }
3688 return error;
3689 }
3690
3691 static void
3692 htb_parse_qdisc_details__(struct netdev *netdev_,
3693 const struct smap *details, struct htb_class *hc)
3694 {
3695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3696 const char *max_rate_s;
3697
3698 max_rate_s = smap_get(details, "max-rate");
3699 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3700 if (!hc->max_rate) {
3701 enum netdev_features current;
3702
3703 netdev_linux_read_features(netdev);
3704 current = !netdev->get_features_error ? netdev->current : 0;
3705 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3706 }
3707 hc->min_rate = hc->max_rate;
3708 hc->burst = 0;
3709 hc->priority = 0;
3710 }
3711
3712 static int
3713 htb_parse_class_details__(struct netdev *netdev,
3714 const struct smap *details, struct htb_class *hc)
3715 {
3716 const struct htb *htb = htb_get__(netdev);
3717 const char *min_rate_s = smap_get(details, "min-rate");
3718 const char *max_rate_s = smap_get(details, "max-rate");
3719 const char *burst_s = smap_get(details, "burst");
3720 const char *priority_s = smap_get(details, "priority");
3721 int mtu, error;
3722
3723 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3724 if (error) {
3725 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3726 netdev_get_name(netdev));
3727 return error;
3728 }
3729
3730 /* HTB requires at least an mtu sized min-rate to send any traffic even
3731 * on uncongested links. */
3732 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3733 hc->min_rate = MAX(hc->min_rate, mtu);
3734 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3735
3736 /* max-rate */
3737 hc->max_rate = (max_rate_s
3738 ? strtoull(max_rate_s, NULL, 10) / 8
3739 : htb->max_rate);
3740 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3741 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3742
3743 /* burst
3744 *
3745 * According to hints in the documentation that I've read, it is important
3746 * that 'burst' be at least as big as the largest frame that might be
3747 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3748 * but having it a bit too small is a problem. Since netdev_get_mtu()
3749 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3750 * the MTU. We actually add 64, instead of 14, as a guard against
3751 * additional headers get tacked on somewhere that we're not aware of. */
3752 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3753 hc->burst = MAX(hc->burst, mtu + 64);
3754
3755 /* priority */
3756 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3757
3758 return 0;
3759 }
3760
3761 static int
3762 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3763 unsigned int parent, struct htb_class *options,
3764 struct netdev_queue_stats *stats)
3765 {
3766 struct ofpbuf *reply;
3767 int error;
3768
3769 error = tc_query_class(netdev, handle, parent, &reply);
3770 if (!error) {
3771 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3772 ofpbuf_delete(reply);
3773 }
3774 return error;
3775 }
3776
3777 static int
3778 htb_tc_install(struct netdev *netdev, const struct smap *details)
3779 {
3780 int error;
3781
3782 error = htb_setup_qdisc__(netdev);
3783 if (!error) {
3784 struct htb_class hc;
3785
3786 htb_parse_qdisc_details__(netdev, details, &hc);
3787 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3788 tc_make_handle(1, 0), &hc);
3789 if (!error) {
3790 htb_install__(netdev, hc.max_rate);
3791 }
3792 }
3793 return error;
3794 }
3795
3796 static struct htb_class *
3797 htb_class_cast__(const struct tc_queue *queue)
3798 {
3799 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3800 }
3801
3802 static void
3803 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3804 const struct htb_class *hc)
3805 {
3806 struct htb *htb = htb_get__(netdev);
3807 size_t hash = hash_int(queue_id, 0);
3808 struct tc_queue *queue;
3809 struct htb_class *hcp;
3810
3811 queue = tc_find_queue__(netdev, queue_id, hash);
3812 if (queue) {
3813 hcp = htb_class_cast__(queue);
3814 } else {
3815 hcp = xmalloc(sizeof *hcp);
3816 queue = &hcp->tc_queue;
3817 queue->queue_id = queue_id;
3818 queue->created = time_msec();
3819 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3820 }
3821
3822 hcp->min_rate = hc->min_rate;
3823 hcp->max_rate = hc->max_rate;
3824 hcp->burst = hc->burst;
3825 hcp->priority = hc->priority;
3826 }
3827
3828 static int
3829 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3830 {
3831 struct ofpbuf msg;
3832 struct queue_dump_state state;
3833 struct htb_class hc;
3834
3835 /* Get qdisc options. */
3836 hc.max_rate = 0;
3837 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3838 htb_install__(netdev, hc.max_rate);
3839
3840 /* Get queues. */
3841 if (!start_queue_dump(netdev, &state)) {
3842 return ENODEV;
3843 }
3844 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3845 unsigned int queue_id;
3846
3847 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3848 htb_update_queue__(netdev, queue_id, &hc);
3849 }
3850 }
3851 finish_queue_dump(&state);
3852
3853 return 0;
3854 }
3855
3856 static void
3857 htb_tc_destroy(struct tc *tc)
3858 {
3859 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3860 struct htb_class *hc;
3861
3862 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3863 free(hc);
3864 }
3865 tc_destroy(tc);
3866 free(htb);
3867 }
3868
3869 static int
3870 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3871 {
3872 const struct htb *htb = htb_get__(netdev);
3873 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3874 return 0;
3875 }
3876
3877 static int
3878 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3879 {
3880 struct htb_class hc;
3881 int error;
3882
3883 htb_parse_qdisc_details__(netdev, details, &hc);
3884 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3885 tc_make_handle(1, 0), &hc);
3886 if (!error) {
3887 htb_get__(netdev)->max_rate = hc.max_rate;
3888 }
3889 return error;
3890 }
3891
3892 static int
3893 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3894 const struct tc_queue *queue, struct smap *details)
3895 {
3896 const struct htb_class *hc = htb_class_cast__(queue);
3897
3898 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3899 if (hc->min_rate != hc->max_rate) {
3900 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3901 }
3902 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3903 if (hc->priority) {
3904 smap_add_format(details, "priority", "%u", hc->priority);
3905 }
3906 return 0;
3907 }
3908
3909 static int
3910 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3911 const struct smap *details)
3912 {
3913 struct htb_class hc;
3914 int error;
3915
3916 error = htb_parse_class_details__(netdev, details, &hc);
3917 if (error) {
3918 return error;
3919 }
3920
3921 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3922 tc_make_handle(1, 0xfffe), &hc);
3923 if (error) {
3924 return error;
3925 }
3926
3927 htb_update_queue__(netdev, queue_id, &hc);
3928 return 0;
3929 }
3930
3931 static int
3932 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3933 {
3934 struct htb_class *hc = htb_class_cast__(queue);
3935 struct htb *htb = htb_get__(netdev);
3936 int error;
3937
3938 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3939 if (!error) {
3940 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3941 free(hc);
3942 }
3943 return error;
3944 }
3945
3946 static int
3947 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3948 struct netdev_queue_stats *stats)
3949 {
3950 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3951 tc_make_handle(1, 0xfffe), NULL, stats);
3952 }
3953
3954 static int
3955 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3956 const struct ofpbuf *nlmsg,
3957 netdev_dump_queue_stats_cb *cb, void *aux)
3958 {
3959 struct netdev_queue_stats stats;
3960 unsigned int handle, major, minor;
3961 int error;
3962
3963 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3964 if (error) {
3965 return error;
3966 }
3967
3968 major = tc_get_major(handle);
3969 minor = tc_get_minor(handle);
3970 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3971 (*cb)(minor - 1, &stats, aux);
3972 }
3973 return 0;
3974 }
3975
3976 static const struct tc_ops tc_ops_htb = {
3977 "htb", /* linux_name */
3978 "linux-htb", /* ovs_name */
3979 HTB_N_QUEUES, /* n_queues */
3980 htb_tc_install,
3981 htb_tc_load,
3982 htb_tc_destroy,
3983 htb_qdisc_get,
3984 htb_qdisc_set,
3985 htb_class_get,
3986 htb_class_set,
3987 htb_class_delete,
3988 htb_class_get_stats,
3989 htb_class_dump_stats
3990 };
3991 \f
3992 /* "linux-hfsc" traffic control class. */
3993
3994 #define HFSC_N_QUEUES 0xf000
3995
3996 struct hfsc {
3997 struct tc tc;
3998 uint32_t max_rate;
3999 };
4000
4001 struct hfsc_class {
4002 struct tc_queue tc_queue;
4003 uint32_t min_rate;
4004 uint32_t max_rate;
4005 };
4006
4007 static struct hfsc *
4008 hfsc_get__(const struct netdev *netdev_)
4009 {
4010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4011 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4012 }
4013
4014 static struct hfsc_class *
4015 hfsc_class_cast__(const struct tc_queue *queue)
4016 {
4017 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4018 }
4019
4020 static void
4021 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4022 {
4023 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4024 struct hfsc *hfsc;
4025
4026 hfsc = xmalloc(sizeof *hfsc);
4027 tc_init(&hfsc->tc, &tc_ops_hfsc);
4028 hfsc->max_rate = max_rate;
4029 netdev->tc = &hfsc->tc;
4030 }
4031
4032 static void
4033 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4034 const struct hfsc_class *hc)
4035 {
4036 size_t hash;
4037 struct hfsc *hfsc;
4038 struct hfsc_class *hcp;
4039 struct tc_queue *queue;
4040
4041 hfsc = hfsc_get__(netdev);
4042 hash = hash_int(queue_id, 0);
4043
4044 queue = tc_find_queue__(netdev, queue_id, hash);
4045 if (queue) {
4046 hcp = hfsc_class_cast__(queue);
4047 } else {
4048 hcp = xmalloc(sizeof *hcp);
4049 queue = &hcp->tc_queue;
4050 queue->queue_id = queue_id;
4051 queue->created = time_msec();
4052 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4053 }
4054
4055 hcp->min_rate = hc->min_rate;
4056 hcp->max_rate = hc->max_rate;
4057 }
4058
4059 static int
4060 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4061 {
4062 const struct tc_service_curve *rsc, *fsc, *usc;
4063 static const struct nl_policy tca_hfsc_policy[] = {
4064 [TCA_HFSC_RSC] = {
4065 .type = NL_A_UNSPEC,
4066 .optional = false,
4067 .min_len = sizeof(struct tc_service_curve),
4068 },
4069 [TCA_HFSC_FSC] = {
4070 .type = NL_A_UNSPEC,
4071 .optional = false,
4072 .min_len = sizeof(struct tc_service_curve),
4073 },
4074 [TCA_HFSC_USC] = {
4075 .type = NL_A_UNSPEC,
4076 .optional = false,
4077 .min_len = sizeof(struct tc_service_curve),
4078 },
4079 };
4080 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4081
4082 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4083 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4084 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4085 return EPROTO;
4086 }
4087
4088 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4089 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4090 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4091
4092 if (rsc->m1 != 0 || rsc->d != 0 ||
4093 fsc->m1 != 0 || fsc->d != 0 ||
4094 usc->m1 != 0 || usc->d != 0) {
4095 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4096 "Non-linear service curves are not supported.");
4097 return EPROTO;
4098 }
4099
4100 if (rsc->m2 != fsc->m2) {
4101 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4102 "Real-time service curves are not supported ");
4103 return EPROTO;
4104 }
4105
4106 if (rsc->m2 > usc->m2) {
4107 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4108 "Min-rate service curve is greater than "
4109 "the max-rate service curve.");
4110 return EPROTO;
4111 }
4112
4113 class->min_rate = fsc->m2;
4114 class->max_rate = usc->m2;
4115 return 0;
4116 }
4117
4118 static int
4119 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4120 struct hfsc_class *options,
4121 struct netdev_queue_stats *stats)
4122 {
4123 int error;
4124 unsigned int handle;
4125 struct nlattr *nl_options;
4126
4127 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4128 if (error) {
4129 return error;
4130 }
4131
4132 if (queue_id) {
4133 unsigned int major, minor;
4134
4135 major = tc_get_major(handle);
4136 minor = tc_get_minor(handle);
4137 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4138 *queue_id = minor - 1;
4139 } else {
4140 return EPROTO;
4141 }
4142 }
4143
4144 if (options) {
4145 error = hfsc_parse_tca_options__(nl_options, options);
4146 }
4147
4148 return error;
4149 }
4150
4151 static int
4152 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4153 unsigned int parent, struct hfsc_class *options,
4154 struct netdev_queue_stats *stats)
4155 {
4156 int error;
4157 struct ofpbuf *reply;
4158
4159 error = tc_query_class(netdev, handle, parent, &reply);
4160 if (error) {
4161 return error;
4162 }
4163
4164 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4165 ofpbuf_delete(reply);
4166 return error;
4167 }
4168
4169 static void
4170 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4171 struct hfsc_class *class)
4172 {
4173 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4174 uint32_t max_rate;
4175 const char *max_rate_s;
4176
4177 max_rate_s = smap_get(details, "max-rate");
4178 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4179
4180 if (!max_rate) {
4181 enum netdev_features current;
4182
4183 netdev_linux_read_features(netdev);
4184 current = !netdev->get_features_error ? netdev->current : 0;
4185 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4186 }
4187
4188 class->min_rate = max_rate;
4189 class->max_rate = max_rate;
4190 }
4191
4192 static int
4193 hfsc_parse_class_details__(struct netdev *netdev,
4194 const struct smap *details,
4195 struct hfsc_class * class)
4196 {
4197 const struct hfsc *hfsc;
4198 uint32_t min_rate, max_rate;
4199 const char *min_rate_s, *max_rate_s;
4200
4201 hfsc = hfsc_get__(netdev);
4202 min_rate_s = smap_get(details, "min-rate");
4203 max_rate_s = smap_get(details, "max-rate");
4204
4205 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4206 min_rate = MAX(min_rate, 1);
4207 min_rate = MIN(min_rate, hfsc->max_rate);
4208
4209 max_rate = (max_rate_s
4210 ? strtoull(max_rate_s, NULL, 10) / 8
4211 : hfsc->max_rate);
4212 max_rate = MAX(max_rate, min_rate);
4213 max_rate = MIN(max_rate, hfsc->max_rate);
4214
4215 class->min_rate = min_rate;
4216 class->max_rate = max_rate;
4217
4218 return 0;
4219 }
4220
4221 /* Create an HFSC qdisc.
4222 *
4223 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4224 static int
4225 hfsc_setup_qdisc__(struct netdev * netdev)
4226 {
4227 struct tcmsg *tcmsg;
4228 struct ofpbuf request;
4229 struct tc_hfsc_qopt opt;
4230
4231 tc_del_qdisc(netdev);
4232
4233 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4234 NLM_F_EXCL | NLM_F_CREATE, &request);
4235
4236 if (!tcmsg) {
4237 return ENODEV;
4238 }
4239
4240 tcmsg->tcm_handle = tc_make_handle(1, 0);
4241 tcmsg->tcm_parent = TC_H_ROOT;
4242
4243 memset(&opt, 0, sizeof opt);
4244 opt.defcls = 1;
4245
4246 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4247 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4248
4249 return tc_transact(&request, NULL);
4250 }
4251
4252 /* Create an HFSC class.
4253 *
4254 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4255 * sc rate <min_rate> ul rate <max_rate>" */
4256 static int
4257 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4258 unsigned int parent, struct hfsc_class *class)
4259 {
4260 int error;
4261 size_t opt_offset;
4262 struct tcmsg *tcmsg;
4263 struct ofpbuf request;
4264 struct tc_service_curve min, max;
4265
4266 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4267
4268 if (!tcmsg) {
4269 return ENODEV;
4270 }
4271
4272 tcmsg->tcm_handle = handle;
4273 tcmsg->tcm_parent = parent;
4274
4275 min.m1 = 0;
4276 min.d = 0;
4277 min.m2 = class->min_rate;
4278
4279 max.m1 = 0;
4280 max.d = 0;
4281 max.m2 = class->max_rate;
4282
4283 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4284 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4285 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4286 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4287 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4288 nl_msg_end_nested(&request, opt_offset);
4289
4290 error = tc_transact(&request, NULL);
4291 if (error) {
4292 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4293 "min-rate %ubps, max-rate %ubps (%s)",
4294 netdev_get_name(netdev),
4295 tc_get_major(handle), tc_get_minor(handle),
4296 tc_get_major(parent), tc_get_minor(parent),
4297 class->min_rate, class->max_rate, ovs_strerror(error));
4298 }
4299
4300 return error;
4301 }
4302
4303 static int
4304 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4305 {
4306 int error;
4307 struct hfsc_class class;
4308
4309 error = hfsc_setup_qdisc__(netdev);
4310
4311 if (error) {
4312 return error;
4313 }
4314
4315 hfsc_parse_qdisc_details__(netdev, details, &class);
4316 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4317 tc_make_handle(1, 0), &class);
4318
4319 if (error) {
4320 return error;
4321 }
4322
4323 hfsc_install__(netdev, class.max_rate);
4324 return 0;
4325 }
4326
4327 static int
4328 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4329 {
4330 struct ofpbuf msg;
4331 struct queue_dump_state state;
4332 struct hfsc_class hc;
4333
4334 hc.max_rate = 0;
4335 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4336 hfsc_install__(netdev, hc.max_rate);
4337
4338 if (!start_queue_dump(netdev, &state)) {
4339 return ENODEV;
4340 }
4341
4342 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4343 unsigned int queue_id;
4344
4345 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4346 hfsc_update_queue__(netdev, queue_id, &hc);
4347 }
4348 }
4349
4350 finish_queue_dump(&state);
4351 return 0;
4352 }
4353
4354 static void
4355 hfsc_tc_destroy(struct tc *tc)
4356 {
4357 struct hfsc *hfsc;
4358 struct hfsc_class *hc, *next;
4359
4360 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4361
4362 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4364 free(hc);
4365 }
4366
4367 tc_destroy(tc);
4368 free(hfsc);
4369 }
4370
4371 static int
4372 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4373 {
4374 const struct hfsc *hfsc;
4375 hfsc = hfsc_get__(netdev);
4376 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4377 return 0;
4378 }
4379
4380 static int
4381 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4382 {
4383 int error;
4384 struct hfsc_class class;
4385
4386 hfsc_parse_qdisc_details__(netdev, details, &class);
4387 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4388 tc_make_handle(1, 0), &class);
4389
4390 if (!error) {
4391 hfsc_get__(netdev)->max_rate = class.max_rate;
4392 }
4393
4394 return error;
4395 }
4396
4397 static int
4398 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4399 const struct tc_queue *queue, struct smap *details)
4400 {
4401 const struct hfsc_class *hc;
4402
4403 hc = hfsc_class_cast__(queue);
4404 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4405 if (hc->min_rate != hc->max_rate) {
4406 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4407 }
4408 return 0;
4409 }
4410
4411 static int
4412 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4413 const struct smap *details)
4414 {
4415 int error;
4416 struct hfsc_class class;
4417
4418 error = hfsc_parse_class_details__(netdev, details, &class);
4419 if (error) {
4420 return error;
4421 }
4422
4423 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4424 tc_make_handle(1, 0xfffe), &class);
4425 if (error) {
4426 return error;
4427 }
4428
4429 hfsc_update_queue__(netdev, queue_id, &class);
4430 return 0;
4431 }
4432
4433 static int
4434 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4435 {
4436 int error;
4437 struct hfsc *hfsc;
4438 struct hfsc_class *hc;
4439
4440 hc = hfsc_class_cast__(queue);
4441 hfsc = hfsc_get__(netdev);
4442
4443 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4444 if (!error) {
4445 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4446 free(hc);
4447 }
4448 return error;
4449 }
4450
4451 static int
4452 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4453 struct netdev_queue_stats *stats)
4454 {
4455 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4456 tc_make_handle(1, 0xfffe), NULL, stats);
4457 }
4458
4459 static int
4460 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4461 const struct ofpbuf *nlmsg,
4462 netdev_dump_queue_stats_cb *cb, void *aux)
4463 {
4464 struct netdev_queue_stats stats;
4465 unsigned int handle, major, minor;
4466 int error;
4467
4468 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4469 if (error) {
4470 return error;
4471 }
4472
4473 major = tc_get_major(handle);
4474 minor = tc_get_minor(handle);
4475 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4476 (*cb)(minor - 1, &stats, aux);
4477 }
4478 return 0;
4479 }
4480
4481 static const struct tc_ops tc_ops_hfsc = {
4482 "hfsc", /* linux_name */
4483 "linux-hfsc", /* ovs_name */
4484 HFSC_N_QUEUES, /* n_queues */
4485 hfsc_tc_install, /* tc_install */
4486 hfsc_tc_load, /* tc_load */
4487 hfsc_tc_destroy, /* tc_destroy */
4488 hfsc_qdisc_get, /* qdisc_get */
4489 hfsc_qdisc_set, /* qdisc_set */
4490 hfsc_class_get, /* class_get */
4491 hfsc_class_set, /* class_set */
4492 hfsc_class_delete, /* class_delete */
4493 hfsc_class_get_stats, /* class_get_stats */
4494 hfsc_class_dump_stats /* class_dump_stats */
4495 };
4496 \f
4497 /* "linux-noop" traffic control class. */
4498
4499 static void
4500 noop_install__(struct netdev *netdev_)
4501 {
4502 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4503 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4504
4505 netdev->tc = CONST_CAST(struct tc *, &tc);
4506 }
4507
4508 static int
4509 noop_tc_install(struct netdev *netdev,
4510 const struct smap *details OVS_UNUSED)
4511 {
4512 noop_install__(netdev);
4513 return 0;
4514 }
4515
4516 static int
4517 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4518 {
4519 noop_install__(netdev);
4520 return 0;
4521 }
4522
4523 static const struct tc_ops tc_ops_noop = {
4524 NULL, /* linux_name */
4525 "linux-noop", /* ovs_name */
4526 0, /* n_queues */
4527 noop_tc_install,
4528 noop_tc_load,
4529 NULL, /* tc_destroy */
4530 NULL, /* qdisc_get */
4531 NULL, /* qdisc_set */
4532 NULL, /* class_get */
4533 NULL, /* class_set */
4534 NULL, /* class_delete */
4535 NULL, /* class_get_stats */
4536 NULL /* class_dump_stats */
4537 };
4538 \f
4539 /* "linux-default" traffic control class.
4540 *
4541 * This class represents the default, unnamed Linux qdisc. It corresponds to
4542 * the "" (empty string) QoS type in the OVS database. */
4543
4544 static void
4545 default_install__(struct netdev *netdev_)
4546 {
4547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4548 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4549
4550 /* Nothing but a tc class implementation is allowed to write to a tc. This
4551 * class never does that, so we can legitimately use a const tc object. */
4552 netdev->tc = CONST_CAST(struct tc *, &tc);
4553 }
4554
4555 static int
4556 default_tc_install(struct netdev *netdev,
4557 const struct smap *details OVS_UNUSED)
4558 {
4559 default_install__(netdev);
4560 return 0;
4561 }
4562
4563 static int
4564 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4565 {
4566 default_install__(netdev);
4567 return 0;
4568 }
4569
4570 static const struct tc_ops tc_ops_default = {
4571 NULL, /* linux_name */
4572 "", /* ovs_name */
4573 0, /* n_queues */
4574 default_tc_install,
4575 default_tc_load,
4576 NULL, /* tc_destroy */
4577 NULL, /* qdisc_get */
4578 NULL, /* qdisc_set */
4579 NULL, /* class_get */
4580 NULL, /* class_set */
4581 NULL, /* class_delete */
4582 NULL, /* class_get_stats */
4583 NULL /* class_dump_stats */
4584 };
4585 \f
4586 /* "linux-other" traffic control class.
4587 *
4588 * */
4589
4590 static int
4591 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4592 {
4593 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4594 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4595
4596 /* Nothing but a tc class implementation is allowed to write to a tc. This
4597 * class never does that, so we can legitimately use a const tc object. */
4598 netdev->tc = CONST_CAST(struct tc *, &tc);
4599 return 0;
4600 }
4601
4602 static const struct tc_ops tc_ops_other = {
4603 NULL, /* linux_name */
4604 "linux-other", /* ovs_name */
4605 0, /* n_queues */
4606 NULL, /* tc_install */
4607 other_tc_load,
4608 NULL, /* tc_destroy */
4609 NULL, /* qdisc_get */
4610 NULL, /* qdisc_set */
4611 NULL, /* class_get */
4612 NULL, /* class_set */
4613 NULL, /* class_delete */
4614 NULL, /* class_get_stats */
4615 NULL /* class_dump_stats */
4616 };
4617 \f
4618 /* Traffic control. */
4619
4620 /* Number of kernel "tc" ticks per second. */
4621 static double ticks_per_s;
4622
4623 /* Number of kernel "jiffies" per second. This is used for the purpose of
4624 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4625 * one jiffy's worth of data.
4626 *
4627 * There are two possibilities here:
4628 *
4629 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4630 * approximate range of 100 to 1024. That means that we really need to
4631 * make sure that the qdisc can buffer that much data.
4632 *
4633 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4634 * has finely granular timers and there's no need to fudge additional room
4635 * for buffers. (There's no extra effort needed to implement that: the
4636 * large 'buffer_hz' is used as a divisor, so practically any number will
4637 * come out as 0 in the division. Small integer results in the case of
4638 * really high dividends won't have any real effect anyhow.)
4639 */
4640 static unsigned int buffer_hz;
4641
4642 /* Returns tc handle 'major':'minor'. */
4643 static unsigned int
4644 tc_make_handle(unsigned int major, unsigned int minor)
4645 {
4646 return TC_H_MAKE(major << 16, minor);
4647 }
4648
4649 /* Returns the major number from 'handle'. */
4650 static unsigned int
4651 tc_get_major(unsigned int handle)
4652 {
4653 return TC_H_MAJ(handle) >> 16;
4654 }
4655
4656 /* Returns the minor number from 'handle'. */
4657 static unsigned int
4658 tc_get_minor(unsigned int handle)
4659 {
4660 return TC_H_MIN(handle);
4661 }
4662
4663 static struct tcmsg *
4664 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4665 struct ofpbuf *request)
4666 {
4667 struct tcmsg *tcmsg;
4668 int ifindex;
4669 int error;
4670
4671 error = get_ifindex(netdev, &ifindex);
4672 if (error) {
4673 return NULL;
4674 }
4675
4676 ofpbuf_init(request, 512);
4677 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4678 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4679 tcmsg->tcm_family = AF_UNSPEC;
4680 tcmsg->tcm_ifindex = ifindex;
4681 /* Caller should fill in tcmsg->tcm_handle. */
4682 /* Caller should fill in tcmsg->tcm_parent. */
4683
4684 return tcmsg;
4685 }
4686
4687 static int
4688 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4689 {
4690 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4691 ofpbuf_uninit(request);
4692 return error;
4693 }
4694
4695 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4696 * policing configuration.
4697 *
4698 * This function is equivalent to running the following when 'add' is true:
4699 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4700 *
4701 * This function is equivalent to running the following when 'add' is false:
4702 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4703 *
4704 * The configuration and stats may be seen with the following command:
4705 * /sbin/tc -s qdisc show dev <devname>
4706 *
4707 * Returns 0 if successful, otherwise a positive errno value.
4708 */
4709 static int
4710 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4711 {
4712 struct ofpbuf request;
4713 struct tcmsg *tcmsg;
4714 int error;
4715 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4716 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4717
4718 tcmsg = tc_make_request(netdev, type, flags, &request);
4719 if (!tcmsg) {
4720 return ENODEV;
4721 }
4722 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4723 tcmsg->tcm_parent = TC_H_INGRESS;
4724 nl_msg_put_string(&request, TCA_KIND, "ingress");
4725 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4726
4727 error = tc_transact(&request, NULL);
4728 if (error) {
4729 /* If we're deleting the qdisc, don't worry about some of the
4730 * error conditions. */
4731 if (!add && (error == ENOENT || error == EINVAL)) {
4732 return 0;
4733 }
4734 return error;
4735 }
4736
4737 return 0;
4738 }
4739
4740 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4741 * of 'kbits_burst'.
4742 *
4743 * This function is equivalent to running:
4744 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4745 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4746 * mtu 65535 drop
4747 *
4748 * The configuration and stats may be seen with the following command:
4749 * /sbin/tc -s filter show dev <devname> parent ffff:
4750 *
4751 * Returns 0 if successful, otherwise a positive errno value.
4752 */
4753 static int
4754 tc_add_policer(struct netdev *netdev,
4755 uint32_t kbits_rate, uint32_t kbits_burst)
4756 {
4757 struct tc_police tc_police;
4758 struct ofpbuf request;
4759 struct tcmsg *tcmsg;
4760 size_t basic_offset;
4761 size_t police_offset;
4762 int error;
4763 int mtu = 65535;
4764
4765 memset(&tc_police, 0, sizeof tc_police);
4766 tc_police.action = TC_POLICE_SHOT;
4767 tc_police.mtu = mtu;
4768 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4769
4770 /* The following appears wrong in one way: In networking a kilobit is
4771 * usually 1000 bits but this uses 1024 bits.
4772 *
4773 * However if you "fix" those problems then "tc filter show ..." shows
4774 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4775 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4776 * tc's point of view. Whatever. */
4777 tc_police.burst = tc_bytes_to_ticks(
4778 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4779
4780 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4781 NLM_F_EXCL | NLM_F_CREATE, &request);
4782 if (!tcmsg) {
4783 return ENODEV;
4784 }
4785 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4786 tcmsg->tcm_info = tc_make_handle(49,
4787 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4788
4789 nl_msg_put_string(&request, TCA_KIND, "basic");
4790 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4791 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4792 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4793 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4794 nl_msg_end_nested(&request, police_offset);
4795 nl_msg_end_nested(&request, basic_offset);
4796
4797 error = tc_transact(&request, NULL);
4798 if (error) {
4799 return error;
4800 }
4801
4802 return 0;
4803 }
4804
4805 static void
4806 read_psched(void)
4807 {
4808 /* The values in psched are not individually very meaningful, but they are
4809 * important. The tables below show some values seen in the wild.
4810 *
4811 * Some notes:
4812 *
4813 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4814 * (Before that, there are hints that it was 1000000000.)
4815 *
4816 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4817 * above.
4818 *
4819 * /proc/net/psched
4820 * -----------------------------------
4821 * [1] 000c8000 000f4240 000f4240 00000064
4822 * [2] 000003e8 00000400 000f4240 3b9aca00
4823 * [3] 000003e8 00000400 000f4240 3b9aca00
4824 * [4] 000003e8 00000400 000f4240 00000064
4825 * [5] 000003e8 00000040 000f4240 3b9aca00
4826 * [6] 000003e8 00000040 000f4240 000000f9
4827 *
4828 * a b c d ticks_per_s buffer_hz
4829 * ------- --------- ---------- ------------- ----------- -------------
4830 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4831 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4832 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4833 * [4] 1,000 1,024 1,000,000 100 976,562 100
4834 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4835 * [6] 1,000 64 1,000,000 249 15,625,000 249
4836 *
4837 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4838 * [2] 2.6.26-1-686-bigmem from Debian lenny
4839 * [3] 2.6.26-2-sparc64 from Debian lenny
4840 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4841 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4842 * [6] 2.6.34 from kernel.org on KVM
4843 */
4844 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4845 static const char fn[] = "/proc/net/psched";
4846 unsigned int a, b, c, d;
4847 FILE *stream;
4848
4849 if (!ovsthread_once_start(&once)) {
4850 return;
4851 }
4852
4853 ticks_per_s = 1.0;
4854 buffer_hz = 100;
4855
4856 stream = fopen(fn, "r");
4857 if (!stream) {
4858 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4859 goto exit;
4860 }
4861
4862 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4863 VLOG_WARN("%s: read failed", fn);
4864 fclose(stream);
4865 goto exit;
4866 }
4867 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4868 fclose(stream);
4869
4870 if (!a || !c) {
4871 VLOG_WARN("%s: invalid scheduler parameters", fn);
4872 goto exit;
4873 }
4874
4875 ticks_per_s = (double) a * c / b;
4876 if (c == 1000000) {
4877 buffer_hz = d;
4878 } else {
4879 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4880 fn, a, b, c, d);
4881 }
4882 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4883
4884 exit:
4885 ovsthread_once_done(&once);
4886 }
4887
4888 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4889 * rate of 'rate' bytes per second. */
4890 static unsigned int
4891 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4892 {
4893 read_psched();
4894 return (rate * ticks) / ticks_per_s;
4895 }
4896
4897 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4898 * rate of 'rate' bytes per second. */
4899 static unsigned int
4900 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4901 {
4902 read_psched();
4903 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4904 }
4905
4906 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4907 * a transmission rate of 'rate' bytes per second. */
4908 static unsigned int
4909 tc_buffer_per_jiffy(unsigned int rate)
4910 {
4911 read_psched();
4912 return rate / buffer_hz;
4913 }
4914
4915 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4916 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4917 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4918 * stores NULL into it if it is absent.
4919 *
4920 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4921 * 'msg'.
4922 *
4923 * Returns 0 if successful, otherwise a positive errno value. */
4924 static int
4925 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4926 struct nlattr **options)
4927 {
4928 static const struct nl_policy tca_policy[] = {
4929 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4930 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4931 };
4932 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4933
4934 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4935 tca_policy, ta, ARRAY_SIZE(ta))) {
4936 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4937 goto error;
4938 }
4939
4940 if (kind) {
4941 *kind = nl_attr_get_string(ta[TCA_KIND]);
4942 }
4943
4944 if (options) {
4945 *options = ta[TCA_OPTIONS];
4946 }
4947
4948 return 0;
4949
4950 error:
4951 if (kind) {
4952 *kind = NULL;
4953 }
4954 if (options) {
4955 *options = NULL;
4956 }
4957 return EPROTO;
4958 }
4959
4960 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4961 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4962 * into '*options', and its queue statistics into '*stats'. Any of the output
4963 * arguments may be null.
4964 *
4965 * Returns 0 if successful, otherwise a positive errno value. */
4966 static int
4967 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4968 struct nlattr **options, struct netdev_queue_stats *stats)
4969 {
4970 static const struct nl_policy tca_policy[] = {
4971 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4972 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4973 };
4974 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4975
4976 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4977 tca_policy, ta, ARRAY_SIZE(ta))) {
4978 VLOG_WARN_RL(&rl, "failed to parse class message");
4979 goto error;
4980 }
4981
4982 if (handlep) {
4983 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4984 *handlep = tc->tcm_handle;
4985 }
4986
4987 if (options) {
4988 *options = ta[TCA_OPTIONS];
4989 }
4990
4991 if (stats) {
4992 const struct gnet_stats_queue *gsq;
4993 struct gnet_stats_basic gsb;
4994
4995 static const struct nl_policy stats_policy[] = {
4996 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4997 .min_len = sizeof gsb },
4998 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4999 .min_len = sizeof *gsq },
5000 };
5001 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5002
5003 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5004 sa, ARRAY_SIZE(sa))) {
5005 VLOG_WARN_RL(&rl, "failed to parse class stats");
5006 goto error;
5007 }
5008
5009 /* Alignment issues screw up the length of struct gnet_stats_basic on
5010 * some arch/bitsize combinations. Newer versions of Linux have a
5011 * struct gnet_stats_basic_packed, but we can't depend on that. The
5012 * easiest thing to do is just to make a copy. */
5013 memset(&gsb, 0, sizeof gsb);
5014 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5015 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5016 stats->tx_bytes = gsb.bytes;
5017 stats->tx_packets = gsb.packets;
5018
5019 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5020 stats->tx_errors = gsq->drops;
5021 }
5022
5023 return 0;
5024
5025 error:
5026 if (options) {
5027 *options = NULL;
5028 }
5029 if (stats) {
5030 memset(stats, 0, sizeof *stats);
5031 }
5032 return EPROTO;
5033 }
5034
5035 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5036 * on 'netdev'. */
5037 static int
5038 tc_query_class(const struct netdev *netdev,
5039 unsigned int handle, unsigned int parent,
5040 struct ofpbuf **replyp)
5041 {
5042 struct ofpbuf request;
5043 struct tcmsg *tcmsg;
5044 int error;
5045
5046 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5047 if (!tcmsg) {
5048 return ENODEV;
5049 }
5050 tcmsg->tcm_handle = handle;
5051 tcmsg->tcm_parent = parent;
5052
5053 error = tc_transact(&request, replyp);
5054 if (error) {
5055 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5056 netdev_get_name(netdev),
5057 tc_get_major(handle), tc_get_minor(handle),
5058 tc_get_major(parent), tc_get_minor(parent),
5059 ovs_strerror(error));
5060 }
5061 return error;
5062 }
5063
5064 /* Equivalent to "tc class del dev <name> handle <handle>". */
5065 static int
5066 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5067 {
5068 struct ofpbuf request;
5069 struct tcmsg *tcmsg;
5070 int error;
5071
5072 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5073 if (!tcmsg) {
5074 return ENODEV;
5075 }
5076 tcmsg->tcm_handle = handle;
5077 tcmsg->tcm_parent = 0;
5078
5079 error = tc_transact(&request, NULL);
5080 if (error) {
5081 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5082 netdev_get_name(netdev),
5083 tc_get_major(handle), tc_get_minor(handle),
5084 ovs_strerror(error));
5085 }
5086 return error;
5087 }
5088
5089 /* Equivalent to "tc qdisc del dev <name> root". */
5090 static int
5091 tc_del_qdisc(struct netdev *netdev_)
5092 {
5093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5094 struct ofpbuf request;
5095 struct tcmsg *tcmsg;
5096 int error;
5097
5098 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5099 if (!tcmsg) {
5100 return ENODEV;
5101 }
5102 tcmsg->tcm_handle = tc_make_handle(1, 0);
5103 tcmsg->tcm_parent = TC_H_ROOT;
5104
5105 error = tc_transact(&request, NULL);
5106 if (error == EINVAL) {
5107 /* EINVAL probably means that the default qdisc was in use, in which
5108 * case we've accomplished our purpose. */
5109 error = 0;
5110 }
5111 if (!error && netdev->tc) {
5112 if (netdev->tc->ops->tc_destroy) {
5113 netdev->tc->ops->tc_destroy(netdev->tc);
5114 }
5115 netdev->tc = NULL;
5116 }
5117 return error;
5118 }
5119
5120 static bool
5121 getqdisc_is_safe(void)
5122 {
5123 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5124 static bool safe = false;
5125
5126 if (ovsthread_once_start(&once)) {
5127 struct utsname utsname;
5128 int major, minor;
5129
5130 if (uname(&utsname) == -1) {
5131 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5132 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5133 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5134 } else if (major < 2 || (major == 2 && minor < 35)) {
5135 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5136 utsname.release);
5137 } else {
5138 safe = true;
5139 }
5140 ovsthread_once_done(&once);
5141 }
5142 return safe;
5143 }
5144
5145 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5146 * kernel to determine what they are. Returns 0 if successful, otherwise a
5147 * positive errno value. */
5148 static int
5149 tc_query_qdisc(const struct netdev *netdev_)
5150 {
5151 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5152 struct ofpbuf request, *qdisc;
5153 const struct tc_ops *ops;
5154 struct tcmsg *tcmsg;
5155 int load_error;
5156 int error;
5157
5158 if (netdev->tc) {
5159 return 0;
5160 }
5161
5162 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5163 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5164 * 2.6.35 without that fix backported to it.
5165 *
5166 * To avoid the OOPS, we must not make a request that would attempt to dump
5167 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5168 * few others. There are a few ways that I can see to do this, but most of
5169 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5170 * technique chosen here is to assume that any non-default qdisc that we
5171 * create will have a class with handle 1:0. The built-in qdiscs only have
5172 * a class with handle 0:0.
5173 *
5174 * On Linux 2.6.35+ we use the straightforward method because it allows us
5175 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5176 * in such a case we get no response at all from the kernel (!) if a
5177 * builtin qdisc is in use (which is later caught by "!error &&
5178 * !qdisc->size"). */
5179 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5180 if (!tcmsg) {
5181 return ENODEV;
5182 }
5183 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5184 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5185
5186 /* Figure out what tc class to instantiate. */
5187 error = tc_transact(&request, &qdisc);
5188 if (!error && qdisc->size) {
5189 const char *kind;
5190
5191 error = tc_parse_qdisc(qdisc, &kind, NULL);
5192 if (error) {
5193 ops = &tc_ops_other;
5194 } else {
5195 ops = tc_lookup_linux_name(kind);
5196 if (!ops) {
5197 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5198 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5199
5200 ops = &tc_ops_other;
5201 }
5202 }
5203 } else if ((!error && !qdisc->size) || error == ENOENT) {
5204 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5205 * set up by some other entity that doesn't have a handle 1:0. We will
5206 * assume that it's the system default qdisc. */
5207 ops = &tc_ops_default;
5208 error = 0;
5209 } else {
5210 /* Who knows? Maybe the device got deleted. */
5211 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5212 netdev_get_name(netdev_), ovs_strerror(error));
5213 ops = &tc_ops_other;
5214 }
5215
5216 /* Instantiate it. */
5217 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5218 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5219 ofpbuf_delete(qdisc);
5220
5221 return error ? error : load_error;
5222 }
5223
5224 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5225 approximate the time to transmit packets of various lengths. For an MTU of
5226 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5227 represents two possible packet lengths; for a MTU of 513 through 1024, four
5228 possible lengths; and so on.
5229
5230 Returns, for the specified 'mtu', the number of bits that packet lengths
5231 need to be shifted right to fit within such a 256-entry table. */
5232 static int
5233 tc_calc_cell_log(unsigned int mtu)
5234 {
5235 int cell_log;
5236
5237 if (!mtu) {
5238 mtu = ETH_PAYLOAD_MAX;
5239 }
5240 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5241
5242 for (cell_log = 0; mtu >= 256; cell_log++) {
5243 mtu >>= 1;
5244 }
5245
5246 return cell_log;
5247 }
5248
5249 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5250 * of 'mtu'. */
5251 static void
5252 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5253 {
5254 memset(rate, 0, sizeof *rate);
5255 rate->cell_log = tc_calc_cell_log(mtu);
5256 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5257 /* rate->cell_align = 0; */ /* distro headers. */
5258 rate->mpu = ETH_TOTAL_MIN;
5259 rate->rate = Bps;
5260 }
5261
5262 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5263 * attribute of the specified "type".
5264 *
5265 * See tc_calc_cell_log() above for a description of "rtab"s. */
5266 static void
5267 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5268 {
5269 uint32_t *rtab;
5270 unsigned int i;
5271
5272 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5273 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5274 unsigned packet_size = (i + 1) << rate->cell_log;
5275 if (packet_size < rate->mpu) {
5276 packet_size = rate->mpu;
5277 }
5278 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5279 }
5280 }
5281
5282 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5283 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5284 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5285 * 0 is fine.) */
5286 static int
5287 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5288 {
5289 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5290 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5291 }
5292 \f
5293 /* Linux-only functions declared in netdev-linux.h */
5294
5295 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5296 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5297 int
5298 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5299 const char *flag_name, bool enable)
5300 {
5301 const char *netdev_name = netdev_get_name(netdev);
5302 struct ethtool_value evalue;
5303 uint32_t new_flags;
5304 int error;
5305
5306 COVERAGE_INC(netdev_get_ethtool);
5307 memset(&evalue, 0, sizeof evalue);
5308 error = netdev_linux_do_ethtool(netdev_name,
5309 (struct ethtool_cmd *)&evalue,
5310 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5311 if (error) {
5312 return error;
5313 }
5314
5315 COVERAGE_INC(netdev_set_ethtool);
5316 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5317 if (new_flags == evalue.data) {
5318 return 0;
5319 }
5320 evalue.data = new_flags;
5321 error = netdev_linux_do_ethtool(netdev_name,
5322 (struct ethtool_cmd *)&evalue,
5323 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5324 if (error) {
5325 return error;
5326 }
5327
5328 COVERAGE_INC(netdev_get_ethtool);
5329 memset(&evalue, 0, sizeof evalue);
5330 error = netdev_linux_do_ethtool(netdev_name,
5331 (struct ethtool_cmd *)&evalue,
5332 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5333 if (error) {
5334 return error;
5335 }
5336
5337 if (new_flags != evalue.data) {
5338 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5339 "device %s failed", enable ? "enable" : "disable",
5340 flag_name, netdev_name);
5341 return EOPNOTSUPP;
5342 }
5343
5344 return 0;
5345 }
5346 \f
5347 /* Utility functions. */
5348
5349 /* Copies 'src' into 'dst', performing format conversion in the process. */
5350 static void
5351 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5352 const struct rtnl_link_stats *src)
5353 {
5354 dst->rx_packets = src->rx_packets;
5355 dst->tx_packets = src->tx_packets;
5356 dst->rx_bytes = src->rx_bytes;
5357 dst->tx_bytes = src->tx_bytes;
5358 dst->rx_errors = src->rx_errors;
5359 dst->tx_errors = src->tx_errors;
5360 dst->rx_dropped = src->rx_dropped;
5361 dst->tx_dropped = src->tx_dropped;
5362 dst->multicast = src->multicast;
5363 dst->collisions = src->collisions;
5364 dst->rx_length_errors = src->rx_length_errors;
5365 dst->rx_over_errors = src->rx_over_errors;
5366 dst->rx_crc_errors = src->rx_crc_errors;
5367 dst->rx_frame_errors = src->rx_frame_errors;
5368 dst->rx_fifo_errors = src->rx_fifo_errors;
5369 dst->rx_missed_errors = src->rx_missed_errors;
5370 dst->tx_aborted_errors = src->tx_aborted_errors;
5371 dst->tx_carrier_errors = src->tx_carrier_errors;
5372 dst->tx_fifo_errors = src->tx_fifo_errors;
5373 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5374 dst->tx_window_errors = src->tx_window_errors;
5375 }
5376
5377 /* Copies 'src' into 'dst', performing format conversion in the process. */
5378 static void
5379 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5380 const struct rtnl_link_stats64 *src)
5381 {
5382 dst->rx_packets = src->rx_packets;
5383 dst->tx_packets = src->tx_packets;
5384 dst->rx_bytes = src->rx_bytes;
5385 dst->tx_bytes = src->tx_bytes;
5386 dst->rx_errors = src->rx_errors;
5387 dst->tx_errors = src->tx_errors;
5388 dst->rx_dropped = src->rx_dropped;
5389 dst->tx_dropped = src->tx_dropped;
5390 dst->multicast = src->multicast;
5391 dst->collisions = src->collisions;
5392 dst->rx_length_errors = src->rx_length_errors;
5393 dst->rx_over_errors = src->rx_over_errors;
5394 dst->rx_crc_errors = src->rx_crc_errors;
5395 dst->rx_frame_errors = src->rx_frame_errors;
5396 dst->rx_fifo_errors = src->rx_fifo_errors;
5397 dst->rx_missed_errors = src->rx_missed_errors;
5398 dst->tx_aborted_errors = src->tx_aborted_errors;
5399 dst->tx_carrier_errors = src->tx_carrier_errors;
5400 dst->tx_fifo_errors = src->tx_fifo_errors;
5401 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5402 dst->tx_window_errors = src->tx_window_errors;
5403 }
5404
5405 static int
5406 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5407 {
5408 struct ofpbuf request;
5409 struct ofpbuf *reply;
5410 int error;
5411
5412 /* Filtering all counters by default */
5413 memset(stats, 0xFF, sizeof(struct netdev_stats));
5414
5415 ofpbuf_init(&request, 0);
5416 nl_msg_put_nlmsghdr(&request,
5417 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5418 RTM_GETLINK, NLM_F_REQUEST);
5419 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5420 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5421 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5422 ofpbuf_uninit(&request);
5423 if (error) {
5424 return error;
5425 }
5426
5427 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5428 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5429 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5430 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5431 error = 0;
5432 } else {
5433 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5434 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5435 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5436 error = 0;
5437 } else {
5438 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5439 error = EPROTO;
5440 }
5441 }
5442 } else {
5443 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5444 error = EPROTO;
5445 }
5446
5447
5448 ofpbuf_delete(reply);
5449 return error;
5450 }
5451
5452 static int
5453 get_flags(const struct netdev *dev, unsigned int *flags)
5454 {
5455 struct ifreq ifr;
5456 int error;
5457
5458 *flags = 0;
5459 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5460 if (!error) {
5461 *flags = ifr.ifr_flags;
5462 }
5463 return error;
5464 }
5465
5466 static int
5467 set_flags(const char *name, unsigned int flags)
5468 {
5469 struct ifreq ifr;
5470
5471 ifr.ifr_flags = flags;
5472 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5473 }
5474
5475 static int
5476 do_get_ifindex(const char *netdev_name)
5477 {
5478 struct ifreq ifr;
5479 int error;
5480
5481 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5482 COVERAGE_INC(netdev_get_ifindex);
5483
5484 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5485 if (error) {
5486 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5487 netdev_name, ovs_strerror(error));
5488 return -error;
5489 }
5490 return ifr.ifr_ifindex;
5491 }
5492
5493 static int
5494 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5495 {
5496 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5497
5498 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5499 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5500
5501 if (ifindex < 0) {
5502 netdev->get_ifindex_error = -ifindex;
5503 netdev->ifindex = 0;
5504 } else {
5505 netdev->get_ifindex_error = 0;
5506 netdev->ifindex = ifindex;
5507 }
5508 netdev->cache_valid |= VALID_IFINDEX;
5509 }
5510
5511 *ifindexp = netdev->ifindex;
5512 return netdev->get_ifindex_error;
5513 }
5514
5515 static int
5516 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5517 {
5518 struct ifreq ifr;
5519 int hwaddr_family;
5520 int error;
5521
5522 memset(&ifr, 0, sizeof ifr);
5523 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5524 COVERAGE_INC(netdev_get_hwaddr);
5525 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5526 if (error) {
5527 /* ENODEV probably means that a vif disappeared asynchronously and
5528 * hasn't been removed from the database yet, so reduce the log level
5529 * to INFO for that case. */
5530 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5531 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5532 netdev_name, ovs_strerror(error));
5533 return error;
5534 }
5535 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5536 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5537 VLOG_INFO("%s device has unknown hardware address family %d",
5538 netdev_name, hwaddr_family);
5539 return EINVAL;
5540 }
5541 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5542 return 0;
5543 }
5544
5545 static int
5546 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5547 {
5548 struct ifreq ifr;
5549 int error;
5550
5551 memset(&ifr, 0, sizeof ifr);
5552 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5553 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5554 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5555 COVERAGE_INC(netdev_set_hwaddr);
5556 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5557 if (error) {
5558 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5559 netdev_name, ovs_strerror(error));
5560 }
5561 return error;
5562 }
5563
5564 static int
5565 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5566 int cmd, const char *cmd_name)
5567 {
5568 struct ifreq ifr;
5569 int error;
5570
5571 memset(&ifr, 0, sizeof ifr);
5572 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5573 ifr.ifr_data = (caddr_t) ecmd;
5574
5575 ecmd->cmd = cmd;
5576 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5577 if (error) {
5578 if (error != EOPNOTSUPP) {
5579 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5580 "failed: %s", cmd_name, name, ovs_strerror(error));
5581 } else {
5582 /* The device doesn't support this operation. That's pretty
5583 * common, so there's no point in logging anything. */
5584 }
5585 }
5586 return error;
5587 }
5588
5589 /* Returns an AF_PACKET raw socket or a negative errno value. */
5590 static int
5591 af_packet_sock(void)
5592 {
5593 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5594 static int sock;
5595
5596 if (ovsthread_once_start(&once)) {
5597 sock = socket(AF_PACKET, SOCK_RAW, 0);
5598 if (sock >= 0) {
5599 int error = set_nonblocking(sock);
5600 if (error) {
5601 close(sock);
5602 sock = -error;
5603 }
5604 } else {
5605 sock = -errno;
5606 VLOG_ERR("failed to create packet socket: %s",
5607 ovs_strerror(errno));
5608 }
5609 ovsthread_once_done(&once);
5610 }
5611
5612 return sock;
5613 }