]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
netdev-linux: Fix warning message.
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
57 #include "hash.h"
58 #include "hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
86
87 \f
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
92 #endif
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
95 #endif
96
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #endif
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 #endif
105
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108 #ifndef TC_RTAB_SIZE
109 #define TC_RTAB_SIZE 1024
110 #endif
111
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
123 #endif
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #endif
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #endif
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140 };
141
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
143 *
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
150 {
151 return ep->speed | (ep->speed_hi << 16);
152 }
153
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
165 #endif
166
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
178 #endif
179
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
181 *
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
187 #ifndef IFLA_STATS64
188 #define IFLA_STATS64 23
189 #endif
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
192 uint64_t rx_packets;
193 uint64_t tx_packets;
194 uint64_t rx_bytes;
195 uint64_t tx_bytes;
196 uint64_t rx_errors;
197 uint64_t tx_errors;
198 uint64_t rx_dropped;
199 uint64_t tx_dropped;
200 uint64_t multicast;
201 uint64_t collisions;
202
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
209
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
215
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
218 };
219
220 enum {
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
223 VALID_IN4 = 1 << 2,
224 VALID_IN6 = 1 << 3,
225 VALID_MTU = 1 << 4,
226 VALID_POLICING = 1 << 5,
227 VALID_VPORT_STAT_ERROR = 1 << 6,
228 VALID_DRVINFO = 1 << 7,
229 VALID_FEATURES = 1 << 8,
230 };
231 \f
232 /* Traffic control. */
233
234 /* An instance of a traffic control class. Always associated with a particular
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
239 struct tc {
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244 };
245
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
248 /* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252 struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
256 };
257
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264 struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
344
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
374
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401 };
402
403 static void
404 tc_init(struct tc *tc, const struct tc_ops *ops)
405 {
406 tc->ops = ops;
407 hmap_init(&tc->queues);
408 }
409
410 static void
411 tc_destroy(struct tc *tc)
412 {
413 hmap_destroy(&tc->queues);
414 }
415
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_other;
423
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
431 &tc_ops_other, /* Some other qdisc. */
432 NULL
433 };
434
435 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
436 static unsigned int tc_get_major(unsigned int handle);
437 static unsigned int tc_get_minor(unsigned int handle);
438
439 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
440 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
441 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
442
443 static struct tcmsg *tc_make_request(const struct netdev *, int type,
444 unsigned int flags, struct ofpbuf *);
445 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
446 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
447 static int tc_add_policer(struct netdev *,
448 uint32_t kbits_rate, uint32_t kbits_burst);
449
450 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
451 struct nlattr **options);
452 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
453 struct nlattr **options,
454 struct netdev_queue_stats *);
455 static int tc_query_class(const struct netdev *,
456 unsigned int handle, unsigned int parent,
457 struct ofpbuf **replyp);
458 static int tc_delete_class(const struct netdev *, unsigned int handle);
459
460 static int tc_del_qdisc(struct netdev *netdev);
461 static int tc_query_qdisc(const struct netdev *netdev);
462
463 static int tc_calc_cell_log(unsigned int mtu);
464 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
465 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
466 const struct tc_ratespec *rate);
467 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
468 \f
469 struct netdev_linux {
470 struct netdev up;
471
472 /* Protects all members below. */
473 struct ovs_mutex mutex;
474
475 unsigned int cache_valid;
476
477 bool miimon; /* Link status of last poll. */
478 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
479 struct timer miimon_timer;
480
481 /* The following are figured out "on demand" only. They are only valid
482 * when the corresponding VALID_* bit in 'cache_valid' is set. */
483 int ifindex;
484 struct eth_addr etheraddr;
485 struct in_addr address, netmask;
486 struct in6_addr in6;
487 int mtu;
488 unsigned int ifi_flags;
489 long long int carrier_resets;
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error; /* Cached error code from set policing. */
497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499 int in4_error; /* Cached error code from reading in4 addr. */
500 int in6_error; /* Cached error code from reading in6 addr. */
501
502 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
504 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
505
506 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
507 struct tc *tc;
508
509 /* For devices of class netdev_tap_class only. */
510 int tap_fd;
511 };
512
513 struct netdev_rxq_linux {
514 struct netdev_rxq up;
515 bool is_tap;
516 int fd;
517 };
518
519 /* This is set pretty low because we probably won't learn anything from the
520 * additional log messages. */
521 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
522
523 /* Polling miimon status for all ports causes performance degradation when
524 * handling a large number of ports. If there are no devices using miimon, then
525 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
526 *
527 * Readers do not depend on this variable synchronizing with the related
528 * changes in the device miimon status, so we can use atomic_count. */
529 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
530
531 static void netdev_linux_run(void);
532
533 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
534 int cmd, const char *cmd_name);
535 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
536 int cmd, const char *cmd_name);
537 static int get_flags(const struct netdev *, unsigned int *flags);
538 static int set_flags(const char *, unsigned int flags);
539 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
540 enum netdev_flags on, enum netdev_flags *old_flagsp)
541 OVS_REQUIRES(netdev->mutex);
542 static int do_get_ifindex(const char *netdev_name);
543 static int get_ifindex(const struct netdev *, int *ifindexp);
544 static int do_set_addr(struct netdev *netdev,
545 int ioctl_nr, const char *ioctl_name,
546 struct in_addr addr);
547 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
548 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
549 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
550 static int af_packet_sock(void);
551 static bool netdev_linux_miimon_enabled(void);
552 static void netdev_linux_miimon_run(void);
553 static void netdev_linux_miimon_wait(void);
554 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
555
556 static bool
557 is_netdev_linux_class(const struct netdev_class *netdev_class)
558 {
559 return netdev_class->run == netdev_linux_run;
560 }
561
562 static bool
563 is_tap_netdev(const struct netdev *netdev)
564 {
565 return netdev_get_class(netdev) == &netdev_tap_class;
566 }
567
568 static struct netdev_linux *
569 netdev_linux_cast(const struct netdev *netdev)
570 {
571 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
572
573 return CONTAINER_OF(netdev, struct netdev_linux, up);
574 }
575
576 static struct netdev_rxq_linux *
577 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
578 {
579 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
580 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
581 }
582 \f
583 static void netdev_linux_update(struct netdev_linux *netdev,
584 const struct rtnetlink_change *)
585 OVS_REQUIRES(netdev->mutex);
586 static void netdev_linux_changed(struct netdev_linux *netdev,
587 unsigned int ifi_flags, unsigned int mask)
588 OVS_REQUIRES(netdev->mutex);
589
590 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
591 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
592 * if no such socket could be created. */
593 static struct nl_sock *
594 netdev_linux_notify_sock(void)
595 {
596 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
597 static struct nl_sock *sock;
598 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
599 RTNLGRP_IPV6_IFADDR};
600
601 if (ovsthread_once_start(&once)) {
602 int error;
603
604 error = nl_sock_create(NETLINK_ROUTE, &sock);
605 if (!error) {
606 size_t i;
607
608 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
609 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
610 if (error) {
611 nl_sock_destroy(sock);
612 sock = NULL;
613 break;
614 }
615 }
616 }
617 ovsthread_once_done(&once);
618 }
619
620 return sock;
621 }
622
623 static bool
624 netdev_linux_miimon_enabled(void)
625 {
626 return atomic_count_get(&miimon_cnt) > 0;
627 }
628
629 static void
630 netdev_linux_run(void)
631 {
632 struct nl_sock *sock;
633 int error;
634
635 if (netdev_linux_miimon_enabled()) {
636 netdev_linux_miimon_run();
637 }
638
639 sock = netdev_linux_notify_sock();
640 if (!sock) {
641 return;
642 }
643
644 do {
645 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
646 uint64_t buf_stub[4096 / 8];
647 struct ofpbuf buf;
648
649 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
650 error = nl_sock_recv(sock, &buf, false);
651 if (!error) {
652 struct rtnetlink_change change;
653
654 if (rtnetlink_parse(&buf, &change)) {
655 struct netdev *netdev_ = netdev_from_name(change.ifname);
656 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
658
659 ovs_mutex_lock(&netdev->mutex);
660 netdev_linux_update(netdev, &change);
661 ovs_mutex_unlock(&netdev->mutex);
662 }
663 netdev_close(netdev_);
664 }
665 } else if (error == ENOBUFS) {
666 struct shash device_shash;
667 struct shash_node *node;
668
669 nl_sock_drain(sock);
670
671 shash_init(&device_shash);
672 netdev_get_devices(&netdev_linux_class, &device_shash);
673 SHASH_FOR_EACH (node, &device_shash) {
674 struct netdev *netdev_ = node->data;
675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
676 unsigned int flags;
677
678 ovs_mutex_lock(&netdev->mutex);
679 get_flags(netdev_, &flags);
680 netdev_linux_changed(netdev, flags, 0);
681 ovs_mutex_unlock(&netdev->mutex);
682
683 netdev_close(netdev_);
684 }
685 shash_destroy(&device_shash);
686 } else if (error != EAGAIN) {
687 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
688 ovs_strerror(error));
689 }
690 ofpbuf_uninit(&buf);
691 } while (!error);
692 }
693
694 static void
695 netdev_linux_wait(void)
696 {
697 struct nl_sock *sock;
698
699 if (netdev_linux_miimon_enabled()) {
700 netdev_linux_miimon_wait();
701 }
702 sock = netdev_linux_notify_sock();
703 if (sock) {
704 nl_sock_wait(sock, POLLIN);
705 }
706 }
707
708 static void
709 netdev_linux_changed(struct netdev_linux *dev,
710 unsigned int ifi_flags, unsigned int mask)
711 OVS_REQUIRES(dev->mutex)
712 {
713 netdev_change_seq_changed(&dev->up);
714
715 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
716 dev->carrier_resets++;
717 }
718 dev->ifi_flags = ifi_flags;
719
720 dev->cache_valid &= mask;
721 }
722
723 static void
724 netdev_linux_update(struct netdev_linux *dev,
725 const struct rtnetlink_change *change)
726 OVS_REQUIRES(dev->mutex)
727 {
728 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
729 if (change->nlmsg_type == RTM_NEWLINK) {
730 /* Keep drv-info, in4, in6. */
731 netdev_linux_changed(dev, change->ifi_flags,
732 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
733
734 /* Update netdev from rtnl-change msg. */
735 if (change->mtu) {
736 dev->mtu = change->mtu;
737 dev->cache_valid |= VALID_MTU;
738 dev->netdev_mtu_error = 0;
739 }
740
741 if (!eth_addr_is_zero(change->mac)) {
742 dev->etheraddr = change->mac;
743 dev->cache_valid |= VALID_ETHERADDR;
744 dev->ether_addr_error = 0;
745 }
746
747 dev->ifindex = change->if_index;
748 dev->cache_valid |= VALID_IFINDEX;
749 dev->get_ifindex_error = 0;
750 } else {
751 netdev_linux_changed(dev, change->ifi_flags, 0);
752 }
753 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
754 /* Invalidates in4, in6. */
755 netdev_linux_changed(dev, dev->ifi_flags,
756 ~(VALID_IN4 | VALID_IN6));
757 } else {
758 OVS_NOT_REACHED();
759 }
760 }
761
762 static struct netdev *
763 netdev_linux_alloc(void)
764 {
765 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
766 return &netdev->up;
767 }
768
769 static void
770 netdev_linux_common_construct(struct netdev_linux *netdev)
771 {
772 ovs_mutex_init(&netdev->mutex);
773 }
774
775 /* Creates system and internal devices. */
776 static int
777 netdev_linux_construct(struct netdev *netdev_)
778 {
779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
780 int error;
781
782 netdev_linux_common_construct(netdev);
783
784 error = get_flags(&netdev->up, &netdev->ifi_flags);
785 if (error == ENODEV) {
786 if (netdev->up.netdev_class != &netdev_internal_class) {
787 /* The device does not exist, so don't allow it to be opened. */
788 return ENODEV;
789 } else {
790 /* "Internal" netdevs have to be created as netdev objects before
791 * they exist in the kernel, because creating them in the kernel
792 * happens by passing a netdev object to dpif_port_add().
793 * Therefore, ignore the error. */
794 }
795 }
796
797 return 0;
798 }
799
800 /* For most types of netdevs we open the device for each call of
801 * netdev_open(). However, this is not the case with tap devices,
802 * since it is only possible to open the device once. In this
803 * situation we share a single file descriptor, and consequently
804 * buffers, across all readers. Therefore once data is read it will
805 * be unavailable to other reads for tap devices. */
806 static int
807 netdev_linux_construct_tap(struct netdev *netdev_)
808 {
809 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
810 static const char tap_dev[] = "/dev/net/tun";
811 const char *name = netdev_->name;
812 struct ifreq ifr;
813 int error;
814
815 netdev_linux_common_construct(netdev);
816
817 /* Open tap device. */
818 netdev->tap_fd = open(tap_dev, O_RDWR);
819 if (netdev->tap_fd < 0) {
820 error = errno;
821 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
822 return error;
823 }
824
825 /* Create tap device. */
826 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
827 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
828 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
829 VLOG_WARN("%s: creating tap device failed: %s", name,
830 ovs_strerror(errno));
831 error = errno;
832 goto error_close;
833 }
834
835 /* Make non-blocking. */
836 error = set_nonblocking(netdev->tap_fd);
837 if (error) {
838 goto error_close;
839 }
840
841 return 0;
842
843 error_close:
844 close(netdev->tap_fd);
845 return error;
846 }
847
848 static void
849 netdev_linux_destruct(struct netdev *netdev_)
850 {
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
852
853 if (netdev->tc && netdev->tc->ops->tc_destroy) {
854 netdev->tc->ops->tc_destroy(netdev->tc);
855 }
856
857 if (netdev_get_class(netdev_) == &netdev_tap_class
858 && netdev->tap_fd >= 0)
859 {
860 close(netdev->tap_fd);
861 }
862
863 if (netdev->miimon_interval > 0) {
864 atomic_count_dec(&miimon_cnt);
865 }
866
867 ovs_mutex_destroy(&netdev->mutex);
868 }
869
870 static void
871 netdev_linux_dealloc(struct netdev *netdev_)
872 {
873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
874 free(netdev);
875 }
876
877 static struct netdev_rxq *
878 netdev_linux_rxq_alloc(void)
879 {
880 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
881 return &rx->up;
882 }
883
884 static int
885 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
886 {
887 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
888 struct netdev *netdev_ = rx->up.netdev;
889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
890 int error;
891
892 ovs_mutex_lock(&netdev->mutex);
893 rx->is_tap = is_tap_netdev(netdev_);
894 if (rx->is_tap) {
895 rx->fd = netdev->tap_fd;
896 } else {
897 struct sockaddr_ll sll;
898 int ifindex, val;
899 /* Result of tcpdump -dd inbound */
900 static const struct sock_filter filt[] = {
901 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
902 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
903 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
904 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
905 };
906 static const struct sock_fprog fprog = {
907 ARRAY_SIZE(filt), (struct sock_filter *) filt
908 };
909
910 /* Create file descriptor. */
911 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
912 if (rx->fd < 0) {
913 error = errno;
914 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
915 goto error;
916 }
917
918 val = 1;
919 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
920 error = errno;
921 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
922 netdev_get_name(netdev_), ovs_strerror(error));
923 goto error;
924 }
925
926 /* Set non-blocking mode. */
927 error = set_nonblocking(rx->fd);
928 if (error) {
929 goto error;
930 }
931
932 /* Get ethernet device index. */
933 error = get_ifindex(&netdev->up, &ifindex);
934 if (error) {
935 goto error;
936 }
937
938 /* Bind to specific ethernet device. */
939 memset(&sll, 0, sizeof sll);
940 sll.sll_family = AF_PACKET;
941 sll.sll_ifindex = ifindex;
942 sll.sll_protocol = htons(ETH_P_ALL);
943 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
944 error = errno;
945 VLOG_ERR("%s: failed to bind raw socket (%s)",
946 netdev_get_name(netdev_), ovs_strerror(error));
947 goto error;
948 }
949
950 /* Filter for only inbound packets. */
951 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
952 sizeof fprog);
953 if (error) {
954 error = errno;
955 VLOG_ERR("%s: failed to attach filter (%s)",
956 netdev_get_name(netdev_), ovs_strerror(error));
957 goto error;
958 }
959 }
960 ovs_mutex_unlock(&netdev->mutex);
961
962 return 0;
963
964 error:
965 if (rx->fd >= 0) {
966 close(rx->fd);
967 }
968 ovs_mutex_unlock(&netdev->mutex);
969 return error;
970 }
971
972 static void
973 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
974 {
975 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
976
977 if (!rx->is_tap) {
978 close(rx->fd);
979 }
980 }
981
982 static void
983 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
984 {
985 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
986
987 free(rx);
988 }
989
990 static ovs_be16
991 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
992 {
993 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
994 return htons(aux->tp_vlan_tpid);
995 } else {
996 return htons(ETH_TYPE_VLAN);
997 }
998 }
999
1000 static bool
1001 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1002 {
1003 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1004 }
1005
1006 static int
1007 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1008 {
1009 size_t size;
1010 ssize_t retval;
1011 struct iovec iov;
1012 struct cmsghdr *cmsg;
1013 union {
1014 struct cmsghdr cmsg;
1015 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1016 } cmsg_buffer;
1017 struct msghdr msgh;
1018
1019 /* Reserve headroom for a single VLAN tag */
1020 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1021 size = dp_packet_tailroom(buffer);
1022
1023 iov.iov_base = dp_packet_data(buffer);
1024 iov.iov_len = size;
1025 msgh.msg_name = NULL;
1026 msgh.msg_namelen = 0;
1027 msgh.msg_iov = &iov;
1028 msgh.msg_iovlen = 1;
1029 msgh.msg_control = &cmsg_buffer;
1030 msgh.msg_controllen = sizeof cmsg_buffer;
1031 msgh.msg_flags = 0;
1032
1033 do {
1034 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1035 } while (retval < 0 && errno == EINTR);
1036
1037 if (retval < 0) {
1038 return errno;
1039 } else if (retval > size) {
1040 return EMSGSIZE;
1041 }
1042
1043 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1044
1045 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1046 const struct tpacket_auxdata *aux;
1047
1048 if (cmsg->cmsg_level != SOL_PACKET
1049 || cmsg->cmsg_type != PACKET_AUXDATA
1050 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1051 continue;
1052 }
1053
1054 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1055 if (auxdata_has_vlan_tci(aux)) {
1056 if (retval < ETH_HEADER_LEN) {
1057 return EINVAL;
1058 }
1059
1060 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1061 htons(aux->tp_vlan_tci));
1062 break;
1063 }
1064 }
1065
1066 return 0;
1067 }
1068
1069 static int
1070 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1071 {
1072 ssize_t retval;
1073 size_t size = dp_packet_tailroom(buffer);
1074
1075 do {
1076 retval = read(fd, dp_packet_data(buffer), size);
1077 } while (retval < 0 && errno == EINTR);
1078
1079 if (retval < 0) {
1080 return errno;
1081 }
1082
1083 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1084 return 0;
1085 }
1086
1087 static int
1088 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1089 int *c)
1090 {
1091 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1092 struct netdev *netdev = rx->up.netdev;
1093 struct dp_packet *buffer;
1094 ssize_t retval;
1095 int mtu;
1096
1097 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1098 mtu = ETH_PAYLOAD_MAX;
1099 }
1100
1101 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1102 DP_NETDEV_HEADROOM);
1103 retval = (rx->is_tap
1104 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1105 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1106
1107 if (retval) {
1108 if (retval != EAGAIN && retval != EMSGSIZE) {
1109 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1110 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1111 }
1112 dp_packet_delete(buffer);
1113 } else {
1114 dp_packet_pad(buffer);
1115 dp_packet_rss_invalidate(buffer);
1116 packets[0] = buffer;
1117 *c = 1;
1118 }
1119
1120 return retval;
1121 }
1122
1123 static void
1124 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1125 {
1126 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1127 poll_fd_wait(rx->fd, POLLIN);
1128 }
1129
1130 static int
1131 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1132 {
1133 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1134 if (rx->is_tap) {
1135 struct ifreq ifr;
1136 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1137 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1138 if (error) {
1139 return error;
1140 }
1141 drain_fd(rx->fd, ifr.ifr_qlen);
1142 return 0;
1143 } else {
1144 return drain_rcvbuf(rx->fd);
1145 }
1146 }
1147
1148 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1149 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1150 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1151 * the packet is too big or too small to transmit on the device.
1152 *
1153 * The caller retains ownership of 'buffer' in all cases.
1154 *
1155 * The kernel maintains a packet transmission queue, so the caller is not
1156 * expected to do additional queuing of packets. */
1157 static int
1158 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1159 struct dp_packet **pkts, int cnt, bool may_steal)
1160 {
1161 int i;
1162 int error = 0;
1163
1164 /* 'i' is incremented only if there's no error */
1165 for (i = 0; i < cnt;) {
1166 const void *data = dp_packet_data(pkts[i]);
1167 size_t size = dp_packet_size(pkts[i]);
1168 ssize_t retval;
1169
1170 if (!is_tap_netdev(netdev_)) {
1171 /* Use our AF_PACKET socket to send to this device. */
1172 struct sockaddr_ll sll;
1173 struct msghdr msg;
1174 struct iovec iov;
1175 int ifindex;
1176 int sock;
1177
1178 sock = af_packet_sock();
1179 if (sock < 0) {
1180 return -sock;
1181 }
1182
1183 ifindex = netdev_get_ifindex(netdev_);
1184 if (ifindex < 0) {
1185 return -ifindex;
1186 }
1187
1188 /* We don't bother setting most fields in sockaddr_ll because the
1189 * kernel ignores them for SOCK_RAW. */
1190 memset(&sll, 0, sizeof sll);
1191 sll.sll_family = AF_PACKET;
1192 sll.sll_ifindex = ifindex;
1193
1194 iov.iov_base = CONST_CAST(void *, data);
1195 iov.iov_len = size;
1196
1197 msg.msg_name = &sll;
1198 msg.msg_namelen = sizeof sll;
1199 msg.msg_iov = &iov;
1200 msg.msg_iovlen = 1;
1201 msg.msg_control = NULL;
1202 msg.msg_controllen = 0;
1203 msg.msg_flags = 0;
1204
1205 retval = sendmsg(sock, &msg, 0);
1206 } else {
1207 /* Use the tap fd to send to this device. This is essential for
1208 * tap devices, because packets sent to a tap device with an
1209 * AF_PACKET socket will loop back to be *received* again on the
1210 * tap device. This doesn't occur on other interface types
1211 * because we attach a socket filter to the rx socket. */
1212 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1213
1214 retval = write(netdev->tap_fd, data, size);
1215 }
1216
1217 if (retval < 0) {
1218 /* The Linux AF_PACKET implementation never blocks waiting for room
1219 * for packets, instead returning ENOBUFS. Translate this into
1220 * EAGAIN for the caller. */
1221 error = errno == ENOBUFS ? EAGAIN : errno;
1222 if (error == EINTR) {
1223 /* continue without incrementing 'i', i.e. retry this packet */
1224 continue;
1225 }
1226 break;
1227 } else if (retval != size) {
1228 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1229 " of %"PRIuSIZE") on %s", retval, size,
1230 netdev_get_name(netdev_));
1231 error = EMSGSIZE;
1232 break;
1233 }
1234
1235 /* Process the next packet in the batch */
1236 i++;
1237 }
1238
1239 if (may_steal) {
1240 for (i = 0; i < cnt; i++) {
1241 dp_packet_delete(pkts[i]);
1242 }
1243 }
1244
1245 if (error && error != EAGAIN) {
1246 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1247 netdev_get_name(netdev_), ovs_strerror(error));
1248 }
1249
1250 return error;
1251
1252 }
1253
1254 /* Registers with the poll loop to wake up from the next call to poll_block()
1255 * when the packet transmission queue has sufficient room to transmit a packet
1256 * with netdev_send().
1257 *
1258 * The kernel maintains a packet transmission queue, so the client is not
1259 * expected to do additional queuing of packets. Thus, this function is
1260 * unlikely to ever be used. It is included for completeness. */
1261 static void
1262 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1263 {
1264 if (is_tap_netdev(netdev)) {
1265 /* TAP device always accepts packets.*/
1266 poll_immediate_wake();
1267 }
1268 }
1269
1270 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1271 * otherwise a positive errno value. */
1272 static int
1273 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1274 {
1275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1276 enum netdev_flags old_flags = 0;
1277 int error;
1278
1279 ovs_mutex_lock(&netdev->mutex);
1280
1281 if (netdev->cache_valid & VALID_ETHERADDR) {
1282 error = netdev->ether_addr_error;
1283 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1284 goto exit;
1285 }
1286 netdev->cache_valid &= ~VALID_ETHERADDR;
1287 }
1288
1289 /* Tap devices must be brought down before setting the address. */
1290 if (is_tap_netdev(netdev_)) {
1291 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1292 }
1293 error = set_etheraddr(netdev_get_name(netdev_), mac);
1294 if (!error || error == ENODEV) {
1295 netdev->ether_addr_error = error;
1296 netdev->cache_valid |= VALID_ETHERADDR;
1297 if (!error) {
1298 netdev->etheraddr = mac;
1299 }
1300 }
1301
1302 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1303 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1304 }
1305
1306 exit:
1307 ovs_mutex_unlock(&netdev->mutex);
1308 return error;
1309 }
1310
1311 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1312 static int
1313 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1314 {
1315 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1316 int error;
1317
1318 ovs_mutex_lock(&netdev->mutex);
1319 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1320 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1321 &netdev->etheraddr);
1322 netdev->cache_valid |= VALID_ETHERADDR;
1323 }
1324
1325 error = netdev->ether_addr_error;
1326 if (!error) {
1327 *mac = netdev->etheraddr;
1328 }
1329 ovs_mutex_unlock(&netdev->mutex);
1330
1331 return error;
1332 }
1333
1334 static int
1335 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1336 {
1337 int error;
1338
1339 if (!(netdev->cache_valid & VALID_MTU)) {
1340 struct ifreq ifr;
1341
1342 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1343 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1344 netdev->mtu = ifr.ifr_mtu;
1345 netdev->cache_valid |= VALID_MTU;
1346 }
1347
1348 error = netdev->netdev_mtu_error;
1349 if (!error) {
1350 *mtup = netdev->mtu;
1351 }
1352
1353 return error;
1354 }
1355
1356 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1357 * in bytes, not including the hardware header; thus, this is typically 1500
1358 * bytes for Ethernet devices. */
1359 static int
1360 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1361 {
1362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1363 int error;
1364
1365 ovs_mutex_lock(&netdev->mutex);
1366 error = netdev_linux_get_mtu__(netdev, mtup);
1367 ovs_mutex_unlock(&netdev->mutex);
1368
1369 return error;
1370 }
1371
1372 /* Sets the maximum size of transmitted (MTU) for given device using linux
1373 * networking ioctl interface.
1374 */
1375 static int
1376 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1377 {
1378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 struct ifreq ifr;
1380 int error;
1381
1382 ovs_mutex_lock(&netdev->mutex);
1383 if (netdev->cache_valid & VALID_MTU) {
1384 error = netdev->netdev_mtu_error;
1385 if (error || netdev->mtu == mtu) {
1386 goto exit;
1387 }
1388 netdev->cache_valid &= ~VALID_MTU;
1389 }
1390 ifr.ifr_mtu = mtu;
1391 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1392 SIOCSIFMTU, "SIOCSIFMTU");
1393 if (!error || error == ENODEV) {
1394 netdev->netdev_mtu_error = error;
1395 netdev->mtu = ifr.ifr_mtu;
1396 netdev->cache_valid |= VALID_MTU;
1397 }
1398 exit:
1399 ovs_mutex_unlock(&netdev->mutex);
1400 return error;
1401 }
1402
1403 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1404 * On failure, returns a negative errno value. */
1405 static int
1406 netdev_linux_get_ifindex(const struct netdev *netdev_)
1407 {
1408 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1409 int ifindex, error;
1410
1411 ovs_mutex_lock(&netdev->mutex);
1412 error = get_ifindex(netdev_, &ifindex);
1413 ovs_mutex_unlock(&netdev->mutex);
1414
1415 return error ? -error : ifindex;
1416 }
1417
1418 static int
1419 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1420 {
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1422
1423 ovs_mutex_lock(&netdev->mutex);
1424 if (netdev->miimon_interval > 0) {
1425 *carrier = netdev->miimon;
1426 } else {
1427 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1428 }
1429 ovs_mutex_unlock(&netdev->mutex);
1430
1431 return 0;
1432 }
1433
1434 static long long int
1435 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1436 {
1437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1438 long long int carrier_resets;
1439
1440 ovs_mutex_lock(&netdev->mutex);
1441 carrier_resets = netdev->carrier_resets;
1442 ovs_mutex_unlock(&netdev->mutex);
1443
1444 return carrier_resets;
1445 }
1446
1447 static int
1448 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1449 struct mii_ioctl_data *data)
1450 {
1451 struct ifreq ifr;
1452 int error;
1453
1454 memset(&ifr, 0, sizeof ifr);
1455 memcpy(&ifr.ifr_data, data, sizeof *data);
1456 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1457 memcpy(data, &ifr.ifr_data, sizeof *data);
1458
1459 return error;
1460 }
1461
1462 static int
1463 netdev_linux_get_miimon(const char *name, bool *miimon)
1464 {
1465 struct mii_ioctl_data data;
1466 int error;
1467
1468 *miimon = false;
1469
1470 memset(&data, 0, sizeof data);
1471 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1472 if (!error) {
1473 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1474 data.reg_num = MII_BMSR;
1475 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1476 &data);
1477
1478 if (!error) {
1479 *miimon = !!(data.val_out & BMSR_LSTATUS);
1480 } else {
1481 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1482 }
1483 } else {
1484 struct ethtool_cmd ecmd;
1485
1486 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1487 name);
1488
1489 COVERAGE_INC(netdev_get_ethtool);
1490 memset(&ecmd, 0, sizeof ecmd);
1491 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1492 "ETHTOOL_GLINK");
1493 if (!error) {
1494 struct ethtool_value eval;
1495
1496 memcpy(&eval, &ecmd, sizeof eval);
1497 *miimon = !!eval.data;
1498 } else {
1499 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1500 }
1501 }
1502
1503 return error;
1504 }
1505
1506 static int
1507 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1508 long long int interval)
1509 {
1510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1511
1512 ovs_mutex_lock(&netdev->mutex);
1513 interval = interval > 0 ? MAX(interval, 100) : 0;
1514 if (netdev->miimon_interval != interval) {
1515 if (interval && !netdev->miimon_interval) {
1516 atomic_count_inc(&miimon_cnt);
1517 } else if (!interval && netdev->miimon_interval) {
1518 atomic_count_dec(&miimon_cnt);
1519 }
1520
1521 netdev->miimon_interval = interval;
1522 timer_set_expired(&netdev->miimon_timer);
1523 }
1524 ovs_mutex_unlock(&netdev->mutex);
1525
1526 return 0;
1527 }
1528
1529 static void
1530 netdev_linux_miimon_run(void)
1531 {
1532 struct shash device_shash;
1533 struct shash_node *node;
1534
1535 shash_init(&device_shash);
1536 netdev_get_devices(&netdev_linux_class, &device_shash);
1537 SHASH_FOR_EACH (node, &device_shash) {
1538 struct netdev *netdev = node->data;
1539 struct netdev_linux *dev = netdev_linux_cast(netdev);
1540 bool miimon;
1541
1542 ovs_mutex_lock(&dev->mutex);
1543 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1544 netdev_linux_get_miimon(dev->up.name, &miimon);
1545 if (miimon != dev->miimon) {
1546 dev->miimon = miimon;
1547 netdev_linux_changed(dev, dev->ifi_flags, 0);
1548 }
1549
1550 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1551 }
1552 ovs_mutex_unlock(&dev->mutex);
1553 netdev_close(netdev);
1554 }
1555
1556 shash_destroy(&device_shash);
1557 }
1558
1559 static void
1560 netdev_linux_miimon_wait(void)
1561 {
1562 struct shash device_shash;
1563 struct shash_node *node;
1564
1565 shash_init(&device_shash);
1566 netdev_get_devices(&netdev_linux_class, &device_shash);
1567 SHASH_FOR_EACH (node, &device_shash) {
1568 struct netdev *netdev = node->data;
1569 struct netdev_linux *dev = netdev_linux_cast(netdev);
1570
1571 ovs_mutex_lock(&dev->mutex);
1572 if (dev->miimon_interval > 0) {
1573 timer_wait(&dev->miimon_timer);
1574 }
1575 ovs_mutex_unlock(&dev->mutex);
1576 netdev_close(netdev);
1577 }
1578 shash_destroy(&device_shash);
1579 }
1580
1581 static void
1582 swap_uint64(uint64_t *a, uint64_t *b)
1583 {
1584 uint64_t tmp = *a;
1585 *a = *b;
1586 *b = tmp;
1587 }
1588
1589 /* Copies 'src' into 'dst', performing format conversion in the process.
1590 *
1591 * 'src' is allowed to be misaligned. */
1592 static void
1593 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1594 const struct ovs_vport_stats *src)
1595 {
1596 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1597 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1598 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1599 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1600 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1601 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1602 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1603 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1604 dst->multicast = 0;
1605 dst->collisions = 0;
1606 dst->rx_length_errors = 0;
1607 dst->rx_over_errors = 0;
1608 dst->rx_crc_errors = 0;
1609 dst->rx_frame_errors = 0;
1610 dst->rx_fifo_errors = 0;
1611 dst->rx_missed_errors = 0;
1612 dst->tx_aborted_errors = 0;
1613 dst->tx_carrier_errors = 0;
1614 dst->tx_fifo_errors = 0;
1615 dst->tx_heartbeat_errors = 0;
1616 dst->tx_window_errors = 0;
1617 }
1618
1619 static int
1620 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1621 {
1622 struct dpif_netlink_vport reply;
1623 struct ofpbuf *buf;
1624 int error;
1625
1626 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1627 if (error) {
1628 return error;
1629 } else if (!reply.stats) {
1630 ofpbuf_delete(buf);
1631 return EOPNOTSUPP;
1632 }
1633
1634 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1635
1636 ofpbuf_delete(buf);
1637
1638 return 0;
1639 }
1640
1641 static void
1642 get_stats_via_vport(const struct netdev *netdev_,
1643 struct netdev_stats *stats)
1644 {
1645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1646
1647 if (!netdev->vport_stats_error ||
1648 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1649 int error;
1650
1651 error = get_stats_via_vport__(netdev_, stats);
1652 if (error && error != ENOENT && error != ENODEV) {
1653 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1654 "(%s)",
1655 netdev_get_name(netdev_), ovs_strerror(error));
1656 }
1657 netdev->vport_stats_error = error;
1658 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1659 }
1660 }
1661
1662 /* Retrieves current device stats for 'netdev-linux'. */
1663 static int
1664 netdev_linux_get_stats(const struct netdev *netdev_,
1665 struct netdev_stats *stats)
1666 {
1667 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1668 struct netdev_stats dev_stats;
1669 int error;
1670
1671 ovs_mutex_lock(&netdev->mutex);
1672 get_stats_via_vport(netdev_, stats);
1673 error = get_stats_via_netlink(netdev_, &dev_stats);
1674 if (error) {
1675 if (!netdev->vport_stats_error) {
1676 error = 0;
1677 }
1678 } else if (netdev->vport_stats_error) {
1679 /* stats not available from OVS then use netdev stats. */
1680 *stats = dev_stats;
1681 } else {
1682 /* Use kernel netdev's packet and byte counts since vport's counters
1683 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1684 * enabled. */
1685 stats->rx_packets = dev_stats.rx_packets;
1686 stats->rx_bytes = dev_stats.rx_bytes;
1687 stats->tx_packets = dev_stats.tx_packets;
1688 stats->tx_bytes = dev_stats.tx_bytes;
1689
1690 stats->rx_errors += dev_stats.rx_errors;
1691 stats->tx_errors += dev_stats.tx_errors;
1692 stats->rx_dropped += dev_stats.rx_dropped;
1693 stats->tx_dropped += dev_stats.tx_dropped;
1694 stats->multicast += dev_stats.multicast;
1695 stats->collisions += dev_stats.collisions;
1696 stats->rx_length_errors += dev_stats.rx_length_errors;
1697 stats->rx_over_errors += dev_stats.rx_over_errors;
1698 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1699 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1700 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1701 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1702 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1703 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1704 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1705 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1706 stats->tx_window_errors += dev_stats.tx_window_errors;
1707 }
1708 ovs_mutex_unlock(&netdev->mutex);
1709
1710 return error;
1711 }
1712
1713 /* Retrieves current device stats for 'netdev-tap' netdev or
1714 * netdev-internal. */
1715 static int
1716 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1717 {
1718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1719 struct netdev_stats dev_stats;
1720 int error;
1721
1722 ovs_mutex_lock(&netdev->mutex);
1723 get_stats_via_vport(netdev_, stats);
1724 error = get_stats_via_netlink(netdev_, &dev_stats);
1725 if (error) {
1726 if (!netdev->vport_stats_error) {
1727 error = 0;
1728 }
1729 } else if (netdev->vport_stats_error) {
1730 /* Transmit and receive stats will appear to be swapped relative to the
1731 * other ports since we are the one sending the data, not a remote
1732 * computer. For consistency, we swap them back here. This does not
1733 * apply if we are getting stats from the vport layer because it always
1734 * tracks stats from the perspective of the switch. */
1735
1736 *stats = dev_stats;
1737 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1738 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1739 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1740 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1741 stats->rx_length_errors = 0;
1742 stats->rx_over_errors = 0;
1743 stats->rx_crc_errors = 0;
1744 stats->rx_frame_errors = 0;
1745 stats->rx_fifo_errors = 0;
1746 stats->rx_missed_errors = 0;
1747 stats->tx_aborted_errors = 0;
1748 stats->tx_carrier_errors = 0;
1749 stats->tx_fifo_errors = 0;
1750 stats->tx_heartbeat_errors = 0;
1751 stats->tx_window_errors = 0;
1752 } else {
1753 /* Use kernel netdev's packet and byte counts since vport counters
1754 * do not reflect packet counts on the wire when GSO, TSO or GRO
1755 * are enabled. */
1756 stats->rx_packets = dev_stats.tx_packets;
1757 stats->rx_bytes = dev_stats.tx_bytes;
1758 stats->tx_packets = dev_stats.rx_packets;
1759 stats->tx_bytes = dev_stats.rx_bytes;
1760
1761 stats->rx_dropped += dev_stats.tx_dropped;
1762 stats->tx_dropped += dev_stats.rx_dropped;
1763
1764 stats->rx_errors += dev_stats.tx_errors;
1765 stats->tx_errors += dev_stats.rx_errors;
1766
1767 stats->multicast += dev_stats.multicast;
1768 stats->collisions += dev_stats.collisions;
1769 }
1770 ovs_mutex_unlock(&netdev->mutex);
1771
1772 return error;
1773 }
1774
1775 static int
1776 netdev_internal_get_stats(const struct netdev *netdev_,
1777 struct netdev_stats *stats)
1778 {
1779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1780 int error;
1781
1782 ovs_mutex_lock(&netdev->mutex);
1783 get_stats_via_vport(netdev_, stats);
1784 error = netdev->vport_stats_error;
1785 ovs_mutex_unlock(&netdev->mutex);
1786
1787 return error;
1788 }
1789
1790 static void
1791 netdev_linux_read_features(struct netdev_linux *netdev)
1792 {
1793 struct ethtool_cmd ecmd;
1794 uint32_t speed;
1795 int error;
1796
1797 if (netdev->cache_valid & VALID_FEATURES) {
1798 return;
1799 }
1800
1801 COVERAGE_INC(netdev_get_ethtool);
1802 memset(&ecmd, 0, sizeof ecmd);
1803 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1804 ETHTOOL_GSET, "ETHTOOL_GSET");
1805 if (error) {
1806 goto out;
1807 }
1808
1809 /* Supported features. */
1810 netdev->supported = 0;
1811 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1812 netdev->supported |= NETDEV_F_10MB_HD;
1813 }
1814 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1815 netdev->supported |= NETDEV_F_10MB_FD;
1816 }
1817 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1818 netdev->supported |= NETDEV_F_100MB_HD;
1819 }
1820 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1821 netdev->supported |= NETDEV_F_100MB_FD;
1822 }
1823 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1824 netdev->supported |= NETDEV_F_1GB_HD;
1825 }
1826 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1827 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1828 netdev->supported |= NETDEV_F_1GB_FD;
1829 }
1830 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1831 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1832 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1833 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1834 netdev->supported |= NETDEV_F_10GB_FD;
1835 }
1836 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1837 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1838 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1839 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1840 netdev->supported |= NETDEV_F_40GB_FD;
1841 }
1842 if (ecmd.supported & SUPPORTED_TP) {
1843 netdev->supported |= NETDEV_F_COPPER;
1844 }
1845 if (ecmd.supported & SUPPORTED_FIBRE) {
1846 netdev->supported |= NETDEV_F_FIBER;
1847 }
1848 if (ecmd.supported & SUPPORTED_Autoneg) {
1849 netdev->supported |= NETDEV_F_AUTONEG;
1850 }
1851 if (ecmd.supported & SUPPORTED_Pause) {
1852 netdev->supported |= NETDEV_F_PAUSE;
1853 }
1854 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1855 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1856 }
1857
1858 /* Advertised features. */
1859 netdev->advertised = 0;
1860 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1861 netdev->advertised |= NETDEV_F_10MB_HD;
1862 }
1863 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1864 netdev->advertised |= NETDEV_F_10MB_FD;
1865 }
1866 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1867 netdev->advertised |= NETDEV_F_100MB_HD;
1868 }
1869 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1870 netdev->advertised |= NETDEV_F_100MB_FD;
1871 }
1872 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1873 netdev->advertised |= NETDEV_F_1GB_HD;
1874 }
1875 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1876 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1877 netdev->advertised |= NETDEV_F_1GB_FD;
1878 }
1879 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1880 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1881 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1882 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1883 netdev->advertised |= NETDEV_F_10GB_FD;
1884 }
1885 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1886 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1887 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1888 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1889 netdev->advertised |= NETDEV_F_40GB_FD;
1890 }
1891 if (ecmd.advertising & ADVERTISED_TP) {
1892 netdev->advertised |= NETDEV_F_COPPER;
1893 }
1894 if (ecmd.advertising & ADVERTISED_FIBRE) {
1895 netdev->advertised |= NETDEV_F_FIBER;
1896 }
1897 if (ecmd.advertising & ADVERTISED_Autoneg) {
1898 netdev->advertised |= NETDEV_F_AUTONEG;
1899 }
1900 if (ecmd.advertising & ADVERTISED_Pause) {
1901 netdev->advertised |= NETDEV_F_PAUSE;
1902 }
1903 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1904 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1905 }
1906
1907 /* Current settings. */
1908 speed = ethtool_cmd_speed(&ecmd);
1909 if (speed == SPEED_10) {
1910 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1911 } else if (speed == SPEED_100) {
1912 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1913 } else if (speed == SPEED_1000) {
1914 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1915 } else if (speed == SPEED_10000) {
1916 netdev->current = NETDEV_F_10GB_FD;
1917 } else if (speed == 40000) {
1918 netdev->current = NETDEV_F_40GB_FD;
1919 } else if (speed == 100000) {
1920 netdev->current = NETDEV_F_100GB_FD;
1921 } else if (speed == 1000000) {
1922 netdev->current = NETDEV_F_1TB_FD;
1923 } else {
1924 netdev->current = 0;
1925 }
1926
1927 if (ecmd.port == PORT_TP) {
1928 netdev->current |= NETDEV_F_COPPER;
1929 } else if (ecmd.port == PORT_FIBRE) {
1930 netdev->current |= NETDEV_F_FIBER;
1931 }
1932
1933 if (ecmd.autoneg) {
1934 netdev->current |= NETDEV_F_AUTONEG;
1935 }
1936
1937 out:
1938 netdev->cache_valid |= VALID_FEATURES;
1939 netdev->get_features_error = error;
1940 }
1941
1942 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1943 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1944 * Returns 0 if successful, otherwise a positive errno value. */
1945 static int
1946 netdev_linux_get_features(const struct netdev *netdev_,
1947 enum netdev_features *current,
1948 enum netdev_features *advertised,
1949 enum netdev_features *supported,
1950 enum netdev_features *peer)
1951 {
1952 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1953 int error;
1954
1955 ovs_mutex_lock(&netdev->mutex);
1956 netdev_linux_read_features(netdev);
1957 if (!netdev->get_features_error) {
1958 *current = netdev->current;
1959 *advertised = netdev->advertised;
1960 *supported = netdev->supported;
1961 *peer = 0; /* XXX */
1962 }
1963 error = netdev->get_features_error;
1964 ovs_mutex_unlock(&netdev->mutex);
1965
1966 return error;
1967 }
1968
1969 /* Set the features advertised by 'netdev' to 'advertise'. */
1970 static int
1971 netdev_linux_set_advertisements(struct netdev *netdev_,
1972 enum netdev_features advertise)
1973 {
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1975 struct ethtool_cmd ecmd;
1976 int error;
1977
1978 ovs_mutex_lock(&netdev->mutex);
1979
1980 COVERAGE_INC(netdev_get_ethtool);
1981 memset(&ecmd, 0, sizeof ecmd);
1982 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1983 ETHTOOL_GSET, "ETHTOOL_GSET");
1984 if (error) {
1985 goto exit;
1986 }
1987
1988 ecmd.advertising = 0;
1989 if (advertise & NETDEV_F_10MB_HD) {
1990 ecmd.advertising |= ADVERTISED_10baseT_Half;
1991 }
1992 if (advertise & NETDEV_F_10MB_FD) {
1993 ecmd.advertising |= ADVERTISED_10baseT_Full;
1994 }
1995 if (advertise & NETDEV_F_100MB_HD) {
1996 ecmd.advertising |= ADVERTISED_100baseT_Half;
1997 }
1998 if (advertise & NETDEV_F_100MB_FD) {
1999 ecmd.advertising |= ADVERTISED_100baseT_Full;
2000 }
2001 if (advertise & NETDEV_F_1GB_HD) {
2002 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2003 }
2004 if (advertise & NETDEV_F_1GB_FD) {
2005 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2006 }
2007 if (advertise & NETDEV_F_10GB_FD) {
2008 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2009 }
2010 if (advertise & NETDEV_F_COPPER) {
2011 ecmd.advertising |= ADVERTISED_TP;
2012 }
2013 if (advertise & NETDEV_F_FIBER) {
2014 ecmd.advertising |= ADVERTISED_FIBRE;
2015 }
2016 if (advertise & NETDEV_F_AUTONEG) {
2017 ecmd.advertising |= ADVERTISED_Autoneg;
2018 }
2019 if (advertise & NETDEV_F_PAUSE) {
2020 ecmd.advertising |= ADVERTISED_Pause;
2021 }
2022 if (advertise & NETDEV_F_PAUSE_ASYM) {
2023 ecmd.advertising |= ADVERTISED_Asym_Pause;
2024 }
2025 COVERAGE_INC(netdev_set_ethtool);
2026 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2027 ETHTOOL_SSET, "ETHTOOL_SSET");
2028
2029 exit:
2030 ovs_mutex_unlock(&netdev->mutex);
2031 return error;
2032 }
2033
2034 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2035 * successful, otherwise a positive errno value. */
2036 static int
2037 netdev_linux_set_policing(struct netdev *netdev_,
2038 uint32_t kbits_rate, uint32_t kbits_burst)
2039 {
2040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2041 const char *netdev_name = netdev_get_name(netdev_);
2042 int error;
2043
2044 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2045 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2046 : kbits_burst); /* Stick with user-specified value. */
2047
2048 ovs_mutex_lock(&netdev->mutex);
2049 if (netdev->cache_valid & VALID_POLICING) {
2050 error = netdev->netdev_policing_error;
2051 if (error || (netdev->kbits_rate == kbits_rate &&
2052 netdev->kbits_burst == kbits_burst)) {
2053 /* Assume that settings haven't changed since we last set them. */
2054 goto out;
2055 }
2056 netdev->cache_valid &= ~VALID_POLICING;
2057 }
2058
2059 COVERAGE_INC(netdev_set_policing);
2060 /* Remove any existing ingress qdisc. */
2061 error = tc_add_del_ingress_qdisc(netdev_, false);
2062 if (error) {
2063 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2064 netdev_name, ovs_strerror(error));
2065 goto out;
2066 }
2067
2068 if (kbits_rate) {
2069 error = tc_add_del_ingress_qdisc(netdev_, true);
2070 if (error) {
2071 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2072 netdev_name, ovs_strerror(error));
2073 goto out;
2074 }
2075
2076 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2077 if (error){
2078 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2079 netdev_name, ovs_strerror(error));
2080 goto out;
2081 }
2082 }
2083
2084 netdev->kbits_rate = kbits_rate;
2085 netdev->kbits_burst = kbits_burst;
2086
2087 out:
2088 if (!error || error == ENODEV) {
2089 netdev->netdev_policing_error = error;
2090 netdev->cache_valid |= VALID_POLICING;
2091 }
2092 ovs_mutex_unlock(&netdev->mutex);
2093 return error;
2094 }
2095
2096 static int
2097 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2098 struct sset *types)
2099 {
2100 const struct tc_ops *const *opsp;
2101
2102 for (opsp = tcs; *opsp != NULL; opsp++) {
2103 const struct tc_ops *ops = *opsp;
2104 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2105 sset_add(types, ops->ovs_name);
2106 }
2107 }
2108 return 0;
2109 }
2110
2111 static const struct tc_ops *
2112 tc_lookup_ovs_name(const char *name)
2113 {
2114 const struct tc_ops *const *opsp;
2115
2116 for (opsp = tcs; *opsp != NULL; opsp++) {
2117 const struct tc_ops *ops = *opsp;
2118 if (!strcmp(name, ops->ovs_name)) {
2119 return ops;
2120 }
2121 }
2122 return NULL;
2123 }
2124
2125 static const struct tc_ops *
2126 tc_lookup_linux_name(const char *name)
2127 {
2128 const struct tc_ops *const *opsp;
2129
2130 for (opsp = tcs; *opsp != NULL; opsp++) {
2131 const struct tc_ops *ops = *opsp;
2132 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2133 return ops;
2134 }
2135 }
2136 return NULL;
2137 }
2138
2139 static struct tc_queue *
2140 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2141 size_t hash)
2142 {
2143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2144 struct tc_queue *queue;
2145
2146 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2147 if (queue->queue_id == queue_id) {
2148 return queue;
2149 }
2150 }
2151 return NULL;
2152 }
2153
2154 static struct tc_queue *
2155 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2156 {
2157 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2158 }
2159
2160 static int
2161 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2162 const char *type,
2163 struct netdev_qos_capabilities *caps)
2164 {
2165 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2166 if (!ops) {
2167 return EOPNOTSUPP;
2168 }
2169 caps->n_queues = ops->n_queues;
2170 return 0;
2171 }
2172
2173 static int
2174 netdev_linux_get_qos(const struct netdev *netdev_,
2175 const char **typep, struct smap *details)
2176 {
2177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2178 int error;
2179
2180 ovs_mutex_lock(&netdev->mutex);
2181 error = tc_query_qdisc(netdev_);
2182 if (!error) {
2183 *typep = netdev->tc->ops->ovs_name;
2184 error = (netdev->tc->ops->qdisc_get
2185 ? netdev->tc->ops->qdisc_get(netdev_, details)
2186 : 0);
2187 }
2188 ovs_mutex_unlock(&netdev->mutex);
2189
2190 return error;
2191 }
2192
2193 static int
2194 netdev_linux_set_qos(struct netdev *netdev_,
2195 const char *type, const struct smap *details)
2196 {
2197 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2198 const struct tc_ops *new_ops;
2199 int error;
2200
2201 new_ops = tc_lookup_ovs_name(type);
2202 if (!new_ops || !new_ops->tc_install) {
2203 return EOPNOTSUPP;
2204 }
2205
2206 ovs_mutex_lock(&netdev->mutex);
2207 error = tc_query_qdisc(netdev_);
2208 if (error) {
2209 goto exit;
2210 }
2211
2212 if (new_ops == netdev->tc->ops) {
2213 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2214 } else {
2215 /* Delete existing qdisc. */
2216 error = tc_del_qdisc(netdev_);
2217 if (error) {
2218 goto exit;
2219 }
2220 ovs_assert(netdev->tc == NULL);
2221
2222 /* Install new qdisc. */
2223 error = new_ops->tc_install(netdev_, details);
2224 ovs_assert((error == 0) == (netdev->tc != NULL));
2225 }
2226
2227 exit:
2228 ovs_mutex_unlock(&netdev->mutex);
2229 return error;
2230 }
2231
2232 static int
2233 netdev_linux_get_queue(const struct netdev *netdev_,
2234 unsigned int queue_id, struct smap *details)
2235 {
2236 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2237 int error;
2238
2239 ovs_mutex_lock(&netdev->mutex);
2240 error = tc_query_qdisc(netdev_);
2241 if (!error) {
2242 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2243 error = (queue
2244 ? netdev->tc->ops->class_get(netdev_, queue, details)
2245 : ENOENT);
2246 }
2247 ovs_mutex_unlock(&netdev->mutex);
2248
2249 return error;
2250 }
2251
2252 static int
2253 netdev_linux_set_queue(struct netdev *netdev_,
2254 unsigned int queue_id, const struct smap *details)
2255 {
2256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2257 int error;
2258
2259 ovs_mutex_lock(&netdev->mutex);
2260 error = tc_query_qdisc(netdev_);
2261 if (!error) {
2262 error = (queue_id < netdev->tc->ops->n_queues
2263 && netdev->tc->ops->class_set
2264 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2265 : EINVAL);
2266 }
2267 ovs_mutex_unlock(&netdev->mutex);
2268
2269 return error;
2270 }
2271
2272 static int
2273 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2274 {
2275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2276 int error;
2277
2278 ovs_mutex_lock(&netdev->mutex);
2279 error = tc_query_qdisc(netdev_);
2280 if (!error) {
2281 if (netdev->tc->ops->class_delete) {
2282 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2283 error = (queue
2284 ? netdev->tc->ops->class_delete(netdev_, queue)
2285 : ENOENT);
2286 } else {
2287 error = EINVAL;
2288 }
2289 }
2290 ovs_mutex_unlock(&netdev->mutex);
2291
2292 return error;
2293 }
2294
2295 static int
2296 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2297 unsigned int queue_id,
2298 struct netdev_queue_stats *stats)
2299 {
2300 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2301 int error;
2302
2303 ovs_mutex_lock(&netdev->mutex);
2304 error = tc_query_qdisc(netdev_);
2305 if (!error) {
2306 if (netdev->tc->ops->class_get_stats) {
2307 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2308 if (queue) {
2309 stats->created = queue->created;
2310 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2311 stats);
2312 } else {
2313 error = ENOENT;
2314 }
2315 } else {
2316 error = EOPNOTSUPP;
2317 }
2318 }
2319 ovs_mutex_unlock(&netdev->mutex);
2320
2321 return error;
2322 }
2323
2324 struct queue_dump_state {
2325 struct nl_dump dump;
2326 struct ofpbuf buf;
2327 };
2328
2329 static bool
2330 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2331 {
2332 struct ofpbuf request;
2333 struct tcmsg *tcmsg;
2334
2335 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2336 if (!tcmsg) {
2337 return false;
2338 }
2339 tcmsg->tcm_parent = 0;
2340 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2341 ofpbuf_uninit(&request);
2342
2343 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2344 return true;
2345 }
2346
2347 static int
2348 finish_queue_dump(struct queue_dump_state *state)
2349 {
2350 ofpbuf_uninit(&state->buf);
2351 return nl_dump_done(&state->dump);
2352 }
2353
2354 struct netdev_linux_queue_state {
2355 unsigned int *queues;
2356 size_t cur_queue;
2357 size_t n_queues;
2358 };
2359
2360 static int
2361 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2362 {
2363 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2364 int error;
2365
2366 ovs_mutex_lock(&netdev->mutex);
2367 error = tc_query_qdisc(netdev_);
2368 if (!error) {
2369 if (netdev->tc->ops->class_get) {
2370 struct netdev_linux_queue_state *state;
2371 struct tc_queue *queue;
2372 size_t i;
2373
2374 *statep = state = xmalloc(sizeof *state);
2375 state->n_queues = hmap_count(&netdev->tc->queues);
2376 state->cur_queue = 0;
2377 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2378
2379 i = 0;
2380 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2381 state->queues[i++] = queue->queue_id;
2382 }
2383 } else {
2384 error = EOPNOTSUPP;
2385 }
2386 }
2387 ovs_mutex_unlock(&netdev->mutex);
2388
2389 return error;
2390 }
2391
2392 static int
2393 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2394 unsigned int *queue_idp, struct smap *details)
2395 {
2396 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2397 struct netdev_linux_queue_state *state = state_;
2398 int error = EOF;
2399
2400 ovs_mutex_lock(&netdev->mutex);
2401 while (state->cur_queue < state->n_queues) {
2402 unsigned int queue_id = state->queues[state->cur_queue++];
2403 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2404
2405 if (queue) {
2406 *queue_idp = queue_id;
2407 error = netdev->tc->ops->class_get(netdev_, queue, details);
2408 break;
2409 }
2410 }
2411 ovs_mutex_unlock(&netdev->mutex);
2412
2413 return error;
2414 }
2415
2416 static int
2417 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2418 void *state_)
2419 {
2420 struct netdev_linux_queue_state *state = state_;
2421
2422 free(state->queues);
2423 free(state);
2424 return 0;
2425 }
2426
2427 static int
2428 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2429 netdev_dump_queue_stats_cb *cb, void *aux)
2430 {
2431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2432 int error;
2433
2434 ovs_mutex_lock(&netdev->mutex);
2435 error = tc_query_qdisc(netdev_);
2436 if (!error) {
2437 struct queue_dump_state state;
2438
2439 if (!netdev->tc->ops->class_dump_stats) {
2440 error = EOPNOTSUPP;
2441 } else if (!start_queue_dump(netdev_, &state)) {
2442 error = ENODEV;
2443 } else {
2444 struct ofpbuf msg;
2445 int retval;
2446
2447 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2448 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2449 cb, aux);
2450 if (retval) {
2451 error = retval;
2452 }
2453 }
2454
2455 retval = finish_queue_dump(&state);
2456 if (retval) {
2457 error = retval;
2458 }
2459 }
2460 }
2461 ovs_mutex_unlock(&netdev->mutex);
2462
2463 return error;
2464 }
2465
2466 static int
2467 netdev_linux_get_in4(const struct netdev *netdev_,
2468 struct in_addr *address, struct in_addr *netmask)
2469 {
2470 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2471 int error;
2472
2473 ovs_mutex_lock(&netdev->mutex);
2474 if (!(netdev->cache_valid & VALID_IN4)) {
2475 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2476 SIOCGIFADDR, "SIOCGIFADDR");
2477 if (!error) {
2478 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2479 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2480 }
2481 netdev->in4_error = error;
2482 netdev->cache_valid |= VALID_IN4;
2483 } else {
2484 error = netdev->in4_error;
2485 }
2486
2487 if (!error) {
2488 if (netdev->address.s_addr != INADDR_ANY) {
2489 *address = netdev->address;
2490 *netmask = netdev->netmask;
2491 } else {
2492 error = EADDRNOTAVAIL;
2493 }
2494 }
2495 ovs_mutex_unlock(&netdev->mutex);
2496
2497 return error;
2498 }
2499
2500 static int
2501 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2502 struct in_addr netmask)
2503 {
2504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2505 int error;
2506
2507 ovs_mutex_lock(&netdev->mutex);
2508 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2509 if (!error) {
2510 netdev->address = address;
2511 netdev->netmask = netmask;
2512 if (address.s_addr != INADDR_ANY) {
2513 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2514 "SIOCSIFNETMASK", netmask);
2515 }
2516 }
2517
2518 if (!error) {
2519 netdev->cache_valid |= VALID_IN4;
2520 netdev->in4_error = 0;
2521 } else {
2522 netdev->cache_valid &= ~VALID_IN4;
2523 }
2524 ovs_mutex_unlock(&netdev->mutex);
2525
2526 return error;
2527 }
2528
2529 static bool
2530 parse_if_inet6_line(const char *line,
2531 struct in6_addr *in6, char ifname[16 + 1])
2532 {
2533 uint8_t *s6 = in6->s6_addr;
2534 #define X8 "%2"SCNx8
2535 return ovs_scan(line,
2536 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2537 "%*x %*x %*x %*x %16s\n",
2538 &s6[0], &s6[1], &s6[2], &s6[3],
2539 &s6[4], &s6[5], &s6[6], &s6[7],
2540 &s6[8], &s6[9], &s6[10], &s6[11],
2541 &s6[12], &s6[13], &s6[14], &s6[15],
2542 ifname);
2543 }
2544
2545 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2546 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2547 * error. */
2548 static int
2549 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2550 {
2551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2552 int error;
2553
2554 ovs_mutex_lock(&netdev->mutex);
2555 if (!(netdev->cache_valid & VALID_IN6)) {
2556 FILE *file;
2557 char line[128];
2558
2559 netdev->in6 = in6addr_any;
2560 netdev->in6_error = EADDRNOTAVAIL;
2561
2562 file = fopen("/proc/net/if_inet6", "r");
2563 if (file != NULL) {
2564 const char *name = netdev_get_name(netdev_);
2565 while (fgets(line, sizeof line, file)) {
2566 struct in6_addr in6_tmp;
2567 char ifname[16 + 1];
2568 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2569 && !strcmp(name, ifname))
2570 {
2571 netdev->in6 = in6_tmp;
2572 netdev->in6_error = 0;
2573 break;
2574 }
2575 }
2576 fclose(file);
2577 } else {
2578 netdev->in6_error = EOPNOTSUPP;
2579 }
2580 netdev->cache_valid |= VALID_IN6;
2581 }
2582 *in6 = netdev->in6;
2583 error = netdev->in6_error;
2584 ovs_mutex_unlock(&netdev->mutex);
2585
2586 return error;
2587 }
2588
2589 static void
2590 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2591 {
2592 struct sockaddr_in sin;
2593 memset(&sin, 0, sizeof sin);
2594 sin.sin_family = AF_INET;
2595 sin.sin_addr = addr;
2596 sin.sin_port = 0;
2597
2598 memset(sa, 0, sizeof *sa);
2599 memcpy(sa, &sin, sizeof sin);
2600 }
2601
2602 static int
2603 do_set_addr(struct netdev *netdev,
2604 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2605 {
2606 struct ifreq ifr;
2607
2608 make_in4_sockaddr(&ifr.ifr_addr, addr);
2609 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2610 ioctl_name);
2611 }
2612
2613 /* Adds 'router' as a default IP gateway. */
2614 static int
2615 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2616 {
2617 struct in_addr any = { INADDR_ANY };
2618 struct rtentry rt;
2619 int error;
2620
2621 memset(&rt, 0, sizeof rt);
2622 make_in4_sockaddr(&rt.rt_dst, any);
2623 make_in4_sockaddr(&rt.rt_gateway, router);
2624 make_in4_sockaddr(&rt.rt_genmask, any);
2625 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2626 error = af_inet_ioctl(SIOCADDRT, &rt);
2627 if (error) {
2628 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2629 }
2630 return error;
2631 }
2632
2633 static int
2634 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2635 char **netdev_name)
2636 {
2637 static const char fn[] = "/proc/net/route";
2638 FILE *stream;
2639 char line[256];
2640 int ln;
2641
2642 *netdev_name = NULL;
2643 stream = fopen(fn, "r");
2644 if (stream == NULL) {
2645 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2646 return errno;
2647 }
2648
2649 ln = 0;
2650 while (fgets(line, sizeof line, stream)) {
2651 if (++ln >= 2) {
2652 char iface[17];
2653 ovs_be32 dest, gateway, mask;
2654 int refcnt, metric, mtu;
2655 unsigned int flags, use, window, irtt;
2656
2657 if (!ovs_scan(line,
2658 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2659 " %d %u %u\n",
2660 iface, &dest, &gateway, &flags, &refcnt,
2661 &use, &metric, &mask, &mtu, &window, &irtt)) {
2662 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2663 fn, ln, line);
2664 continue;
2665 }
2666 if (!(flags & RTF_UP)) {
2667 /* Skip routes that aren't up. */
2668 continue;
2669 }
2670
2671 /* The output of 'dest', 'mask', and 'gateway' were given in
2672 * network byte order, so we don't need need any endian
2673 * conversions here. */
2674 if ((dest & mask) == (host->s_addr & mask)) {
2675 if (!gateway) {
2676 /* The host is directly reachable. */
2677 next_hop->s_addr = 0;
2678 } else {
2679 /* To reach the host, we must go through a gateway. */
2680 next_hop->s_addr = gateway;
2681 }
2682 *netdev_name = xstrdup(iface);
2683 fclose(stream);
2684 return 0;
2685 }
2686 }
2687 }
2688
2689 fclose(stream);
2690 return ENXIO;
2691 }
2692
2693 static int
2694 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2695 {
2696 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2697 int error = 0;
2698
2699 ovs_mutex_lock(&netdev->mutex);
2700 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2701 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2702
2703 COVERAGE_INC(netdev_get_ethtool);
2704 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2705 error = netdev_linux_do_ethtool(netdev->up.name,
2706 cmd,
2707 ETHTOOL_GDRVINFO,
2708 "ETHTOOL_GDRVINFO");
2709 if (!error) {
2710 netdev->cache_valid |= VALID_DRVINFO;
2711 }
2712 }
2713
2714 if (!error) {
2715 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2716 smap_add(smap, "driver_version", netdev->drvinfo.version);
2717 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2718 }
2719 ovs_mutex_unlock(&netdev->mutex);
2720
2721 return error;
2722 }
2723
2724 static int
2725 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2726 struct smap *smap)
2727 {
2728 smap_add(smap, "driver_name", "openvswitch");
2729 return 0;
2730 }
2731
2732 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2733 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2734 * returns 0. Otherwise, it returns a positive errno value; in particular,
2735 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2736 static int
2737 netdev_linux_arp_lookup(const struct netdev *netdev,
2738 ovs_be32 ip, struct eth_addr *mac)
2739 {
2740 struct arpreq r;
2741 struct sockaddr_in sin;
2742 int retval;
2743
2744 memset(&r, 0, sizeof r);
2745 memset(&sin, 0, sizeof sin);
2746 sin.sin_family = AF_INET;
2747 sin.sin_addr.s_addr = ip;
2748 sin.sin_port = 0;
2749 memcpy(&r.arp_pa, &sin, sizeof sin);
2750 r.arp_ha.sa_family = ARPHRD_ETHER;
2751 r.arp_flags = 0;
2752 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2753 COVERAGE_INC(netdev_arp_lookup);
2754 retval = af_inet_ioctl(SIOCGARP, &r);
2755 if (!retval) {
2756 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2757 } else if (retval != ENXIO) {
2758 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2759 netdev_get_name(netdev), IP_ARGS(ip),
2760 ovs_strerror(retval));
2761 }
2762 return retval;
2763 }
2764
2765 static int
2766 nd_to_iff_flags(enum netdev_flags nd)
2767 {
2768 int iff = 0;
2769 if (nd & NETDEV_UP) {
2770 iff |= IFF_UP;
2771 }
2772 if (nd & NETDEV_PROMISC) {
2773 iff |= IFF_PROMISC;
2774 }
2775 if (nd & NETDEV_LOOPBACK) {
2776 iff |= IFF_LOOPBACK;
2777 }
2778 return iff;
2779 }
2780
2781 static int
2782 iff_to_nd_flags(int iff)
2783 {
2784 enum netdev_flags nd = 0;
2785 if (iff & IFF_UP) {
2786 nd |= NETDEV_UP;
2787 }
2788 if (iff & IFF_PROMISC) {
2789 nd |= NETDEV_PROMISC;
2790 }
2791 if (iff & IFF_LOOPBACK) {
2792 nd |= NETDEV_LOOPBACK;
2793 }
2794 return nd;
2795 }
2796
2797 static int
2798 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2799 enum netdev_flags on, enum netdev_flags *old_flagsp)
2800 OVS_REQUIRES(netdev->mutex)
2801 {
2802 int old_flags, new_flags;
2803 int error = 0;
2804
2805 old_flags = netdev->ifi_flags;
2806 *old_flagsp = iff_to_nd_flags(old_flags);
2807 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2808 if (new_flags != old_flags) {
2809 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2810 get_flags(&netdev->up, &netdev->ifi_flags);
2811 }
2812
2813 return error;
2814 }
2815
2816 static int
2817 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2818 enum netdev_flags on, enum netdev_flags *old_flagsp)
2819 {
2820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2821 int error;
2822
2823 ovs_mutex_lock(&netdev->mutex);
2824 error = update_flags(netdev, off, on, old_flagsp);
2825 ovs_mutex_unlock(&netdev->mutex);
2826
2827 return error;
2828 }
2829
2830 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2831 GET_FEATURES, GET_STATUS) \
2832 { \
2833 NAME, \
2834 \
2835 NULL, \
2836 netdev_linux_run, \
2837 netdev_linux_wait, \
2838 \
2839 netdev_linux_alloc, \
2840 CONSTRUCT, \
2841 netdev_linux_destruct, \
2842 netdev_linux_dealloc, \
2843 NULL, /* get_config */ \
2844 NULL, /* set_config */ \
2845 NULL, /* get_tunnel_config */ \
2846 NULL, /* build header */ \
2847 NULL, /* push header */ \
2848 NULL, /* pop header */ \
2849 NULL, /* get_numa_id */ \
2850 NULL, /* set_multiq */ \
2851 \
2852 netdev_linux_send, \
2853 netdev_linux_send_wait, \
2854 \
2855 netdev_linux_set_etheraddr, \
2856 netdev_linux_get_etheraddr, \
2857 netdev_linux_get_mtu, \
2858 netdev_linux_set_mtu, \
2859 netdev_linux_get_ifindex, \
2860 netdev_linux_get_carrier, \
2861 netdev_linux_get_carrier_resets, \
2862 netdev_linux_set_miimon_interval, \
2863 GET_STATS, \
2864 \
2865 GET_FEATURES, \
2866 netdev_linux_set_advertisements, \
2867 \
2868 netdev_linux_set_policing, \
2869 netdev_linux_get_qos_types, \
2870 netdev_linux_get_qos_capabilities, \
2871 netdev_linux_get_qos, \
2872 netdev_linux_set_qos, \
2873 netdev_linux_get_queue, \
2874 netdev_linux_set_queue, \
2875 netdev_linux_delete_queue, \
2876 netdev_linux_get_queue_stats, \
2877 netdev_linux_queue_dump_start, \
2878 netdev_linux_queue_dump_next, \
2879 netdev_linux_queue_dump_done, \
2880 netdev_linux_dump_queue_stats, \
2881 \
2882 netdev_linux_get_in4, \
2883 netdev_linux_set_in4, \
2884 netdev_linux_get_in6, \
2885 netdev_linux_add_router, \
2886 netdev_linux_get_next_hop, \
2887 GET_STATUS, \
2888 netdev_linux_arp_lookup, \
2889 \
2890 netdev_linux_update_flags, \
2891 \
2892 netdev_linux_rxq_alloc, \
2893 netdev_linux_rxq_construct, \
2894 netdev_linux_rxq_destruct, \
2895 netdev_linux_rxq_dealloc, \
2896 netdev_linux_rxq_recv, \
2897 netdev_linux_rxq_wait, \
2898 netdev_linux_rxq_drain, \
2899 }
2900
2901 const struct netdev_class netdev_linux_class =
2902 NETDEV_LINUX_CLASS(
2903 "system",
2904 netdev_linux_construct,
2905 netdev_linux_get_stats,
2906 netdev_linux_get_features,
2907 netdev_linux_get_status);
2908
2909 const struct netdev_class netdev_tap_class =
2910 NETDEV_LINUX_CLASS(
2911 "tap",
2912 netdev_linux_construct_tap,
2913 netdev_tap_get_stats,
2914 netdev_linux_get_features,
2915 netdev_linux_get_status);
2916
2917 const struct netdev_class netdev_internal_class =
2918 NETDEV_LINUX_CLASS(
2919 "internal",
2920 netdev_linux_construct,
2921 netdev_internal_get_stats,
2922 NULL, /* get_features */
2923 netdev_internal_get_status);
2924 \f
2925
2926 #define CODEL_N_QUEUES 0x0000
2927
2928 /* In sufficiently new kernel headers these are defined as enums in
2929 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2930 * kernels. (This overrides any enum definition in the header file but that's
2931 * harmless.) */
2932 #define TCA_CODEL_TARGET 1
2933 #define TCA_CODEL_LIMIT 2
2934 #define TCA_CODEL_INTERVAL 3
2935
2936 struct codel {
2937 struct tc tc;
2938 uint32_t target;
2939 uint32_t limit;
2940 uint32_t interval;
2941 };
2942
2943 static struct codel *
2944 codel_get__(const struct netdev *netdev_)
2945 {
2946 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2947 return CONTAINER_OF(netdev->tc, struct codel, tc);
2948 }
2949
2950 static void
2951 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2952 uint32_t interval)
2953 {
2954 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2955 struct codel *codel;
2956
2957 codel = xmalloc(sizeof *codel);
2958 tc_init(&codel->tc, &tc_ops_codel);
2959 codel->target = target;
2960 codel->limit = limit;
2961 codel->interval = interval;
2962
2963 netdev->tc = &codel->tc;
2964 }
2965
2966 static int
2967 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2968 uint32_t interval)
2969 {
2970 size_t opt_offset;
2971 struct ofpbuf request;
2972 struct tcmsg *tcmsg;
2973 uint32_t otarget, olimit, ointerval;
2974 int error;
2975
2976 tc_del_qdisc(netdev);
2977
2978 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2979 NLM_F_EXCL | NLM_F_CREATE, &request);
2980 if (!tcmsg) {
2981 return ENODEV;
2982 }
2983 tcmsg->tcm_handle = tc_make_handle(1, 0);
2984 tcmsg->tcm_parent = TC_H_ROOT;
2985
2986 otarget = target ? target : 5000;
2987 olimit = limit ? limit : 10240;
2988 ointerval = interval ? interval : 100000;
2989
2990 nl_msg_put_string(&request, TCA_KIND, "codel");
2991 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2992 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2993 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2994 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2995 nl_msg_end_nested(&request, opt_offset);
2996
2997 error = tc_transact(&request, NULL);
2998 if (error) {
2999 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3000 "target %u, limit %u, interval %u error %d(%s)",
3001 netdev_get_name(netdev),
3002 otarget, olimit, ointerval,
3003 error, ovs_strerror(error));
3004 }
3005 return error;
3006 }
3007
3008 static void
3009 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3010 const struct smap *details, struct codel *codel)
3011 {
3012 const char *target_s;
3013 const char *limit_s;
3014 const char *interval_s;
3015
3016 target_s = smap_get(details, "target");
3017 limit_s = smap_get(details, "limit");
3018 interval_s = smap_get(details, "interval");
3019
3020 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3021 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3022 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3023
3024 if (!codel->target) {
3025 codel->target = 5000;
3026 }
3027 if (!codel->limit) {
3028 codel->limit = 10240;
3029 }
3030 if (!codel->interval) {
3031 codel->interval = 100000;
3032 }
3033 }
3034
3035 static int
3036 codel_tc_install(struct netdev *netdev, const struct smap *details)
3037 {
3038 int error;
3039 struct codel codel;
3040
3041 codel_parse_qdisc_details__(netdev, details, &codel);
3042 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3043 codel.interval);
3044 if (!error) {
3045 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3046 }
3047 return error;
3048 }
3049
3050 static int
3051 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3052 {
3053 static const struct nl_policy tca_codel_policy[] = {
3054 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3055 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3056 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3057 };
3058
3059 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3060
3061 if (!nl_parse_nested(nl_options, tca_codel_policy,
3062 attrs, ARRAY_SIZE(tca_codel_policy))) {
3063 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3064 return EPROTO;
3065 }
3066
3067 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3068 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3069 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3070 return 0;
3071 }
3072
3073 static int
3074 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3075 {
3076 struct nlattr *nlattr;
3077 const char * kind;
3078 int error;
3079 struct codel codel;
3080
3081 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3082 if (error != 0) {
3083 return error;
3084 }
3085
3086 error = codel_parse_tca_options__(nlattr, &codel);
3087 if (error != 0) {
3088 return error;
3089 }
3090
3091 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3092 return 0;
3093 }
3094
3095
3096 static void
3097 codel_tc_destroy(struct tc *tc)
3098 {
3099 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3100 tc_destroy(tc);
3101 free(codel);
3102 }
3103
3104 static int
3105 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3106 {
3107 const struct codel *codel = codel_get__(netdev);
3108 smap_add_format(details, "target", "%u", codel->target);
3109 smap_add_format(details, "limit", "%u", codel->limit);
3110 smap_add_format(details, "interval", "%u", codel->interval);
3111 return 0;
3112 }
3113
3114 static int
3115 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3116 {
3117 struct codel codel;
3118
3119 codel_parse_qdisc_details__(netdev, details, &codel);
3120 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3121 codel_get__(netdev)->target = codel.target;
3122 codel_get__(netdev)->limit = codel.limit;
3123 codel_get__(netdev)->interval = codel.interval;
3124 return 0;
3125 }
3126
3127 static const struct tc_ops tc_ops_codel = {
3128 "codel", /* linux_name */
3129 "linux-codel", /* ovs_name */
3130 CODEL_N_QUEUES, /* n_queues */
3131 codel_tc_install,
3132 codel_tc_load,
3133 codel_tc_destroy,
3134 codel_qdisc_get,
3135 codel_qdisc_set,
3136 NULL,
3137 NULL,
3138 NULL,
3139 NULL,
3140 NULL
3141 };
3142 \f
3143 /* FQ-CoDel traffic control class. */
3144
3145 #define FQCODEL_N_QUEUES 0x0000
3146
3147 /* In sufficiently new kernel headers these are defined as enums in
3148 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3149 * kernels. (This overrides any enum definition in the header file but that's
3150 * harmless.) */
3151 #define TCA_FQ_CODEL_TARGET 1
3152 #define TCA_FQ_CODEL_LIMIT 2
3153 #define TCA_FQ_CODEL_INTERVAL 3
3154 #define TCA_FQ_CODEL_ECN 4
3155 #define TCA_FQ_CODEL_FLOWS 5
3156 #define TCA_FQ_CODEL_QUANTUM 6
3157
3158 struct fqcodel {
3159 struct tc tc;
3160 uint32_t target;
3161 uint32_t limit;
3162 uint32_t interval;
3163 uint32_t flows;
3164 uint32_t quantum;
3165 };
3166
3167 static struct fqcodel *
3168 fqcodel_get__(const struct netdev *netdev_)
3169 {
3170 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3171 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3172 }
3173
3174 static void
3175 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3176 uint32_t interval, uint32_t flows, uint32_t quantum)
3177 {
3178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3179 struct fqcodel *fqcodel;
3180
3181 fqcodel = xmalloc(sizeof *fqcodel);
3182 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3183 fqcodel->target = target;
3184 fqcodel->limit = limit;
3185 fqcodel->interval = interval;
3186 fqcodel->flows = flows;
3187 fqcodel->quantum = quantum;
3188
3189 netdev->tc = &fqcodel->tc;
3190 }
3191
3192 static int
3193 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3194 uint32_t interval, uint32_t flows, uint32_t quantum)
3195 {
3196 size_t opt_offset;
3197 struct ofpbuf request;
3198 struct tcmsg *tcmsg;
3199 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3200 int error;
3201
3202 tc_del_qdisc(netdev);
3203
3204 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3205 NLM_F_EXCL | NLM_F_CREATE, &request);
3206 if (!tcmsg) {
3207 return ENODEV;
3208 }
3209 tcmsg->tcm_handle = tc_make_handle(1, 0);
3210 tcmsg->tcm_parent = TC_H_ROOT;
3211
3212 otarget = target ? target : 5000;
3213 olimit = limit ? limit : 10240;
3214 ointerval = interval ? interval : 100000;
3215 oflows = flows ? flows : 1024;
3216 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3217 not mtu */
3218
3219 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3220 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3221 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3222 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3223 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3224 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3225 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3226 nl_msg_end_nested(&request, opt_offset);
3227
3228 error = tc_transact(&request, NULL);
3229 if (error) {
3230 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3231 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3232 netdev_get_name(netdev),
3233 otarget, olimit, ointerval, oflows, oquantum,
3234 error, ovs_strerror(error));
3235 }
3236 return error;
3237 }
3238
3239 static void
3240 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3241 const struct smap *details, struct fqcodel *fqcodel)
3242 {
3243 const char *target_s;
3244 const char *limit_s;
3245 const char *interval_s;
3246 const char *flows_s;
3247 const char *quantum_s;
3248
3249 target_s = smap_get(details, "target");
3250 limit_s = smap_get(details, "limit");
3251 interval_s = smap_get(details, "interval");
3252 flows_s = smap_get(details, "flows");
3253 quantum_s = smap_get(details, "quantum");
3254 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3255 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3256 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3257 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3258 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3259 if (!fqcodel->target) {
3260 fqcodel->target = 5000;
3261 }
3262 if (!fqcodel->limit) {
3263 fqcodel->limit = 10240;
3264 }
3265 if (!fqcodel->interval) {
3266 fqcodel->interval = 1000000;
3267 }
3268 if (!fqcodel->flows) {
3269 fqcodel->flows = 1024;
3270 }
3271 if (!fqcodel->quantum) {
3272 fqcodel->quantum = 1514;
3273 }
3274 }
3275
3276 static int
3277 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3278 {
3279 int error;
3280 struct fqcodel fqcodel;
3281
3282 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3283 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3284 fqcodel.interval, fqcodel.flows,
3285 fqcodel.quantum);
3286 if (!error) {
3287 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3288 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3289 }
3290 return error;
3291 }
3292
3293 static int
3294 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3295 {
3296 static const struct nl_policy tca_fqcodel_policy[] = {
3297 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3298 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3299 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3300 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3301 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3302 };
3303
3304 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3305
3306 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3307 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3308 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3309 return EPROTO;
3310 }
3311
3312 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3313 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3314 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3315 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3316 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3317 return 0;
3318 }
3319
3320 static int
3321 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3322 {
3323 struct nlattr *nlattr;
3324 const char * kind;
3325 int error;
3326 struct fqcodel fqcodel;
3327
3328 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3329 if (error != 0) {
3330 return error;
3331 }
3332
3333 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3334 if (error != 0) {
3335 return error;
3336 }
3337
3338 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3339 fqcodel.flows, fqcodel.quantum);
3340 return 0;
3341 }
3342
3343 static void
3344 fqcodel_tc_destroy(struct tc *tc)
3345 {
3346 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3347 tc_destroy(tc);
3348 free(fqcodel);
3349 }
3350
3351 static int
3352 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3353 {
3354 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3355 smap_add_format(details, "target", "%u", fqcodel->target);
3356 smap_add_format(details, "limit", "%u", fqcodel->limit);
3357 smap_add_format(details, "interval", "%u", fqcodel->interval);
3358 smap_add_format(details, "flows", "%u", fqcodel->flows);
3359 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3360 return 0;
3361 }
3362
3363 static int
3364 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3365 {
3366 struct fqcodel fqcodel;
3367
3368 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3369 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3370 fqcodel.flows, fqcodel.quantum);
3371 fqcodel_get__(netdev)->target = fqcodel.target;
3372 fqcodel_get__(netdev)->limit = fqcodel.limit;
3373 fqcodel_get__(netdev)->interval = fqcodel.interval;
3374 fqcodel_get__(netdev)->flows = fqcodel.flows;
3375 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3376 return 0;
3377 }
3378
3379 static const struct tc_ops tc_ops_fqcodel = {
3380 "fq_codel", /* linux_name */
3381 "linux-fq_codel", /* ovs_name */
3382 FQCODEL_N_QUEUES, /* n_queues */
3383 fqcodel_tc_install,
3384 fqcodel_tc_load,
3385 fqcodel_tc_destroy,
3386 fqcodel_qdisc_get,
3387 fqcodel_qdisc_set,
3388 NULL,
3389 NULL,
3390 NULL,
3391 NULL,
3392 NULL
3393 };
3394 \f
3395 /* SFQ traffic control class. */
3396
3397 #define SFQ_N_QUEUES 0x0000
3398
3399 struct sfq {
3400 struct tc tc;
3401 uint32_t quantum;
3402 uint32_t perturb;
3403 };
3404
3405 static struct sfq *
3406 sfq_get__(const struct netdev *netdev_)
3407 {
3408 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3409 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3410 }
3411
3412 static void
3413 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3414 {
3415 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3416 struct sfq *sfq;
3417
3418 sfq = xmalloc(sizeof *sfq);
3419 tc_init(&sfq->tc, &tc_ops_sfq);
3420 sfq->perturb = perturb;
3421 sfq->quantum = quantum;
3422
3423 netdev->tc = &sfq->tc;
3424 }
3425
3426 static int
3427 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3428 {
3429 struct tc_sfq_qopt opt;
3430 struct ofpbuf request;
3431 struct tcmsg *tcmsg;
3432 int mtu;
3433 int mtu_error, error;
3434 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3435
3436 tc_del_qdisc(netdev);
3437
3438 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3439 NLM_F_EXCL | NLM_F_CREATE, &request);
3440 if (!tcmsg) {
3441 return ENODEV;
3442 }
3443 tcmsg->tcm_handle = tc_make_handle(1, 0);
3444 tcmsg->tcm_parent = TC_H_ROOT;
3445
3446 memset(&opt, 0, sizeof opt);
3447 if (!quantum) {
3448 if (!mtu_error) {
3449 opt.quantum = mtu; /* if we cannot find mtu, use default */
3450 }
3451 } else {
3452 opt.quantum = quantum;
3453 }
3454
3455 if (!perturb) {
3456 opt.perturb_period = 10;
3457 } else {
3458 opt.perturb_period = perturb;
3459 }
3460
3461 nl_msg_put_string(&request, TCA_KIND, "sfq");
3462 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3463
3464 error = tc_transact(&request, NULL);
3465 if (error) {
3466 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3467 "quantum %u, perturb %u error %d(%s)",
3468 netdev_get_name(netdev),
3469 opt.quantum, opt.perturb_period,
3470 error, ovs_strerror(error));
3471 }
3472 return error;
3473 }
3474
3475 static void
3476 sfq_parse_qdisc_details__(struct netdev *netdev,
3477 const struct smap *details, struct sfq *sfq)
3478 {
3479 const char *perturb_s;
3480 const char *quantum_s;
3481 int mtu;
3482 int mtu_error;
3483
3484 perturb_s = smap_get(details, "perturb");
3485 quantum_s = smap_get(details, "quantum");
3486 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3487 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3488 if (!sfq->perturb) {
3489 sfq->perturb = 10;
3490 }
3491
3492 if (!sfq->quantum) {
3493 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3494 if (!mtu_error) {
3495 sfq->quantum = mtu;
3496 } else {
3497 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3498 "device without mtu");
3499 return;
3500 }
3501 }
3502 }
3503
3504 static int
3505 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3506 {
3507 int error;
3508 struct sfq sfq;
3509
3510 sfq_parse_qdisc_details__(netdev, details, &sfq);
3511 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3512 if (!error) {
3513 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3514 }
3515 return error;
3516 }
3517
3518 static int
3519 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3520 {
3521 const struct tc_sfq_qopt *sfq;
3522 struct nlattr *nlattr;
3523 const char * kind;
3524 int error;
3525
3526 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3527 if (error == 0) {
3528 sfq = nl_attr_get(nlattr);
3529 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3530 return 0;
3531 }
3532
3533 return error;
3534 }
3535
3536 static void
3537 sfq_tc_destroy(struct tc *tc)
3538 {
3539 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3540 tc_destroy(tc);
3541 free(sfq);
3542 }
3543
3544 static int
3545 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3546 {
3547 const struct sfq *sfq = sfq_get__(netdev);
3548 smap_add_format(details, "quantum", "%u", sfq->quantum);
3549 smap_add_format(details, "perturb", "%u", sfq->perturb);
3550 return 0;
3551 }
3552
3553 static int
3554 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3555 {
3556 struct sfq sfq;
3557
3558 sfq_parse_qdisc_details__(netdev, details, &sfq);
3559 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3560 sfq_get__(netdev)->quantum = sfq.quantum;
3561 sfq_get__(netdev)->perturb = sfq.perturb;
3562 return 0;
3563 }
3564
3565 static const struct tc_ops tc_ops_sfq = {
3566 "sfq", /* linux_name */
3567 "linux-sfq", /* ovs_name */
3568 SFQ_N_QUEUES, /* n_queues */
3569 sfq_tc_install,
3570 sfq_tc_load,
3571 sfq_tc_destroy,
3572 sfq_qdisc_get,
3573 sfq_qdisc_set,
3574 NULL,
3575 NULL,
3576 NULL,
3577 NULL,
3578 NULL
3579 };
3580 \f
3581 /* HTB traffic control class. */
3582
3583 #define HTB_N_QUEUES 0xf000
3584 #define HTB_RATE2QUANTUM 10
3585
3586 struct htb {
3587 struct tc tc;
3588 unsigned int max_rate; /* In bytes/s. */
3589 };
3590
3591 struct htb_class {
3592 struct tc_queue tc_queue;
3593 unsigned int min_rate; /* In bytes/s. */
3594 unsigned int max_rate; /* In bytes/s. */
3595 unsigned int burst; /* In bytes. */
3596 unsigned int priority; /* Lower values are higher priorities. */
3597 };
3598
3599 static struct htb *
3600 htb_get__(const struct netdev *netdev_)
3601 {
3602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3603 return CONTAINER_OF(netdev->tc, struct htb, tc);
3604 }
3605
3606 static void
3607 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3608 {
3609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3610 struct htb *htb;
3611
3612 htb = xmalloc(sizeof *htb);
3613 tc_init(&htb->tc, &tc_ops_htb);
3614 htb->max_rate = max_rate;
3615
3616 netdev->tc = &htb->tc;
3617 }
3618
3619 /* Create an HTB qdisc.
3620 *
3621 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3622 static int
3623 htb_setup_qdisc__(struct netdev *netdev)
3624 {
3625 size_t opt_offset;
3626 struct tc_htb_glob opt;
3627 struct ofpbuf request;
3628 struct tcmsg *tcmsg;
3629
3630 tc_del_qdisc(netdev);
3631
3632 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3633 NLM_F_EXCL | NLM_F_CREATE, &request);
3634 if (!tcmsg) {
3635 return ENODEV;
3636 }
3637 tcmsg->tcm_handle = tc_make_handle(1, 0);
3638 tcmsg->tcm_parent = TC_H_ROOT;
3639
3640 nl_msg_put_string(&request, TCA_KIND, "htb");
3641
3642 memset(&opt, 0, sizeof opt);
3643 opt.rate2quantum = HTB_RATE2QUANTUM;
3644 opt.version = 3;
3645 opt.defcls = 1;
3646
3647 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3648 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3649 nl_msg_end_nested(&request, opt_offset);
3650
3651 return tc_transact(&request, NULL);
3652 }
3653
3654 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3655 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3656 static int
3657 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3658 unsigned int parent, struct htb_class *class)
3659 {
3660 size_t opt_offset;
3661 struct tc_htb_opt opt;
3662 struct ofpbuf request;
3663 struct tcmsg *tcmsg;
3664 int error;
3665 int mtu;
3666
3667 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3668 if (error) {
3669 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3670 netdev_get_name(netdev));
3671 return error;
3672 }
3673
3674 memset(&opt, 0, sizeof opt);
3675 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3676 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3677 /* Makes sure the quantum is at least MTU. Setting quantum will
3678 * make htb ignore the r2q for this class. */
3679 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3680 opt.quantum = mtu;
3681 }
3682 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3683 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3684 opt.prio = class->priority;
3685
3686 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3687 if (!tcmsg) {
3688 return ENODEV;
3689 }
3690 tcmsg->tcm_handle = handle;
3691 tcmsg->tcm_parent = parent;
3692
3693 nl_msg_put_string(&request, TCA_KIND, "htb");
3694 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3695 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3696 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3697 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3698 nl_msg_end_nested(&request, opt_offset);
3699
3700 error = tc_transact(&request, NULL);
3701 if (error) {
3702 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3703 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3704 netdev_get_name(netdev),
3705 tc_get_major(handle), tc_get_minor(handle),
3706 tc_get_major(parent), tc_get_minor(parent),
3707 class->min_rate, class->max_rate,
3708 class->burst, class->priority, ovs_strerror(error));
3709 }
3710 return error;
3711 }
3712
3713 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3714 * description of them into 'details'. The description complies with the
3715 * specification given in the vswitch database documentation for linux-htb
3716 * queue details. */
3717 static int
3718 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3719 {
3720 static const struct nl_policy tca_htb_policy[] = {
3721 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3722 .min_len = sizeof(struct tc_htb_opt) },
3723 };
3724
3725 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3726 const struct tc_htb_opt *htb;
3727
3728 if (!nl_parse_nested(nl_options, tca_htb_policy,
3729 attrs, ARRAY_SIZE(tca_htb_policy))) {
3730 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3731 return EPROTO;
3732 }
3733
3734 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3735 class->min_rate = htb->rate.rate;
3736 class->max_rate = htb->ceil.rate;
3737 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3738 class->priority = htb->prio;
3739 return 0;
3740 }
3741
3742 static int
3743 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3744 struct htb_class *options,
3745 struct netdev_queue_stats *stats)
3746 {
3747 struct nlattr *nl_options;
3748 unsigned int handle;
3749 int error;
3750
3751 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3752 if (!error && queue_id) {
3753 unsigned int major = tc_get_major(handle);
3754 unsigned int minor = tc_get_minor(handle);
3755 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3756 *queue_id = minor - 1;
3757 } else {
3758 error = EPROTO;
3759 }
3760 }
3761 if (!error && options) {
3762 error = htb_parse_tca_options__(nl_options, options);
3763 }
3764 return error;
3765 }
3766
3767 static void
3768 htb_parse_qdisc_details__(struct netdev *netdev_,
3769 const struct smap *details, struct htb_class *hc)
3770 {
3771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3772 const char *max_rate_s;
3773
3774 max_rate_s = smap_get(details, "max-rate");
3775 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3776 if (!hc->max_rate) {
3777 enum netdev_features current;
3778
3779 netdev_linux_read_features(netdev);
3780 current = !netdev->get_features_error ? netdev->current : 0;
3781 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3782 }
3783 hc->min_rate = hc->max_rate;
3784 hc->burst = 0;
3785 hc->priority = 0;
3786 }
3787
3788 static int
3789 htb_parse_class_details__(struct netdev *netdev,
3790 const struct smap *details, struct htb_class *hc)
3791 {
3792 const struct htb *htb = htb_get__(netdev);
3793 const char *min_rate_s = smap_get(details, "min-rate");
3794 const char *max_rate_s = smap_get(details, "max-rate");
3795 const char *burst_s = smap_get(details, "burst");
3796 const char *priority_s = smap_get(details, "priority");
3797 int mtu, error;
3798
3799 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3800 if (error) {
3801 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3802 netdev_get_name(netdev));
3803 return error;
3804 }
3805
3806 /* HTB requires at least an mtu sized min-rate to send any traffic even
3807 * on uncongested links. */
3808 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3809 hc->min_rate = MAX(hc->min_rate, mtu);
3810 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3811
3812 /* max-rate */
3813 hc->max_rate = (max_rate_s
3814 ? strtoull(max_rate_s, NULL, 10) / 8
3815 : htb->max_rate);
3816 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3817 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3818
3819 /* burst
3820 *
3821 * According to hints in the documentation that I've read, it is important
3822 * that 'burst' be at least as big as the largest frame that might be
3823 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3824 * but having it a bit too small is a problem. Since netdev_get_mtu()
3825 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3826 * the MTU. We actually add 64, instead of 14, as a guard against
3827 * additional headers get tacked on somewhere that we're not aware of. */
3828 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3829 hc->burst = MAX(hc->burst, mtu + 64);
3830
3831 /* priority */
3832 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3833
3834 return 0;
3835 }
3836
3837 static int
3838 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3839 unsigned int parent, struct htb_class *options,
3840 struct netdev_queue_stats *stats)
3841 {
3842 struct ofpbuf *reply;
3843 int error;
3844
3845 error = tc_query_class(netdev, handle, parent, &reply);
3846 if (!error) {
3847 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3848 ofpbuf_delete(reply);
3849 }
3850 return error;
3851 }
3852
3853 static int
3854 htb_tc_install(struct netdev *netdev, const struct smap *details)
3855 {
3856 int error;
3857
3858 error = htb_setup_qdisc__(netdev);
3859 if (!error) {
3860 struct htb_class hc;
3861
3862 htb_parse_qdisc_details__(netdev, details, &hc);
3863 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3864 tc_make_handle(1, 0), &hc);
3865 if (!error) {
3866 htb_install__(netdev, hc.max_rate);
3867 }
3868 }
3869 return error;
3870 }
3871
3872 static struct htb_class *
3873 htb_class_cast__(const struct tc_queue *queue)
3874 {
3875 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3876 }
3877
3878 static void
3879 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3880 const struct htb_class *hc)
3881 {
3882 struct htb *htb = htb_get__(netdev);
3883 size_t hash = hash_int(queue_id, 0);
3884 struct tc_queue *queue;
3885 struct htb_class *hcp;
3886
3887 queue = tc_find_queue__(netdev, queue_id, hash);
3888 if (queue) {
3889 hcp = htb_class_cast__(queue);
3890 } else {
3891 hcp = xmalloc(sizeof *hcp);
3892 queue = &hcp->tc_queue;
3893 queue->queue_id = queue_id;
3894 queue->created = time_msec();
3895 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3896 }
3897
3898 hcp->min_rate = hc->min_rate;
3899 hcp->max_rate = hc->max_rate;
3900 hcp->burst = hc->burst;
3901 hcp->priority = hc->priority;
3902 }
3903
3904 static int
3905 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3906 {
3907 struct ofpbuf msg;
3908 struct queue_dump_state state;
3909 struct htb_class hc;
3910
3911 /* Get qdisc options. */
3912 hc.max_rate = 0;
3913 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3914 htb_install__(netdev, hc.max_rate);
3915
3916 /* Get queues. */
3917 if (!start_queue_dump(netdev, &state)) {
3918 return ENODEV;
3919 }
3920 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3921 unsigned int queue_id;
3922
3923 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3924 htb_update_queue__(netdev, queue_id, &hc);
3925 }
3926 }
3927 finish_queue_dump(&state);
3928
3929 return 0;
3930 }
3931
3932 static void
3933 htb_tc_destroy(struct tc *tc)
3934 {
3935 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3936 struct htb_class *hc, *next;
3937
3938 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3939 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3940 free(hc);
3941 }
3942 tc_destroy(tc);
3943 free(htb);
3944 }
3945
3946 static int
3947 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3948 {
3949 const struct htb *htb = htb_get__(netdev);
3950 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3951 return 0;
3952 }
3953
3954 static int
3955 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3956 {
3957 struct htb_class hc;
3958 int error;
3959
3960 htb_parse_qdisc_details__(netdev, details, &hc);
3961 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3962 tc_make_handle(1, 0), &hc);
3963 if (!error) {
3964 htb_get__(netdev)->max_rate = hc.max_rate;
3965 }
3966 return error;
3967 }
3968
3969 static int
3970 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3971 const struct tc_queue *queue, struct smap *details)
3972 {
3973 const struct htb_class *hc = htb_class_cast__(queue);
3974
3975 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3976 if (hc->min_rate != hc->max_rate) {
3977 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3978 }
3979 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3980 if (hc->priority) {
3981 smap_add_format(details, "priority", "%u", hc->priority);
3982 }
3983 return 0;
3984 }
3985
3986 static int
3987 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3988 const struct smap *details)
3989 {
3990 struct htb_class hc;
3991 int error;
3992
3993 error = htb_parse_class_details__(netdev, details, &hc);
3994 if (error) {
3995 return error;
3996 }
3997
3998 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3999 tc_make_handle(1, 0xfffe), &hc);
4000 if (error) {
4001 return error;
4002 }
4003
4004 htb_update_queue__(netdev, queue_id, &hc);
4005 return 0;
4006 }
4007
4008 static int
4009 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4010 {
4011 struct htb_class *hc = htb_class_cast__(queue);
4012 struct htb *htb = htb_get__(netdev);
4013 int error;
4014
4015 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4016 if (!error) {
4017 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4018 free(hc);
4019 }
4020 return error;
4021 }
4022
4023 static int
4024 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4025 struct netdev_queue_stats *stats)
4026 {
4027 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4028 tc_make_handle(1, 0xfffe), NULL, stats);
4029 }
4030
4031 static int
4032 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4033 const struct ofpbuf *nlmsg,
4034 netdev_dump_queue_stats_cb *cb, void *aux)
4035 {
4036 struct netdev_queue_stats stats;
4037 unsigned int handle, major, minor;
4038 int error;
4039
4040 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4041 if (error) {
4042 return error;
4043 }
4044
4045 major = tc_get_major(handle);
4046 minor = tc_get_minor(handle);
4047 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4048 (*cb)(minor - 1, &stats, aux);
4049 }
4050 return 0;
4051 }
4052
4053 static const struct tc_ops tc_ops_htb = {
4054 "htb", /* linux_name */
4055 "linux-htb", /* ovs_name */
4056 HTB_N_QUEUES, /* n_queues */
4057 htb_tc_install,
4058 htb_tc_load,
4059 htb_tc_destroy,
4060 htb_qdisc_get,
4061 htb_qdisc_set,
4062 htb_class_get,
4063 htb_class_set,
4064 htb_class_delete,
4065 htb_class_get_stats,
4066 htb_class_dump_stats
4067 };
4068 \f
4069 /* "linux-hfsc" traffic control class. */
4070
4071 #define HFSC_N_QUEUES 0xf000
4072
4073 struct hfsc {
4074 struct tc tc;
4075 uint32_t max_rate;
4076 };
4077
4078 struct hfsc_class {
4079 struct tc_queue tc_queue;
4080 uint32_t min_rate;
4081 uint32_t max_rate;
4082 };
4083
4084 static struct hfsc *
4085 hfsc_get__(const struct netdev *netdev_)
4086 {
4087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4088 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4089 }
4090
4091 static struct hfsc_class *
4092 hfsc_class_cast__(const struct tc_queue *queue)
4093 {
4094 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4095 }
4096
4097 static void
4098 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4099 {
4100 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4101 struct hfsc *hfsc;
4102
4103 hfsc = xmalloc(sizeof *hfsc);
4104 tc_init(&hfsc->tc, &tc_ops_hfsc);
4105 hfsc->max_rate = max_rate;
4106 netdev->tc = &hfsc->tc;
4107 }
4108
4109 static void
4110 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4111 const struct hfsc_class *hc)
4112 {
4113 size_t hash;
4114 struct hfsc *hfsc;
4115 struct hfsc_class *hcp;
4116 struct tc_queue *queue;
4117
4118 hfsc = hfsc_get__(netdev);
4119 hash = hash_int(queue_id, 0);
4120
4121 queue = tc_find_queue__(netdev, queue_id, hash);
4122 if (queue) {
4123 hcp = hfsc_class_cast__(queue);
4124 } else {
4125 hcp = xmalloc(sizeof *hcp);
4126 queue = &hcp->tc_queue;
4127 queue->queue_id = queue_id;
4128 queue->created = time_msec();
4129 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4130 }
4131
4132 hcp->min_rate = hc->min_rate;
4133 hcp->max_rate = hc->max_rate;
4134 }
4135
4136 static int
4137 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4138 {
4139 const struct tc_service_curve *rsc, *fsc, *usc;
4140 static const struct nl_policy tca_hfsc_policy[] = {
4141 [TCA_HFSC_RSC] = {
4142 .type = NL_A_UNSPEC,
4143 .optional = false,
4144 .min_len = sizeof(struct tc_service_curve),
4145 },
4146 [TCA_HFSC_FSC] = {
4147 .type = NL_A_UNSPEC,
4148 .optional = false,
4149 .min_len = sizeof(struct tc_service_curve),
4150 },
4151 [TCA_HFSC_USC] = {
4152 .type = NL_A_UNSPEC,
4153 .optional = false,
4154 .min_len = sizeof(struct tc_service_curve),
4155 },
4156 };
4157 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4158
4159 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4160 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4161 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4162 return EPROTO;
4163 }
4164
4165 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4166 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4167 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4168
4169 if (rsc->m1 != 0 || rsc->d != 0 ||
4170 fsc->m1 != 0 || fsc->d != 0 ||
4171 usc->m1 != 0 || usc->d != 0) {
4172 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4173 "Non-linear service curves are not supported.");
4174 return EPROTO;
4175 }
4176
4177 if (rsc->m2 != fsc->m2) {
4178 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4179 "Real-time service curves are not supported ");
4180 return EPROTO;
4181 }
4182
4183 if (rsc->m2 > usc->m2) {
4184 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4185 "Min-rate service curve is greater than "
4186 "the max-rate service curve.");
4187 return EPROTO;
4188 }
4189
4190 class->min_rate = fsc->m2;
4191 class->max_rate = usc->m2;
4192 return 0;
4193 }
4194
4195 static int
4196 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4197 struct hfsc_class *options,
4198 struct netdev_queue_stats *stats)
4199 {
4200 int error;
4201 unsigned int handle;
4202 struct nlattr *nl_options;
4203
4204 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4205 if (error) {
4206 return error;
4207 }
4208
4209 if (queue_id) {
4210 unsigned int major, minor;
4211
4212 major = tc_get_major(handle);
4213 minor = tc_get_minor(handle);
4214 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4215 *queue_id = minor - 1;
4216 } else {
4217 return EPROTO;
4218 }
4219 }
4220
4221 if (options) {
4222 error = hfsc_parse_tca_options__(nl_options, options);
4223 }
4224
4225 return error;
4226 }
4227
4228 static int
4229 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4230 unsigned int parent, struct hfsc_class *options,
4231 struct netdev_queue_stats *stats)
4232 {
4233 int error;
4234 struct ofpbuf *reply;
4235
4236 error = tc_query_class(netdev, handle, parent, &reply);
4237 if (error) {
4238 return error;
4239 }
4240
4241 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4242 ofpbuf_delete(reply);
4243 return error;
4244 }
4245
4246 static void
4247 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4248 struct hfsc_class *class)
4249 {
4250 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4251 uint32_t max_rate;
4252 const char *max_rate_s;
4253
4254 max_rate_s = smap_get(details, "max-rate");
4255 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4256
4257 if (!max_rate) {
4258 enum netdev_features current;
4259
4260 netdev_linux_read_features(netdev);
4261 current = !netdev->get_features_error ? netdev->current : 0;
4262 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4263 }
4264
4265 class->min_rate = max_rate;
4266 class->max_rate = max_rate;
4267 }
4268
4269 static int
4270 hfsc_parse_class_details__(struct netdev *netdev,
4271 const struct smap *details,
4272 struct hfsc_class * class)
4273 {
4274 const struct hfsc *hfsc;
4275 uint32_t min_rate, max_rate;
4276 const char *min_rate_s, *max_rate_s;
4277
4278 hfsc = hfsc_get__(netdev);
4279 min_rate_s = smap_get(details, "min-rate");
4280 max_rate_s = smap_get(details, "max-rate");
4281
4282 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4283 min_rate = MAX(min_rate, 1);
4284 min_rate = MIN(min_rate, hfsc->max_rate);
4285
4286 max_rate = (max_rate_s
4287 ? strtoull(max_rate_s, NULL, 10) / 8
4288 : hfsc->max_rate);
4289 max_rate = MAX(max_rate, min_rate);
4290 max_rate = MIN(max_rate, hfsc->max_rate);
4291
4292 class->min_rate = min_rate;
4293 class->max_rate = max_rate;
4294
4295 return 0;
4296 }
4297
4298 /* Create an HFSC qdisc.
4299 *
4300 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4301 static int
4302 hfsc_setup_qdisc__(struct netdev * netdev)
4303 {
4304 struct tcmsg *tcmsg;
4305 struct ofpbuf request;
4306 struct tc_hfsc_qopt opt;
4307
4308 tc_del_qdisc(netdev);
4309
4310 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4311 NLM_F_EXCL | NLM_F_CREATE, &request);
4312
4313 if (!tcmsg) {
4314 return ENODEV;
4315 }
4316
4317 tcmsg->tcm_handle = tc_make_handle(1, 0);
4318 tcmsg->tcm_parent = TC_H_ROOT;
4319
4320 memset(&opt, 0, sizeof opt);
4321 opt.defcls = 1;
4322
4323 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4324 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4325
4326 return tc_transact(&request, NULL);
4327 }
4328
4329 /* Create an HFSC class.
4330 *
4331 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4332 * sc rate <min_rate> ul rate <max_rate>" */
4333 static int
4334 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4335 unsigned int parent, struct hfsc_class *class)
4336 {
4337 int error;
4338 size_t opt_offset;
4339 struct tcmsg *tcmsg;
4340 struct ofpbuf request;
4341 struct tc_service_curve min, max;
4342
4343 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4344
4345 if (!tcmsg) {
4346 return ENODEV;
4347 }
4348
4349 tcmsg->tcm_handle = handle;
4350 tcmsg->tcm_parent = parent;
4351
4352 min.m1 = 0;
4353 min.d = 0;
4354 min.m2 = class->min_rate;
4355
4356 max.m1 = 0;
4357 max.d = 0;
4358 max.m2 = class->max_rate;
4359
4360 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4361 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4362 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4363 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4364 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4365 nl_msg_end_nested(&request, opt_offset);
4366
4367 error = tc_transact(&request, NULL);
4368 if (error) {
4369 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4370 "min-rate %ubps, max-rate %ubps (%s)",
4371 netdev_get_name(netdev),
4372 tc_get_major(handle), tc_get_minor(handle),
4373 tc_get_major(parent), tc_get_minor(parent),
4374 class->min_rate, class->max_rate, ovs_strerror(error));
4375 }
4376
4377 return error;
4378 }
4379
4380 static int
4381 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4382 {
4383 int error;
4384 struct hfsc_class class;
4385
4386 error = hfsc_setup_qdisc__(netdev);
4387
4388 if (error) {
4389 return error;
4390 }
4391
4392 hfsc_parse_qdisc_details__(netdev, details, &class);
4393 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4394 tc_make_handle(1, 0), &class);
4395
4396 if (error) {
4397 return error;
4398 }
4399
4400 hfsc_install__(netdev, class.max_rate);
4401 return 0;
4402 }
4403
4404 static int
4405 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4406 {
4407 struct ofpbuf msg;
4408 struct queue_dump_state state;
4409 struct hfsc_class hc;
4410
4411 hc.max_rate = 0;
4412 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4413 hfsc_install__(netdev, hc.max_rate);
4414
4415 if (!start_queue_dump(netdev, &state)) {
4416 return ENODEV;
4417 }
4418
4419 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4420 unsigned int queue_id;
4421
4422 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4423 hfsc_update_queue__(netdev, queue_id, &hc);
4424 }
4425 }
4426
4427 finish_queue_dump(&state);
4428 return 0;
4429 }
4430
4431 static void
4432 hfsc_tc_destroy(struct tc *tc)
4433 {
4434 struct hfsc *hfsc;
4435 struct hfsc_class *hc, *next;
4436
4437 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4438
4439 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4440 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4441 free(hc);
4442 }
4443
4444 tc_destroy(tc);
4445 free(hfsc);
4446 }
4447
4448 static int
4449 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4450 {
4451 const struct hfsc *hfsc;
4452 hfsc = hfsc_get__(netdev);
4453 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4454 return 0;
4455 }
4456
4457 static int
4458 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4459 {
4460 int error;
4461 struct hfsc_class class;
4462
4463 hfsc_parse_qdisc_details__(netdev, details, &class);
4464 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4465 tc_make_handle(1, 0), &class);
4466
4467 if (!error) {
4468 hfsc_get__(netdev)->max_rate = class.max_rate;
4469 }
4470
4471 return error;
4472 }
4473
4474 static int
4475 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4476 const struct tc_queue *queue, struct smap *details)
4477 {
4478 const struct hfsc_class *hc;
4479
4480 hc = hfsc_class_cast__(queue);
4481 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4482 if (hc->min_rate != hc->max_rate) {
4483 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4484 }
4485 return 0;
4486 }
4487
4488 static int
4489 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4490 const struct smap *details)
4491 {
4492 int error;
4493 struct hfsc_class class;
4494
4495 error = hfsc_parse_class_details__(netdev, details, &class);
4496 if (error) {
4497 return error;
4498 }
4499
4500 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4501 tc_make_handle(1, 0xfffe), &class);
4502 if (error) {
4503 return error;
4504 }
4505
4506 hfsc_update_queue__(netdev, queue_id, &class);
4507 return 0;
4508 }
4509
4510 static int
4511 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4512 {
4513 int error;
4514 struct hfsc *hfsc;
4515 struct hfsc_class *hc;
4516
4517 hc = hfsc_class_cast__(queue);
4518 hfsc = hfsc_get__(netdev);
4519
4520 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4521 if (!error) {
4522 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4523 free(hc);
4524 }
4525 return error;
4526 }
4527
4528 static int
4529 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4530 struct netdev_queue_stats *stats)
4531 {
4532 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4533 tc_make_handle(1, 0xfffe), NULL, stats);
4534 }
4535
4536 static int
4537 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4538 const struct ofpbuf *nlmsg,
4539 netdev_dump_queue_stats_cb *cb, void *aux)
4540 {
4541 struct netdev_queue_stats stats;
4542 unsigned int handle, major, minor;
4543 int error;
4544
4545 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4546 if (error) {
4547 return error;
4548 }
4549
4550 major = tc_get_major(handle);
4551 minor = tc_get_minor(handle);
4552 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4553 (*cb)(minor - 1, &stats, aux);
4554 }
4555 return 0;
4556 }
4557
4558 static const struct tc_ops tc_ops_hfsc = {
4559 "hfsc", /* linux_name */
4560 "linux-hfsc", /* ovs_name */
4561 HFSC_N_QUEUES, /* n_queues */
4562 hfsc_tc_install, /* tc_install */
4563 hfsc_tc_load, /* tc_load */
4564 hfsc_tc_destroy, /* tc_destroy */
4565 hfsc_qdisc_get, /* qdisc_get */
4566 hfsc_qdisc_set, /* qdisc_set */
4567 hfsc_class_get, /* class_get */
4568 hfsc_class_set, /* class_set */
4569 hfsc_class_delete, /* class_delete */
4570 hfsc_class_get_stats, /* class_get_stats */
4571 hfsc_class_dump_stats /* class_dump_stats */
4572 };
4573 \f
4574 /* "linux-default" traffic control class.
4575 *
4576 * This class represents the default, unnamed Linux qdisc. It corresponds to
4577 * the "" (empty string) QoS type in the OVS database. */
4578
4579 static void
4580 default_install__(struct netdev *netdev_)
4581 {
4582 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4583 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4584
4585 /* Nothing but a tc class implementation is allowed to write to a tc. This
4586 * class never does that, so we can legitimately use a const tc object. */
4587 netdev->tc = CONST_CAST(struct tc *, &tc);
4588 }
4589
4590 static int
4591 default_tc_install(struct netdev *netdev,
4592 const struct smap *details OVS_UNUSED)
4593 {
4594 default_install__(netdev);
4595 return 0;
4596 }
4597
4598 static int
4599 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4600 {
4601 default_install__(netdev);
4602 return 0;
4603 }
4604
4605 static const struct tc_ops tc_ops_default = {
4606 NULL, /* linux_name */
4607 "", /* ovs_name */
4608 0, /* n_queues */
4609 default_tc_install,
4610 default_tc_load,
4611 NULL, /* tc_destroy */
4612 NULL, /* qdisc_get */
4613 NULL, /* qdisc_set */
4614 NULL, /* class_get */
4615 NULL, /* class_set */
4616 NULL, /* class_delete */
4617 NULL, /* class_get_stats */
4618 NULL /* class_dump_stats */
4619 };
4620 \f
4621 /* "linux-other" traffic control class.
4622 *
4623 * */
4624
4625 static int
4626 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4627 {
4628 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4629 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4630
4631 /* Nothing but a tc class implementation is allowed to write to a tc. This
4632 * class never does that, so we can legitimately use a const tc object. */
4633 netdev->tc = CONST_CAST(struct tc *, &tc);
4634 return 0;
4635 }
4636
4637 static const struct tc_ops tc_ops_other = {
4638 NULL, /* linux_name */
4639 "linux-other", /* ovs_name */
4640 0, /* n_queues */
4641 NULL, /* tc_install */
4642 other_tc_load,
4643 NULL, /* tc_destroy */
4644 NULL, /* qdisc_get */
4645 NULL, /* qdisc_set */
4646 NULL, /* class_get */
4647 NULL, /* class_set */
4648 NULL, /* class_delete */
4649 NULL, /* class_get_stats */
4650 NULL /* class_dump_stats */
4651 };
4652 \f
4653 /* Traffic control. */
4654
4655 /* Number of kernel "tc" ticks per second. */
4656 static double ticks_per_s;
4657
4658 /* Number of kernel "jiffies" per second. This is used for the purpose of
4659 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4660 * one jiffy's worth of data.
4661 *
4662 * There are two possibilities here:
4663 *
4664 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4665 * approximate range of 100 to 1024. That means that we really need to
4666 * make sure that the qdisc can buffer that much data.
4667 *
4668 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4669 * has finely granular timers and there's no need to fudge additional room
4670 * for buffers. (There's no extra effort needed to implement that: the
4671 * large 'buffer_hz' is used as a divisor, so practically any number will
4672 * come out as 0 in the division. Small integer results in the case of
4673 * really high dividends won't have any real effect anyhow.)
4674 */
4675 static unsigned int buffer_hz;
4676
4677 /* Returns tc handle 'major':'minor'. */
4678 static unsigned int
4679 tc_make_handle(unsigned int major, unsigned int minor)
4680 {
4681 return TC_H_MAKE(major << 16, minor);
4682 }
4683
4684 /* Returns the major number from 'handle'. */
4685 static unsigned int
4686 tc_get_major(unsigned int handle)
4687 {
4688 return TC_H_MAJ(handle) >> 16;
4689 }
4690
4691 /* Returns the minor number from 'handle'. */
4692 static unsigned int
4693 tc_get_minor(unsigned int handle)
4694 {
4695 return TC_H_MIN(handle);
4696 }
4697
4698 static struct tcmsg *
4699 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4700 struct ofpbuf *request)
4701 {
4702 struct tcmsg *tcmsg;
4703 int ifindex;
4704 int error;
4705
4706 error = get_ifindex(netdev, &ifindex);
4707 if (error) {
4708 return NULL;
4709 }
4710
4711 ofpbuf_init(request, 512);
4712 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4713 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4714 tcmsg->tcm_family = AF_UNSPEC;
4715 tcmsg->tcm_ifindex = ifindex;
4716 /* Caller should fill in tcmsg->tcm_handle. */
4717 /* Caller should fill in tcmsg->tcm_parent. */
4718
4719 return tcmsg;
4720 }
4721
4722 static int
4723 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4724 {
4725 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4726 ofpbuf_uninit(request);
4727 return error;
4728 }
4729
4730 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4731 * policing configuration.
4732 *
4733 * This function is equivalent to running the following when 'add' is true:
4734 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4735 *
4736 * This function is equivalent to running the following when 'add' is false:
4737 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4738 *
4739 * The configuration and stats may be seen with the following command:
4740 * /sbin/tc -s qdisc show dev <devname>
4741 *
4742 * Returns 0 if successful, otherwise a positive errno value.
4743 */
4744 static int
4745 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4746 {
4747 struct ofpbuf request;
4748 struct tcmsg *tcmsg;
4749 int error;
4750 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4751 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4752
4753 tcmsg = tc_make_request(netdev, type, flags, &request);
4754 if (!tcmsg) {
4755 return ENODEV;
4756 }
4757 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4758 tcmsg->tcm_parent = TC_H_INGRESS;
4759 nl_msg_put_string(&request, TCA_KIND, "ingress");
4760 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4761
4762 error = tc_transact(&request, NULL);
4763 if (error) {
4764 /* If we're deleting the qdisc, don't worry about some of the
4765 * error conditions. */
4766 if (!add && (error == ENOENT || error == EINVAL)) {
4767 return 0;
4768 }
4769 return error;
4770 }
4771
4772 return 0;
4773 }
4774
4775 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4776 * of 'kbits_burst'.
4777 *
4778 * This function is equivalent to running:
4779 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4780 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4781 * mtu 65535 drop
4782 *
4783 * The configuration and stats may be seen with the following command:
4784 * /sbin/tc -s filter show dev <devname> parent ffff:
4785 *
4786 * Returns 0 if successful, otherwise a positive errno value.
4787 */
4788 static int
4789 tc_add_policer(struct netdev *netdev,
4790 uint32_t kbits_rate, uint32_t kbits_burst)
4791 {
4792 struct tc_police tc_police;
4793 struct ofpbuf request;
4794 struct tcmsg *tcmsg;
4795 size_t basic_offset;
4796 size_t police_offset;
4797 int error;
4798 int mtu = 65535;
4799
4800 memset(&tc_police, 0, sizeof tc_police);
4801 tc_police.action = TC_POLICE_SHOT;
4802 tc_police.mtu = mtu;
4803 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4804
4805 /* The following appears wrong in two ways:
4806 *
4807 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4808 * arguments (or at least consistently "bytes" as both or "bits" as
4809 * both), but this supplies bytes for the first argument and bits for the
4810 * second.
4811 *
4812 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4813 *
4814 * However if you "fix" those problems then "tc filter show ..." shows
4815 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4816 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4817 * tc's point of view. Whatever. */
4818 tc_police.burst = tc_bytes_to_ticks(
4819 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4820
4821 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4822 NLM_F_EXCL | NLM_F_CREATE, &request);
4823 if (!tcmsg) {
4824 return ENODEV;
4825 }
4826 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4827 tcmsg->tcm_info = tc_make_handle(49,
4828 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4829
4830 nl_msg_put_string(&request, TCA_KIND, "basic");
4831 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4832 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4833 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4834 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4835 nl_msg_end_nested(&request, police_offset);
4836 nl_msg_end_nested(&request, basic_offset);
4837
4838 error = tc_transact(&request, NULL);
4839 if (error) {
4840 return error;
4841 }
4842
4843 return 0;
4844 }
4845
4846 static void
4847 read_psched(void)
4848 {
4849 /* The values in psched are not individually very meaningful, but they are
4850 * important. The tables below show some values seen in the wild.
4851 *
4852 * Some notes:
4853 *
4854 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4855 * (Before that, there are hints that it was 1000000000.)
4856 *
4857 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4858 * above.
4859 *
4860 * /proc/net/psched
4861 * -----------------------------------
4862 * [1] 000c8000 000f4240 000f4240 00000064
4863 * [2] 000003e8 00000400 000f4240 3b9aca00
4864 * [3] 000003e8 00000400 000f4240 3b9aca00
4865 * [4] 000003e8 00000400 000f4240 00000064
4866 * [5] 000003e8 00000040 000f4240 3b9aca00
4867 * [6] 000003e8 00000040 000f4240 000000f9
4868 *
4869 * a b c d ticks_per_s buffer_hz
4870 * ------- --------- ---------- ------------- ----------- -------------
4871 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4872 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4873 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4874 * [4] 1,000 1,024 1,000,000 100 976,562 100
4875 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4876 * [6] 1,000 64 1,000,000 249 15,625,000 249
4877 *
4878 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4879 * [2] 2.6.26-1-686-bigmem from Debian lenny
4880 * [3] 2.6.26-2-sparc64 from Debian lenny
4881 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4882 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4883 * [6] 2.6.34 from kernel.org on KVM
4884 */
4885 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4886 static const char fn[] = "/proc/net/psched";
4887 unsigned int a, b, c, d;
4888 FILE *stream;
4889
4890 if (!ovsthread_once_start(&once)) {
4891 return;
4892 }
4893
4894 ticks_per_s = 1.0;
4895 buffer_hz = 100;
4896
4897 stream = fopen(fn, "r");
4898 if (!stream) {
4899 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4900 goto exit;
4901 }
4902
4903 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4904 VLOG_WARN("%s: read failed", fn);
4905 fclose(stream);
4906 goto exit;
4907 }
4908 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4909 fclose(stream);
4910
4911 if (!a || !c) {
4912 VLOG_WARN("%s: invalid scheduler parameters", fn);
4913 goto exit;
4914 }
4915
4916 ticks_per_s = (double) a * c / b;
4917 if (c == 1000000) {
4918 buffer_hz = d;
4919 } else {
4920 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4921 fn, a, b, c, d);
4922 }
4923 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4924
4925 exit:
4926 ovsthread_once_done(&once);
4927 }
4928
4929 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4930 * rate of 'rate' bytes per second. */
4931 static unsigned int
4932 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4933 {
4934 read_psched();
4935 return (rate * ticks) / ticks_per_s;
4936 }
4937
4938 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4939 * rate of 'rate' bytes per second. */
4940 static unsigned int
4941 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4942 {
4943 read_psched();
4944 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4945 }
4946
4947 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4948 * a transmission rate of 'rate' bytes per second. */
4949 static unsigned int
4950 tc_buffer_per_jiffy(unsigned int rate)
4951 {
4952 read_psched();
4953 return rate / buffer_hz;
4954 }
4955
4956 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4957 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4958 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4959 * stores NULL into it if it is absent.
4960 *
4961 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4962 * 'msg'.
4963 *
4964 * Returns 0 if successful, otherwise a positive errno value. */
4965 static int
4966 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4967 struct nlattr **options)
4968 {
4969 static const struct nl_policy tca_policy[] = {
4970 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4971 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4972 };
4973 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4974
4975 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4976 tca_policy, ta, ARRAY_SIZE(ta))) {
4977 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4978 goto error;
4979 }
4980
4981 if (kind) {
4982 *kind = nl_attr_get_string(ta[TCA_KIND]);
4983 }
4984
4985 if (options) {
4986 *options = ta[TCA_OPTIONS];
4987 }
4988
4989 return 0;
4990
4991 error:
4992 if (kind) {
4993 *kind = NULL;
4994 }
4995 if (options) {
4996 *options = NULL;
4997 }
4998 return EPROTO;
4999 }
5000
5001 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5002 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5003 * into '*options', and its queue statistics into '*stats'. Any of the output
5004 * arguments may be null.
5005 *
5006 * Returns 0 if successful, otherwise a positive errno value. */
5007 static int
5008 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5009 struct nlattr **options, struct netdev_queue_stats *stats)
5010 {
5011 static const struct nl_policy tca_policy[] = {
5012 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5013 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5014 };
5015 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5016
5017 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5018 tca_policy, ta, ARRAY_SIZE(ta))) {
5019 VLOG_WARN_RL(&rl, "failed to parse class message");
5020 goto error;
5021 }
5022
5023 if (handlep) {
5024 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5025 *handlep = tc->tcm_handle;
5026 }
5027
5028 if (options) {
5029 *options = ta[TCA_OPTIONS];
5030 }
5031
5032 if (stats) {
5033 const struct gnet_stats_queue *gsq;
5034 struct gnet_stats_basic gsb;
5035
5036 static const struct nl_policy stats_policy[] = {
5037 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5038 .min_len = sizeof gsb },
5039 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5040 .min_len = sizeof *gsq },
5041 };
5042 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5043
5044 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5045 sa, ARRAY_SIZE(sa))) {
5046 VLOG_WARN_RL(&rl, "failed to parse class stats");
5047 goto error;
5048 }
5049
5050 /* Alignment issues screw up the length of struct gnet_stats_basic on
5051 * some arch/bitsize combinations. Newer versions of Linux have a
5052 * struct gnet_stats_basic_packed, but we can't depend on that. The
5053 * easiest thing to do is just to make a copy. */
5054 memset(&gsb, 0, sizeof gsb);
5055 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5056 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5057 stats->tx_bytes = gsb.bytes;
5058 stats->tx_packets = gsb.packets;
5059
5060 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5061 stats->tx_errors = gsq->drops;
5062 }
5063
5064 return 0;
5065
5066 error:
5067 if (options) {
5068 *options = NULL;
5069 }
5070 if (stats) {
5071 memset(stats, 0, sizeof *stats);
5072 }
5073 return EPROTO;
5074 }
5075
5076 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5077 * on 'netdev'. */
5078 static int
5079 tc_query_class(const struct netdev *netdev,
5080 unsigned int handle, unsigned int parent,
5081 struct ofpbuf **replyp)
5082 {
5083 struct ofpbuf request;
5084 struct tcmsg *tcmsg;
5085 int error;
5086
5087 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5088 if (!tcmsg) {
5089 return ENODEV;
5090 }
5091 tcmsg->tcm_handle = handle;
5092 tcmsg->tcm_parent = parent;
5093
5094 error = tc_transact(&request, replyp);
5095 if (error) {
5096 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5097 netdev_get_name(netdev),
5098 tc_get_major(handle), tc_get_minor(handle),
5099 tc_get_major(parent), tc_get_minor(parent),
5100 ovs_strerror(error));
5101 }
5102 return error;
5103 }
5104
5105 /* Equivalent to "tc class del dev <name> handle <handle>". */
5106 static int
5107 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5108 {
5109 struct ofpbuf request;
5110 struct tcmsg *tcmsg;
5111 int error;
5112
5113 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5114 if (!tcmsg) {
5115 return ENODEV;
5116 }
5117 tcmsg->tcm_handle = handle;
5118 tcmsg->tcm_parent = 0;
5119
5120 error = tc_transact(&request, NULL);
5121 if (error) {
5122 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5123 netdev_get_name(netdev),
5124 tc_get_major(handle), tc_get_minor(handle),
5125 ovs_strerror(error));
5126 }
5127 return error;
5128 }
5129
5130 /* Equivalent to "tc qdisc del dev <name> root". */
5131 static int
5132 tc_del_qdisc(struct netdev *netdev_)
5133 {
5134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5135 struct ofpbuf request;
5136 struct tcmsg *tcmsg;
5137 int error;
5138
5139 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5140 if (!tcmsg) {
5141 return ENODEV;
5142 }
5143 tcmsg->tcm_handle = tc_make_handle(1, 0);
5144 tcmsg->tcm_parent = TC_H_ROOT;
5145
5146 error = tc_transact(&request, NULL);
5147 if (error == EINVAL) {
5148 /* EINVAL probably means that the default qdisc was in use, in which
5149 * case we've accomplished our purpose. */
5150 error = 0;
5151 }
5152 if (!error && netdev->tc) {
5153 if (netdev->tc->ops->tc_destroy) {
5154 netdev->tc->ops->tc_destroy(netdev->tc);
5155 }
5156 netdev->tc = NULL;
5157 }
5158 return error;
5159 }
5160
5161 static bool
5162 getqdisc_is_safe(void)
5163 {
5164 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5165 static bool safe = false;
5166
5167 if (ovsthread_once_start(&once)) {
5168 struct utsname utsname;
5169 int major, minor;
5170
5171 if (uname(&utsname) == -1) {
5172 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5173 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5174 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5175 } else if (major < 2 || (major == 2 && minor < 35)) {
5176 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5177 utsname.release);
5178 } else {
5179 safe = true;
5180 }
5181 ovsthread_once_done(&once);
5182 }
5183 return safe;
5184 }
5185
5186 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5187 * kernel to determine what they are. Returns 0 if successful, otherwise a
5188 * positive errno value. */
5189 static int
5190 tc_query_qdisc(const struct netdev *netdev_)
5191 {
5192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5193 struct ofpbuf request, *qdisc;
5194 const struct tc_ops *ops;
5195 struct tcmsg *tcmsg;
5196 int load_error;
5197 int error;
5198
5199 if (netdev->tc) {
5200 return 0;
5201 }
5202
5203 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5204 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5205 * 2.6.35 without that fix backported to it.
5206 *
5207 * To avoid the OOPS, we must not make a request that would attempt to dump
5208 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5209 * few others. There are a few ways that I can see to do this, but most of
5210 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5211 * technique chosen here is to assume that any non-default qdisc that we
5212 * create will have a class with handle 1:0. The built-in qdiscs only have
5213 * a class with handle 0:0.
5214 *
5215 * On Linux 2.6.35+ we use the straightforward method because it allows us
5216 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5217 * in such a case we get no response at all from the kernel (!) if a
5218 * builtin qdisc is in use (which is later caught by "!error &&
5219 * !qdisc->size"). */
5220 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5221 if (!tcmsg) {
5222 return ENODEV;
5223 }
5224 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5225 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5226
5227 /* Figure out what tc class to instantiate. */
5228 error = tc_transact(&request, &qdisc);
5229 if (!error && qdisc->size) {
5230 const char *kind;
5231
5232 error = tc_parse_qdisc(qdisc, &kind, NULL);
5233 if (error) {
5234 ops = &tc_ops_other;
5235 } else {
5236 ops = tc_lookup_linux_name(kind);
5237 if (!ops) {
5238 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5239 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5240
5241 ops = &tc_ops_other;
5242 }
5243 }
5244 } else if ((!error && !qdisc->size) || error == ENOENT) {
5245 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5246 * set up by some other entity that doesn't have a handle 1:0. We will
5247 * assume that it's the system default qdisc. */
5248 ops = &tc_ops_default;
5249 error = 0;
5250 } else {
5251 /* Who knows? Maybe the device got deleted. */
5252 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5253 netdev_get_name(netdev_), ovs_strerror(error));
5254 ops = &tc_ops_other;
5255 }
5256
5257 /* Instantiate it. */
5258 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5259 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5260 ofpbuf_delete(qdisc);
5261
5262 return error ? error : load_error;
5263 }
5264
5265 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5266 approximate the time to transmit packets of various lengths. For an MTU of
5267 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5268 represents two possible packet lengths; for a MTU of 513 through 1024, four
5269 possible lengths; and so on.
5270
5271 Returns, for the specified 'mtu', the number of bits that packet lengths
5272 need to be shifted right to fit within such a 256-entry table. */
5273 static int
5274 tc_calc_cell_log(unsigned int mtu)
5275 {
5276 int cell_log;
5277
5278 if (!mtu) {
5279 mtu = ETH_PAYLOAD_MAX;
5280 }
5281 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5282
5283 for (cell_log = 0; mtu >= 256; cell_log++) {
5284 mtu >>= 1;
5285 }
5286
5287 return cell_log;
5288 }
5289
5290 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5291 * of 'mtu'. */
5292 static void
5293 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5294 {
5295 memset(rate, 0, sizeof *rate);
5296 rate->cell_log = tc_calc_cell_log(mtu);
5297 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5298 /* rate->cell_align = 0; */ /* distro headers. */
5299 rate->mpu = ETH_TOTAL_MIN;
5300 rate->rate = Bps;
5301 }
5302
5303 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5304 * attribute of the specified "type".
5305 *
5306 * See tc_calc_cell_log() above for a description of "rtab"s. */
5307 static void
5308 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5309 {
5310 uint32_t *rtab;
5311 unsigned int i;
5312
5313 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5314 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5315 unsigned packet_size = (i + 1) << rate->cell_log;
5316 if (packet_size < rate->mpu) {
5317 packet_size = rate->mpu;
5318 }
5319 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5320 }
5321 }
5322
5323 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5324 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5325 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5326 * 0 is fine.) */
5327 static int
5328 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5329 {
5330 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5331 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5332 }
5333 \f
5334 /* Linux-only functions declared in netdev-linux.h */
5335
5336 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5337 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5338 int
5339 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5340 const char *flag_name, bool enable)
5341 {
5342 const char *netdev_name = netdev_get_name(netdev);
5343 struct ethtool_value evalue;
5344 uint32_t new_flags;
5345 int error;
5346
5347 COVERAGE_INC(netdev_get_ethtool);
5348 memset(&evalue, 0, sizeof evalue);
5349 error = netdev_linux_do_ethtool(netdev_name,
5350 (struct ethtool_cmd *)&evalue,
5351 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5352 if (error) {
5353 return error;
5354 }
5355
5356 COVERAGE_INC(netdev_set_ethtool);
5357 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5358 if (new_flags == evalue.data) {
5359 return 0;
5360 }
5361 evalue.data = new_flags;
5362 error = netdev_linux_do_ethtool(netdev_name,
5363 (struct ethtool_cmd *)&evalue,
5364 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5365 if (error) {
5366 return error;
5367 }
5368
5369 COVERAGE_INC(netdev_get_ethtool);
5370 memset(&evalue, 0, sizeof evalue);
5371 error = netdev_linux_do_ethtool(netdev_name,
5372 (struct ethtool_cmd *)&evalue,
5373 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5374 if (error) {
5375 return error;
5376 }
5377
5378 if (new_flags != evalue.data) {
5379 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5380 "device %s failed", enable ? "enable" : "disable",
5381 flag_name, netdev_name);
5382 return EOPNOTSUPP;
5383 }
5384
5385 return 0;
5386 }
5387 \f
5388 /* Utility functions. */
5389
5390 /* Copies 'src' into 'dst', performing format conversion in the process. */
5391 static void
5392 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5393 const struct rtnl_link_stats *src)
5394 {
5395 dst->rx_packets = src->rx_packets;
5396 dst->tx_packets = src->tx_packets;
5397 dst->rx_bytes = src->rx_bytes;
5398 dst->tx_bytes = src->tx_bytes;
5399 dst->rx_errors = src->rx_errors;
5400 dst->tx_errors = src->tx_errors;
5401 dst->rx_dropped = src->rx_dropped;
5402 dst->tx_dropped = src->tx_dropped;
5403 dst->multicast = src->multicast;
5404 dst->collisions = src->collisions;
5405 dst->rx_length_errors = src->rx_length_errors;
5406 dst->rx_over_errors = src->rx_over_errors;
5407 dst->rx_crc_errors = src->rx_crc_errors;
5408 dst->rx_frame_errors = src->rx_frame_errors;
5409 dst->rx_fifo_errors = src->rx_fifo_errors;
5410 dst->rx_missed_errors = src->rx_missed_errors;
5411 dst->tx_aborted_errors = src->tx_aborted_errors;
5412 dst->tx_carrier_errors = src->tx_carrier_errors;
5413 dst->tx_fifo_errors = src->tx_fifo_errors;
5414 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5415 dst->tx_window_errors = src->tx_window_errors;
5416 }
5417
5418 /* Copies 'src' into 'dst', performing format conversion in the process. */
5419 static void
5420 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5421 const struct rtnl_link_stats64 *src)
5422 {
5423 dst->rx_packets = src->rx_packets;
5424 dst->tx_packets = src->tx_packets;
5425 dst->rx_bytes = src->rx_bytes;
5426 dst->tx_bytes = src->tx_bytes;
5427 dst->rx_errors = src->rx_errors;
5428 dst->tx_errors = src->tx_errors;
5429 dst->rx_dropped = src->rx_dropped;
5430 dst->tx_dropped = src->tx_dropped;
5431 dst->multicast = src->multicast;
5432 dst->collisions = src->collisions;
5433 dst->rx_length_errors = src->rx_length_errors;
5434 dst->rx_over_errors = src->rx_over_errors;
5435 dst->rx_crc_errors = src->rx_crc_errors;
5436 dst->rx_frame_errors = src->rx_frame_errors;
5437 dst->rx_fifo_errors = src->rx_fifo_errors;
5438 dst->rx_missed_errors = src->rx_missed_errors;
5439 dst->tx_aborted_errors = src->tx_aborted_errors;
5440 dst->tx_carrier_errors = src->tx_carrier_errors;
5441 dst->tx_fifo_errors = src->tx_fifo_errors;
5442 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5443 dst->tx_window_errors = src->tx_window_errors;
5444 }
5445
5446 static int
5447 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5448 {
5449 struct ofpbuf request;
5450 struct ofpbuf *reply;
5451 int error;
5452
5453 ofpbuf_init(&request, 0);
5454 nl_msg_put_nlmsghdr(&request,
5455 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5456 RTM_GETLINK, NLM_F_REQUEST);
5457 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5458 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5459 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5460 ofpbuf_uninit(&request);
5461 if (error) {
5462 return error;
5463 }
5464
5465 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5466 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5467 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5468 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5469 error = 0;
5470 } else {
5471 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5472 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5473 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5474 error = 0;
5475 } else {
5476 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5477 error = EPROTO;
5478 }
5479 }
5480 } else {
5481 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5482 error = EPROTO;
5483 }
5484
5485
5486 ofpbuf_delete(reply);
5487 return error;
5488 }
5489
5490 static int
5491 get_flags(const struct netdev *dev, unsigned int *flags)
5492 {
5493 struct ifreq ifr;
5494 int error;
5495
5496 *flags = 0;
5497 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5498 if (!error) {
5499 *flags = ifr.ifr_flags;
5500 }
5501 return error;
5502 }
5503
5504 static int
5505 set_flags(const char *name, unsigned int flags)
5506 {
5507 struct ifreq ifr;
5508
5509 ifr.ifr_flags = flags;
5510 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5511 }
5512
5513 static int
5514 do_get_ifindex(const char *netdev_name)
5515 {
5516 struct ifreq ifr;
5517 int error;
5518
5519 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5520 COVERAGE_INC(netdev_get_ifindex);
5521
5522 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5523 if (error) {
5524 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5525 netdev_name, ovs_strerror(error));
5526 return -error;
5527 }
5528 return ifr.ifr_ifindex;
5529 }
5530
5531 static int
5532 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5533 {
5534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5535
5536 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5537 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5538
5539 if (ifindex < 0) {
5540 netdev->get_ifindex_error = -ifindex;
5541 netdev->ifindex = 0;
5542 } else {
5543 netdev->get_ifindex_error = 0;
5544 netdev->ifindex = ifindex;
5545 }
5546 netdev->cache_valid |= VALID_IFINDEX;
5547 }
5548
5549 *ifindexp = netdev->ifindex;
5550 return netdev->get_ifindex_error;
5551 }
5552
5553 static int
5554 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5555 {
5556 struct ifreq ifr;
5557 int hwaddr_family;
5558 int error;
5559
5560 memset(&ifr, 0, sizeof ifr);
5561 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5562 COVERAGE_INC(netdev_get_hwaddr);
5563 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5564 if (error) {
5565 /* ENODEV probably means that a vif disappeared asynchronously and
5566 * hasn't been removed from the database yet, so reduce the log level
5567 * to INFO for that case. */
5568 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5569 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5570 netdev_name, ovs_strerror(error));
5571 return error;
5572 }
5573 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5574 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5575 VLOG_INFO("%s device has unknown hardware address family %d",
5576 netdev_name, hwaddr_family);
5577 return EINVAL;
5578 }
5579 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5580 return 0;
5581 }
5582
5583 static int
5584 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5585 {
5586 struct ifreq ifr;
5587 int error;
5588
5589 memset(&ifr, 0, sizeof ifr);
5590 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5591 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5592 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5593 COVERAGE_INC(netdev_set_hwaddr);
5594 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5595 if (error) {
5596 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5597 netdev_name, ovs_strerror(error));
5598 }
5599 return error;
5600 }
5601
5602 static int
5603 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5604 int cmd, const char *cmd_name)
5605 {
5606 struct ifreq ifr;
5607 int error;
5608
5609 memset(&ifr, 0, sizeof ifr);
5610 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5611 ifr.ifr_data = (caddr_t) ecmd;
5612
5613 ecmd->cmd = cmd;
5614 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5615 if (error) {
5616 if (error != EOPNOTSUPP) {
5617 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5618 "failed: %s", cmd_name, name, ovs_strerror(error));
5619 } else {
5620 /* The device doesn't support this operation. That's pretty
5621 * common, so there's no point in logging anything. */
5622 }
5623 }
5624 return error;
5625 }
5626
5627 static int
5628 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5629 int cmd, const char *cmd_name)
5630 {
5631 struct ifreq ifr;
5632 int error;
5633
5634 ifr.ifr_addr.sa_family = AF_INET;
5635 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5636 if (!error) {
5637 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5638 &ifr.ifr_addr);
5639 *ip = sin->sin_addr;
5640 }
5641 return error;
5642 }
5643
5644 /* Returns an AF_PACKET raw socket or a negative errno value. */
5645 static int
5646 af_packet_sock(void)
5647 {
5648 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5649 static int sock;
5650
5651 if (ovsthread_once_start(&once)) {
5652 sock = socket(AF_PACKET, SOCK_RAW, 0);
5653 if (sock >= 0) {
5654 int error = set_nonblocking(sock);
5655 if (error) {
5656 close(sock);
5657 sock = -error;
5658 }
5659 } else {
5660 sock = -errno;
5661 VLOG_ERR("failed to create packet socket: %s",
5662 ovs_strerror(errno));
5663 }
5664 ovsthread_once_done(&once);
5665 }
5666
5667 return sock;
5668 }