]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev-afxdp: add new netdev type for AF_XDP.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
27 #include <inttypes.h>
28 #include <math.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_tun.h>
33 #include <linux/types.h>
34 #include <linux/ethtool.h>
35 #include <linux/mii.h>
36 #include <linux/rtnetlink.h>
37 #include <linux/sockios.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <sys/utsname.h>
41 #include <netpacket/packet.h>
42 #include <net/if.h>
43 #include <net/if_arp.h>
44 #include <net/route.h>
45 #include <poll.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49
50 #include "coverage.h"
51 #include "dp-packet.h"
52 #include "dpif-netlink.h"
53 #include "dpif-netdev.h"
54 #include "openvswitch/dynamic-string.h"
55 #include "fatal-signal.h"
56 #include "hash.h"
57 #include "openvswitch/hmap.h"
58 #include "netdev-afxdp.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "netnsid.h"
65 #include "openvswitch/ofpbuf.h"
66 #include "openflow/openflow.h"
67 #include "ovs-atomic.h"
68 #include "packets.h"
69 #include "openvswitch/poll-loop.h"
70 #include "rtnetlink.h"
71 #include "openvswitch/shash.h"
72 #include "socket-util.h"
73 #include "sset.h"
74 #include "tc.h"
75 #include "timer.h"
76 #include "unaligned.h"
77 #include "openvswitch/vlog.h"
78 #include "util.h"
79
80 VLOG_DEFINE_THIS_MODULE(netdev_linux);
81
82 COVERAGE_DEFINE(netdev_set_policing);
83 COVERAGE_DEFINE(netdev_arp_lookup);
84 COVERAGE_DEFINE(netdev_get_ifindex);
85 COVERAGE_DEFINE(netdev_get_hwaddr);
86 COVERAGE_DEFINE(netdev_set_hwaddr);
87 COVERAGE_DEFINE(netdev_get_ethtool);
88 COVERAGE_DEFINE(netdev_set_ethtool);
89
90 \f
91 #ifndef IFLA_IF_NETNSID
92 #define IFLA_IF_NETNSID 0x45
93 #endif
94 /* These were introduced in Linux 2.6.14, so they might be missing if we have
95 * old headers. */
96 #ifndef ADVERTISED_Pause
97 #define ADVERTISED_Pause (1 << 13)
98 #endif
99 #ifndef ADVERTISED_Asym_Pause
100 #define ADVERTISED_Asym_Pause (1 << 14)
101 #endif
102
103 /* These were introduced in Linux 2.6.24, so they might be missing if we
104 * have old headers. */
105 #ifndef ETHTOOL_GFLAGS
106 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
107 #endif
108 #ifndef ETHTOOL_SFLAGS
109 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
110 #endif
111
112 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
113 * headers. */
114 #ifndef TC_RTAB_SIZE
115 #define TC_RTAB_SIZE 1024
116 #endif
117
118 #ifndef TCM_IFINDEX_MAGIC_BLOCK
119 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
120 #endif
121
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
127 *
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
130 */
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
133 #endif
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
136 #endif
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
139 #endif
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata {
143 uint32_t tp_status;
144 uint32_t tp_len;
145 uint32_t tp_snaplen;
146 uint16_t tp_mac;
147 uint16_t tp_net;
148 uint16_t tp_vlan_tci;
149 uint16_t tp_vlan_tpid;
150 };
151
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
153 *
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
160 {
161 return ep->speed | (ep->speed_hi << 16);
162 }
163
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
175 #endif
176
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
188 #endif
189
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
191 *
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
197 #ifndef IFLA_STATS64
198 #define IFLA_STATS64 23
199 #endif
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64 {
202 uint64_t rx_packets;
203 uint64_t tx_packets;
204 uint64_t rx_bytes;
205 uint64_t tx_bytes;
206 uint64_t rx_errors;
207 uint64_t tx_errors;
208 uint64_t rx_dropped;
209 uint64_t tx_dropped;
210 uint64_t multicast;
211 uint64_t collisions;
212
213 uint64_t rx_length_errors;
214 uint64_t rx_over_errors;
215 uint64_t rx_crc_errors;
216 uint64_t rx_frame_errors;
217 uint64_t rx_fifo_errors;
218 uint64_t rx_missed_errors;
219
220 uint64_t tx_aborted_errors;
221 uint64_t tx_carrier_errors;
222 uint64_t tx_fifo_errors;
223 uint64_t tx_heartbeat_errors;
224 uint64_t tx_window_errors;
225
226 uint64_t rx_compressed;
227 uint64_t tx_compressed;
228 };
229
230 enum {
231 VALID_IFINDEX = 1 << 0,
232 VALID_ETHERADDR = 1 << 1,
233 VALID_IN = 1 << 2,
234 VALID_MTU = 1 << 3,
235 VALID_POLICING = 1 << 4,
236 VALID_VPORT_STAT_ERROR = 1 << 5,
237 VALID_DRVINFO = 1 << 6,
238 VALID_FEATURES = 1 << 7,
239 };
240 \f
241 struct linux_lag_slave {
242 uint32_t block_id;
243 struct shash_node *node;
244 };
245
246 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
247 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
248
249 /* All slaves whose LAG masters are network devices in OvS. */
250 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
251 = SHASH_INITIALIZER(&lag_shash);
252
253 /* Traffic control. */
254
255 /* An instance of a traffic control class. Always associated with a particular
256 * network device.
257 *
258 * Each TC implementation subclasses this with whatever additional data it
259 * needs. */
260 struct tc {
261 const struct tc_ops *ops;
262 struct hmap queues; /* Contains "struct tc_queue"s.
263 * Read by generic TC layer.
264 * Written only by TC implementation. */
265 };
266
267 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
268
269 /* One traffic control queue.
270 *
271 * Each TC implementation subclasses this with whatever additional data it
272 * needs. */
273 struct tc_queue {
274 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
275 unsigned int queue_id; /* OpenFlow queue ID. */
276 long long int created; /* Time queue was created, in msecs. */
277 };
278
279 /* A particular kind of traffic control. Each implementation generally maps to
280 * one particular Linux qdisc class.
281 *
282 * The functions below return 0 if successful or a positive errno value on
283 * failure, except where otherwise noted. All of them must be provided, except
284 * where otherwise noted. */
285 struct tc_ops {
286 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
287 * This is null for tc_ops_default and tc_ops_other, for which there are no
288 * appropriate values. */
289 const char *linux_name;
290
291 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
292 const char *ovs_name;
293
294 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
295 * queues. The queues are numbered 0 through n_queues - 1. */
296 unsigned int n_queues;
297
298 /* Called to install this TC class on 'netdev'. The implementation should
299 * make the Netlink calls required to set up 'netdev' with the right qdisc
300 * and configure it according to 'details'. The implementation may assume
301 * that the current qdisc is the default; that is, there is no need for it
302 * to delete the current qdisc before installing itself.
303 *
304 * The contents of 'details' should be documented as valid for 'ovs_name'
305 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
306 * (which is built as ovs-vswitchd.conf.db(8)).
307 *
308 * This function must return 0 if and only if it sets 'netdev->tc' to an
309 * initialized 'struct tc'.
310 *
311 * (This function is null for tc_ops_other, which cannot be installed. For
312 * other TC classes it should always be nonnull.) */
313 int (*tc_install)(struct netdev *netdev, const struct smap *details);
314
315 /* Called when the netdev code determines (through a Netlink query) that
316 * this TC class's qdisc is installed on 'netdev', but we didn't install
317 * it ourselves and so don't know any of the details.
318 *
319 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
320 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
321 * implementation should parse the other attributes of 'nlmsg' as
322 * necessary to determine its configuration. If necessary it should also
323 * use Netlink queries to determine the configuration of queues on
324 * 'netdev'.
325 *
326 * This function must return 0 if and only if it sets 'netdev->tc' to an
327 * initialized 'struct tc'. */
328 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
329
330 /* Destroys the data structures allocated by the implementation as part of
331 * 'tc'. (This includes destroying 'tc->queues' by calling
332 * tc_destroy(tc).
333 *
334 * The implementation should not need to perform any Netlink calls. If
335 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
336 * (But it may not be desirable.)
337 *
338 * This function may be null if 'tc' is trivial. */
339 void (*tc_destroy)(struct tc *tc);
340
341 /* Retrieves details of 'netdev->tc' configuration into 'details'.
342 *
343 * The implementation should not need to perform any Netlink calls, because
344 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
345 * cached the configuration.
346 *
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
349 * (which is built as ovs-vswitchd.conf.db(8)).
350 *
351 * This function may be null if 'tc' is not configurable.
352 */
353 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
354
355 /* Reconfigures 'netdev->tc' according to 'details', performing any
356 * required Netlink calls to complete the reconfiguration.
357 *
358 * The contents of 'details' should be documented as valid for 'ovs_name'
359 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
360 * (which is built as ovs-vswitchd.conf.db(8)).
361 *
362 * This function may be null if 'tc' is not configurable.
363 */
364 int (*qdisc_set)(struct netdev *, const struct smap *details);
365
366 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
367 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
368 *
369 * The contents of 'details' should be documented as valid for 'ovs_name'
370 * in the "other_config" column in the "Queue" table in
371 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
372 *
373 * The implementation should not need to perform any Netlink calls, because
374 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
375 * cached the queue configuration.
376 *
377 * This function may be null if 'tc' does not have queues ('n_queues' is
378 * 0). */
379 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
380 struct smap *details);
381
382 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
383 * 'details', perfoming any required Netlink calls to complete the
384 * reconfiguration. The caller ensures that 'queue_id' is less than
385 * 'n_queues'.
386 *
387 * The contents of 'details' should be documented as valid for 'ovs_name'
388 * in the "other_config" column in the "Queue" table in
389 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
390 *
391 * This function may be null if 'tc' does not have queues or its queues are
392 * not configurable. */
393 int (*class_set)(struct netdev *, unsigned int queue_id,
394 const struct smap *details);
395
396 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
397 * tc_queue's within 'netdev->tc->queues'.
398 *
399 * This function may be null if 'tc' does not have queues or its queues
400 * cannot be deleted. */
401 int (*class_delete)(struct netdev *, struct tc_queue *queue);
402
403 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
404 * 'struct tc_queue's within 'netdev->tc->queues'.
405 *
406 * On success, initializes '*stats'.
407 *
408 * This function may be null if 'tc' does not have queues or if it cannot
409 * report queue statistics. */
410 int (*class_get_stats)(const struct netdev *netdev,
411 const struct tc_queue *queue,
412 struct netdev_queue_stats *stats);
413
414 /* Extracts queue stats from 'nlmsg', which is a response to a
415 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
416 *
417 * This function may be null if 'tc' does not have queues or if it cannot
418 * report queue statistics. */
419 int (*class_dump_stats)(const struct netdev *netdev,
420 const struct ofpbuf *nlmsg,
421 netdev_dump_queue_stats_cb *cb, void *aux);
422 };
423
424 static void
425 tc_init(struct tc *tc, const struct tc_ops *ops)
426 {
427 tc->ops = ops;
428 hmap_init(&tc->queues);
429 }
430
431 static void
432 tc_destroy(struct tc *tc)
433 {
434 hmap_destroy(&tc->queues);
435 }
436
437 static const struct tc_ops tc_ops_htb;
438 static const struct tc_ops tc_ops_hfsc;
439 static const struct tc_ops tc_ops_codel;
440 static const struct tc_ops tc_ops_fqcodel;
441 static const struct tc_ops tc_ops_sfq;
442 static const struct tc_ops tc_ops_netem;
443 static const struct tc_ops tc_ops_default;
444 static const struct tc_ops tc_ops_noop;
445 static const struct tc_ops tc_ops_other;
446
447 static const struct tc_ops *const tcs[] = {
448 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
449 &tc_ops_hfsc, /* Hierarchical fair service curve. */
450 &tc_ops_codel, /* Controlled delay */
451 &tc_ops_fqcodel, /* Fair queue controlled delay */
452 &tc_ops_sfq, /* Stochastic fair queueing */
453 &tc_ops_netem, /* Network Emulator */
454 &tc_ops_noop, /* Non operating qos type. */
455 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
456 &tc_ops_other, /* Some other qdisc. */
457 NULL
458 };
459
460 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
461 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
462 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
463 static uint32_t tc_time_to_ticks(uint32_t time);
464
465 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
466 int type,
467 unsigned int flags,
468 struct ofpbuf *);
469 static int tc_add_policer(struct netdev *,
470 uint32_t kbits_rate, uint32_t kbits_burst);
471
472 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
473 struct nlattr **options);
474 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
475 struct nlattr **options,
476 struct netdev_queue_stats *);
477 static int tc_query_class(const struct netdev *,
478 unsigned int handle, unsigned int parent,
479 struct ofpbuf **replyp);
480 static int tc_delete_class(const struct netdev *, unsigned int handle);
481
482 static int tc_del_qdisc(struct netdev *netdev);
483 static int tc_query_qdisc(const struct netdev *netdev);
484
485 void
486 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
487 static int tc_calc_cell_log(unsigned int mtu);
488 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
489 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
490 \f
491
492 /* This is set pretty low because we probably won't learn anything from the
493 * additional log messages. */
494 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
495
496 /* Polling miimon status for all ports causes performance degradation when
497 * handling a large number of ports. If there are no devices using miimon, then
498 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
499 *
500 * Readers do not depend on this variable synchronizing with the related
501 * changes in the device miimon status, so we can use atomic_count. */
502 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
503
504 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
505 int cmd, const char *cmd_name);
506 static int get_flags(const struct netdev *, unsigned int *flags);
507 static int set_flags(const char *, unsigned int flags);
508 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
509 enum netdev_flags on, enum netdev_flags *old_flagsp)
510 OVS_REQUIRES(netdev->mutex);
511 static int get_ifindex(const struct netdev *, int *ifindexp);
512 static int do_set_addr(struct netdev *netdev,
513 int ioctl_nr, const char *ioctl_name,
514 struct in_addr addr);
515 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
516 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
517 static int af_packet_sock(void);
518 static bool netdev_linux_miimon_enabled(void);
519 static void netdev_linux_miimon_run(void);
520 static void netdev_linux_miimon_wait(void);
521 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
522
523 static bool
524 is_tap_netdev(const struct netdev *netdev)
525 {
526 return netdev_get_class(netdev) == &netdev_tap_class;
527 }
528 \f
529 static int
530 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
531 {
532 struct dpif_netlink_vport reply;
533 struct ofpbuf *buf;
534 int error;
535
536 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
537 if (error) {
538 if (error == ENOENT) {
539 /* Assume it is local if there is no API (e.g. if the openvswitch
540 * kernel module is not loaded). */
541 netnsid_set_local(&netdev->netnsid);
542 } else {
543 netnsid_unset(&netdev->netnsid);
544 }
545 return error;
546 }
547
548 netnsid_set(&netdev->netnsid, reply.netnsid);
549 ofpbuf_delete(buf);
550 return 0;
551 }
552
553 static int
554 netdev_linux_netnsid_update(struct netdev_linux *netdev)
555 {
556 if (netnsid_is_unset(netdev->netnsid)) {
557 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
558 netnsid_set_local(&netdev->netnsid);
559 } else {
560 return netdev_linux_netnsid_update__(netdev);
561 }
562 }
563
564 return 0;
565 }
566
567 static bool
568 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
569 {
570 netdev_linux_netnsid_update(netdev);
571 return netnsid_eq(netdev->netnsid, nsid);
572 }
573
574 static bool
575 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
576 {
577 netdev_linux_netnsid_update(netdev);
578 return netnsid_is_remote(netdev->netnsid);
579 }
580
581 static int netdev_linux_update_via_netlink(struct netdev_linux *);
582 static void netdev_linux_update(struct netdev_linux *netdev, int,
583 const struct rtnetlink_change *)
584 OVS_REQUIRES(netdev->mutex);
585 static void netdev_linux_changed(struct netdev_linux *netdev,
586 unsigned int ifi_flags, unsigned int mask)
587 OVS_REQUIRES(netdev->mutex);
588
589 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
590 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
591 * if no such socket could be created. */
592 static struct nl_sock *
593 netdev_linux_notify_sock(void)
594 {
595 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
596 static struct nl_sock *sock;
597 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
598 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
599
600 if (ovsthread_once_start(&once)) {
601 int error;
602
603 error = nl_sock_create(NETLINK_ROUTE, &sock);
604 if (!error) {
605 size_t i;
606
607 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
608 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
609 if (error) {
610 nl_sock_destroy(sock);
611 sock = NULL;
612 break;
613 }
614 }
615 }
616 nl_sock_listen_all_nsid(sock, true);
617 ovsthread_once_done(&once);
618 }
619
620 return sock;
621 }
622
623 static bool
624 netdev_linux_miimon_enabled(void)
625 {
626 return atomic_count_get(&miimon_cnt) > 0;
627 }
628
629 static bool
630 netdev_linux_kind_is_lag(const char *kind)
631 {
632 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
633 return true;
634 }
635
636 return false;
637 }
638
639 static void
640 netdev_linux_update_lag(struct rtnetlink_change *change)
641 OVS_REQUIRES(lag_mutex)
642 {
643 struct linux_lag_slave *lag;
644
645 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
646 return;
647 }
648
649 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
650 lag = shash_find_data(&lag_shash, change->ifname);
651
652 if (!lag) {
653 struct netdev *master_netdev;
654 char master_name[IFNAMSIZ];
655 uint32_t block_id;
656 int error = 0;
657
658 if_indextoname(change->master_ifindex, master_name);
659 master_netdev = netdev_from_name(master_name);
660 if (!master_netdev) {
661 return;
662 }
663
664 if (is_netdev_linux_class(master_netdev->netdev_class)) {
665 block_id = netdev_get_block_id(master_netdev);
666 if (!block_id) {
667 netdev_close(master_netdev);
668 return;
669 }
670
671 lag = xmalloc(sizeof *lag);
672 lag->block_id = block_id;
673 lag->node = shash_add(&lag_shash, change->ifname, lag);
674
675 /* delete ingress block in case it exists */
676 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
677 /* LAG master is linux netdev so add slave to same block. */
678 error = tc_add_del_qdisc(change->if_index, true, block_id,
679 TC_INGRESS);
680 if (error) {
681 VLOG_WARN("failed to bind LAG slave %s to master's block",
682 change->ifname);
683 shash_delete(&lag_shash, lag->node);
684 free(lag);
685 }
686 }
687
688 netdev_close(master_netdev);
689 }
690 } else if (change->master_ifindex == 0) {
691 /* Check if this was a lag slave that has been freed. */
692 lag = shash_find_data(&lag_shash, change->ifname);
693
694 if (lag) {
695 tc_add_del_qdisc(change->if_index, false, lag->block_id,
696 TC_INGRESS);
697 shash_delete(&lag_shash, lag->node);
698 free(lag);
699 }
700 }
701 }
702
703 void
704 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
705 {
706 struct nl_sock *sock;
707 int error;
708
709 if (netdev_linux_miimon_enabled()) {
710 netdev_linux_miimon_run();
711 }
712
713 sock = netdev_linux_notify_sock();
714 if (!sock) {
715 return;
716 }
717
718 do {
719 uint64_t buf_stub[4096 / 8];
720 int nsid;
721 struct ofpbuf buf;
722
723 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
724 error = nl_sock_recv(sock, &buf, &nsid, false);
725 if (!error) {
726 struct rtnetlink_change change;
727
728 if (rtnetlink_parse(&buf, &change)) {
729 struct netdev *netdev_ = NULL;
730 char dev_name[IFNAMSIZ];
731
732 if (!change.ifname) {
733 change.ifname = if_indextoname(change.if_index, dev_name);
734 }
735
736 if (change.ifname) {
737 netdev_ = netdev_from_name(change.ifname);
738 }
739 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
740 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
741
742 ovs_mutex_lock(&netdev->mutex);
743 netdev_linux_update(netdev, nsid, &change);
744 ovs_mutex_unlock(&netdev->mutex);
745 }
746 else if (!netdev_ && change.ifname) {
747 /* Netdev is not present in OvS but its master could be. */
748 ovs_mutex_lock(&lag_mutex);
749 netdev_linux_update_lag(&change);
750 ovs_mutex_unlock(&lag_mutex);
751 }
752 netdev_close(netdev_);
753 }
754 } else if (error == ENOBUFS) {
755 struct shash device_shash;
756 struct shash_node *node;
757
758 nl_sock_drain(sock);
759
760 shash_init(&device_shash);
761 netdev_get_devices(&netdev_linux_class, &device_shash);
762 SHASH_FOR_EACH (node, &device_shash) {
763 struct netdev *netdev_ = node->data;
764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
765 unsigned int flags;
766
767 ovs_mutex_lock(&netdev->mutex);
768 get_flags(netdev_, &flags);
769 netdev_linux_changed(netdev, flags, 0);
770 ovs_mutex_unlock(&netdev->mutex);
771
772 netdev_close(netdev_);
773 }
774 shash_destroy(&device_shash);
775 } else if (error != EAGAIN) {
776 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
777 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
778 ovs_strerror(error));
779 }
780 ofpbuf_uninit(&buf);
781 } while (!error);
782 }
783
784 static void
785 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
786 {
787 struct nl_sock *sock;
788
789 if (netdev_linux_miimon_enabled()) {
790 netdev_linux_miimon_wait();
791 }
792 sock = netdev_linux_notify_sock();
793 if (sock) {
794 nl_sock_wait(sock, POLLIN);
795 }
796 }
797
798 static void
799 netdev_linux_changed(struct netdev_linux *dev,
800 unsigned int ifi_flags, unsigned int mask)
801 OVS_REQUIRES(dev->mutex)
802 {
803 netdev_change_seq_changed(&dev->up);
804
805 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
806 dev->carrier_resets++;
807 }
808 dev->ifi_flags = ifi_flags;
809
810 dev->cache_valid &= mask;
811 if (!(mask & VALID_IN)) {
812 netdev_get_addrs_list_flush();
813 }
814 }
815
816 static void
817 netdev_linux_update__(struct netdev_linux *dev,
818 const struct rtnetlink_change *change)
819 OVS_REQUIRES(dev->mutex)
820 {
821 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
822 if (change->nlmsg_type == RTM_NEWLINK) {
823 /* Keep drv-info, and ip addresses. */
824 netdev_linux_changed(dev, change->ifi_flags,
825 VALID_DRVINFO | VALID_IN);
826
827 /* Update netdev from rtnl-change msg. */
828 if (change->mtu) {
829 dev->mtu = change->mtu;
830 dev->cache_valid |= VALID_MTU;
831 dev->netdev_mtu_error = 0;
832 }
833
834 if (!eth_addr_is_zero(change->mac)) {
835 dev->etheraddr = change->mac;
836 dev->cache_valid |= VALID_ETHERADDR;
837 dev->ether_addr_error = 0;
838
839 /* The mac addr has been changed, report it now. */
840 rtnetlink_report_link();
841 }
842
843 if (change->master && netdev_linux_kind_is_lag(change->master)) {
844 dev->is_lag_master = true;
845 }
846
847 dev->ifindex = change->if_index;
848 dev->cache_valid |= VALID_IFINDEX;
849 dev->get_ifindex_error = 0;
850 dev->present = true;
851 } else {
852 /* FIXME */
853 netdev_linux_changed(dev, change->ifi_flags, 0);
854 dev->present = false;
855 netnsid_unset(&dev->netnsid);
856 }
857 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
858 /* Invalidates in4, in6. */
859 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
860 } else {
861 OVS_NOT_REACHED();
862 }
863 }
864
865 static void
866 netdev_linux_update(struct netdev_linux *dev, int nsid,
867 const struct rtnetlink_change *change)
868 OVS_REQUIRES(dev->mutex)
869 {
870 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
871 netdev_linux_update__(dev, change);
872 }
873 }
874
875 static struct netdev *
876 netdev_linux_alloc(void)
877 {
878 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
879 return &netdev->up;
880 }
881
882 static int
883 netdev_linux_common_construct(struct netdev *netdev_)
884 {
885 /* Prevent any attempt to create (or open) a network device named "default"
886 * or "all". These device names are effectively reserved on Linux because
887 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
888 * itself this wouldn't call for any special treatment, but in practice if
889 * a program tries to create devices with these names, it causes the kernel
890 * to fire a "new device" notification event even though creation failed,
891 * and in turn that causes OVS to wake up and try to create them again,
892 * which ends up as a 100% CPU loop. */
893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
894 const char *name = netdev_->name;
895 if (!strcmp(name, "default") || !strcmp(name, "all")) {
896 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
897 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
898 name);
899 return EINVAL;
900 }
901
902 /* The device could be in the same network namespace or in another one. */
903 netnsid_unset(&netdev->netnsid);
904 ovs_mutex_init(&netdev->mutex);
905 return 0;
906 }
907
908 /* Creates system and internal devices. */
909 static int
910 netdev_linux_construct(struct netdev *netdev_)
911 {
912 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
913 int error = netdev_linux_common_construct(netdev_);
914 if (error) {
915 return error;
916 }
917
918 error = get_flags(&netdev->up, &netdev->ifi_flags);
919 if (error == ENODEV) {
920 if (netdev->up.netdev_class != &netdev_internal_class) {
921 /* The device does not exist, so don't allow it to be opened. */
922 return ENODEV;
923 } else {
924 /* "Internal" netdevs have to be created as netdev objects before
925 * they exist in the kernel, because creating them in the kernel
926 * happens by passing a netdev object to dpif_port_add().
927 * Therefore, ignore the error. */
928 }
929 }
930
931 return 0;
932 }
933
934 /* For most types of netdevs we open the device for each call of
935 * netdev_open(). However, this is not the case with tap devices,
936 * since it is only possible to open the device once. In this
937 * situation we share a single file descriptor, and consequently
938 * buffers, across all readers. Therefore once data is read it will
939 * be unavailable to other reads for tap devices. */
940 static int
941 netdev_linux_construct_tap(struct netdev *netdev_)
942 {
943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
944 static const char tap_dev[] = "/dev/net/tun";
945 const char *name = netdev_->name;
946 struct ifreq ifr;
947
948 int error = netdev_linux_common_construct(netdev_);
949 if (error) {
950 return error;
951 }
952
953 /* Open tap device. */
954 netdev->tap_fd = open(tap_dev, O_RDWR);
955 if (netdev->tap_fd < 0) {
956 error = errno;
957 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
958 return error;
959 }
960
961 /* Create tap device. */
962 get_flags(&netdev->up, &netdev->ifi_flags);
963 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
964 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
965 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
966 VLOG_WARN("%s: creating tap device failed: %s", name,
967 ovs_strerror(errno));
968 error = errno;
969 goto error_close;
970 }
971
972 /* Make non-blocking. */
973 error = set_nonblocking(netdev->tap_fd);
974 if (error) {
975 goto error_close;
976 }
977
978 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
979 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
980 ovs_strerror(errno));
981 error = errno;
982 goto error_close;
983 }
984
985 netdev->present = true;
986 return 0;
987
988 error_close:
989 close(netdev->tap_fd);
990 return error;
991 }
992
993 static void
994 netdev_linux_destruct(struct netdev *netdev_)
995 {
996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
997
998 if (netdev->tc && netdev->tc->ops->tc_destroy) {
999 netdev->tc->ops->tc_destroy(netdev->tc);
1000 }
1001
1002 if (netdev_get_class(netdev_) == &netdev_tap_class
1003 && netdev->tap_fd >= 0)
1004 {
1005 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1006 close(netdev->tap_fd);
1007 }
1008
1009 if (netdev->miimon_interval > 0) {
1010 atomic_count_dec(&miimon_cnt);
1011 }
1012
1013 ovs_mutex_destroy(&netdev->mutex);
1014 }
1015
1016 static void
1017 netdev_linux_dealloc(struct netdev *netdev_)
1018 {
1019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1020 free(netdev);
1021 }
1022
1023 static struct netdev_rxq *
1024 netdev_linux_rxq_alloc(void)
1025 {
1026 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1027 return &rx->up;
1028 }
1029
1030 static int
1031 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1032 {
1033 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1034 struct netdev *netdev_ = rx->up.netdev;
1035 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1036 int error;
1037
1038 ovs_mutex_lock(&netdev->mutex);
1039 rx->is_tap = is_tap_netdev(netdev_);
1040 if (rx->is_tap) {
1041 rx->fd = netdev->tap_fd;
1042 } else {
1043 struct sockaddr_ll sll;
1044 int ifindex, val;
1045 /* Result of tcpdump -dd inbound */
1046 static const struct sock_filter filt[] = {
1047 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1048 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1049 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1050 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1051 };
1052 static const struct sock_fprog fprog = {
1053 ARRAY_SIZE(filt), (struct sock_filter *) filt
1054 };
1055
1056 /* Create file descriptor. */
1057 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1058 if (rx->fd < 0) {
1059 error = errno;
1060 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1061 goto error;
1062 }
1063
1064 val = 1;
1065 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1066 error = errno;
1067 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1068 netdev_get_name(netdev_), ovs_strerror(error));
1069 goto error;
1070 }
1071
1072 /* Set non-blocking mode. */
1073 error = set_nonblocking(rx->fd);
1074 if (error) {
1075 goto error;
1076 }
1077
1078 /* Get ethernet device index. */
1079 error = get_ifindex(&netdev->up, &ifindex);
1080 if (error) {
1081 goto error;
1082 }
1083
1084 /* Bind to specific ethernet device. */
1085 memset(&sll, 0, sizeof sll);
1086 sll.sll_family = AF_PACKET;
1087 sll.sll_ifindex = ifindex;
1088 sll.sll_protocol = htons(ETH_P_ALL);
1089 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1090 error = errno;
1091 VLOG_ERR("%s: failed to bind raw socket (%s)",
1092 netdev_get_name(netdev_), ovs_strerror(error));
1093 goto error;
1094 }
1095
1096 /* Filter for only inbound packets. */
1097 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1098 sizeof fprog);
1099 if (error) {
1100 error = errno;
1101 VLOG_ERR("%s: failed to attach filter (%s)",
1102 netdev_get_name(netdev_), ovs_strerror(error));
1103 goto error;
1104 }
1105 }
1106 ovs_mutex_unlock(&netdev->mutex);
1107
1108 return 0;
1109
1110 error:
1111 if (rx->fd >= 0) {
1112 close(rx->fd);
1113 }
1114 ovs_mutex_unlock(&netdev->mutex);
1115 return error;
1116 }
1117
1118 static void
1119 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1120 {
1121 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1122
1123 if (!rx->is_tap) {
1124 close(rx->fd);
1125 }
1126 }
1127
1128 static void
1129 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1130 {
1131 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1132
1133 free(rx);
1134 }
1135
1136 static ovs_be16
1137 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1138 {
1139 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1140 return htons(aux->tp_vlan_tpid);
1141 } else if (double_tagged) {
1142 return htons(ETH_TYPE_VLAN_8021AD);
1143 } else {
1144 return htons(ETH_TYPE_VLAN_8021Q);
1145 }
1146 }
1147
1148 static bool
1149 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1150 {
1151 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1152 }
1153
1154 static int
1155 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1156 {
1157 size_t size;
1158 ssize_t retval;
1159 struct iovec iov;
1160 struct cmsghdr *cmsg;
1161 union {
1162 struct cmsghdr cmsg;
1163 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1164 } cmsg_buffer;
1165 struct msghdr msgh;
1166
1167 /* Reserve headroom for a single VLAN tag */
1168 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1169 size = dp_packet_tailroom(buffer);
1170
1171 iov.iov_base = dp_packet_data(buffer);
1172 iov.iov_len = size;
1173 msgh.msg_name = NULL;
1174 msgh.msg_namelen = 0;
1175 msgh.msg_iov = &iov;
1176 msgh.msg_iovlen = 1;
1177 msgh.msg_control = &cmsg_buffer;
1178 msgh.msg_controllen = sizeof cmsg_buffer;
1179 msgh.msg_flags = 0;
1180
1181 do {
1182 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1183 } while (retval < 0 && errno == EINTR);
1184
1185 if (retval < 0) {
1186 return errno;
1187 } else if (retval > size) {
1188 return EMSGSIZE;
1189 }
1190
1191 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1192
1193 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1194 const struct tpacket_auxdata *aux;
1195
1196 if (cmsg->cmsg_level != SOL_PACKET
1197 || cmsg->cmsg_type != PACKET_AUXDATA
1198 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1199 continue;
1200 }
1201
1202 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1203 if (auxdata_has_vlan_tci(aux)) {
1204 struct eth_header *eth;
1205 bool double_tagged;
1206
1207 if (retval < ETH_HEADER_LEN) {
1208 return EINVAL;
1209 }
1210
1211 eth = dp_packet_data(buffer);
1212 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1213
1214 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1215 htons(aux->tp_vlan_tci));
1216 break;
1217 }
1218 }
1219
1220 return 0;
1221 }
1222
1223 static int
1224 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1225 {
1226 ssize_t retval;
1227 size_t size = dp_packet_tailroom(buffer);
1228
1229 do {
1230 retval = read(fd, dp_packet_data(buffer), size);
1231 } while (retval < 0 && errno == EINTR);
1232
1233 if (retval < 0) {
1234 return errno;
1235 }
1236
1237 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1238 return 0;
1239 }
1240
1241 static int
1242 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1243 int *qfill)
1244 {
1245 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1246 struct netdev *netdev = rx->up.netdev;
1247 struct dp_packet *buffer;
1248 ssize_t retval;
1249 int mtu;
1250
1251 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1252 mtu = ETH_PAYLOAD_MAX;
1253 }
1254
1255 /* Assume Ethernet port. No need to set packet_type. */
1256 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1257 DP_NETDEV_HEADROOM);
1258 retval = (rx->is_tap
1259 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1260 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1261
1262 if (retval) {
1263 if (retval != EAGAIN && retval != EMSGSIZE) {
1264 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1265 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1266 }
1267 dp_packet_delete(buffer);
1268 } else {
1269 dp_packet_batch_init_packet(batch, buffer);
1270 }
1271
1272 if (qfill) {
1273 *qfill = -ENOTSUP;
1274 }
1275
1276 return retval;
1277 }
1278
1279 static void
1280 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1281 {
1282 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1283 poll_fd_wait(rx->fd, POLLIN);
1284 }
1285
1286 static int
1287 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1288 {
1289 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1290 if (rx->is_tap) {
1291 struct ifreq ifr;
1292 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1293 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1294 if (error) {
1295 return error;
1296 }
1297 drain_fd(rx->fd, ifr.ifr_qlen);
1298 return 0;
1299 } else {
1300 return drain_rcvbuf(rx->fd);
1301 }
1302 }
1303
1304 static int
1305 netdev_linux_sock_batch_send(int sock, int ifindex,
1306 struct dp_packet_batch *batch)
1307 {
1308 const size_t size = dp_packet_batch_size(batch);
1309 /* We don't bother setting most fields in sockaddr_ll because the
1310 * kernel ignores them for SOCK_RAW. */
1311 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1312 .sll_ifindex = ifindex };
1313
1314 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1315 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1316
1317 struct dp_packet *packet;
1318 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1319 iov[i].iov_base = dp_packet_data(packet);
1320 iov[i].iov_len = dp_packet_size(packet);
1321 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1322 .msg_namelen = sizeof sll,
1323 .msg_iov = &iov[i],
1324 .msg_iovlen = 1 };
1325 }
1326
1327 int error = 0;
1328 for (uint32_t ofs = 0; ofs < size; ) {
1329 ssize_t retval;
1330 do {
1331 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1332 error = retval < 0 ? errno : 0;
1333 } while (error == EINTR);
1334 if (error) {
1335 break;
1336 }
1337 ofs += retval;
1338 }
1339
1340 free(mmsg);
1341 free(iov);
1342 return error;
1343 }
1344
1345 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1346 * essential, because packets sent to a tap device with an AF_PACKET socket
1347 * will loop back to be *received* again on the tap device. This doesn't occur
1348 * on other interface types because we attach a socket filter to the rx
1349 * socket. */
1350 static int
1351 netdev_linux_tap_batch_send(struct netdev *netdev_,
1352 struct dp_packet_batch *batch)
1353 {
1354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1355 struct dp_packet *packet;
1356
1357 /* The Linux tap driver returns EIO if the device is not up,
1358 * so if the device is not up, don't waste time sending it.
1359 * However, if the device is in another network namespace
1360 * then OVS can't retrieve the state. In that case, send the
1361 * packets anyway. */
1362 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1363 netdev->tx_dropped += dp_packet_batch_size(batch);
1364 return 0;
1365 }
1366
1367 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1368 size_t size = dp_packet_size(packet);
1369 ssize_t retval;
1370 int error;
1371
1372 do {
1373 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1374 error = retval < 0 ? errno : 0;
1375 } while (error == EINTR);
1376
1377 if (error) {
1378 /* The Linux tap driver returns EIO if the device is not up. From
1379 * the OVS side this is not an error, so we ignore it; otherwise,
1380 * return the erro. */
1381 if (error != EIO) {
1382 return error;
1383 }
1384 } else if (retval != size) {
1385 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1386 "bytes of %"PRIuSIZE") on %s",
1387 retval, size, netdev_get_name(netdev_));
1388 return EMSGSIZE;
1389 }
1390 }
1391 return 0;
1392 }
1393
1394 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1395 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1396 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1397 * the packet is too big or too small to transmit on the device.
1398 *
1399 * The kernel maintains a packet transmission queue, so the caller is not
1400 * expected to do additional queuing of packets. */
1401 static int
1402 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1403 struct dp_packet_batch *batch,
1404 bool concurrent_txq OVS_UNUSED)
1405 {
1406 int error = 0;
1407 int sock = 0;
1408
1409 if (!is_tap_netdev(netdev_)) {
1410 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1411 error = EOPNOTSUPP;
1412 goto free_batch;
1413 }
1414
1415 sock = af_packet_sock();
1416 if (sock < 0) {
1417 error = -sock;
1418 goto free_batch;
1419 }
1420
1421 int ifindex = netdev_get_ifindex(netdev_);
1422 if (ifindex < 0) {
1423 error = -ifindex;
1424 goto free_batch;
1425 }
1426
1427 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1428 } else {
1429 error = netdev_linux_tap_batch_send(netdev_, batch);
1430 }
1431 if (error) {
1432 if (error == ENOBUFS) {
1433 /* The Linux AF_PACKET implementation never blocks waiting
1434 * for room for packets, instead returning ENOBUFS.
1435 * Translate this into EAGAIN for the caller. */
1436 error = EAGAIN;
1437 } else {
1438 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1439 netdev_get_name(netdev_), ovs_strerror(error));
1440 }
1441 }
1442
1443 free_batch:
1444 dp_packet_delete_batch(batch, true);
1445 return error;
1446 }
1447
1448 /* Registers with the poll loop to wake up from the next call to poll_block()
1449 * when the packet transmission queue has sufficient room to transmit a packet
1450 * with netdev_send().
1451 *
1452 * The kernel maintains a packet transmission queue, so the client is not
1453 * expected to do additional queuing of packets. Thus, this function is
1454 * unlikely to ever be used. It is included for completeness. */
1455 static void
1456 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1457 {
1458 if (is_tap_netdev(netdev)) {
1459 /* TAP device always accepts packets.*/
1460 poll_immediate_wake();
1461 }
1462 }
1463
1464 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1465 * otherwise a positive errno value. */
1466 static int
1467 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1468 {
1469 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1470 enum netdev_flags old_flags = 0;
1471 int error;
1472
1473 ovs_mutex_lock(&netdev->mutex);
1474 if (netdev_linux_netnsid_is_remote(netdev)) {
1475 error = EOPNOTSUPP;
1476 goto exit;
1477 }
1478
1479 if (netdev->cache_valid & VALID_ETHERADDR) {
1480 error = netdev->ether_addr_error;
1481 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1482 goto exit;
1483 }
1484 netdev->cache_valid &= ~VALID_ETHERADDR;
1485 }
1486
1487 /* Tap devices must be brought down before setting the address. */
1488 if (is_tap_netdev(netdev_)) {
1489 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1490 }
1491 error = set_etheraddr(netdev_get_name(netdev_), mac);
1492 if (!error || error == ENODEV) {
1493 netdev->ether_addr_error = error;
1494 netdev->cache_valid |= VALID_ETHERADDR;
1495 if (!error) {
1496 netdev->etheraddr = mac;
1497 }
1498 }
1499
1500 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1501 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1502 }
1503
1504 exit:
1505 ovs_mutex_unlock(&netdev->mutex);
1506 return error;
1507 }
1508
1509 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1510 static int
1511 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1512 {
1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1514 int error;
1515
1516 ovs_mutex_lock(&netdev->mutex);
1517 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1518 netdev_linux_update_via_netlink(netdev);
1519 }
1520
1521 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1522 /* Fall back to ioctl if netlink fails */
1523 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1524 &netdev->etheraddr);
1525 netdev->cache_valid |= VALID_ETHERADDR;
1526 }
1527
1528 error = netdev->ether_addr_error;
1529 if (!error) {
1530 *mac = netdev->etheraddr;
1531 }
1532 ovs_mutex_unlock(&netdev->mutex);
1533
1534 return error;
1535 }
1536
1537 static int
1538 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1539 {
1540 int error;
1541
1542 if (!(netdev->cache_valid & VALID_MTU)) {
1543 netdev_linux_update_via_netlink(netdev);
1544 }
1545
1546 if (!(netdev->cache_valid & VALID_MTU)) {
1547 /* Fall back to ioctl if netlink fails */
1548 struct ifreq ifr;
1549
1550 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1551 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1552 netdev->mtu = ifr.ifr_mtu;
1553 netdev->cache_valid |= VALID_MTU;
1554 }
1555
1556 error = netdev->netdev_mtu_error;
1557 if (!error) {
1558 *mtup = netdev->mtu;
1559 }
1560
1561 return error;
1562 }
1563
1564 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1565 * in bytes, not including the hardware header; thus, this is typically 1500
1566 * bytes for Ethernet devices. */
1567 static int
1568 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1569 {
1570 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1571 int error;
1572
1573 ovs_mutex_lock(&netdev->mutex);
1574 error = netdev_linux_get_mtu__(netdev, mtup);
1575 ovs_mutex_unlock(&netdev->mutex);
1576
1577 return error;
1578 }
1579
1580 /* Sets the maximum size of transmitted (MTU) for given device using linux
1581 * networking ioctl interface.
1582 */
1583 static int
1584 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1585 {
1586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1587 struct ifreq ifr;
1588 int error;
1589
1590 ovs_mutex_lock(&netdev->mutex);
1591 if (netdev_linux_netnsid_is_remote(netdev)) {
1592 error = EOPNOTSUPP;
1593 goto exit;
1594 }
1595
1596 if (netdev->cache_valid & VALID_MTU) {
1597 error = netdev->netdev_mtu_error;
1598 if (error || netdev->mtu == mtu) {
1599 goto exit;
1600 }
1601 netdev->cache_valid &= ~VALID_MTU;
1602 }
1603 ifr.ifr_mtu = mtu;
1604 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1605 SIOCSIFMTU, "SIOCSIFMTU");
1606 if (!error || error == ENODEV) {
1607 netdev->netdev_mtu_error = error;
1608 netdev->mtu = ifr.ifr_mtu;
1609 netdev->cache_valid |= VALID_MTU;
1610 }
1611 exit:
1612 ovs_mutex_unlock(&netdev->mutex);
1613 return error;
1614 }
1615
1616 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1617 * On failure, returns a negative errno value. */
1618 static int
1619 netdev_linux_get_ifindex(const struct netdev *netdev_)
1620 {
1621 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1622 int ifindex, error;
1623
1624 ovs_mutex_lock(&netdev->mutex);
1625 if (netdev_linux_netnsid_is_remote(netdev)) {
1626 error = EOPNOTSUPP;
1627 goto exit;
1628 }
1629 error = get_ifindex(netdev_, &ifindex);
1630
1631 exit:
1632 ovs_mutex_unlock(&netdev->mutex);
1633 return error ? -error : ifindex;
1634 }
1635
1636 static int
1637 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1638 {
1639 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1640
1641 ovs_mutex_lock(&netdev->mutex);
1642 if (netdev->miimon_interval > 0) {
1643 *carrier = netdev->miimon;
1644 } else {
1645 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1646 }
1647 ovs_mutex_unlock(&netdev->mutex);
1648
1649 return 0;
1650 }
1651
1652 static long long int
1653 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1654 {
1655 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1656 long long int carrier_resets;
1657
1658 ovs_mutex_lock(&netdev->mutex);
1659 carrier_resets = netdev->carrier_resets;
1660 ovs_mutex_unlock(&netdev->mutex);
1661
1662 return carrier_resets;
1663 }
1664
1665 static int
1666 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1667 struct mii_ioctl_data *data)
1668 {
1669 struct ifreq ifr;
1670 int error;
1671
1672 memset(&ifr, 0, sizeof ifr);
1673 memcpy(&ifr.ifr_data, data, sizeof *data);
1674 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1675 memcpy(data, &ifr.ifr_data, sizeof *data);
1676
1677 return error;
1678 }
1679
1680 static int
1681 netdev_linux_get_miimon(const char *name, bool *miimon)
1682 {
1683 struct mii_ioctl_data data;
1684 int error;
1685
1686 *miimon = false;
1687
1688 memset(&data, 0, sizeof data);
1689 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1690 if (!error) {
1691 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1692 data.reg_num = MII_BMSR;
1693 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1694 &data);
1695
1696 if (!error) {
1697 *miimon = !!(data.val_out & BMSR_LSTATUS);
1698 }
1699 }
1700 if (error) {
1701 struct ethtool_cmd ecmd;
1702
1703 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1704 name);
1705
1706 COVERAGE_INC(netdev_get_ethtool);
1707 memset(&ecmd, 0, sizeof ecmd);
1708 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1709 "ETHTOOL_GLINK");
1710 if (!error) {
1711 struct ethtool_value eval;
1712
1713 memcpy(&eval, &ecmd, sizeof eval);
1714 *miimon = !!eval.data;
1715 } else {
1716 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1717 }
1718 }
1719
1720 return error;
1721 }
1722
1723 static int
1724 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1725 long long int interval)
1726 {
1727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1728
1729 ovs_mutex_lock(&netdev->mutex);
1730 interval = interval > 0 ? MAX(interval, 100) : 0;
1731 if (netdev->miimon_interval != interval) {
1732 if (interval && !netdev->miimon_interval) {
1733 atomic_count_inc(&miimon_cnt);
1734 } else if (!interval && netdev->miimon_interval) {
1735 atomic_count_dec(&miimon_cnt);
1736 }
1737
1738 netdev->miimon_interval = interval;
1739 timer_set_expired(&netdev->miimon_timer);
1740 }
1741 ovs_mutex_unlock(&netdev->mutex);
1742
1743 return 0;
1744 }
1745
1746 static void
1747 netdev_linux_miimon_run(void)
1748 {
1749 struct shash device_shash;
1750 struct shash_node *node;
1751
1752 shash_init(&device_shash);
1753 netdev_get_devices(&netdev_linux_class, &device_shash);
1754 SHASH_FOR_EACH (node, &device_shash) {
1755 struct netdev *netdev = node->data;
1756 struct netdev_linux *dev = netdev_linux_cast(netdev);
1757 bool miimon;
1758
1759 ovs_mutex_lock(&dev->mutex);
1760 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1761 netdev_linux_get_miimon(dev->up.name, &miimon);
1762 if (miimon != dev->miimon) {
1763 dev->miimon = miimon;
1764 netdev_linux_changed(dev, dev->ifi_flags, 0);
1765 }
1766
1767 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1768 }
1769 ovs_mutex_unlock(&dev->mutex);
1770 netdev_close(netdev);
1771 }
1772
1773 shash_destroy(&device_shash);
1774 }
1775
1776 static void
1777 netdev_linux_miimon_wait(void)
1778 {
1779 struct shash device_shash;
1780 struct shash_node *node;
1781
1782 shash_init(&device_shash);
1783 netdev_get_devices(&netdev_linux_class, &device_shash);
1784 SHASH_FOR_EACH (node, &device_shash) {
1785 struct netdev *netdev = node->data;
1786 struct netdev_linux *dev = netdev_linux_cast(netdev);
1787
1788 ovs_mutex_lock(&dev->mutex);
1789 if (dev->miimon_interval > 0) {
1790 timer_wait(&dev->miimon_timer);
1791 }
1792 ovs_mutex_unlock(&dev->mutex);
1793 netdev_close(netdev);
1794 }
1795 shash_destroy(&device_shash);
1796 }
1797
1798 static void
1799 swap_uint64(uint64_t *a, uint64_t *b)
1800 {
1801 uint64_t tmp = *a;
1802 *a = *b;
1803 *b = tmp;
1804 }
1805
1806 /* Copies 'src' into 'dst', performing format conversion in the process.
1807 *
1808 * 'src' is allowed to be misaligned. */
1809 static void
1810 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1811 const struct ovs_vport_stats *src)
1812 {
1813 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1814 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1815 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1816 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1817 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1818 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1819 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1820 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1821 dst->multicast = 0;
1822 dst->collisions = 0;
1823 dst->rx_length_errors = 0;
1824 dst->rx_over_errors = 0;
1825 dst->rx_crc_errors = 0;
1826 dst->rx_frame_errors = 0;
1827 dst->rx_fifo_errors = 0;
1828 dst->rx_missed_errors = 0;
1829 dst->tx_aborted_errors = 0;
1830 dst->tx_carrier_errors = 0;
1831 dst->tx_fifo_errors = 0;
1832 dst->tx_heartbeat_errors = 0;
1833 dst->tx_window_errors = 0;
1834 }
1835
1836 static int
1837 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1838 {
1839 struct dpif_netlink_vport reply;
1840 struct ofpbuf *buf;
1841 int error;
1842
1843 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1844 if (error) {
1845 return error;
1846 } else if (!reply.stats) {
1847 ofpbuf_delete(buf);
1848 return EOPNOTSUPP;
1849 }
1850
1851 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1852
1853 ofpbuf_delete(buf);
1854
1855 return 0;
1856 }
1857
1858 static void
1859 get_stats_via_vport(const struct netdev *netdev_,
1860 struct netdev_stats *stats)
1861 {
1862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1863
1864 if (!netdev->vport_stats_error ||
1865 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1866 int error;
1867
1868 error = get_stats_via_vport__(netdev_, stats);
1869 if (error && error != ENOENT && error != ENODEV) {
1870 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1871 "(%s)",
1872 netdev_get_name(netdev_), ovs_strerror(error));
1873 }
1874 netdev->vport_stats_error = error;
1875 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1876 }
1877 }
1878
1879 /* Retrieves current device stats for 'netdev-linux'. */
1880 static int
1881 netdev_linux_get_stats(const struct netdev *netdev_,
1882 struct netdev_stats *stats)
1883 {
1884 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1885 struct netdev_stats dev_stats;
1886 int error;
1887
1888 ovs_mutex_lock(&netdev->mutex);
1889 get_stats_via_vport(netdev_, stats);
1890 error = get_stats_via_netlink(netdev_, &dev_stats);
1891 if (error) {
1892 if (!netdev->vport_stats_error) {
1893 error = 0;
1894 }
1895 } else if (netdev->vport_stats_error) {
1896 /* stats not available from OVS then use netdev stats. */
1897 *stats = dev_stats;
1898 } else {
1899 /* Use kernel netdev's packet and byte counts since vport's counters
1900 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1901 * enabled. */
1902 stats->rx_packets = dev_stats.rx_packets;
1903 stats->rx_bytes = dev_stats.rx_bytes;
1904 stats->tx_packets = dev_stats.tx_packets;
1905 stats->tx_bytes = dev_stats.tx_bytes;
1906
1907 stats->rx_errors += dev_stats.rx_errors;
1908 stats->tx_errors += dev_stats.tx_errors;
1909 stats->rx_dropped += dev_stats.rx_dropped;
1910 stats->tx_dropped += dev_stats.tx_dropped;
1911 stats->multicast += dev_stats.multicast;
1912 stats->collisions += dev_stats.collisions;
1913 stats->rx_length_errors += dev_stats.rx_length_errors;
1914 stats->rx_over_errors += dev_stats.rx_over_errors;
1915 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1916 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1917 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1918 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1919 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1920 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1921 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1922 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1923 stats->tx_window_errors += dev_stats.tx_window_errors;
1924 }
1925 ovs_mutex_unlock(&netdev->mutex);
1926
1927 return error;
1928 }
1929
1930 /* Retrieves current device stats for 'netdev-tap' netdev or
1931 * netdev-internal. */
1932 static int
1933 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1934 {
1935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1936 struct netdev_stats dev_stats;
1937 int error;
1938
1939 ovs_mutex_lock(&netdev->mutex);
1940 get_stats_via_vport(netdev_, stats);
1941 error = get_stats_via_netlink(netdev_, &dev_stats);
1942 if (error) {
1943 if (!netdev->vport_stats_error) {
1944 error = 0;
1945 }
1946 } else if (netdev->vport_stats_error) {
1947 /* Transmit and receive stats will appear to be swapped relative to the
1948 * other ports since we are the one sending the data, not a remote
1949 * computer. For consistency, we swap them back here. This does not
1950 * apply if we are getting stats from the vport layer because it always
1951 * tracks stats from the perspective of the switch. */
1952
1953 *stats = dev_stats;
1954 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1955 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1956 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1957 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1958 stats->rx_length_errors = 0;
1959 stats->rx_over_errors = 0;
1960 stats->rx_crc_errors = 0;
1961 stats->rx_frame_errors = 0;
1962 stats->rx_fifo_errors = 0;
1963 stats->rx_missed_errors = 0;
1964 stats->tx_aborted_errors = 0;
1965 stats->tx_carrier_errors = 0;
1966 stats->tx_fifo_errors = 0;
1967 stats->tx_heartbeat_errors = 0;
1968 stats->tx_window_errors = 0;
1969 } else {
1970 /* Use kernel netdev's packet and byte counts since vport counters
1971 * do not reflect packet counts on the wire when GSO, TSO or GRO
1972 * are enabled. */
1973 stats->rx_packets = dev_stats.tx_packets;
1974 stats->rx_bytes = dev_stats.tx_bytes;
1975 stats->tx_packets = dev_stats.rx_packets;
1976 stats->tx_bytes = dev_stats.rx_bytes;
1977
1978 stats->rx_dropped += dev_stats.tx_dropped;
1979 stats->tx_dropped += dev_stats.rx_dropped;
1980
1981 stats->rx_errors += dev_stats.tx_errors;
1982 stats->tx_errors += dev_stats.rx_errors;
1983
1984 stats->multicast += dev_stats.multicast;
1985 stats->collisions += dev_stats.collisions;
1986 }
1987 stats->tx_dropped += netdev->tx_dropped;
1988 ovs_mutex_unlock(&netdev->mutex);
1989
1990 return error;
1991 }
1992
1993 static int
1994 netdev_internal_get_stats(const struct netdev *netdev_,
1995 struct netdev_stats *stats)
1996 {
1997 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1998 int error;
1999
2000 ovs_mutex_lock(&netdev->mutex);
2001 get_stats_via_vport(netdev_, stats);
2002 error = netdev->vport_stats_error;
2003 ovs_mutex_unlock(&netdev->mutex);
2004
2005 return error;
2006 }
2007
2008 static void
2009 netdev_linux_read_features(struct netdev_linux *netdev)
2010 {
2011 struct ethtool_cmd ecmd;
2012 uint32_t speed;
2013 int error;
2014
2015 if (netdev->cache_valid & VALID_FEATURES) {
2016 return;
2017 }
2018
2019 COVERAGE_INC(netdev_get_ethtool);
2020 memset(&ecmd, 0, sizeof ecmd);
2021 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2022 ETHTOOL_GSET, "ETHTOOL_GSET");
2023 if (error) {
2024 goto out;
2025 }
2026
2027 /* Supported features. */
2028 netdev->supported = 0;
2029 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2030 netdev->supported |= NETDEV_F_10MB_HD;
2031 }
2032 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2033 netdev->supported |= NETDEV_F_10MB_FD;
2034 }
2035 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2036 netdev->supported |= NETDEV_F_100MB_HD;
2037 }
2038 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2039 netdev->supported |= NETDEV_F_100MB_FD;
2040 }
2041 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2042 netdev->supported |= NETDEV_F_1GB_HD;
2043 }
2044 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2045 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2046 netdev->supported |= NETDEV_F_1GB_FD;
2047 }
2048 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2049 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2050 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2051 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2052 netdev->supported |= NETDEV_F_10GB_FD;
2053 }
2054 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2055 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2056 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2057 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2058 netdev->supported |= NETDEV_F_40GB_FD;
2059 }
2060 if (ecmd.supported & SUPPORTED_TP) {
2061 netdev->supported |= NETDEV_F_COPPER;
2062 }
2063 if (ecmd.supported & SUPPORTED_FIBRE) {
2064 netdev->supported |= NETDEV_F_FIBER;
2065 }
2066 if (ecmd.supported & SUPPORTED_Autoneg) {
2067 netdev->supported |= NETDEV_F_AUTONEG;
2068 }
2069 if (ecmd.supported & SUPPORTED_Pause) {
2070 netdev->supported |= NETDEV_F_PAUSE;
2071 }
2072 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2073 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2074 }
2075
2076 /* Advertised features. */
2077 netdev->advertised = 0;
2078 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2079 netdev->advertised |= NETDEV_F_10MB_HD;
2080 }
2081 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2082 netdev->advertised |= NETDEV_F_10MB_FD;
2083 }
2084 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2085 netdev->advertised |= NETDEV_F_100MB_HD;
2086 }
2087 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2088 netdev->advertised |= NETDEV_F_100MB_FD;
2089 }
2090 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2091 netdev->advertised |= NETDEV_F_1GB_HD;
2092 }
2093 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2094 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2095 netdev->advertised |= NETDEV_F_1GB_FD;
2096 }
2097 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2098 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2099 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2100 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2101 netdev->advertised |= NETDEV_F_10GB_FD;
2102 }
2103 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2104 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2105 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2106 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2107 netdev->advertised |= NETDEV_F_40GB_FD;
2108 }
2109 if (ecmd.advertising & ADVERTISED_TP) {
2110 netdev->advertised |= NETDEV_F_COPPER;
2111 }
2112 if (ecmd.advertising & ADVERTISED_FIBRE) {
2113 netdev->advertised |= NETDEV_F_FIBER;
2114 }
2115 if (ecmd.advertising & ADVERTISED_Autoneg) {
2116 netdev->advertised |= NETDEV_F_AUTONEG;
2117 }
2118 if (ecmd.advertising & ADVERTISED_Pause) {
2119 netdev->advertised |= NETDEV_F_PAUSE;
2120 }
2121 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2122 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2123 }
2124
2125 /* Current settings. */
2126 speed = ethtool_cmd_speed(&ecmd);
2127 if (speed == SPEED_10) {
2128 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2129 } else if (speed == SPEED_100) {
2130 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2131 } else if (speed == SPEED_1000) {
2132 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2133 } else if (speed == SPEED_10000) {
2134 netdev->current = NETDEV_F_10GB_FD;
2135 } else if (speed == 40000) {
2136 netdev->current = NETDEV_F_40GB_FD;
2137 } else if (speed == 100000) {
2138 netdev->current = NETDEV_F_100GB_FD;
2139 } else if (speed == 1000000) {
2140 netdev->current = NETDEV_F_1TB_FD;
2141 } else {
2142 netdev->current = 0;
2143 }
2144
2145 if (ecmd.port == PORT_TP) {
2146 netdev->current |= NETDEV_F_COPPER;
2147 } else if (ecmd.port == PORT_FIBRE) {
2148 netdev->current |= NETDEV_F_FIBER;
2149 }
2150
2151 if (ecmd.autoneg) {
2152 netdev->current |= NETDEV_F_AUTONEG;
2153 }
2154
2155 out:
2156 netdev->cache_valid |= VALID_FEATURES;
2157 netdev->get_features_error = error;
2158 }
2159
2160 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2161 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2162 * Returns 0 if successful, otherwise a positive errno value. */
2163 static int
2164 netdev_linux_get_features(const struct netdev *netdev_,
2165 enum netdev_features *current,
2166 enum netdev_features *advertised,
2167 enum netdev_features *supported,
2168 enum netdev_features *peer)
2169 {
2170 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2171 int error;
2172
2173 ovs_mutex_lock(&netdev->mutex);
2174 if (netdev_linux_netnsid_is_remote(netdev)) {
2175 error = EOPNOTSUPP;
2176 goto exit;
2177 }
2178
2179 netdev_linux_read_features(netdev);
2180 if (!netdev->get_features_error) {
2181 *current = netdev->current;
2182 *advertised = netdev->advertised;
2183 *supported = netdev->supported;
2184 *peer = 0; /* XXX */
2185 }
2186 error = netdev->get_features_error;
2187
2188 exit:
2189 ovs_mutex_unlock(&netdev->mutex);
2190 return error;
2191 }
2192
2193 /* Set the features advertised by 'netdev' to 'advertise'. */
2194 static int
2195 netdev_linux_set_advertisements(struct netdev *netdev_,
2196 enum netdev_features advertise)
2197 {
2198 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2199 struct ethtool_cmd ecmd;
2200 int error;
2201
2202 ovs_mutex_lock(&netdev->mutex);
2203
2204 COVERAGE_INC(netdev_get_ethtool);
2205
2206 if (netdev_linux_netnsid_is_remote(netdev)) {
2207 error = EOPNOTSUPP;
2208 goto exit;
2209 }
2210
2211 memset(&ecmd, 0, sizeof ecmd);
2212 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2213 ETHTOOL_GSET, "ETHTOOL_GSET");
2214 if (error) {
2215 goto exit;
2216 }
2217
2218 ecmd.advertising = 0;
2219 if (advertise & NETDEV_F_10MB_HD) {
2220 ecmd.advertising |= ADVERTISED_10baseT_Half;
2221 }
2222 if (advertise & NETDEV_F_10MB_FD) {
2223 ecmd.advertising |= ADVERTISED_10baseT_Full;
2224 }
2225 if (advertise & NETDEV_F_100MB_HD) {
2226 ecmd.advertising |= ADVERTISED_100baseT_Half;
2227 }
2228 if (advertise & NETDEV_F_100MB_FD) {
2229 ecmd.advertising |= ADVERTISED_100baseT_Full;
2230 }
2231 if (advertise & NETDEV_F_1GB_HD) {
2232 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2233 }
2234 if (advertise & NETDEV_F_1GB_FD) {
2235 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2236 }
2237 if (advertise & NETDEV_F_10GB_FD) {
2238 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2239 }
2240 if (advertise & NETDEV_F_COPPER) {
2241 ecmd.advertising |= ADVERTISED_TP;
2242 }
2243 if (advertise & NETDEV_F_FIBER) {
2244 ecmd.advertising |= ADVERTISED_FIBRE;
2245 }
2246 if (advertise & NETDEV_F_AUTONEG) {
2247 ecmd.advertising |= ADVERTISED_Autoneg;
2248 }
2249 if (advertise & NETDEV_F_PAUSE) {
2250 ecmd.advertising |= ADVERTISED_Pause;
2251 }
2252 if (advertise & NETDEV_F_PAUSE_ASYM) {
2253 ecmd.advertising |= ADVERTISED_Asym_Pause;
2254 }
2255 COVERAGE_INC(netdev_set_ethtool);
2256 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2257 ETHTOOL_SSET, "ETHTOOL_SSET");
2258
2259 exit:
2260 ovs_mutex_unlock(&netdev->mutex);
2261 return error;
2262 }
2263
2264 static struct tc_police
2265 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2266 {
2267 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2268 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2269 struct tc_police police;
2270 struct tc_ratespec rate;
2271 int mtu = 65535;
2272
2273 memset(&rate, 0, sizeof rate);
2274 rate.rate = bps;
2275 rate.cell_log = tc_calc_cell_log(mtu);
2276 rate.mpu = ETH_TOTAL_MIN;
2277
2278 memset(&police, 0, sizeof police);
2279 police.burst = tc_bytes_to_ticks(bps, bsize);
2280 police.action = TC_POLICE_SHOT;
2281 police.rate = rate;
2282 police.mtu = mtu;
2283
2284 return police;
2285 }
2286
2287 static void
2288 nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2289 {
2290 size_t offset;
2291
2292 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2293 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2294 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2295 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2296 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2297 nl_msg_end_nested(request, offset);
2298 }
2299
2300 static int
2301 tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2302 uint32_t kbits_burst)
2303 {
2304 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2305 size_t basic_offset, action_offset, inner_offset;
2306 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2307 int ifindex, index, err = 0;
2308 struct tc_police pol_act;
2309 uint32_t block_id = 0;
2310 struct ofpbuf request;
2311 struct ofpbuf *reply;
2312 struct tcmsg *tcmsg;
2313 uint32_t handle = 1;
2314
2315 err = get_ifindex(netdev, &ifindex);
2316 if (err) {
2317 return err;
2318 }
2319
2320 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2321 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2322 &request);
2323 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2324 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2325 tcmsg->tcm_handle = handle;
2326
2327 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2328 nl_msg_put_string(&request, TCA_KIND, "matchall");
2329 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2330 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2331 inner_offset = nl_msg_start_nested(&request, 1);
2332 nl_msg_put_act_police(&request, pol_act);
2333 nl_msg_end_nested(&request, inner_offset);
2334 nl_msg_end_nested(&request, action_offset);
2335 nl_msg_end_nested(&request, basic_offset);
2336
2337 err = tc_transact(&request, &reply);
2338 if (!err) {
2339 struct tcmsg *tc =
2340 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2341 ofpbuf_delete(reply);
2342 }
2343
2344 return err;
2345 }
2346
2347 static int
2348 tc_del_matchall_policer(struct netdev *netdev)
2349 {
2350 uint32_t block_id = 0;
2351 int ifindex;
2352 int err;
2353
2354 err = get_ifindex(netdev, &ifindex);
2355 if (err) {
2356 return err;
2357 }
2358
2359 err = tc_del_filter(ifindex, TC_RESERVED_PRIORITY_POLICE, 1, block_id,
2360 TC_INGRESS);
2361 if (err) {
2362 return err;
2363 }
2364
2365 return 0;
2366 }
2367
2368 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2369 * successful, otherwise a positive errno value. */
2370 static int
2371 netdev_linux_set_policing(struct netdev *netdev_,
2372 uint32_t kbits_rate, uint32_t kbits_burst)
2373 {
2374 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2375 const char *netdev_name = netdev_get_name(netdev_);
2376 int ifindex;
2377 int error;
2378
2379 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2380 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2381 : kbits_burst); /* Stick with user-specified value. */
2382
2383 ovs_mutex_lock(&netdev->mutex);
2384 if (netdev_linux_netnsid_is_remote(netdev)) {
2385 error = EOPNOTSUPP;
2386 goto out;
2387 }
2388
2389 if (netdev->cache_valid & VALID_POLICING) {
2390 error = netdev->netdev_policing_error;
2391 if (error || (netdev->kbits_rate == kbits_rate &&
2392 netdev->kbits_burst == kbits_burst)) {
2393 /* Assume that settings haven't changed since we last set them. */
2394 goto out;
2395 }
2396 netdev->cache_valid &= ~VALID_POLICING;
2397 }
2398
2399 COVERAGE_INC(netdev_set_policing);
2400
2401 /* Use matchall for policing when offloadling ovs with tc-flower. */
2402 if (netdev_is_flow_api_enabled()) {
2403 error = tc_del_matchall_policer(netdev_);
2404 if (kbits_rate) {
2405 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2406 }
2407 ovs_mutex_unlock(&netdev->mutex);
2408 return error;
2409 }
2410
2411 error = get_ifindex(netdev_, &ifindex);
2412 if (error) {
2413 goto out;
2414 }
2415
2416 /* Remove any existing ingress qdisc. */
2417 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
2418 if (error) {
2419 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2420 netdev_name, ovs_strerror(error));
2421 goto out;
2422 }
2423
2424 if (kbits_rate) {
2425 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
2426 if (error) {
2427 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2428 netdev_name, ovs_strerror(error));
2429 goto out;
2430 }
2431
2432 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2433 if (error){
2434 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2435 netdev_name, ovs_strerror(error));
2436 goto out;
2437 }
2438 }
2439
2440 netdev->kbits_rate = kbits_rate;
2441 netdev->kbits_burst = kbits_burst;
2442
2443 out:
2444 if (!error || error == ENODEV) {
2445 netdev->netdev_policing_error = error;
2446 netdev->cache_valid |= VALID_POLICING;
2447 }
2448 ovs_mutex_unlock(&netdev->mutex);
2449 return error;
2450 }
2451
2452 static int
2453 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2454 struct sset *types)
2455 {
2456 const struct tc_ops *const *opsp;
2457 for (opsp = tcs; *opsp != NULL; opsp++) {
2458 const struct tc_ops *ops = *opsp;
2459 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2460 sset_add(types, ops->ovs_name);
2461 }
2462 }
2463 return 0;
2464 }
2465
2466 static const struct tc_ops *
2467 tc_lookup_ovs_name(const char *name)
2468 {
2469 const struct tc_ops *const *opsp;
2470
2471 for (opsp = tcs; *opsp != NULL; opsp++) {
2472 const struct tc_ops *ops = *opsp;
2473 if (!strcmp(name, ops->ovs_name)) {
2474 return ops;
2475 }
2476 }
2477 return NULL;
2478 }
2479
2480 static const struct tc_ops *
2481 tc_lookup_linux_name(const char *name)
2482 {
2483 const struct tc_ops *const *opsp;
2484
2485 for (opsp = tcs; *opsp != NULL; opsp++) {
2486 const struct tc_ops *ops = *opsp;
2487 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2488 return ops;
2489 }
2490 }
2491 return NULL;
2492 }
2493
2494 static struct tc_queue *
2495 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2496 size_t hash)
2497 {
2498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2499 struct tc_queue *queue;
2500
2501 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2502 if (queue->queue_id == queue_id) {
2503 return queue;
2504 }
2505 }
2506 return NULL;
2507 }
2508
2509 static struct tc_queue *
2510 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2511 {
2512 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2513 }
2514
2515 static int
2516 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2517 const char *type,
2518 struct netdev_qos_capabilities *caps)
2519 {
2520 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2521 if (!ops) {
2522 return EOPNOTSUPP;
2523 }
2524 caps->n_queues = ops->n_queues;
2525 return 0;
2526 }
2527
2528 static int
2529 netdev_linux_get_qos(const struct netdev *netdev_,
2530 const char **typep, struct smap *details)
2531 {
2532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2533 int error;
2534
2535 ovs_mutex_lock(&netdev->mutex);
2536 if (netdev_linux_netnsid_is_remote(netdev)) {
2537 error = EOPNOTSUPP;
2538 goto exit;
2539 }
2540
2541 error = tc_query_qdisc(netdev_);
2542 if (!error) {
2543 *typep = netdev->tc->ops->ovs_name;
2544 error = (netdev->tc->ops->qdisc_get
2545 ? netdev->tc->ops->qdisc_get(netdev_, details)
2546 : 0);
2547 }
2548
2549 exit:
2550 ovs_mutex_unlock(&netdev->mutex);
2551 return error;
2552 }
2553
2554 static int
2555 netdev_linux_set_qos(struct netdev *netdev_,
2556 const char *type, const struct smap *details)
2557 {
2558 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2559 const struct tc_ops *new_ops;
2560 int error;
2561
2562 new_ops = tc_lookup_ovs_name(type);
2563 if (!new_ops || !new_ops->tc_install) {
2564 return EOPNOTSUPP;
2565 }
2566
2567 if (new_ops == &tc_ops_noop) {
2568 return new_ops->tc_install(netdev_, details);
2569 }
2570
2571 ovs_mutex_lock(&netdev->mutex);
2572 if (netdev_linux_netnsid_is_remote(netdev)) {
2573 error = EOPNOTSUPP;
2574 goto exit;
2575 }
2576
2577 error = tc_query_qdisc(netdev_);
2578 if (error) {
2579 goto exit;
2580 }
2581
2582 if (new_ops == netdev->tc->ops) {
2583 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2584 } else {
2585 /* Delete existing qdisc. */
2586 error = tc_del_qdisc(netdev_);
2587 if (error) {
2588 goto exit;
2589 }
2590 ovs_assert(netdev->tc == NULL);
2591
2592 /* Install new qdisc. */
2593 error = new_ops->tc_install(netdev_, details);
2594 ovs_assert((error == 0) == (netdev->tc != NULL));
2595 }
2596
2597 exit:
2598 ovs_mutex_unlock(&netdev->mutex);
2599 return error;
2600 }
2601
2602 static int
2603 netdev_linux_get_queue(const struct netdev *netdev_,
2604 unsigned int queue_id, struct smap *details)
2605 {
2606 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2607 int error;
2608
2609 ovs_mutex_lock(&netdev->mutex);
2610 if (netdev_linux_netnsid_is_remote(netdev)) {
2611 error = EOPNOTSUPP;
2612 goto exit;
2613 }
2614
2615 error = tc_query_qdisc(netdev_);
2616 if (!error) {
2617 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2618 error = (queue
2619 ? netdev->tc->ops->class_get(netdev_, queue, details)
2620 : ENOENT);
2621 }
2622
2623 exit:
2624 ovs_mutex_unlock(&netdev->mutex);
2625 return error;
2626 }
2627
2628 static int
2629 netdev_linux_set_queue(struct netdev *netdev_,
2630 unsigned int queue_id, const struct smap *details)
2631 {
2632 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2633 int error;
2634
2635 ovs_mutex_lock(&netdev->mutex);
2636 if (netdev_linux_netnsid_is_remote(netdev)) {
2637 error = EOPNOTSUPP;
2638 goto exit;
2639 }
2640
2641 error = tc_query_qdisc(netdev_);
2642 if (!error) {
2643 error = (queue_id < netdev->tc->ops->n_queues
2644 && netdev->tc->ops->class_set
2645 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2646 : EINVAL);
2647 }
2648
2649 exit:
2650 ovs_mutex_unlock(&netdev->mutex);
2651 return error;
2652 }
2653
2654 static int
2655 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2656 {
2657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2658 int error;
2659
2660 ovs_mutex_lock(&netdev->mutex);
2661 if (netdev_linux_netnsid_is_remote(netdev)) {
2662 error = EOPNOTSUPP;
2663 goto exit;
2664 }
2665
2666 error = tc_query_qdisc(netdev_);
2667 if (!error) {
2668 if (netdev->tc->ops->class_delete) {
2669 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2670 error = (queue
2671 ? netdev->tc->ops->class_delete(netdev_, queue)
2672 : ENOENT);
2673 } else {
2674 error = EINVAL;
2675 }
2676 }
2677
2678 exit:
2679 ovs_mutex_unlock(&netdev->mutex);
2680 return error;
2681 }
2682
2683 static int
2684 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2685 unsigned int queue_id,
2686 struct netdev_queue_stats *stats)
2687 {
2688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2689 int error;
2690
2691 ovs_mutex_lock(&netdev->mutex);
2692 if (netdev_linux_netnsid_is_remote(netdev)) {
2693 error = EOPNOTSUPP;
2694 goto exit;
2695 }
2696
2697 error = tc_query_qdisc(netdev_);
2698 if (!error) {
2699 if (netdev->tc->ops->class_get_stats) {
2700 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2701 if (queue) {
2702 stats->created = queue->created;
2703 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2704 stats);
2705 } else {
2706 error = ENOENT;
2707 }
2708 } else {
2709 error = EOPNOTSUPP;
2710 }
2711 }
2712
2713 exit:
2714 ovs_mutex_unlock(&netdev->mutex);
2715 return error;
2716 }
2717
2718 struct queue_dump_state {
2719 struct nl_dump dump;
2720 struct ofpbuf buf;
2721 };
2722
2723 static bool
2724 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2725 {
2726 struct ofpbuf request;
2727 struct tcmsg *tcmsg;
2728
2729 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2730 if (!tcmsg) {
2731 return false;
2732 }
2733 tcmsg->tcm_parent = 0;
2734 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2735 ofpbuf_uninit(&request);
2736
2737 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2738 return true;
2739 }
2740
2741 static int
2742 finish_queue_dump(struct queue_dump_state *state)
2743 {
2744 ofpbuf_uninit(&state->buf);
2745 return nl_dump_done(&state->dump);
2746 }
2747
2748 struct netdev_linux_queue_state {
2749 unsigned int *queues;
2750 size_t cur_queue;
2751 size_t n_queues;
2752 };
2753
2754 static int
2755 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2756 {
2757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2758 int error;
2759
2760 ovs_mutex_lock(&netdev->mutex);
2761 if (netdev_linux_netnsid_is_remote(netdev)) {
2762 error = EOPNOTSUPP;
2763 goto exit;
2764 }
2765
2766 error = tc_query_qdisc(netdev_);
2767 if (!error) {
2768 if (netdev->tc->ops->class_get) {
2769 struct netdev_linux_queue_state *state;
2770 struct tc_queue *queue;
2771 size_t i;
2772
2773 *statep = state = xmalloc(sizeof *state);
2774 state->n_queues = hmap_count(&netdev->tc->queues);
2775 state->cur_queue = 0;
2776 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2777
2778 i = 0;
2779 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2780 state->queues[i++] = queue->queue_id;
2781 }
2782 } else {
2783 error = EOPNOTSUPP;
2784 }
2785 }
2786
2787 exit:
2788 ovs_mutex_unlock(&netdev->mutex);
2789 return error;
2790 }
2791
2792 static int
2793 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2794 unsigned int *queue_idp, struct smap *details)
2795 {
2796 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2797 struct netdev_linux_queue_state *state = state_;
2798 int error = EOF;
2799
2800 ovs_mutex_lock(&netdev->mutex);
2801 if (netdev_linux_netnsid_is_remote(netdev)) {
2802 error = EOPNOTSUPP;
2803 goto exit;
2804 }
2805
2806 while (state->cur_queue < state->n_queues) {
2807 unsigned int queue_id = state->queues[state->cur_queue++];
2808 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2809
2810 if (queue) {
2811 *queue_idp = queue_id;
2812 error = netdev->tc->ops->class_get(netdev_, queue, details);
2813 break;
2814 }
2815 }
2816
2817 exit:
2818 ovs_mutex_unlock(&netdev->mutex);
2819 return error;
2820 }
2821
2822 static int
2823 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2824 void *state_)
2825 {
2826 struct netdev_linux_queue_state *state = state_;
2827
2828 free(state->queues);
2829 free(state);
2830 return 0;
2831 }
2832
2833 static int
2834 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2835 netdev_dump_queue_stats_cb *cb, void *aux)
2836 {
2837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2838 int error;
2839
2840 ovs_mutex_lock(&netdev->mutex);
2841 if (netdev_linux_netnsid_is_remote(netdev)) {
2842 error = EOPNOTSUPP;
2843 goto exit;
2844 }
2845
2846 error = tc_query_qdisc(netdev_);
2847 if (!error) {
2848 struct queue_dump_state state;
2849
2850 if (!netdev->tc->ops->class_dump_stats) {
2851 error = EOPNOTSUPP;
2852 } else if (!start_queue_dump(netdev_, &state)) {
2853 error = ENODEV;
2854 } else {
2855 struct ofpbuf msg;
2856 int retval;
2857
2858 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2859 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2860 cb, aux);
2861 if (retval) {
2862 error = retval;
2863 }
2864 }
2865
2866 retval = finish_queue_dump(&state);
2867 if (retval) {
2868 error = retval;
2869 }
2870 }
2871 }
2872
2873 exit:
2874 ovs_mutex_unlock(&netdev->mutex);
2875 return error;
2876 }
2877
2878 static int
2879 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2880 struct in_addr netmask)
2881 {
2882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2883 int error;
2884
2885 ovs_mutex_lock(&netdev->mutex);
2886 if (netdev_linux_netnsid_is_remote(netdev)) {
2887 error = EOPNOTSUPP;
2888 goto exit;
2889 }
2890
2891 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2892 if (!error) {
2893 if (address.s_addr != INADDR_ANY) {
2894 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2895 "SIOCSIFNETMASK", netmask);
2896 }
2897 }
2898
2899 exit:
2900 ovs_mutex_unlock(&netdev->mutex);
2901 return error;
2902 }
2903
2904 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2905 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2906 * error. */
2907 static int
2908 netdev_linux_get_addr_list(const struct netdev *netdev_,
2909 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2910 {
2911 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2912 int error;
2913
2914 ovs_mutex_lock(&netdev->mutex);
2915 if (netdev_linux_netnsid_is_remote(netdev)) {
2916 error = EOPNOTSUPP;
2917 goto exit;
2918 }
2919
2920 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2921
2922 exit:
2923 ovs_mutex_unlock(&netdev->mutex);
2924 return error;
2925 }
2926
2927 static void
2928 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2929 {
2930 struct sockaddr_in sin;
2931 memset(&sin, 0, sizeof sin);
2932 sin.sin_family = AF_INET;
2933 sin.sin_addr = addr;
2934 sin.sin_port = 0;
2935
2936 memset(sa, 0, sizeof *sa);
2937 memcpy(sa, &sin, sizeof sin);
2938 }
2939
2940 static int
2941 do_set_addr(struct netdev *netdev,
2942 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2943 {
2944 struct ifreq ifr;
2945
2946 make_in4_sockaddr(&ifr.ifr_addr, addr);
2947 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2948 ioctl_name);
2949 }
2950
2951 /* Adds 'router' as a default IP gateway. */
2952 static int
2953 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2954 {
2955 struct in_addr any = { INADDR_ANY };
2956 struct rtentry rt;
2957 int error;
2958
2959 memset(&rt, 0, sizeof rt);
2960 make_in4_sockaddr(&rt.rt_dst, any);
2961 make_in4_sockaddr(&rt.rt_gateway, router);
2962 make_in4_sockaddr(&rt.rt_genmask, any);
2963 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2964 error = af_inet_ioctl(SIOCADDRT, &rt);
2965 if (error) {
2966 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2967 }
2968 return error;
2969 }
2970
2971 static int
2972 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2973 char **netdev_name)
2974 {
2975 static const char fn[] = "/proc/net/route";
2976 FILE *stream;
2977 char line[256];
2978 int ln;
2979
2980 *netdev_name = NULL;
2981 stream = fopen(fn, "r");
2982 if (stream == NULL) {
2983 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2984 return errno;
2985 }
2986
2987 ln = 0;
2988 while (fgets(line, sizeof line, stream)) {
2989 if (++ln >= 2) {
2990 char iface[17];
2991 ovs_be32 dest, gateway, mask;
2992 int refcnt, metric, mtu;
2993 unsigned int flags, use, window, irtt;
2994
2995 if (!ovs_scan(line,
2996 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2997 " %d %u %u\n",
2998 iface, &dest, &gateway, &flags, &refcnt,
2999 &use, &metric, &mask, &mtu, &window, &irtt)) {
3000 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3001 fn, ln, line);
3002 continue;
3003 }
3004 if (!(flags & RTF_UP)) {
3005 /* Skip routes that aren't up. */
3006 continue;
3007 }
3008
3009 /* The output of 'dest', 'mask', and 'gateway' were given in
3010 * network byte order, so we don't need need any endian
3011 * conversions here. */
3012 if ((dest & mask) == (host->s_addr & mask)) {
3013 if (!gateway) {
3014 /* The host is directly reachable. */
3015 next_hop->s_addr = 0;
3016 } else {
3017 /* To reach the host, we must go through a gateway. */
3018 next_hop->s_addr = gateway;
3019 }
3020 *netdev_name = xstrdup(iface);
3021 fclose(stream);
3022 return 0;
3023 }
3024 }
3025 }
3026
3027 fclose(stream);
3028 return ENXIO;
3029 }
3030
3031 static int
3032 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3033 {
3034 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3035 int error = 0;
3036
3037 ovs_mutex_lock(&netdev->mutex);
3038 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3039 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3040
3041 COVERAGE_INC(netdev_get_ethtool);
3042 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3043 error = netdev_linux_do_ethtool(netdev->up.name,
3044 cmd,
3045 ETHTOOL_GDRVINFO,
3046 "ETHTOOL_GDRVINFO");
3047 if (!error) {
3048 netdev->cache_valid |= VALID_DRVINFO;
3049 }
3050 }
3051
3052 if (!error) {
3053 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3054 smap_add(smap, "driver_version", netdev->drvinfo.version);
3055 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3056 }
3057 ovs_mutex_unlock(&netdev->mutex);
3058
3059 return error;
3060 }
3061
3062 static int
3063 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3064 struct smap *smap)
3065 {
3066 smap_add(smap, "driver_name", "openvswitch");
3067 return 0;
3068 }
3069
3070 static uint32_t
3071 netdev_linux_get_block_id(struct netdev *netdev_)
3072 {
3073 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3074 uint32_t block_id = 0;
3075
3076 ovs_mutex_lock(&netdev->mutex);
3077 /* Ensure the linux netdev has had its fields populated. */
3078 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3079 netdev_linux_update_via_netlink(netdev);
3080 }
3081
3082 /* Only assigning block ids to linux netdevs that are LAG masters. */
3083 if (netdev->is_lag_master) {
3084 block_id = netdev->ifindex;
3085 }
3086 ovs_mutex_unlock(&netdev->mutex);
3087
3088 return block_id;
3089 }
3090
3091 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3092 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3093 * returns 0. Otherwise, it returns a positive errno value; in particular,
3094 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3095 static int
3096 netdev_linux_arp_lookup(const struct netdev *netdev,
3097 ovs_be32 ip, struct eth_addr *mac)
3098 {
3099 struct arpreq r;
3100 struct sockaddr_in sin;
3101 int retval;
3102
3103 memset(&r, 0, sizeof r);
3104 memset(&sin, 0, sizeof sin);
3105 sin.sin_family = AF_INET;
3106 sin.sin_addr.s_addr = ip;
3107 sin.sin_port = 0;
3108 memcpy(&r.arp_pa, &sin, sizeof sin);
3109 r.arp_ha.sa_family = ARPHRD_ETHER;
3110 r.arp_flags = 0;
3111 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3112 COVERAGE_INC(netdev_arp_lookup);
3113 retval = af_inet_ioctl(SIOCGARP, &r);
3114 if (!retval) {
3115 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3116 } else if (retval != ENXIO) {
3117 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3118 netdev_get_name(netdev), IP_ARGS(ip),
3119 ovs_strerror(retval));
3120 }
3121 return retval;
3122 }
3123
3124 static unsigned int
3125 nd_to_iff_flags(enum netdev_flags nd)
3126 {
3127 unsigned int iff = 0;
3128 if (nd & NETDEV_UP) {
3129 iff |= IFF_UP;
3130 }
3131 if (nd & NETDEV_PROMISC) {
3132 iff |= IFF_PROMISC;
3133 }
3134 if (nd & NETDEV_LOOPBACK) {
3135 iff |= IFF_LOOPBACK;
3136 }
3137 return iff;
3138 }
3139
3140 static int
3141 iff_to_nd_flags(unsigned int iff)
3142 {
3143 enum netdev_flags nd = 0;
3144 if (iff & IFF_UP) {
3145 nd |= NETDEV_UP;
3146 }
3147 if (iff & IFF_PROMISC) {
3148 nd |= NETDEV_PROMISC;
3149 }
3150 if (iff & IFF_LOOPBACK) {
3151 nd |= NETDEV_LOOPBACK;
3152 }
3153 return nd;
3154 }
3155
3156 static int
3157 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3158 enum netdev_flags on, enum netdev_flags *old_flagsp)
3159 OVS_REQUIRES(netdev->mutex)
3160 {
3161 unsigned int old_flags, new_flags;
3162 int error = 0;
3163
3164 old_flags = netdev->ifi_flags;
3165 *old_flagsp = iff_to_nd_flags(old_flags);
3166 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3167 if (new_flags != old_flags) {
3168 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3169 get_flags(&netdev->up, &netdev->ifi_flags);
3170 }
3171
3172 return error;
3173 }
3174
3175 static int
3176 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3177 enum netdev_flags on, enum netdev_flags *old_flagsp)
3178 {
3179 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3180 int error = 0;
3181
3182 ovs_mutex_lock(&netdev->mutex);
3183 if (on || off) {
3184 /* Changing flags over netlink isn't support yet. */
3185 if (netdev_linux_netnsid_is_remote(netdev)) {
3186 error = EOPNOTSUPP;
3187 goto exit;
3188 }
3189 error = update_flags(netdev, off, on, old_flagsp);
3190 } else {
3191 /* Try reading flags over netlink, or fall back to ioctl. */
3192 if (!netdev_linux_update_via_netlink(netdev)) {
3193 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3194 } else {
3195 error = update_flags(netdev, off, on, old_flagsp);
3196 }
3197 }
3198
3199 exit:
3200 ovs_mutex_unlock(&netdev->mutex);
3201 return error;
3202 }
3203
3204 #define NETDEV_LINUX_CLASS_COMMON \
3205 .run = netdev_linux_run, \
3206 .wait = netdev_linux_wait, \
3207 .alloc = netdev_linux_alloc, \
3208 .dealloc = netdev_linux_dealloc, \
3209 .send_wait = netdev_linux_send_wait, \
3210 .set_etheraddr = netdev_linux_set_etheraddr, \
3211 .get_etheraddr = netdev_linux_get_etheraddr, \
3212 .get_mtu = netdev_linux_get_mtu, \
3213 .set_mtu = netdev_linux_set_mtu, \
3214 .get_ifindex = netdev_linux_get_ifindex, \
3215 .get_carrier = netdev_linux_get_carrier, \
3216 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3217 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3218 .set_advertisements = netdev_linux_set_advertisements, \
3219 .set_policing = netdev_linux_set_policing, \
3220 .get_qos_types = netdev_linux_get_qos_types, \
3221 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3222 .get_qos = netdev_linux_get_qos, \
3223 .set_qos = netdev_linux_set_qos, \
3224 .get_queue = netdev_linux_get_queue, \
3225 .set_queue = netdev_linux_set_queue, \
3226 .delete_queue = netdev_linux_delete_queue, \
3227 .get_queue_stats = netdev_linux_get_queue_stats, \
3228 .queue_dump_start = netdev_linux_queue_dump_start, \
3229 .queue_dump_next = netdev_linux_queue_dump_next, \
3230 .queue_dump_done = netdev_linux_queue_dump_done, \
3231 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3232 .set_in4 = netdev_linux_set_in4, \
3233 .get_addr_list = netdev_linux_get_addr_list, \
3234 .add_router = netdev_linux_add_router, \
3235 .get_next_hop = netdev_linux_get_next_hop, \
3236 .arp_lookup = netdev_linux_arp_lookup, \
3237 .update_flags = netdev_linux_update_flags, \
3238 .rxq_alloc = netdev_linux_rxq_alloc, \
3239 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3240 .rxq_wait = netdev_linux_rxq_wait, \
3241 .rxq_drain = netdev_linux_rxq_drain
3242
3243 const struct netdev_class netdev_linux_class = {
3244 NETDEV_LINUX_CLASS_COMMON,
3245 .type = "system",
3246 .is_pmd = false,
3247 .construct = netdev_linux_construct,
3248 .destruct = netdev_linux_destruct,
3249 .get_stats = netdev_linux_get_stats,
3250 .get_features = netdev_linux_get_features,
3251 .get_status = netdev_linux_get_status,
3252 .get_block_id = netdev_linux_get_block_id,
3253 .send = netdev_linux_send,
3254 .rxq_construct = netdev_linux_rxq_construct,
3255 .rxq_destruct = netdev_linux_rxq_destruct,
3256 .rxq_recv = netdev_linux_rxq_recv,
3257 };
3258
3259 const struct netdev_class netdev_tap_class = {
3260 NETDEV_LINUX_CLASS_COMMON,
3261 .type = "tap",
3262 .is_pmd = false,
3263 .construct = netdev_linux_construct_tap,
3264 .destruct = netdev_linux_destruct,
3265 .get_stats = netdev_tap_get_stats,
3266 .get_features = netdev_linux_get_features,
3267 .get_status = netdev_linux_get_status,
3268 .send = netdev_linux_send,
3269 .rxq_construct = netdev_linux_rxq_construct,
3270 .rxq_destruct = netdev_linux_rxq_destruct,
3271 .rxq_recv = netdev_linux_rxq_recv,
3272 };
3273
3274 const struct netdev_class netdev_internal_class = {
3275 NETDEV_LINUX_CLASS_COMMON,
3276 .type = "internal",
3277 .is_pmd = false,
3278 .construct = netdev_linux_construct,
3279 .destruct = netdev_linux_destruct,
3280 .get_stats = netdev_internal_get_stats,
3281 .get_status = netdev_internal_get_status,
3282 .send = netdev_linux_send,
3283 .rxq_construct = netdev_linux_rxq_construct,
3284 .rxq_destruct = netdev_linux_rxq_destruct,
3285 .rxq_recv = netdev_linux_rxq_recv,
3286 };
3287
3288 #ifdef HAVE_AF_XDP
3289 const struct netdev_class netdev_afxdp_class = {
3290 NETDEV_LINUX_CLASS_COMMON,
3291 .type = "afxdp",
3292 .is_pmd = true,
3293 .construct = netdev_linux_construct,
3294 .destruct = netdev_afxdp_destruct,
3295 .get_stats = netdev_afxdp_get_stats,
3296 .get_status = netdev_linux_get_status,
3297 .set_config = netdev_afxdp_set_config,
3298 .get_config = netdev_afxdp_get_config,
3299 .reconfigure = netdev_afxdp_reconfigure,
3300 .get_numa_id = netdev_afxdp_get_numa_id,
3301 .send = netdev_afxdp_batch_send,
3302 .rxq_construct = netdev_afxdp_rxq_construct,
3303 .rxq_destruct = netdev_afxdp_rxq_destruct,
3304 .rxq_recv = netdev_afxdp_rxq_recv,
3305 };
3306 #endif
3307 \f
3308
3309 #define CODEL_N_QUEUES 0x0000
3310
3311 /* In sufficiently new kernel headers these are defined as enums in
3312 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3313 * kernels. (This overrides any enum definition in the header file but that's
3314 * harmless.) */
3315 #define TCA_CODEL_TARGET 1
3316 #define TCA_CODEL_LIMIT 2
3317 #define TCA_CODEL_INTERVAL 3
3318
3319 struct codel {
3320 struct tc tc;
3321 uint32_t target;
3322 uint32_t limit;
3323 uint32_t interval;
3324 };
3325
3326 static struct codel *
3327 codel_get__(const struct netdev *netdev_)
3328 {
3329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3330 return CONTAINER_OF(netdev->tc, struct codel, tc);
3331 }
3332
3333 static void
3334 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3335 uint32_t interval)
3336 {
3337 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3338 struct codel *codel;
3339
3340 codel = xmalloc(sizeof *codel);
3341 tc_init(&codel->tc, &tc_ops_codel);
3342 codel->target = target;
3343 codel->limit = limit;
3344 codel->interval = interval;
3345
3346 netdev->tc = &codel->tc;
3347 }
3348
3349 static int
3350 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3351 uint32_t interval)
3352 {
3353 size_t opt_offset;
3354 struct ofpbuf request;
3355 struct tcmsg *tcmsg;
3356 uint32_t otarget, olimit, ointerval;
3357 int error;
3358
3359 tc_del_qdisc(netdev);
3360
3361 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3362 NLM_F_EXCL | NLM_F_CREATE, &request);
3363 if (!tcmsg) {
3364 return ENODEV;
3365 }
3366 tcmsg->tcm_handle = tc_make_handle(1, 0);
3367 tcmsg->tcm_parent = TC_H_ROOT;
3368
3369 otarget = target ? target : 5000;
3370 olimit = limit ? limit : 10240;
3371 ointerval = interval ? interval : 100000;
3372
3373 nl_msg_put_string(&request, TCA_KIND, "codel");
3374 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3375 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3376 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3377 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3378 nl_msg_end_nested(&request, opt_offset);
3379
3380 error = tc_transact(&request, NULL);
3381 if (error) {
3382 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3383 "target %u, limit %u, interval %u error %d(%s)",
3384 netdev_get_name(netdev),
3385 otarget, olimit, ointerval,
3386 error, ovs_strerror(error));
3387 }
3388 return error;
3389 }
3390
3391 static void
3392 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3393 const struct smap *details, struct codel *codel)
3394 {
3395 codel->target = smap_get_ullong(details, "target", 0);
3396 codel->limit = smap_get_ullong(details, "limit", 0);
3397 codel->interval = smap_get_ullong(details, "interval", 0);
3398
3399 if (!codel->target) {
3400 codel->target = 5000;
3401 }
3402 if (!codel->limit) {
3403 codel->limit = 10240;
3404 }
3405 if (!codel->interval) {
3406 codel->interval = 100000;
3407 }
3408 }
3409
3410 static int
3411 codel_tc_install(struct netdev *netdev, const struct smap *details)
3412 {
3413 int error;
3414 struct codel codel;
3415
3416 codel_parse_qdisc_details__(netdev, details, &codel);
3417 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3418 codel.interval);
3419 if (!error) {
3420 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3421 }
3422 return error;
3423 }
3424
3425 static int
3426 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3427 {
3428 static const struct nl_policy tca_codel_policy[] = {
3429 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3430 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3431 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3432 };
3433
3434 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3435
3436 if (!nl_parse_nested(nl_options, tca_codel_policy,
3437 attrs, ARRAY_SIZE(tca_codel_policy))) {
3438 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3439 return EPROTO;
3440 }
3441
3442 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3443 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3444 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3445 return 0;
3446 }
3447
3448 static int
3449 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3450 {
3451 struct nlattr *nlattr;
3452 const char * kind;
3453 int error;
3454 struct codel codel;
3455
3456 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3457 if (error != 0) {
3458 return error;
3459 }
3460
3461 error = codel_parse_tca_options__(nlattr, &codel);
3462 if (error != 0) {
3463 return error;
3464 }
3465
3466 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3467 return 0;
3468 }
3469
3470
3471 static void
3472 codel_tc_destroy(struct tc *tc)
3473 {
3474 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3475 tc_destroy(tc);
3476 free(codel);
3477 }
3478
3479 static int
3480 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3481 {
3482 const struct codel *codel = codel_get__(netdev);
3483 smap_add_format(details, "target", "%u", codel->target);
3484 smap_add_format(details, "limit", "%u", codel->limit);
3485 smap_add_format(details, "interval", "%u", codel->interval);
3486 return 0;
3487 }
3488
3489 static int
3490 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3491 {
3492 struct codel codel;
3493
3494 codel_parse_qdisc_details__(netdev, details, &codel);
3495 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3496 codel_get__(netdev)->target = codel.target;
3497 codel_get__(netdev)->limit = codel.limit;
3498 codel_get__(netdev)->interval = codel.interval;
3499 return 0;
3500 }
3501
3502 static const struct tc_ops tc_ops_codel = {
3503 .linux_name = "codel",
3504 .ovs_name = "linux-codel",
3505 .n_queues = CODEL_N_QUEUES,
3506 .tc_install = codel_tc_install,
3507 .tc_load = codel_tc_load,
3508 .tc_destroy = codel_tc_destroy,
3509 .qdisc_get = codel_qdisc_get,
3510 .qdisc_set = codel_qdisc_set,
3511 };
3512 \f
3513 /* FQ-CoDel traffic control class. */
3514
3515 #define FQCODEL_N_QUEUES 0x0000
3516
3517 /* In sufficiently new kernel headers these are defined as enums in
3518 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3519 * kernels. (This overrides any enum definition in the header file but that's
3520 * harmless.) */
3521 #define TCA_FQ_CODEL_TARGET 1
3522 #define TCA_FQ_CODEL_LIMIT 2
3523 #define TCA_FQ_CODEL_INTERVAL 3
3524 #define TCA_FQ_CODEL_ECN 4
3525 #define TCA_FQ_CODEL_FLOWS 5
3526 #define TCA_FQ_CODEL_QUANTUM 6
3527
3528 struct fqcodel {
3529 struct tc tc;
3530 uint32_t target;
3531 uint32_t limit;
3532 uint32_t interval;
3533 uint32_t flows;
3534 uint32_t quantum;
3535 };
3536
3537 static struct fqcodel *
3538 fqcodel_get__(const struct netdev *netdev_)
3539 {
3540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3541 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3542 }
3543
3544 static void
3545 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3546 uint32_t interval, uint32_t flows, uint32_t quantum)
3547 {
3548 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3549 struct fqcodel *fqcodel;
3550
3551 fqcodel = xmalloc(sizeof *fqcodel);
3552 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3553 fqcodel->target = target;
3554 fqcodel->limit = limit;
3555 fqcodel->interval = interval;
3556 fqcodel->flows = flows;
3557 fqcodel->quantum = quantum;
3558
3559 netdev->tc = &fqcodel->tc;
3560 }
3561
3562 static int
3563 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3564 uint32_t interval, uint32_t flows, uint32_t quantum)
3565 {
3566 size_t opt_offset;
3567 struct ofpbuf request;
3568 struct tcmsg *tcmsg;
3569 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3570 int error;
3571
3572 tc_del_qdisc(netdev);
3573
3574 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3575 NLM_F_EXCL | NLM_F_CREATE, &request);
3576 if (!tcmsg) {
3577 return ENODEV;
3578 }
3579 tcmsg->tcm_handle = tc_make_handle(1, 0);
3580 tcmsg->tcm_parent = TC_H_ROOT;
3581
3582 otarget = target ? target : 5000;
3583 olimit = limit ? limit : 10240;
3584 ointerval = interval ? interval : 100000;
3585 oflows = flows ? flows : 1024;
3586 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3587 not mtu */
3588
3589 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3590 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3591 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3592 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3593 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3594 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3595 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3596 nl_msg_end_nested(&request, opt_offset);
3597
3598 error = tc_transact(&request, NULL);
3599 if (error) {
3600 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3601 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3602 netdev_get_name(netdev),
3603 otarget, olimit, ointerval, oflows, oquantum,
3604 error, ovs_strerror(error));
3605 }
3606 return error;
3607 }
3608
3609 static void
3610 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3611 const struct smap *details, struct fqcodel *fqcodel)
3612 {
3613 fqcodel->target = smap_get_ullong(details, "target", 0);
3614 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3615 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3616 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3617 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3618
3619 if (!fqcodel->target) {
3620 fqcodel->target = 5000;
3621 }
3622 if (!fqcodel->limit) {
3623 fqcodel->limit = 10240;
3624 }
3625 if (!fqcodel->interval) {
3626 fqcodel->interval = 1000000;
3627 }
3628 if (!fqcodel->flows) {
3629 fqcodel->flows = 1024;
3630 }
3631 if (!fqcodel->quantum) {
3632 fqcodel->quantum = 1514;
3633 }
3634 }
3635
3636 static int
3637 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3638 {
3639 int error;
3640 struct fqcodel fqcodel;
3641
3642 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3643 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3644 fqcodel.interval, fqcodel.flows,
3645 fqcodel.quantum);
3646 if (!error) {
3647 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3648 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3649 }
3650 return error;
3651 }
3652
3653 static int
3654 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3655 {
3656 static const struct nl_policy tca_fqcodel_policy[] = {
3657 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3658 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3659 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3660 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3661 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3662 };
3663
3664 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3665
3666 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3667 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3668 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3669 return EPROTO;
3670 }
3671
3672 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3673 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3674 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3675 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3676 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3677 return 0;
3678 }
3679
3680 static int
3681 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3682 {
3683 struct nlattr *nlattr;
3684 const char * kind;
3685 int error;
3686 struct fqcodel fqcodel;
3687
3688 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3689 if (error != 0) {
3690 return error;
3691 }
3692
3693 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3694 if (error != 0) {
3695 return error;
3696 }
3697
3698 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3699 fqcodel.flows, fqcodel.quantum);
3700 return 0;
3701 }
3702
3703 static void
3704 fqcodel_tc_destroy(struct tc *tc)
3705 {
3706 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3707 tc_destroy(tc);
3708 free(fqcodel);
3709 }
3710
3711 static int
3712 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3713 {
3714 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3715 smap_add_format(details, "target", "%u", fqcodel->target);
3716 smap_add_format(details, "limit", "%u", fqcodel->limit);
3717 smap_add_format(details, "interval", "%u", fqcodel->interval);
3718 smap_add_format(details, "flows", "%u", fqcodel->flows);
3719 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3720 return 0;
3721 }
3722
3723 static int
3724 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3725 {
3726 struct fqcodel fqcodel;
3727
3728 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3729 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3730 fqcodel.flows, fqcodel.quantum);
3731 fqcodel_get__(netdev)->target = fqcodel.target;
3732 fqcodel_get__(netdev)->limit = fqcodel.limit;
3733 fqcodel_get__(netdev)->interval = fqcodel.interval;
3734 fqcodel_get__(netdev)->flows = fqcodel.flows;
3735 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3736 return 0;
3737 }
3738
3739 static const struct tc_ops tc_ops_fqcodel = {
3740 .linux_name = "fq_codel",
3741 .ovs_name = "linux-fq_codel",
3742 .n_queues = FQCODEL_N_QUEUES,
3743 .tc_install = fqcodel_tc_install,
3744 .tc_load = fqcodel_tc_load,
3745 .tc_destroy = fqcodel_tc_destroy,
3746 .qdisc_get = fqcodel_qdisc_get,
3747 .qdisc_set = fqcodel_qdisc_set,
3748 };
3749 \f
3750 /* SFQ traffic control class. */
3751
3752 #define SFQ_N_QUEUES 0x0000
3753
3754 struct sfq {
3755 struct tc tc;
3756 uint32_t quantum;
3757 uint32_t perturb;
3758 };
3759
3760 static struct sfq *
3761 sfq_get__(const struct netdev *netdev_)
3762 {
3763 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3764 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3765 }
3766
3767 static void
3768 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3769 {
3770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3771 struct sfq *sfq;
3772
3773 sfq = xmalloc(sizeof *sfq);
3774 tc_init(&sfq->tc, &tc_ops_sfq);
3775 sfq->perturb = perturb;
3776 sfq->quantum = quantum;
3777
3778 netdev->tc = &sfq->tc;
3779 }
3780
3781 static int
3782 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3783 {
3784 struct tc_sfq_qopt opt;
3785 struct ofpbuf request;
3786 struct tcmsg *tcmsg;
3787 int mtu;
3788 int mtu_error, error;
3789 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3790
3791 tc_del_qdisc(netdev);
3792
3793 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3794 NLM_F_EXCL | NLM_F_CREATE, &request);
3795 if (!tcmsg) {
3796 return ENODEV;
3797 }
3798 tcmsg->tcm_handle = tc_make_handle(1, 0);
3799 tcmsg->tcm_parent = TC_H_ROOT;
3800
3801 memset(&opt, 0, sizeof opt);
3802 if (!quantum) {
3803 if (!mtu_error) {
3804 opt.quantum = mtu; /* if we cannot find mtu, use default */
3805 }
3806 } else {
3807 opt.quantum = quantum;
3808 }
3809
3810 if (!perturb) {
3811 opt.perturb_period = 10;
3812 } else {
3813 opt.perturb_period = perturb;
3814 }
3815
3816 nl_msg_put_string(&request, TCA_KIND, "sfq");
3817 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3818
3819 error = tc_transact(&request, NULL);
3820 if (error) {
3821 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3822 "quantum %u, perturb %u error %d(%s)",
3823 netdev_get_name(netdev),
3824 opt.quantum, opt.perturb_period,
3825 error, ovs_strerror(error));
3826 }
3827 return error;
3828 }
3829
3830 static void
3831 sfq_parse_qdisc_details__(struct netdev *netdev,
3832 const struct smap *details, struct sfq *sfq)
3833 {
3834 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3835 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3836
3837 if (!sfq->perturb) {
3838 sfq->perturb = 10;
3839 }
3840
3841 if (!sfq->quantum) {
3842 int mtu;
3843 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3844 sfq->quantum = mtu;
3845 } else {
3846 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3847 "device without mtu");
3848 }
3849 }
3850 }
3851
3852 static int
3853 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3854 {
3855 int error;
3856 struct sfq sfq;
3857
3858 sfq_parse_qdisc_details__(netdev, details, &sfq);
3859 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3860 if (!error) {
3861 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3862 }
3863 return error;
3864 }
3865
3866 static int
3867 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3868 {
3869 const struct tc_sfq_qopt *sfq;
3870 struct nlattr *nlattr;
3871 const char * kind;
3872 int error;
3873
3874 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3875 if (error == 0) {
3876 sfq = nl_attr_get(nlattr);
3877 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
3878 return 0;
3879 }
3880
3881 return error;
3882 }
3883
3884 static void
3885 sfq_tc_destroy(struct tc *tc)
3886 {
3887 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3888 tc_destroy(tc);
3889 free(sfq);
3890 }
3891
3892 static int
3893 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3894 {
3895 const struct sfq *sfq = sfq_get__(netdev);
3896 smap_add_format(details, "quantum", "%u", sfq->quantum);
3897 smap_add_format(details, "perturb", "%u", sfq->perturb);
3898 return 0;
3899 }
3900
3901 static int
3902 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3903 {
3904 struct sfq sfq;
3905
3906 sfq_parse_qdisc_details__(netdev, details, &sfq);
3907 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3908 sfq_get__(netdev)->quantum = sfq.quantum;
3909 sfq_get__(netdev)->perturb = sfq.perturb;
3910 return 0;
3911 }
3912
3913 static const struct tc_ops tc_ops_sfq = {
3914 .linux_name = "sfq",
3915 .ovs_name = "linux-sfq",
3916 .n_queues = SFQ_N_QUEUES,
3917 .tc_install = sfq_tc_install,
3918 .tc_load = sfq_tc_load,
3919 .tc_destroy = sfq_tc_destroy,
3920 .qdisc_get = sfq_qdisc_get,
3921 .qdisc_set = sfq_qdisc_set,
3922 };
3923 \f
3924 /* netem traffic control class. */
3925
3926 struct netem {
3927 struct tc tc;
3928 uint32_t latency;
3929 uint32_t limit;
3930 uint32_t loss;
3931 };
3932
3933 static struct netem *
3934 netem_get__(const struct netdev *netdev_)
3935 {
3936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3937 return CONTAINER_OF(netdev->tc, struct netem, tc);
3938 }
3939
3940 static void
3941 netem_install__(struct netdev *netdev_, uint32_t latency,
3942 uint32_t limit, uint32_t loss)
3943 {
3944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3945 struct netem *netem;
3946
3947 netem = xmalloc(sizeof *netem);
3948 tc_init(&netem->tc, &tc_ops_netem);
3949 netem->latency = latency;
3950 netem->limit = limit;
3951 netem->loss = loss;
3952
3953 netdev->tc = &netem->tc;
3954 }
3955
3956 static int
3957 netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
3958 uint32_t limit, uint32_t loss)
3959 {
3960 struct tc_netem_qopt opt;
3961 struct ofpbuf request;
3962 struct tcmsg *tcmsg;
3963 int error;
3964
3965 tc_del_qdisc(netdev);
3966
3967 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3968 NLM_F_EXCL | NLM_F_CREATE, &request);
3969 if (!tcmsg) {
3970 return ENODEV;
3971 }
3972 tcmsg->tcm_handle = tc_make_handle(1, 0);
3973 tcmsg->tcm_parent = TC_H_ROOT;
3974
3975 memset(&opt, 0, sizeof opt);
3976
3977 if (!limit) {
3978 opt.limit = 1000;
3979 } else {
3980 opt.limit = limit;
3981 }
3982
3983 if (loss) {
3984 if (loss > 100) {
3985 VLOG_WARN_RL(&rl,
3986 "loss should be a percentage value between 0 to 100, "
3987 "loss was %u", loss);
3988 return EINVAL;
3989 }
3990 opt.loss = floor(UINT32_MAX * (loss / 100.0));
3991 }
3992
3993 opt.latency = tc_time_to_ticks(latency);
3994
3995 nl_msg_put_string(&request, TCA_KIND, "netem");
3996 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3997
3998 error = tc_transact(&request, NULL);
3999 if (error) {
4000 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4001 "latency %u, limit %u, loss %u error %d(%s)",
4002 netdev_get_name(netdev),
4003 opt.latency, opt.limit, opt.loss,
4004 error, ovs_strerror(error));
4005 }
4006 return error;
4007 }
4008
4009 static void
4010 netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4011 const struct smap *details, struct netem *netem)
4012 {
4013 netem->latency = smap_get_ullong(details, "latency", 0);
4014 netem->limit = smap_get_ullong(details, "limit", 0);
4015 netem->loss = smap_get_ullong(details, "loss", 0);
4016
4017 if (!netem->limit) {
4018 netem->limit = 1000;
4019 }
4020 }
4021
4022 static int
4023 netem_tc_install(struct netdev *netdev, const struct smap *details)
4024 {
4025 int error;
4026 struct netem netem;
4027
4028 netem_parse_qdisc_details__(netdev, details, &netem);
4029 error = netem_setup_qdisc__(netdev, netem.latency,
4030 netem.limit, netem.loss);
4031 if (!error) {
4032 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4033 }
4034 return error;
4035 }
4036
4037 static int
4038 netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4039 {
4040 const struct tc_netem_qopt *netem;
4041 struct nlattr *nlattr;
4042 const char *kind;
4043 int error;
4044
4045 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4046 if (error == 0) {
4047 netem = nl_attr_get(nlattr);
4048 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4049 return 0;
4050 }
4051
4052 return error;
4053 }
4054
4055 static void
4056 netem_tc_destroy(struct tc *tc)
4057 {
4058 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4059 tc_destroy(tc);
4060 free(netem);
4061 }
4062
4063 static int
4064 netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4065 {
4066 const struct netem *netem = netem_get__(netdev);
4067 smap_add_format(details, "latency", "%u", netem->latency);
4068 smap_add_format(details, "limit", "%u", netem->limit);
4069 smap_add_format(details, "loss", "%u", netem->loss);
4070 return 0;
4071 }
4072
4073 static int
4074 netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4075 {
4076 struct netem netem;
4077
4078 netem_parse_qdisc_details__(netdev, details, &netem);
4079 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4080 netem_get__(netdev)->latency = netem.latency;
4081 netem_get__(netdev)->limit = netem.limit;
4082 netem_get__(netdev)->loss = netem.loss;
4083 return 0;
4084 }
4085
4086 static const struct tc_ops tc_ops_netem = {
4087 .linux_name = "netem",
4088 .ovs_name = "linux-netem",
4089 .n_queues = 0,
4090 .tc_install = netem_tc_install,
4091 .tc_load = netem_tc_load,
4092 .tc_destroy = netem_tc_destroy,
4093 .qdisc_get = netem_qdisc_get,
4094 .qdisc_set = netem_qdisc_set,
4095 };
4096 \f
4097 /* HTB traffic control class. */
4098
4099 #define HTB_N_QUEUES 0xf000
4100 #define HTB_RATE2QUANTUM 10
4101
4102 struct htb {
4103 struct tc tc;
4104 unsigned int max_rate; /* In bytes/s. */
4105 };
4106
4107 struct htb_class {
4108 struct tc_queue tc_queue;
4109 unsigned int min_rate; /* In bytes/s. */
4110 unsigned int max_rate; /* In bytes/s. */
4111 unsigned int burst; /* In bytes. */
4112 unsigned int priority; /* Lower values are higher priorities. */
4113 };
4114
4115 static struct htb *
4116 htb_get__(const struct netdev *netdev_)
4117 {
4118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4119 return CONTAINER_OF(netdev->tc, struct htb, tc);
4120 }
4121
4122 static void
4123 htb_install__(struct netdev *netdev_, uint64_t max_rate)
4124 {
4125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4126 struct htb *htb;
4127
4128 htb = xmalloc(sizeof *htb);
4129 tc_init(&htb->tc, &tc_ops_htb);
4130 htb->max_rate = max_rate;
4131
4132 netdev->tc = &htb->tc;
4133 }
4134
4135 /* Create an HTB qdisc.
4136 *
4137 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4138 static int
4139 htb_setup_qdisc__(struct netdev *netdev)
4140 {
4141 size_t opt_offset;
4142 struct tc_htb_glob opt;
4143 struct ofpbuf request;
4144 struct tcmsg *tcmsg;
4145
4146 tc_del_qdisc(netdev);
4147
4148 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4149 NLM_F_EXCL | NLM_F_CREATE, &request);
4150 if (!tcmsg) {
4151 return ENODEV;
4152 }
4153 tcmsg->tcm_handle = tc_make_handle(1, 0);
4154 tcmsg->tcm_parent = TC_H_ROOT;
4155
4156 nl_msg_put_string(&request, TCA_KIND, "htb");
4157
4158 memset(&opt, 0, sizeof opt);
4159 opt.rate2quantum = HTB_RATE2QUANTUM;
4160 opt.version = 3;
4161 opt.defcls = 1;
4162
4163 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4164 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4165 nl_msg_end_nested(&request, opt_offset);
4166
4167 return tc_transact(&request, NULL);
4168 }
4169
4170 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4171 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4172 static int
4173 htb_setup_class__(struct netdev *netdev, unsigned int handle,
4174 unsigned int parent, struct htb_class *class)
4175 {
4176 size_t opt_offset;
4177 struct tc_htb_opt opt;
4178 struct ofpbuf request;
4179 struct tcmsg *tcmsg;
4180 int error;
4181 int mtu;
4182
4183 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4184 if (error) {
4185 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4186 netdev_get_name(netdev));
4187 return error;
4188 }
4189
4190 memset(&opt, 0, sizeof opt);
4191 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4192 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4193 /* Makes sure the quantum is at least MTU. Setting quantum will
4194 * make htb ignore the r2q for this class. */
4195 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4196 opt.quantum = mtu;
4197 }
4198 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4199 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4200 opt.prio = class->priority;
4201
4202 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4203 &request);
4204 if (!tcmsg) {
4205 return ENODEV;
4206 }
4207 tcmsg->tcm_handle = handle;
4208 tcmsg->tcm_parent = parent;
4209
4210 nl_msg_put_string(&request, TCA_KIND, "htb");
4211 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4212 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4213 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4214 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4215 nl_msg_end_nested(&request, opt_offset);
4216
4217 error = tc_transact(&request, NULL);
4218 if (error) {
4219 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4220 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4221 netdev_get_name(netdev),
4222 tc_get_major(handle), tc_get_minor(handle),
4223 tc_get_major(parent), tc_get_minor(parent),
4224 class->min_rate, class->max_rate,
4225 class->burst, class->priority, ovs_strerror(error));
4226 }
4227 return error;
4228 }
4229
4230 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4231 * description of them into 'details'. The description complies with the
4232 * specification given in the vswitch database documentation for linux-htb
4233 * queue details. */
4234 static int
4235 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4236 {
4237 static const struct nl_policy tca_htb_policy[] = {
4238 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4239 .min_len = sizeof(struct tc_htb_opt) },
4240 };
4241
4242 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4243 const struct tc_htb_opt *htb;
4244
4245 if (!nl_parse_nested(nl_options, tca_htb_policy,
4246 attrs, ARRAY_SIZE(tca_htb_policy))) {
4247 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4248 return EPROTO;
4249 }
4250
4251 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4252 class->min_rate = htb->rate.rate;
4253 class->max_rate = htb->ceil.rate;
4254 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4255 class->priority = htb->prio;
4256 return 0;
4257 }
4258
4259 static int
4260 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4261 struct htb_class *options,
4262 struct netdev_queue_stats *stats)
4263 {
4264 struct nlattr *nl_options;
4265 unsigned int handle;
4266 int error;
4267
4268 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4269 if (!error && queue_id) {
4270 unsigned int major = tc_get_major(handle);
4271 unsigned int minor = tc_get_minor(handle);
4272 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4273 *queue_id = minor - 1;
4274 } else {
4275 error = EPROTO;
4276 }
4277 }
4278 if (!error && options) {
4279 error = htb_parse_tca_options__(nl_options, options);
4280 }
4281 return error;
4282 }
4283
4284 static void
4285 htb_parse_qdisc_details__(struct netdev *netdev_,
4286 const struct smap *details, struct htb_class *hc)
4287 {
4288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4289
4290 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4291 if (!hc->max_rate) {
4292 enum netdev_features current;
4293
4294 netdev_linux_read_features(netdev);
4295 current = !netdev->get_features_error ? netdev->current : 0;
4296 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4297 }
4298 hc->min_rate = hc->max_rate;
4299 hc->burst = 0;
4300 hc->priority = 0;
4301 }
4302
4303 static int
4304 htb_parse_class_details__(struct netdev *netdev,
4305 const struct smap *details, struct htb_class *hc)
4306 {
4307 const struct htb *htb = htb_get__(netdev);
4308 int mtu, error;
4309 unsigned long long int max_rate_bit;
4310
4311 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4312 if (error) {
4313 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4314 netdev_get_name(netdev));
4315 return error;
4316 }
4317
4318 /* HTB requires at least an mtu sized min-rate to send any traffic even
4319 * on uncongested links. */
4320 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4321 hc->min_rate = MAX(hc->min_rate, mtu);
4322 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4323
4324 /* max-rate */
4325 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4326 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4327 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4328 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4329
4330 /* burst
4331 *
4332 * According to hints in the documentation that I've read, it is important
4333 * that 'burst' be at least as big as the largest frame that might be
4334 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4335 * but having it a bit too small is a problem. Since netdev_get_mtu()
4336 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4337 * the MTU. We actually add 64, instead of 14, as a guard against
4338 * additional headers get tacked on somewhere that we're not aware of. */
4339 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4340 hc->burst = MAX(hc->burst, mtu + 64);
4341
4342 /* priority */
4343 hc->priority = smap_get_ullong(details, "priority", 0);
4344
4345 return 0;
4346 }
4347
4348 static int
4349 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4350 unsigned int parent, struct htb_class *options,
4351 struct netdev_queue_stats *stats)
4352 {
4353 struct ofpbuf *reply;
4354 int error;
4355
4356 error = tc_query_class(netdev, handle, parent, &reply);
4357 if (!error) {
4358 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4359 ofpbuf_delete(reply);
4360 }
4361 return error;
4362 }
4363
4364 static int
4365 htb_tc_install(struct netdev *netdev, const struct smap *details)
4366 {
4367 int error;
4368
4369 error = htb_setup_qdisc__(netdev);
4370 if (!error) {
4371 struct htb_class hc;
4372
4373 htb_parse_qdisc_details__(netdev, details, &hc);
4374 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4375 tc_make_handle(1, 0), &hc);
4376 if (!error) {
4377 htb_install__(netdev, hc.max_rate);
4378 }
4379 }
4380 return error;
4381 }
4382
4383 static struct htb_class *
4384 htb_class_cast__(const struct tc_queue *queue)
4385 {
4386 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4387 }
4388
4389 static void
4390 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4391 const struct htb_class *hc)
4392 {
4393 struct htb *htb = htb_get__(netdev);
4394 size_t hash = hash_int(queue_id, 0);
4395 struct tc_queue *queue;
4396 struct htb_class *hcp;
4397
4398 queue = tc_find_queue__(netdev, queue_id, hash);
4399 if (queue) {
4400 hcp = htb_class_cast__(queue);
4401 } else {
4402 hcp = xmalloc(sizeof *hcp);
4403 queue = &hcp->tc_queue;
4404 queue->queue_id = queue_id;
4405 queue->created = time_msec();
4406 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4407 }
4408
4409 hcp->min_rate = hc->min_rate;
4410 hcp->max_rate = hc->max_rate;
4411 hcp->burst = hc->burst;
4412 hcp->priority = hc->priority;
4413 }
4414
4415 static int
4416 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4417 {
4418 struct ofpbuf msg;
4419 struct queue_dump_state state;
4420 struct htb_class hc;
4421
4422 /* Get qdisc options. */
4423 hc.max_rate = 0;
4424 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4425 htb_install__(netdev, hc.max_rate);
4426
4427 /* Get queues. */
4428 if (!start_queue_dump(netdev, &state)) {
4429 return ENODEV;
4430 }
4431 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4432 unsigned int queue_id;
4433
4434 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4435 htb_update_queue__(netdev, queue_id, &hc);
4436 }
4437 }
4438 finish_queue_dump(&state);
4439
4440 return 0;
4441 }
4442
4443 static void
4444 htb_tc_destroy(struct tc *tc)
4445 {
4446 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4447 struct htb_class *hc;
4448
4449 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4450 free(hc);
4451 }
4452 tc_destroy(tc);
4453 free(htb);
4454 }
4455
4456 static int
4457 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4458 {
4459 const struct htb *htb = htb_get__(netdev);
4460 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4461 return 0;
4462 }
4463
4464 static int
4465 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4466 {
4467 struct htb_class hc;
4468 int error;
4469
4470 htb_parse_qdisc_details__(netdev, details, &hc);
4471 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4472 tc_make_handle(1, 0), &hc);
4473 if (!error) {
4474 htb_get__(netdev)->max_rate = hc.max_rate;
4475 }
4476 return error;
4477 }
4478
4479 static int
4480 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4481 const struct tc_queue *queue, struct smap *details)
4482 {
4483 const struct htb_class *hc = htb_class_cast__(queue);
4484
4485 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4486 if (hc->min_rate != hc->max_rate) {
4487 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4488 }
4489 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4490 if (hc->priority) {
4491 smap_add_format(details, "priority", "%u", hc->priority);
4492 }
4493 return 0;
4494 }
4495
4496 static int
4497 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4498 const struct smap *details)
4499 {
4500 struct htb_class hc;
4501 int error;
4502
4503 error = htb_parse_class_details__(netdev, details, &hc);
4504 if (error) {
4505 return error;
4506 }
4507
4508 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4509 tc_make_handle(1, 0xfffe), &hc);
4510 if (error) {
4511 return error;
4512 }
4513
4514 htb_update_queue__(netdev, queue_id, &hc);
4515 return 0;
4516 }
4517
4518 static int
4519 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4520 {
4521 struct htb_class *hc = htb_class_cast__(queue);
4522 struct htb *htb = htb_get__(netdev);
4523 int error;
4524
4525 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4526 if (!error) {
4527 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4528 free(hc);
4529 }
4530 return error;
4531 }
4532
4533 static int
4534 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4535 struct netdev_queue_stats *stats)
4536 {
4537 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4538 tc_make_handle(1, 0xfffe), NULL, stats);
4539 }
4540
4541 static int
4542 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4543 const struct ofpbuf *nlmsg,
4544 netdev_dump_queue_stats_cb *cb, void *aux)
4545 {
4546 struct netdev_queue_stats stats;
4547 unsigned int handle, major, minor;
4548 int error;
4549
4550 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4551 if (error) {
4552 return error;
4553 }
4554
4555 major = tc_get_major(handle);
4556 minor = tc_get_minor(handle);
4557 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4558 (*cb)(minor - 1, &stats, aux);
4559 }
4560 return 0;
4561 }
4562
4563 static const struct tc_ops tc_ops_htb = {
4564 .linux_name = "htb",
4565 .ovs_name = "linux-htb",
4566 .n_queues = HTB_N_QUEUES,
4567 .tc_install = htb_tc_install,
4568 .tc_load = htb_tc_load,
4569 .tc_destroy = htb_tc_destroy,
4570 .qdisc_get = htb_qdisc_get,
4571 .qdisc_set = htb_qdisc_set,
4572 .class_get = htb_class_get,
4573 .class_set = htb_class_set,
4574 .class_delete = htb_class_delete,
4575 .class_get_stats = htb_class_get_stats,
4576 .class_dump_stats = htb_class_dump_stats
4577 };
4578 \f
4579 /* "linux-hfsc" traffic control class. */
4580
4581 #define HFSC_N_QUEUES 0xf000
4582
4583 struct hfsc {
4584 struct tc tc;
4585 uint32_t max_rate;
4586 };
4587
4588 struct hfsc_class {
4589 struct tc_queue tc_queue;
4590 uint32_t min_rate;
4591 uint32_t max_rate;
4592 };
4593
4594 static struct hfsc *
4595 hfsc_get__(const struct netdev *netdev_)
4596 {
4597 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4598 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4599 }
4600
4601 static struct hfsc_class *
4602 hfsc_class_cast__(const struct tc_queue *queue)
4603 {
4604 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4605 }
4606
4607 static void
4608 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4609 {
4610 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4611 struct hfsc *hfsc;
4612
4613 hfsc = xmalloc(sizeof *hfsc);
4614 tc_init(&hfsc->tc, &tc_ops_hfsc);
4615 hfsc->max_rate = max_rate;
4616 netdev->tc = &hfsc->tc;
4617 }
4618
4619 static void
4620 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4621 const struct hfsc_class *hc)
4622 {
4623 size_t hash;
4624 struct hfsc *hfsc;
4625 struct hfsc_class *hcp;
4626 struct tc_queue *queue;
4627
4628 hfsc = hfsc_get__(netdev);
4629 hash = hash_int(queue_id, 0);
4630
4631 queue = tc_find_queue__(netdev, queue_id, hash);
4632 if (queue) {
4633 hcp = hfsc_class_cast__(queue);
4634 } else {
4635 hcp = xmalloc(sizeof *hcp);
4636 queue = &hcp->tc_queue;
4637 queue->queue_id = queue_id;
4638 queue->created = time_msec();
4639 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4640 }
4641
4642 hcp->min_rate = hc->min_rate;
4643 hcp->max_rate = hc->max_rate;
4644 }
4645
4646 static int
4647 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4648 {
4649 const struct tc_service_curve *rsc, *fsc, *usc;
4650 static const struct nl_policy tca_hfsc_policy[] = {
4651 [TCA_HFSC_RSC] = {
4652 .type = NL_A_UNSPEC,
4653 .optional = false,
4654 .min_len = sizeof(struct tc_service_curve),
4655 },
4656 [TCA_HFSC_FSC] = {
4657 .type = NL_A_UNSPEC,
4658 .optional = false,
4659 .min_len = sizeof(struct tc_service_curve),
4660 },
4661 [TCA_HFSC_USC] = {
4662 .type = NL_A_UNSPEC,
4663 .optional = false,
4664 .min_len = sizeof(struct tc_service_curve),
4665 },
4666 };
4667 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4668
4669 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4670 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4671 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4672 return EPROTO;
4673 }
4674
4675 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4676 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4677 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4678
4679 if (rsc->m1 != 0 || rsc->d != 0 ||
4680 fsc->m1 != 0 || fsc->d != 0 ||
4681 usc->m1 != 0 || usc->d != 0) {
4682 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4683 "Non-linear service curves are not supported.");
4684 return EPROTO;
4685 }
4686
4687 if (rsc->m2 != fsc->m2) {
4688 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4689 "Real-time service curves are not supported ");
4690 return EPROTO;
4691 }
4692
4693 if (rsc->m2 > usc->m2) {
4694 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4695 "Min-rate service curve is greater than "
4696 "the max-rate service curve.");
4697 return EPROTO;
4698 }
4699
4700 class->min_rate = fsc->m2;
4701 class->max_rate = usc->m2;
4702 return 0;
4703 }
4704
4705 static int
4706 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4707 struct hfsc_class *options,
4708 struct netdev_queue_stats *stats)
4709 {
4710 int error;
4711 unsigned int handle;
4712 struct nlattr *nl_options;
4713
4714 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4715 if (error) {
4716 return error;
4717 }
4718
4719 if (queue_id) {
4720 unsigned int major, minor;
4721
4722 major = tc_get_major(handle);
4723 minor = tc_get_minor(handle);
4724 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4725 *queue_id = minor - 1;
4726 } else {
4727 return EPROTO;
4728 }
4729 }
4730
4731 if (options) {
4732 error = hfsc_parse_tca_options__(nl_options, options);
4733 }
4734
4735 return error;
4736 }
4737
4738 static int
4739 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4740 unsigned int parent, struct hfsc_class *options,
4741 struct netdev_queue_stats *stats)
4742 {
4743 int error;
4744 struct ofpbuf *reply;
4745
4746 error = tc_query_class(netdev, handle, parent, &reply);
4747 if (error) {
4748 return error;
4749 }
4750
4751 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4752 ofpbuf_delete(reply);
4753 return error;
4754 }
4755
4756 static void
4757 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4758 struct hfsc_class *class)
4759 {
4760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4761
4762 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4763 if (!max_rate) {
4764 enum netdev_features current;
4765
4766 netdev_linux_read_features(netdev);
4767 current = !netdev->get_features_error ? netdev->current : 0;
4768 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4769 }
4770
4771 class->min_rate = max_rate;
4772 class->max_rate = max_rate;
4773 }
4774
4775 static int
4776 hfsc_parse_class_details__(struct netdev *netdev,
4777 const struct smap *details,
4778 struct hfsc_class * class)
4779 {
4780 const struct hfsc *hfsc;
4781 uint32_t min_rate, max_rate;
4782
4783 hfsc = hfsc_get__(netdev);
4784
4785 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4786 min_rate = MAX(min_rate, 1);
4787 min_rate = MIN(min_rate, hfsc->max_rate);
4788
4789 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4790 max_rate = MAX(max_rate, min_rate);
4791 max_rate = MIN(max_rate, hfsc->max_rate);
4792
4793 class->min_rate = min_rate;
4794 class->max_rate = max_rate;
4795
4796 return 0;
4797 }
4798
4799 /* Create an HFSC qdisc.
4800 *
4801 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4802 static int
4803 hfsc_setup_qdisc__(struct netdev * netdev)
4804 {
4805 struct tcmsg *tcmsg;
4806 struct ofpbuf request;
4807 struct tc_hfsc_qopt opt;
4808
4809 tc_del_qdisc(netdev);
4810
4811 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4812 NLM_F_EXCL | NLM_F_CREATE, &request);
4813
4814 if (!tcmsg) {
4815 return ENODEV;
4816 }
4817
4818 tcmsg->tcm_handle = tc_make_handle(1, 0);
4819 tcmsg->tcm_parent = TC_H_ROOT;
4820
4821 memset(&opt, 0, sizeof opt);
4822 opt.defcls = 1;
4823
4824 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4825 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4826
4827 return tc_transact(&request, NULL);
4828 }
4829
4830 /* Create an HFSC class.
4831 *
4832 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4833 * sc rate <min_rate> ul rate <max_rate>" */
4834 static int
4835 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4836 unsigned int parent, struct hfsc_class *class)
4837 {
4838 int error;
4839 size_t opt_offset;
4840 struct tcmsg *tcmsg;
4841 struct ofpbuf request;
4842 struct tc_service_curve min, max;
4843
4844 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4845 &request);
4846
4847 if (!tcmsg) {
4848 return ENODEV;
4849 }
4850
4851 tcmsg->tcm_handle = handle;
4852 tcmsg->tcm_parent = parent;
4853
4854 min.m1 = 0;
4855 min.d = 0;
4856 min.m2 = class->min_rate;
4857
4858 max.m1 = 0;
4859 max.d = 0;
4860 max.m2 = class->max_rate;
4861
4862 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4863 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4864 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4865 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4866 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4867 nl_msg_end_nested(&request, opt_offset);
4868
4869 error = tc_transact(&request, NULL);
4870 if (error) {
4871 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4872 "min-rate %ubps, max-rate %ubps (%s)",
4873 netdev_get_name(netdev),
4874 tc_get_major(handle), tc_get_minor(handle),
4875 tc_get_major(parent), tc_get_minor(parent),
4876 class->min_rate, class->max_rate, ovs_strerror(error));
4877 }
4878
4879 return error;
4880 }
4881
4882 static int
4883 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4884 {
4885 int error;
4886 struct hfsc_class class;
4887
4888 error = hfsc_setup_qdisc__(netdev);
4889
4890 if (error) {
4891 return error;
4892 }
4893
4894 hfsc_parse_qdisc_details__(netdev, details, &class);
4895 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4896 tc_make_handle(1, 0), &class);
4897
4898 if (error) {
4899 return error;
4900 }
4901
4902 hfsc_install__(netdev, class.max_rate);
4903 return 0;
4904 }
4905
4906 static int
4907 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4908 {
4909 struct ofpbuf msg;
4910 struct queue_dump_state state;
4911 struct hfsc_class hc;
4912
4913 hc.max_rate = 0;
4914 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4915 hfsc_install__(netdev, hc.max_rate);
4916
4917 if (!start_queue_dump(netdev, &state)) {
4918 return ENODEV;
4919 }
4920
4921 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4922 unsigned int queue_id;
4923
4924 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4925 hfsc_update_queue__(netdev, queue_id, &hc);
4926 }
4927 }
4928
4929 finish_queue_dump(&state);
4930 return 0;
4931 }
4932
4933 static void
4934 hfsc_tc_destroy(struct tc *tc)
4935 {
4936 struct hfsc *hfsc;
4937 struct hfsc_class *hc, *next;
4938
4939 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4940
4941 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4942 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4943 free(hc);
4944 }
4945
4946 tc_destroy(tc);
4947 free(hfsc);
4948 }
4949
4950 static int
4951 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4952 {
4953 const struct hfsc *hfsc;
4954 hfsc = hfsc_get__(netdev);
4955 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4956 return 0;
4957 }
4958
4959 static int
4960 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4961 {
4962 int error;
4963 struct hfsc_class class;
4964
4965 hfsc_parse_qdisc_details__(netdev, details, &class);
4966 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4967 tc_make_handle(1, 0), &class);
4968
4969 if (!error) {
4970 hfsc_get__(netdev)->max_rate = class.max_rate;
4971 }
4972
4973 return error;
4974 }
4975
4976 static int
4977 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4978 const struct tc_queue *queue, struct smap *details)
4979 {
4980 const struct hfsc_class *hc;
4981
4982 hc = hfsc_class_cast__(queue);
4983 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4984 if (hc->min_rate != hc->max_rate) {
4985 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4986 }
4987 return 0;
4988 }
4989
4990 static int
4991 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4992 const struct smap *details)
4993 {
4994 int error;
4995 struct hfsc_class class;
4996
4997 error = hfsc_parse_class_details__(netdev, details, &class);
4998 if (error) {
4999 return error;
5000 }
5001
5002 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5003 tc_make_handle(1, 0xfffe), &class);
5004 if (error) {
5005 return error;
5006 }
5007
5008 hfsc_update_queue__(netdev, queue_id, &class);
5009 return 0;
5010 }
5011
5012 static int
5013 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5014 {
5015 int error;
5016 struct hfsc *hfsc;
5017 struct hfsc_class *hc;
5018
5019 hc = hfsc_class_cast__(queue);
5020 hfsc = hfsc_get__(netdev);
5021
5022 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5023 if (!error) {
5024 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5025 free(hc);
5026 }
5027 return error;
5028 }
5029
5030 static int
5031 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5032 struct netdev_queue_stats *stats)
5033 {
5034 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5035 tc_make_handle(1, 0xfffe), NULL, stats);
5036 }
5037
5038 static int
5039 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5040 const struct ofpbuf *nlmsg,
5041 netdev_dump_queue_stats_cb *cb, void *aux)
5042 {
5043 struct netdev_queue_stats stats;
5044 unsigned int handle, major, minor;
5045 int error;
5046
5047 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5048 if (error) {
5049 return error;
5050 }
5051
5052 major = tc_get_major(handle);
5053 minor = tc_get_minor(handle);
5054 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5055 (*cb)(minor - 1, &stats, aux);
5056 }
5057 return 0;
5058 }
5059
5060 static const struct tc_ops tc_ops_hfsc = {
5061 .linux_name = "hfsc",
5062 .ovs_name = "linux-hfsc",
5063 .n_queues = HFSC_N_QUEUES, /* n_queues */
5064 .tc_install = hfsc_tc_install,
5065 .tc_load = hfsc_tc_load,
5066 .tc_destroy = hfsc_tc_destroy,
5067 .qdisc_get = hfsc_qdisc_get,
5068 .qdisc_set = hfsc_qdisc_set,
5069 .class_get = hfsc_class_get,
5070 .class_set = hfsc_class_set,
5071 .class_delete = hfsc_class_delete,
5072 .class_get_stats = hfsc_class_get_stats,
5073 .class_dump_stats = hfsc_class_dump_stats,
5074 };
5075 \f
5076 /* "linux-noop" traffic control class. */
5077
5078 static void
5079 noop_install__(struct netdev *netdev_)
5080 {
5081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5082 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5083
5084 netdev->tc = CONST_CAST(struct tc *, &tc);
5085 }
5086
5087 static int
5088 noop_tc_install(struct netdev *netdev,
5089 const struct smap *details OVS_UNUSED)
5090 {
5091 noop_install__(netdev);
5092 return 0;
5093 }
5094
5095 static int
5096 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5097 {
5098 noop_install__(netdev);
5099 return 0;
5100 }
5101
5102 static const struct tc_ops tc_ops_noop = {
5103 .ovs_name = "linux-noop", /* ovs_name */
5104 .tc_install = noop_tc_install,
5105 .tc_load = noop_tc_load,
5106 };
5107 \f
5108 /* "linux-default" traffic control class.
5109 *
5110 * This class represents the default, unnamed Linux qdisc. It corresponds to
5111 * the "" (empty string) QoS type in the OVS database. */
5112
5113 static void
5114 default_install__(struct netdev *netdev_)
5115 {
5116 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5117 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5118
5119 /* Nothing but a tc class implementation is allowed to write to a tc. This
5120 * class never does that, so we can legitimately use a const tc object. */
5121 netdev->tc = CONST_CAST(struct tc *, &tc);
5122 }
5123
5124 static int
5125 default_tc_install(struct netdev *netdev,
5126 const struct smap *details OVS_UNUSED)
5127 {
5128 default_install__(netdev);
5129 return 0;
5130 }
5131
5132 static int
5133 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5134 {
5135 default_install__(netdev);
5136 return 0;
5137 }
5138
5139 static const struct tc_ops tc_ops_default = {
5140 .ovs_name = "", /* ovs_name */
5141 .tc_install = default_tc_install,
5142 .tc_load = default_tc_load,
5143 };
5144 \f
5145 /* "linux-other" traffic control class.
5146 *
5147 * */
5148
5149 static int
5150 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5151 {
5152 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5153 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5154
5155 /* Nothing but a tc class implementation is allowed to write to a tc. This
5156 * class never does that, so we can legitimately use a const tc object. */
5157 netdev->tc = CONST_CAST(struct tc *, &tc);
5158 return 0;
5159 }
5160
5161 static const struct tc_ops tc_ops_other = {
5162 .ovs_name = "linux-other",
5163 .tc_load = other_tc_load,
5164 };
5165 \f
5166 /* Traffic control. */
5167
5168 /* Number of kernel "tc" ticks per second. */
5169 static double ticks_per_s;
5170
5171 /* Number of kernel "jiffies" per second. This is used for the purpose of
5172 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5173 * one jiffy's worth of data.
5174 *
5175 * There are two possibilities here:
5176 *
5177 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5178 * approximate range of 100 to 1024. That means that we really need to
5179 * make sure that the qdisc can buffer that much data.
5180 *
5181 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5182 * has finely granular timers and there's no need to fudge additional room
5183 * for buffers. (There's no extra effort needed to implement that: the
5184 * large 'buffer_hz' is used as a divisor, so practically any number will
5185 * come out as 0 in the division. Small integer results in the case of
5186 * really high dividends won't have any real effect anyhow.)
5187 */
5188 static unsigned int buffer_hz;
5189
5190 static struct tcmsg *
5191 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5192 unsigned int flags, struct ofpbuf *request)
5193 {
5194 int ifindex;
5195 int error;
5196
5197 error = get_ifindex(netdev, &ifindex);
5198 if (error) {
5199 return NULL;
5200 }
5201
5202 return tc_make_request(ifindex, type, flags, request);
5203 }
5204
5205 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5206 * of 'kbits_burst'.
5207 *
5208 * This function is equivalent to running:
5209 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5210 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5211 * mtu 65535 drop
5212 *
5213 * The configuration and stats may be seen with the following command:
5214 * /sbin/tc -s filter show dev <devname> parent ffff:
5215 *
5216 * Returns 0 if successful, otherwise a positive errno value.
5217 */
5218 static int
5219 tc_add_policer(struct netdev *netdev,
5220 uint32_t kbits_rate, uint32_t kbits_burst)
5221 {
5222 struct tc_police tc_police;
5223 struct ofpbuf request;
5224 struct tcmsg *tcmsg;
5225 size_t basic_offset;
5226 size_t police_offset;
5227 int error;
5228 int mtu = 65535;
5229
5230 memset(&tc_police, 0, sizeof tc_police);
5231 tc_police.action = TC_POLICE_SHOT;
5232 tc_police.mtu = mtu;
5233 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5234
5235 /* The following appears wrong in one way: In networking a kilobit is
5236 * usually 1000 bits but this uses 1024 bits.
5237 *
5238 * However if you "fix" those problems then "tc filter show ..." shows
5239 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5240 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5241 * tc's point of view. Whatever. */
5242 tc_police.burst = tc_bytes_to_ticks(
5243 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5244
5245 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5246 NLM_F_EXCL | NLM_F_CREATE, &request);
5247 if (!tcmsg) {
5248 return ENODEV;
5249 }
5250 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5251 tcmsg->tcm_info = tc_make_handle(49,
5252 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5253
5254 nl_msg_put_string(&request, TCA_KIND, "basic");
5255 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5256 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5257 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5258 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5259 nl_msg_end_nested(&request, police_offset);
5260 nl_msg_end_nested(&request, basic_offset);
5261
5262 error = tc_transact(&request, NULL);
5263 if (error) {
5264 return error;
5265 }
5266
5267 return 0;
5268 }
5269
5270 static void
5271 read_psched(void)
5272 {
5273 /* The values in psched are not individually very meaningful, but they are
5274 * important. The tables below show some values seen in the wild.
5275 *
5276 * Some notes:
5277 *
5278 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5279 * (Before that, there are hints that it was 1000000000.)
5280 *
5281 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5282 * above.
5283 *
5284 * /proc/net/psched
5285 * -----------------------------------
5286 * [1] 000c8000 000f4240 000f4240 00000064
5287 * [2] 000003e8 00000400 000f4240 3b9aca00
5288 * [3] 000003e8 00000400 000f4240 3b9aca00
5289 * [4] 000003e8 00000400 000f4240 00000064
5290 * [5] 000003e8 00000040 000f4240 3b9aca00
5291 * [6] 000003e8 00000040 000f4240 000000f9
5292 *
5293 * a b c d ticks_per_s buffer_hz
5294 * ------- --------- ---------- ------------- ----------- -------------
5295 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5296 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5297 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5298 * [4] 1,000 1,024 1,000,000 100 976,562 100
5299 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5300 * [6] 1,000 64 1,000,000 249 15,625,000 249
5301 *
5302 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5303 * [2] 2.6.26-1-686-bigmem from Debian lenny
5304 * [3] 2.6.26-2-sparc64 from Debian lenny
5305 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5306 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5307 * [6] 2.6.34 from kernel.org on KVM
5308 */
5309 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5310 static const char fn[] = "/proc/net/psched";
5311 unsigned int a, b, c, d;
5312 FILE *stream;
5313
5314 if (!ovsthread_once_start(&once)) {
5315 return;
5316 }
5317
5318 ticks_per_s = 1.0;
5319 buffer_hz = 100;
5320
5321 stream = fopen(fn, "r");
5322 if (!stream) {
5323 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5324 goto exit;
5325 }
5326
5327 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5328 VLOG_WARN("%s: read failed", fn);
5329 fclose(stream);
5330 goto exit;
5331 }
5332 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5333 fclose(stream);
5334
5335 if (!a || !b || !c) {
5336 VLOG_WARN("%s: invalid scheduler parameters", fn);
5337 goto exit;
5338 }
5339
5340 ticks_per_s = (double) a * c / b;
5341 if (c == 1000000) {
5342 buffer_hz = d;
5343 } else {
5344 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5345 fn, a, b, c, d);
5346 }
5347 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5348
5349 exit:
5350 ovsthread_once_done(&once);
5351 }
5352
5353 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5354 * rate of 'rate' bytes per second. */
5355 static unsigned int
5356 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5357 {
5358 read_psched();
5359 return (rate * ticks) / ticks_per_s;
5360 }
5361
5362 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5363 * rate of 'rate' bytes per second. */
5364 static unsigned int
5365 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5366 {
5367 read_psched();
5368 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5369 }
5370
5371 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5372 * a transmission rate of 'rate' bytes per second. */
5373 static unsigned int
5374 tc_buffer_per_jiffy(unsigned int rate)
5375 {
5376 read_psched();
5377 return rate / buffer_hz;
5378 }
5379
5380 static uint32_t
5381 tc_time_to_ticks(uint32_t time) {
5382 read_psched();
5383 return time * (ticks_per_s / 1000000);
5384 }
5385
5386 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5387 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5388 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5389 * stores NULL into it if it is absent.
5390 *
5391 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5392 * 'msg'.
5393 *
5394 * Returns 0 if successful, otherwise a positive errno value. */
5395 static int
5396 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5397 struct nlattr **options)
5398 {
5399 static const struct nl_policy tca_policy[] = {
5400 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5401 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5402 };
5403 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5404
5405 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5406 tca_policy, ta, ARRAY_SIZE(ta))) {
5407 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5408 goto error;
5409 }
5410
5411 if (kind) {
5412 *kind = nl_attr_get_string(ta[TCA_KIND]);
5413 }
5414
5415 if (options) {
5416 *options = ta[TCA_OPTIONS];
5417 }
5418
5419 return 0;
5420
5421 error:
5422 if (kind) {
5423 *kind = NULL;
5424 }
5425 if (options) {
5426 *options = NULL;
5427 }
5428 return EPROTO;
5429 }
5430
5431 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5432 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5433 * into '*options', and its queue statistics into '*stats'. Any of the output
5434 * arguments may be null.
5435 *
5436 * Returns 0 if successful, otherwise a positive errno value. */
5437 static int
5438 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5439 struct nlattr **options, struct netdev_queue_stats *stats)
5440 {
5441 static const struct nl_policy tca_policy[] = {
5442 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5443 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5444 };
5445 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5446
5447 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5448 tca_policy, ta, ARRAY_SIZE(ta))) {
5449 VLOG_WARN_RL(&rl, "failed to parse class message");
5450 goto error;
5451 }
5452
5453 if (handlep) {
5454 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5455 *handlep = tc->tcm_handle;
5456 }
5457
5458 if (options) {
5459 *options = ta[TCA_OPTIONS];
5460 }
5461
5462 if (stats) {
5463 const struct gnet_stats_queue *gsq;
5464 struct gnet_stats_basic gsb;
5465
5466 static const struct nl_policy stats_policy[] = {
5467 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5468 .min_len = sizeof gsb },
5469 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5470 .min_len = sizeof *gsq },
5471 };
5472 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5473
5474 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5475 sa, ARRAY_SIZE(sa))) {
5476 VLOG_WARN_RL(&rl, "failed to parse class stats");
5477 goto error;
5478 }
5479
5480 /* Alignment issues screw up the length of struct gnet_stats_basic on
5481 * some arch/bitsize combinations. Newer versions of Linux have a
5482 * struct gnet_stats_basic_packed, but we can't depend on that. The
5483 * easiest thing to do is just to make a copy. */
5484 memset(&gsb, 0, sizeof gsb);
5485 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5486 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5487 stats->tx_bytes = gsb.bytes;
5488 stats->tx_packets = gsb.packets;
5489
5490 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5491 stats->tx_errors = gsq->drops;
5492 }
5493
5494 return 0;
5495
5496 error:
5497 if (options) {
5498 *options = NULL;
5499 }
5500 if (stats) {
5501 memset(stats, 0, sizeof *stats);
5502 }
5503 return EPROTO;
5504 }
5505
5506 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5507 * on 'netdev'. */
5508 static int
5509 tc_query_class(const struct netdev *netdev,
5510 unsigned int handle, unsigned int parent,
5511 struct ofpbuf **replyp)
5512 {
5513 struct ofpbuf request;
5514 struct tcmsg *tcmsg;
5515 int error;
5516
5517 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5518 &request);
5519 if (!tcmsg) {
5520 return ENODEV;
5521 }
5522 tcmsg->tcm_handle = handle;
5523 tcmsg->tcm_parent = parent;
5524
5525 error = tc_transact(&request, replyp);
5526 if (error) {
5527 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5528 netdev_get_name(netdev),
5529 tc_get_major(handle), tc_get_minor(handle),
5530 tc_get_major(parent), tc_get_minor(parent),
5531 ovs_strerror(error));
5532 }
5533 return error;
5534 }
5535
5536 /* Equivalent to "tc class del dev <name> handle <handle>". */
5537 static int
5538 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5539 {
5540 struct ofpbuf request;
5541 struct tcmsg *tcmsg;
5542 int error;
5543
5544 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5545 if (!tcmsg) {
5546 return ENODEV;
5547 }
5548 tcmsg->tcm_handle = handle;
5549 tcmsg->tcm_parent = 0;
5550
5551 error = tc_transact(&request, NULL);
5552 if (error) {
5553 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5554 netdev_get_name(netdev),
5555 tc_get_major(handle), tc_get_minor(handle),
5556 ovs_strerror(error));
5557 }
5558 return error;
5559 }
5560
5561 /* Equivalent to "tc qdisc del dev <name> root". */
5562 static int
5563 tc_del_qdisc(struct netdev *netdev_)
5564 {
5565 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5566 struct ofpbuf request;
5567 struct tcmsg *tcmsg;
5568 int error;
5569
5570 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5571 if (!tcmsg) {
5572 return ENODEV;
5573 }
5574 tcmsg->tcm_handle = tc_make_handle(1, 0);
5575 tcmsg->tcm_parent = TC_H_ROOT;
5576
5577 error = tc_transact(&request, NULL);
5578 if (error == EINVAL) {
5579 /* EINVAL probably means that the default qdisc was in use, in which
5580 * case we've accomplished our purpose. */
5581 error = 0;
5582 }
5583 if (!error && netdev->tc) {
5584 if (netdev->tc->ops->tc_destroy) {
5585 netdev->tc->ops->tc_destroy(netdev->tc);
5586 }
5587 netdev->tc = NULL;
5588 }
5589 return error;
5590 }
5591
5592 static bool
5593 getqdisc_is_safe(void)
5594 {
5595 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5596 static bool safe = false;
5597
5598 if (ovsthread_once_start(&once)) {
5599 struct utsname utsname;
5600 int major, minor;
5601
5602 if (uname(&utsname) == -1) {
5603 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5604 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5605 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5606 } else if (major < 2 || (major == 2 && minor < 35)) {
5607 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5608 utsname.release);
5609 } else {
5610 safe = true;
5611 }
5612 ovsthread_once_done(&once);
5613 }
5614 return safe;
5615 }
5616
5617 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5618 * kernel to determine what they are. Returns 0 if successful, otherwise a
5619 * positive errno value. */
5620 static int
5621 tc_query_qdisc(const struct netdev *netdev_)
5622 {
5623 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5624 struct ofpbuf request, *qdisc;
5625 const struct tc_ops *ops;
5626 struct tcmsg *tcmsg;
5627 int load_error;
5628 int error;
5629
5630 if (netdev->tc) {
5631 return 0;
5632 }
5633
5634 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5635 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5636 * 2.6.35 without that fix backported to it.
5637 *
5638 * To avoid the OOPS, we must not make a request that would attempt to dump
5639 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5640 * few others. There are a few ways that I can see to do this, but most of
5641 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5642 * technique chosen here is to assume that any non-default qdisc that we
5643 * create will have a class with handle 1:0. The built-in qdiscs only have
5644 * a class with handle 0:0.
5645 *
5646 * On Linux 2.6.35+ we use the straightforward method because it allows us
5647 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5648 * in such a case we get no response at all from the kernel (!) if a
5649 * builtin qdisc is in use (which is later caught by "!error &&
5650 * !qdisc->size"). */
5651 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5652 &request);
5653 if (!tcmsg) {
5654 return ENODEV;
5655 }
5656 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5657 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5658
5659 /* Figure out what tc class to instantiate. */
5660 error = tc_transact(&request, &qdisc);
5661 if (!error && qdisc->size) {
5662 const char *kind;
5663
5664 error = tc_parse_qdisc(qdisc, &kind, NULL);
5665 if (error) {
5666 ops = &tc_ops_other;
5667 } else {
5668 ops = tc_lookup_linux_name(kind);
5669 if (!ops) {
5670 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5671 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5672
5673 ops = &tc_ops_other;
5674 }
5675 }
5676 } else if ((!error && !qdisc->size) || error == ENOENT) {
5677 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5678 * set up by some other entity that doesn't have a handle 1:0. We will
5679 * assume that it's the system default qdisc. */
5680 ops = &tc_ops_default;
5681 error = 0;
5682 } else {
5683 /* Who knows? Maybe the device got deleted. */
5684 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5685 netdev_get_name(netdev_), ovs_strerror(error));
5686 ops = &tc_ops_other;
5687 }
5688
5689 /* Instantiate it. */
5690 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5691 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5692 ofpbuf_delete(qdisc);
5693
5694 return error ? error : load_error;
5695 }
5696
5697 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5698 approximate the time to transmit packets of various lengths. For an MTU of
5699 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5700 represents two possible packet lengths; for a MTU of 513 through 1024, four
5701 possible lengths; and so on.
5702
5703 Returns, for the specified 'mtu', the number of bits that packet lengths
5704 need to be shifted right to fit within such a 256-entry table. */
5705 static int
5706 tc_calc_cell_log(unsigned int mtu)
5707 {
5708 int cell_log;
5709
5710 if (!mtu) {
5711 mtu = ETH_PAYLOAD_MAX;
5712 }
5713 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5714
5715 for (cell_log = 0; mtu >= 256; cell_log++) {
5716 mtu >>= 1;
5717 }
5718
5719 return cell_log;
5720 }
5721
5722 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5723 * of 'mtu'. */
5724 static void
5725 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5726 {
5727 memset(rate, 0, sizeof *rate);
5728 rate->cell_log = tc_calc_cell_log(mtu);
5729 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5730 /* rate->cell_align = 0; */ /* distro headers. */
5731 rate->mpu = ETH_TOTAL_MIN;
5732 rate->rate = Bps;
5733 }
5734
5735 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5736 * attribute of the specified "type".
5737 *
5738 * See tc_calc_cell_log() above for a description of "rtab"s. */
5739 void
5740 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5741 {
5742 uint32_t *rtab;
5743 unsigned int i;
5744
5745 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5746 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5747 unsigned packet_size = (i + 1) << rate->cell_log;
5748 if (packet_size < rate->mpu) {
5749 packet_size = rate->mpu;
5750 }
5751 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5752 }
5753 }
5754
5755 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5756 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5757 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5758 * 0 is fine.) */
5759 static int
5760 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5761 {
5762 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5763 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5764 }
5765 \f
5766 /* Linux-only functions declared in netdev-linux.h */
5767
5768 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5769 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5770 int
5771 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5772 const char *flag_name, bool enable)
5773 {
5774 const char *netdev_name = netdev_get_name(netdev);
5775 struct ethtool_value evalue;
5776 uint32_t new_flags;
5777 int error;
5778
5779 COVERAGE_INC(netdev_get_ethtool);
5780 memset(&evalue, 0, sizeof evalue);
5781 error = netdev_linux_do_ethtool(netdev_name,
5782 (struct ethtool_cmd *)&evalue,
5783 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5784 if (error) {
5785 return error;
5786 }
5787
5788 COVERAGE_INC(netdev_set_ethtool);
5789 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5790 if (new_flags == evalue.data) {
5791 return 0;
5792 }
5793 evalue.data = new_flags;
5794 error = netdev_linux_do_ethtool(netdev_name,
5795 (struct ethtool_cmd *)&evalue,
5796 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5797 if (error) {
5798 return error;
5799 }
5800
5801 COVERAGE_INC(netdev_get_ethtool);
5802 memset(&evalue, 0, sizeof evalue);
5803 error = netdev_linux_do_ethtool(netdev_name,
5804 (struct ethtool_cmd *)&evalue,
5805 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5806 if (error) {
5807 return error;
5808 }
5809
5810 if (new_flags != evalue.data) {
5811 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5812 "device %s failed", enable ? "enable" : "disable",
5813 flag_name, netdev_name);
5814 return EOPNOTSUPP;
5815 }
5816
5817 return 0;
5818 }
5819 \f
5820 /* Utility functions. */
5821
5822 /* Copies 'src' into 'dst', performing format conversion in the process. */
5823 static void
5824 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5825 const struct rtnl_link_stats *src)
5826 {
5827 dst->rx_packets = src->rx_packets;
5828 dst->tx_packets = src->tx_packets;
5829 dst->rx_bytes = src->rx_bytes;
5830 dst->tx_bytes = src->tx_bytes;
5831 dst->rx_errors = src->rx_errors;
5832 dst->tx_errors = src->tx_errors;
5833 dst->rx_dropped = src->rx_dropped;
5834 dst->tx_dropped = src->tx_dropped;
5835 dst->multicast = src->multicast;
5836 dst->collisions = src->collisions;
5837 dst->rx_length_errors = src->rx_length_errors;
5838 dst->rx_over_errors = src->rx_over_errors;
5839 dst->rx_crc_errors = src->rx_crc_errors;
5840 dst->rx_frame_errors = src->rx_frame_errors;
5841 dst->rx_fifo_errors = src->rx_fifo_errors;
5842 dst->rx_missed_errors = src->rx_missed_errors;
5843 dst->tx_aborted_errors = src->tx_aborted_errors;
5844 dst->tx_carrier_errors = src->tx_carrier_errors;
5845 dst->tx_fifo_errors = src->tx_fifo_errors;
5846 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5847 dst->tx_window_errors = src->tx_window_errors;
5848 }
5849
5850 /* Copies 'src' into 'dst', performing format conversion in the process. */
5851 static void
5852 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5853 const struct rtnl_link_stats64 *src)
5854 {
5855 dst->rx_packets = src->rx_packets;
5856 dst->tx_packets = src->tx_packets;
5857 dst->rx_bytes = src->rx_bytes;
5858 dst->tx_bytes = src->tx_bytes;
5859 dst->rx_errors = src->rx_errors;
5860 dst->tx_errors = src->tx_errors;
5861 dst->rx_dropped = src->rx_dropped;
5862 dst->tx_dropped = src->tx_dropped;
5863 dst->multicast = src->multicast;
5864 dst->collisions = src->collisions;
5865 dst->rx_length_errors = src->rx_length_errors;
5866 dst->rx_over_errors = src->rx_over_errors;
5867 dst->rx_crc_errors = src->rx_crc_errors;
5868 dst->rx_frame_errors = src->rx_frame_errors;
5869 dst->rx_fifo_errors = src->rx_fifo_errors;
5870 dst->rx_missed_errors = src->rx_missed_errors;
5871 dst->tx_aborted_errors = src->tx_aborted_errors;
5872 dst->tx_carrier_errors = src->tx_carrier_errors;
5873 dst->tx_fifo_errors = src->tx_fifo_errors;
5874 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5875 dst->tx_window_errors = src->tx_window_errors;
5876 }
5877
5878 int
5879 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5880 {
5881 struct ofpbuf request;
5882 struct ofpbuf *reply;
5883 int error;
5884
5885 /* Filtering all counters by default */
5886 memset(stats, 0xFF, sizeof(struct netdev_stats));
5887
5888 ofpbuf_init(&request, 0);
5889 nl_msg_put_nlmsghdr(&request,
5890 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5891 RTM_GETLINK, NLM_F_REQUEST);
5892 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5893 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5894 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5895 ofpbuf_uninit(&request);
5896 if (error) {
5897 return error;
5898 }
5899
5900 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5901 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5902 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5903 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5904 error = 0;
5905 } else {
5906 a = nl_attr_find(reply, 0, IFLA_STATS);
5907 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5908 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5909 error = 0;
5910 } else {
5911 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5912 error = EPROTO;
5913 }
5914 }
5915 } else {
5916 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5917 error = EPROTO;
5918 }
5919
5920
5921 ofpbuf_delete(reply);
5922 return error;
5923 }
5924
5925 static int
5926 get_flags(const struct netdev *dev, unsigned int *flags)
5927 {
5928 struct ifreq ifr;
5929 int error;
5930
5931 *flags = 0;
5932 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5933 if (!error) {
5934 *flags = ifr.ifr_flags;
5935 }
5936 return error;
5937 }
5938
5939 static int
5940 set_flags(const char *name, unsigned int flags)
5941 {
5942 struct ifreq ifr;
5943
5944 ifr.ifr_flags = flags;
5945 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5946 }
5947
5948 int
5949 linux_get_ifindex(const char *netdev_name)
5950 {
5951 struct ifreq ifr;
5952 int error;
5953
5954 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5955 COVERAGE_INC(netdev_get_ifindex);
5956
5957 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5958 if (error) {
5959 /* ENODEV probably means that a vif disappeared asynchronously and
5960 * hasn't been removed from the database yet, so reduce the log level
5961 * to INFO for that case. */
5962 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5963 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5964 netdev_name, ovs_strerror(error));
5965 return -error;
5966 }
5967 return ifr.ifr_ifindex;
5968 }
5969
5970 static int
5971 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5972 {
5973 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5974
5975 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5976 netdev_linux_update_via_netlink(netdev);
5977 }
5978
5979 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5980 /* Fall back to ioctl if netlink fails */
5981 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5982
5983 if (ifindex < 0) {
5984 netdev->get_ifindex_error = -ifindex;
5985 netdev->ifindex = 0;
5986 } else {
5987 netdev->get_ifindex_error = 0;
5988 netdev->ifindex = ifindex;
5989 }
5990 netdev->cache_valid |= VALID_IFINDEX;
5991 }
5992
5993 *ifindexp = netdev->ifindex;
5994 return netdev->get_ifindex_error;
5995 }
5996
5997 static int
5998 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5999 {
6000 struct ofpbuf request;
6001 struct ofpbuf *reply;
6002 struct rtnetlink_change chg;
6003 struct rtnetlink_change *change = &chg;
6004 int error;
6005
6006 ofpbuf_init(&request, 0);
6007 nl_msg_put_nlmsghdr(&request,
6008 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6009 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6010 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6011
6012 /* The correct identifiers for a Linux device are netnsid and ifindex,
6013 * but ifindex changes as the port is moved to another network namespace
6014 * and the interface name statically stored in ovsdb. */
6015 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6016 if (netdev_linux_netnsid_is_remote(netdev)) {
6017 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6018 }
6019 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6020 ofpbuf_uninit(&request);
6021 if (error) {
6022 ofpbuf_delete(reply);
6023 return error;
6024 }
6025
6026 if (rtnetlink_parse(reply, change)
6027 && change->nlmsg_type == RTM_NEWLINK) {
6028 bool changed = false;
6029 error = 0;
6030
6031 /* Update netdev from rtnl msg and increment its seq if needed. */
6032 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6033 netdev->carrier_resets++;
6034 changed = true;
6035 }
6036 if (change->ifi_flags != netdev->ifi_flags) {
6037 netdev->ifi_flags = change->ifi_flags;
6038 changed = true;
6039 }
6040 if (change->mtu && change->mtu != netdev->mtu) {
6041 netdev->mtu = change->mtu;
6042 netdev->cache_valid |= VALID_MTU;
6043 netdev->netdev_mtu_error = 0;
6044 changed = true;
6045 }
6046 if (!eth_addr_is_zero(change->mac)
6047 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6048 netdev->etheraddr = change->mac;
6049 netdev->cache_valid |= VALID_ETHERADDR;
6050 netdev->ether_addr_error = 0;
6051 changed = true;
6052 }
6053 if (change->if_index != netdev->ifindex) {
6054 netdev->ifindex = change->if_index;
6055 netdev->cache_valid |= VALID_IFINDEX;
6056 netdev->get_ifindex_error = 0;
6057 changed = true;
6058 }
6059 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6060 netdev->is_lag_master = true;
6061 }
6062 if (changed) {
6063 netdev_change_seq_changed(&netdev->up);
6064 }
6065 } else {
6066 error = EINVAL;
6067 }
6068
6069 ofpbuf_delete(reply);
6070 return error;
6071 }
6072
6073 static int
6074 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6075 {
6076 struct ifreq ifr;
6077 int hwaddr_family;
6078 int error;
6079
6080 memset(&ifr, 0, sizeof ifr);
6081 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6082 COVERAGE_INC(netdev_get_hwaddr);
6083 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6084 if (error) {
6085 /* ENODEV probably means that a vif disappeared asynchronously and
6086 * hasn't been removed from the database yet, so reduce the log level
6087 * to INFO for that case. */
6088 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6089 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6090 netdev_name, ovs_strerror(error));
6091 return error;
6092 }
6093 hwaddr_family = ifr.ifr_hwaddr.sa_family;
6094 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6095 hwaddr_family != ARPHRD_NONE) {
6096 VLOG_INFO("%s device has unknown hardware address family %d",
6097 netdev_name, hwaddr_family);
6098 return EINVAL;
6099 }
6100 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6101 return 0;
6102 }
6103
6104 static int
6105 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6106 {
6107 struct ifreq ifr;
6108 int error;
6109
6110 memset(&ifr, 0, sizeof ifr);
6111 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6112 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6113 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6114 COVERAGE_INC(netdev_set_hwaddr);
6115 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6116 if (error) {
6117 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6118 netdev_name, ovs_strerror(error));
6119 }
6120 return error;
6121 }
6122
6123 static int
6124 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6125 int cmd, const char *cmd_name)
6126 {
6127 struct ifreq ifr;
6128 int error;
6129
6130 memset(&ifr, 0, sizeof ifr);
6131 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6132 ifr.ifr_data = (caddr_t) ecmd;
6133
6134 ecmd->cmd = cmd;
6135 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6136 if (error) {
6137 if (error != EOPNOTSUPP) {
6138 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6139 "failed: %s", cmd_name, name, ovs_strerror(error));
6140 } else {
6141 /* The device doesn't support this operation. That's pretty
6142 * common, so there's no point in logging anything. */
6143 }
6144 }
6145 return error;
6146 }
6147
6148 /* Returns an AF_PACKET raw socket or a negative errno value. */
6149 static int
6150 af_packet_sock(void)
6151 {
6152 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6153 static int sock;
6154
6155 if (ovsthread_once_start(&once)) {
6156 sock = socket(AF_PACKET, SOCK_RAW, 0);
6157 if (sock >= 0) {
6158 int error = set_nonblocking(sock);
6159 if (error) {
6160 close(sock);
6161 sock = -error;
6162 }
6163 } else {
6164 sock = -errno;
6165 VLOG_ERR("failed to create packet socket: %s",
6166 ovs_strerror(errno));
6167 }
6168 ovsthread_once_done(&once);
6169 }
6170
6171 return sock;
6172 }