]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev-offload: Use dpif type instead of class.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
27 #include <inttypes.h>
28 #include <math.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/utsname.h>
44 #include <net/if.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
47 #include <poll.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51
52 #include "coverage.h"
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
58 #include "hash.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
65 #include "netlink.h"
66 #include "netnsid.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
70 #include "ovs-numa.h"
71 #include "packets.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
76 #include "sset.h"
77 #include "tc.h"
78 #include "timer.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
82 #include "util.h"
83
84 VLOG_DEFINE_THIS_MODULE(netdev_linux);
85
86 COVERAGE_DEFINE(netdev_set_policing);
87 COVERAGE_DEFINE(netdev_arp_lookup);
88 COVERAGE_DEFINE(netdev_get_ifindex);
89 COVERAGE_DEFINE(netdev_get_hwaddr);
90 COVERAGE_DEFINE(netdev_set_hwaddr);
91 COVERAGE_DEFINE(netdev_get_ethtool);
92 COVERAGE_DEFINE(netdev_set_ethtool);
93
94 \f
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
97 #endif
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
99 * old headers. */
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
102 #endif
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
105 #endif
106
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
111 #endif
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
114 #endif
115
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
117 * headers. */
118 #ifndef TC_RTAB_SIZE
119 #define TC_RTAB_SIZE 1024
120 #endif
121
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
127 *
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
130 */
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
133 #endif
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
136 #endif
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
139 #endif
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata {
143 uint32_t tp_status;
144 uint32_t tp_len;
145 uint32_t tp_snaplen;
146 uint16_t tp_mac;
147 uint16_t tp_net;
148 uint16_t tp_vlan_tci;
149 uint16_t tp_vlan_tpid;
150 };
151
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
153 *
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
160 {
161 return ep->speed | (ep->speed_hi << 16);
162 }
163
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
175 #endif
176
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
188 #endif
189
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
191 *
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
197 #ifndef IFLA_STATS64
198 #define IFLA_STATS64 23
199 #endif
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64 {
202 uint64_t rx_packets;
203 uint64_t tx_packets;
204 uint64_t rx_bytes;
205 uint64_t tx_bytes;
206 uint64_t rx_errors;
207 uint64_t tx_errors;
208 uint64_t rx_dropped;
209 uint64_t tx_dropped;
210 uint64_t multicast;
211 uint64_t collisions;
212
213 uint64_t rx_length_errors;
214 uint64_t rx_over_errors;
215 uint64_t rx_crc_errors;
216 uint64_t rx_frame_errors;
217 uint64_t rx_fifo_errors;
218 uint64_t rx_missed_errors;
219
220 uint64_t tx_aborted_errors;
221 uint64_t tx_carrier_errors;
222 uint64_t tx_fifo_errors;
223 uint64_t tx_heartbeat_errors;
224 uint64_t tx_window_errors;
225
226 uint64_t rx_compressed;
227 uint64_t tx_compressed;
228 };
229
230 enum {
231 VALID_IFINDEX = 1 << 0,
232 VALID_ETHERADDR = 1 << 1,
233 VALID_IN = 1 << 2,
234 VALID_MTU = 1 << 3,
235 VALID_POLICING = 1 << 4,
236 VALID_VPORT_STAT_ERROR = 1 << 5,
237 VALID_DRVINFO = 1 << 6,
238 VALID_FEATURES = 1 << 7,
239 VALID_NUMA_ID = 1 << 8,
240 };
241
242 /* Use one for the packet buffer and another for the aux buffer to receive
243 * TSO packets. */
244 #define IOV_STD_SIZE 1
245 #define IOV_TSO_SIZE 2
246
247 enum {
248 IOV_PACKET = 0,
249 IOV_AUXBUF = 1,
250 };
251 \f
252 struct linux_lag_slave {
253 uint32_t block_id;
254 struct shash_node *node;
255 };
256
257 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
258 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
259
260 /* All slaves whose LAG masters are network devices in OvS. */
261 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
262 = SHASH_INITIALIZER(&lag_shash);
263
264 /* Traffic control. */
265
266 /* An instance of a traffic control class. Always associated with a particular
267 * network device.
268 *
269 * Each TC implementation subclasses this with whatever additional data it
270 * needs. */
271 struct tc {
272 const struct tc_ops *ops;
273 struct hmap queues; /* Contains "struct tc_queue"s.
274 * Read by generic TC layer.
275 * Written only by TC implementation. */
276 };
277
278 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
279
280 /* One traffic control queue.
281 *
282 * Each TC implementation subclasses this with whatever additional data it
283 * needs. */
284 struct tc_queue {
285 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
286 unsigned int queue_id; /* OpenFlow queue ID. */
287 long long int created; /* Time queue was created, in msecs. */
288 };
289
290 /* A particular kind of traffic control. Each implementation generally maps to
291 * one particular Linux qdisc class.
292 *
293 * The functions below return 0 if successful or a positive errno value on
294 * failure, except where otherwise noted. All of them must be provided, except
295 * where otherwise noted. */
296 struct tc_ops {
297 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
298 * This is null for tc_ops_default and tc_ops_other, for which there are no
299 * appropriate values. */
300 const char *linux_name;
301
302 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
303 const char *ovs_name;
304
305 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
306 * queues. The queues are numbered 0 through n_queues - 1. */
307 unsigned int n_queues;
308
309 /* Called to install this TC class on 'netdev'. The implementation should
310 * make the Netlink calls required to set up 'netdev' with the right qdisc
311 * and configure it according to 'details'. The implementation may assume
312 * that the current qdisc is the default; that is, there is no need for it
313 * to delete the current qdisc before installing itself.
314 *
315 * The contents of 'details' should be documented as valid for 'ovs_name'
316 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
317 * (which is built as ovs-vswitchd.conf.db(8)).
318 *
319 * This function must return 0 if and only if it sets 'netdev->tc' to an
320 * initialized 'struct tc'.
321 *
322 * (This function is null for tc_ops_other, which cannot be installed. For
323 * other TC classes it should always be nonnull.) */
324 int (*tc_install)(struct netdev *netdev, const struct smap *details);
325
326 /* Called when the netdev code determines (through a Netlink query) that
327 * this TC class's qdisc is installed on 'netdev', but we didn't install
328 * it ourselves and so don't know any of the details.
329 *
330 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
331 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
332 * implementation should parse the other attributes of 'nlmsg' as
333 * necessary to determine its configuration. If necessary it should also
334 * use Netlink queries to determine the configuration of queues on
335 * 'netdev'.
336 *
337 * This function must return 0 if and only if it sets 'netdev->tc' to an
338 * initialized 'struct tc'. */
339 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
340
341 /* Destroys the data structures allocated by the implementation as part of
342 * 'tc'. (This includes destroying 'tc->queues' by calling
343 * tc_destroy(tc).
344 *
345 * The implementation should not need to perform any Netlink calls. If
346 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
347 * (But it may not be desirable.)
348 *
349 * This function may be null if 'tc' is trivial. */
350 void (*tc_destroy)(struct tc *tc);
351
352 /* Retrieves details of 'netdev->tc' configuration into 'details'.
353 *
354 * The implementation should not need to perform any Netlink calls, because
355 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
356 * cached the configuration.
357 *
358 * The contents of 'details' should be documented as valid for 'ovs_name'
359 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
360 * (which is built as ovs-vswitchd.conf.db(8)).
361 *
362 * This function may be null if 'tc' is not configurable.
363 */
364 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
365
366 /* Reconfigures 'netdev->tc' according to 'details', performing any
367 * required Netlink calls to complete the reconfiguration.
368 *
369 * The contents of 'details' should be documented as valid for 'ovs_name'
370 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
371 * (which is built as ovs-vswitchd.conf.db(8)).
372 *
373 * This function may be null if 'tc' is not configurable.
374 */
375 int (*qdisc_set)(struct netdev *, const struct smap *details);
376
377 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
378 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
379 *
380 * The contents of 'details' should be documented as valid for 'ovs_name'
381 * in the "other_config" column in the "Queue" table in
382 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
383 *
384 * The implementation should not need to perform any Netlink calls, because
385 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
386 * cached the queue configuration.
387 *
388 * This function may be null if 'tc' does not have queues ('n_queues' is
389 * 0). */
390 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
391 struct smap *details);
392
393 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
394 * 'details', perfoming any required Netlink calls to complete the
395 * reconfiguration. The caller ensures that 'queue_id' is less than
396 * 'n_queues'.
397 *
398 * The contents of 'details' should be documented as valid for 'ovs_name'
399 * in the "other_config" column in the "Queue" table in
400 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
401 *
402 * This function may be null if 'tc' does not have queues or its queues are
403 * not configurable. */
404 int (*class_set)(struct netdev *, unsigned int queue_id,
405 const struct smap *details);
406
407 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
408 * tc_queue's within 'netdev->tc->queues'.
409 *
410 * This function may be null if 'tc' does not have queues or its queues
411 * cannot be deleted. */
412 int (*class_delete)(struct netdev *, struct tc_queue *queue);
413
414 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
415 * 'struct tc_queue's within 'netdev->tc->queues'.
416 *
417 * On success, initializes '*stats'.
418 *
419 * This function may be null if 'tc' does not have queues or if it cannot
420 * report queue statistics. */
421 int (*class_get_stats)(const struct netdev *netdev,
422 const struct tc_queue *queue,
423 struct netdev_queue_stats *stats);
424
425 /* Extracts queue stats from 'nlmsg', which is a response to a
426 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
427 *
428 * This function may be null if 'tc' does not have queues or if it cannot
429 * report queue statistics. */
430 int (*class_dump_stats)(const struct netdev *netdev,
431 const struct ofpbuf *nlmsg,
432 netdev_dump_queue_stats_cb *cb, void *aux);
433 };
434
435 static void
436 tc_init(struct tc *tc, const struct tc_ops *ops)
437 {
438 tc->ops = ops;
439 hmap_init(&tc->queues);
440 }
441
442 static void
443 tc_destroy(struct tc *tc)
444 {
445 hmap_destroy(&tc->queues);
446 }
447
448 static const struct tc_ops tc_ops_htb;
449 static const struct tc_ops tc_ops_hfsc;
450 static const struct tc_ops tc_ops_codel;
451 static const struct tc_ops tc_ops_fqcodel;
452 static const struct tc_ops tc_ops_sfq;
453 static const struct tc_ops tc_ops_netem;
454 static const struct tc_ops tc_ops_default;
455 static const struct tc_ops tc_ops_noop;
456 static const struct tc_ops tc_ops_other;
457
458 static const struct tc_ops *const tcs[] = {
459 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
460 &tc_ops_hfsc, /* Hierarchical fair service curve. */
461 &tc_ops_codel, /* Controlled delay */
462 &tc_ops_fqcodel, /* Fair queue controlled delay */
463 &tc_ops_sfq, /* Stochastic fair queueing */
464 &tc_ops_netem, /* Network Emulator */
465 &tc_ops_noop, /* Non operating qos type. */
466 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
467 &tc_ops_other, /* Some other qdisc. */
468 NULL
469 };
470
471 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
472 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
473 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
474 static uint32_t tc_time_to_ticks(uint32_t time);
475
476 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
477 int type,
478 unsigned int flags,
479 struct ofpbuf *);
480 static int tc_add_policer(struct netdev *,
481 uint32_t kbits_rate, uint32_t kbits_burst);
482
483 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
484 struct nlattr **options);
485 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
486 struct nlattr **options,
487 struct netdev_queue_stats *);
488 static int tc_query_class(const struct netdev *,
489 unsigned int handle, unsigned int parent,
490 struct ofpbuf **replyp);
491 static int tc_delete_class(const struct netdev *, unsigned int handle);
492
493 static int tc_del_qdisc(struct netdev *netdev);
494 static int tc_query_qdisc(const struct netdev *netdev);
495
496 void
497 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
498 static int tc_calc_cell_log(unsigned int mtu);
499 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
500 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
501 \f
502
503 /* This is set pretty low because we probably won't learn anything from the
504 * additional log messages. */
505 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
506
507 /* Polling miimon status for all ports causes performance degradation when
508 * handling a large number of ports. If there are no devices using miimon, then
509 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
510 *
511 * Readers do not depend on this variable synchronizing with the related
512 * changes in the device miimon status, so we can use atomic_count. */
513 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
514
515 static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
516 static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
517 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
518 int cmd, const char *cmd_name);
519 static int get_flags(const struct netdev *, unsigned int *flags);
520 static int set_flags(const char *, unsigned int flags);
521 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
522 enum netdev_flags on, enum netdev_flags *old_flagsp)
523 OVS_REQUIRES(netdev->mutex);
524 static int get_ifindex(const struct netdev *, int *ifindexp);
525 static int do_set_addr(struct netdev *netdev,
526 int ioctl_nr, const char *ioctl_name,
527 struct in_addr addr);
528 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
529 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
530 static int af_packet_sock(void);
531 static bool netdev_linux_miimon_enabled(void);
532 static void netdev_linux_miimon_run(void);
533 static void netdev_linux_miimon_wait(void);
534 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
535
536 static bool
537 is_tap_netdev(const struct netdev *netdev)
538 {
539 return netdev_get_class(netdev) == &netdev_tap_class;
540 }
541 \f
542 static int
543 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
544 {
545 struct dpif_netlink_vport reply;
546 struct ofpbuf *buf;
547 int error;
548
549 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
550 if (error) {
551 if (error == ENOENT) {
552 /* Assume it is local if there is no API (e.g. if the openvswitch
553 * kernel module is not loaded). */
554 netnsid_set_local(&netdev->netnsid);
555 } else {
556 netnsid_unset(&netdev->netnsid);
557 }
558 return error;
559 }
560
561 netnsid_set(&netdev->netnsid, reply.netnsid);
562 ofpbuf_delete(buf);
563 return 0;
564 }
565
566 static int
567 netdev_linux_netnsid_update(struct netdev_linux *netdev)
568 {
569 if (netnsid_is_unset(netdev->netnsid)) {
570 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
571 netnsid_set_local(&netdev->netnsid);
572 } else {
573 return netdev_linux_netnsid_update__(netdev);
574 }
575 }
576
577 return 0;
578 }
579
580 static bool
581 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
582 {
583 netdev_linux_netnsid_update(netdev);
584 return netnsid_eq(netdev->netnsid, nsid);
585 }
586
587 static bool
588 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
589 {
590 netdev_linux_netnsid_update(netdev);
591 return netnsid_is_remote(netdev->netnsid);
592 }
593
594 static int netdev_linux_update_via_netlink(struct netdev_linux *);
595 static void netdev_linux_update(struct netdev_linux *netdev, int,
596 const struct rtnetlink_change *)
597 OVS_REQUIRES(netdev->mutex);
598 static void netdev_linux_changed(struct netdev_linux *netdev,
599 unsigned int ifi_flags, unsigned int mask)
600 OVS_REQUIRES(netdev->mutex);
601
602 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
603 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
604 * if no such socket could be created. */
605 static struct nl_sock *
606 netdev_linux_notify_sock(void)
607 {
608 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
609 static struct nl_sock *sock;
610 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
611 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
612
613 if (ovsthread_once_start(&once)) {
614 int error;
615
616 error = nl_sock_create(NETLINK_ROUTE, &sock);
617 if (!error) {
618 size_t i;
619
620 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
621 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
622 if (error) {
623 nl_sock_destroy(sock);
624 sock = NULL;
625 break;
626 }
627 }
628 }
629 nl_sock_listen_all_nsid(sock, true);
630 ovsthread_once_done(&once);
631 }
632
633 return sock;
634 }
635
636 static bool
637 netdev_linux_miimon_enabled(void)
638 {
639 return atomic_count_get(&miimon_cnt) > 0;
640 }
641
642 static bool
643 netdev_linux_kind_is_lag(const char *kind)
644 {
645 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
646 return true;
647 }
648
649 return false;
650 }
651
652 static void
653 netdev_linux_update_lag(struct rtnetlink_change *change)
654 OVS_REQUIRES(lag_mutex)
655 {
656 struct linux_lag_slave *lag;
657
658 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
659 lag = shash_find_data(&lag_shash, change->ifname);
660
661 if (!lag) {
662 struct netdev *master_netdev;
663 char master_name[IFNAMSIZ];
664 uint32_t block_id;
665 int error = 0;
666
667 if_indextoname(change->master_ifindex, master_name);
668 master_netdev = netdev_from_name(master_name);
669 if (!master_netdev) {
670 return;
671 }
672
673 if (is_netdev_linux_class(master_netdev->netdev_class)) {
674 block_id = netdev_get_block_id(master_netdev);
675 if (!block_id) {
676 netdev_close(master_netdev);
677 return;
678 }
679
680 lag = xmalloc(sizeof *lag);
681 lag->block_id = block_id;
682 lag->node = shash_add(&lag_shash, change->ifname, lag);
683
684 /* delete ingress block in case it exists */
685 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
686 /* LAG master is linux netdev so add slave to same block. */
687 error = tc_add_del_qdisc(change->if_index, true, block_id,
688 TC_INGRESS);
689 if (error) {
690 VLOG_WARN("failed to bind LAG slave %s to master's block",
691 change->ifname);
692 shash_delete(&lag_shash, lag->node);
693 free(lag);
694 }
695 }
696
697 netdev_close(master_netdev);
698 }
699 } else if (change->master_ifindex == 0) {
700 /* Check if this was a lag slave that has been freed. */
701 lag = shash_find_data(&lag_shash, change->ifname);
702
703 if (lag) {
704 tc_add_del_qdisc(change->if_index, false, lag->block_id,
705 TC_INGRESS);
706 shash_delete(&lag_shash, lag->node);
707 free(lag);
708 }
709 }
710 }
711
712 void
713 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
714 {
715 struct nl_sock *sock;
716 int error;
717
718 if (netdev_linux_miimon_enabled()) {
719 netdev_linux_miimon_run();
720 }
721
722 sock = netdev_linux_notify_sock();
723 if (!sock) {
724 return;
725 }
726
727 do {
728 uint64_t buf_stub[4096 / 8];
729 int nsid;
730 struct ofpbuf buf;
731
732 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
733 error = nl_sock_recv(sock, &buf, &nsid, false);
734 if (!error) {
735 struct rtnetlink_change change;
736
737 if (rtnetlink_parse(&buf, &change)) {
738 struct netdev *netdev_ = NULL;
739 char dev_name[IFNAMSIZ];
740
741 if (!change.ifname) {
742 change.ifname = if_indextoname(change.if_index, dev_name);
743 }
744
745 if (change.ifname) {
746 netdev_ = netdev_from_name(change.ifname);
747 }
748 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750
751 ovs_mutex_lock(&netdev->mutex);
752 netdev_linux_update(netdev, nsid, &change);
753 ovs_mutex_unlock(&netdev->mutex);
754 }
755
756 if (change.ifname &&
757 rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
758
759 /* Need to try updating the LAG information. */
760 ovs_mutex_lock(&lag_mutex);
761 netdev_linux_update_lag(&change);
762 ovs_mutex_unlock(&lag_mutex);
763 }
764 netdev_close(netdev_);
765 }
766 } else if (error == ENOBUFS) {
767 struct shash device_shash;
768 struct shash_node *node;
769
770 nl_sock_drain(sock);
771
772 shash_init(&device_shash);
773 netdev_get_devices(&netdev_linux_class, &device_shash);
774 SHASH_FOR_EACH (node, &device_shash) {
775 struct netdev *netdev_ = node->data;
776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
777 unsigned int flags;
778
779 ovs_mutex_lock(&netdev->mutex);
780 get_flags(netdev_, &flags);
781 netdev_linux_changed(netdev, flags, 0);
782 ovs_mutex_unlock(&netdev->mutex);
783
784 netdev_close(netdev_);
785 }
786 shash_destroy(&device_shash);
787 } else if (error != EAGAIN) {
788 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
789 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
790 ovs_strerror(error));
791 }
792 ofpbuf_uninit(&buf);
793 } while (!error);
794 }
795
796 static void
797 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
798 {
799 struct nl_sock *sock;
800
801 if (netdev_linux_miimon_enabled()) {
802 netdev_linux_miimon_wait();
803 }
804 sock = netdev_linux_notify_sock();
805 if (sock) {
806 nl_sock_wait(sock, POLLIN);
807 }
808 }
809
810 static void
811 netdev_linux_changed(struct netdev_linux *dev,
812 unsigned int ifi_flags, unsigned int mask)
813 OVS_REQUIRES(dev->mutex)
814 {
815 netdev_change_seq_changed(&dev->up);
816
817 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
818 dev->carrier_resets++;
819 }
820 dev->ifi_flags = ifi_flags;
821
822 dev->cache_valid &= mask;
823 if (!(mask & VALID_IN)) {
824 netdev_get_addrs_list_flush();
825 }
826 }
827
828 static void
829 netdev_linux_update__(struct netdev_linux *dev,
830 const struct rtnetlink_change *change)
831 OVS_REQUIRES(dev->mutex)
832 {
833 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
834 if (change->nlmsg_type == RTM_NEWLINK) {
835 /* Keep drv-info, ip addresses, and NUMA id. */
836 netdev_linux_changed(dev, change->ifi_flags,
837 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
838
839 /* Update netdev from rtnl-change msg. */
840 if (change->mtu) {
841 dev->mtu = change->mtu;
842 dev->cache_valid |= VALID_MTU;
843 dev->netdev_mtu_error = 0;
844 }
845
846 if (!eth_addr_is_zero(change->mac)) {
847 dev->etheraddr = change->mac;
848 dev->cache_valid |= VALID_ETHERADDR;
849 dev->ether_addr_error = 0;
850
851 /* The mac addr has been changed, report it now. */
852 rtnetlink_report_link();
853 }
854
855 if (change->master && netdev_linux_kind_is_lag(change->master)) {
856 dev->is_lag_master = true;
857 }
858
859 dev->ifindex = change->if_index;
860 dev->cache_valid |= VALID_IFINDEX;
861 dev->get_ifindex_error = 0;
862 dev->present = true;
863 } else {
864 /* FIXME */
865 netdev_linux_changed(dev, change->ifi_flags, 0);
866 dev->present = false;
867 netnsid_unset(&dev->netnsid);
868 }
869 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
870 /* Invalidates in4, in6. */
871 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
872 } else {
873 OVS_NOT_REACHED();
874 }
875 }
876
877 static void
878 netdev_linux_update(struct netdev_linux *dev, int nsid,
879 const struct rtnetlink_change *change)
880 OVS_REQUIRES(dev->mutex)
881 {
882 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
883 netdev_linux_update__(dev, change);
884 }
885 }
886
887 static struct netdev *
888 netdev_linux_alloc(void)
889 {
890 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
891 return &netdev->up;
892 }
893
894 static int
895 netdev_linux_common_construct(struct netdev *netdev_)
896 {
897 /* Prevent any attempt to create (or open) a network device named "default"
898 * or "all". These device names are effectively reserved on Linux because
899 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
900 * itself this wouldn't call for any special treatment, but in practice if
901 * a program tries to create devices with these names, it causes the kernel
902 * to fire a "new device" notification event even though creation failed,
903 * and in turn that causes OVS to wake up and try to create them again,
904 * which ends up as a 100% CPU loop. */
905 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
906 const char *name = netdev_->name;
907 if (!strcmp(name, "default") || !strcmp(name, "all")) {
908 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
909 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
910 name);
911 return EINVAL;
912 }
913
914 /* The device could be in the same network namespace or in another one. */
915 netnsid_unset(&netdev->netnsid);
916 ovs_mutex_init(&netdev->mutex);
917
918 if (userspace_tso_enabled()) {
919 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
920 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
921 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
922 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
923 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
924 }
925
926 return 0;
927 }
928
929 /* Creates system and internal devices. */
930 int
931 netdev_linux_construct(struct netdev *netdev_)
932 {
933 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
934 int error = netdev_linux_common_construct(netdev_);
935 if (error) {
936 return error;
937 }
938
939 error = get_flags(&netdev->up, &netdev->ifi_flags);
940 if (error == ENODEV) {
941 if (netdev->up.netdev_class != &netdev_internal_class) {
942 /* The device does not exist, so don't allow it to be opened. */
943 return ENODEV;
944 } else {
945 /* "Internal" netdevs have to be created as netdev objects before
946 * they exist in the kernel, because creating them in the kernel
947 * happens by passing a netdev object to dpif_port_add().
948 * Therefore, ignore the error. */
949 }
950 }
951
952 return 0;
953 }
954
955 /* For most types of netdevs we open the device for each call of
956 * netdev_open(). However, this is not the case with tap devices,
957 * since it is only possible to open the device once. In this
958 * situation we share a single file descriptor, and consequently
959 * buffers, across all readers. Therefore once data is read it will
960 * be unavailable to other reads for tap devices. */
961 static int
962 netdev_linux_construct_tap(struct netdev *netdev_)
963 {
964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
965 static const char tap_dev[] = "/dev/net/tun";
966 const char *name = netdev_->name;
967 struct ifreq ifr;
968
969 int error = netdev_linux_common_construct(netdev_);
970 if (error) {
971 return error;
972 }
973
974 /* Open tap device. */
975 netdev->tap_fd = open(tap_dev, O_RDWR);
976 if (netdev->tap_fd < 0) {
977 error = errno;
978 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
979 return error;
980 }
981
982 /* Create tap device. */
983 get_flags(&netdev->up, &netdev->ifi_flags);
984 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
985 if (userspace_tso_enabled()) {
986 ifr.ifr_flags |= IFF_VNET_HDR;
987 }
988
989 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
990 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
991 VLOG_WARN("%s: creating tap device failed: %s", name,
992 ovs_strerror(errno));
993 error = errno;
994 goto error_close;
995 }
996
997 /* Make non-blocking. */
998 error = set_nonblocking(netdev->tap_fd);
999 if (error) {
1000 goto error_close;
1001 }
1002
1003 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1004 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1005 ovs_strerror(errno));
1006 error = errno;
1007 goto error_close;
1008 }
1009
1010 if (userspace_tso_enabled()) {
1011 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1012 * available, it will return EINVAL when a flag is unknown.
1013 * Therefore, try enabling offload with no flags to check
1014 * if TUNSETOFFLOAD support is available or not. */
1015 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) {
1016 unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
1017
1018 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
1019 VLOG_WARN("%s: enabling tap offloading failed: %s", name,
1020 ovs_strerror(errno));
1021 error = errno;
1022 goto error_close;
1023 }
1024 }
1025 }
1026
1027 netdev->present = true;
1028 return 0;
1029
1030 error_close:
1031 close(netdev->tap_fd);
1032 return error;
1033 }
1034
1035 static void
1036 netdev_linux_destruct(struct netdev *netdev_)
1037 {
1038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1039
1040 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1041 netdev->tc->ops->tc_destroy(netdev->tc);
1042 }
1043
1044 if (netdev_get_class(netdev_) == &netdev_tap_class
1045 && netdev->tap_fd >= 0)
1046 {
1047 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1048 close(netdev->tap_fd);
1049 }
1050
1051 if (netdev->miimon_interval > 0) {
1052 atomic_count_dec(&miimon_cnt);
1053 }
1054
1055 ovs_mutex_destroy(&netdev->mutex);
1056 }
1057
1058 static void
1059 netdev_linux_dealloc(struct netdev *netdev_)
1060 {
1061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1062 free(netdev);
1063 }
1064
1065 static struct netdev_rxq *
1066 netdev_linux_rxq_alloc(void)
1067 {
1068 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1069 return &rx->up;
1070 }
1071
1072 static int
1073 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1074 {
1075 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1076 struct netdev *netdev_ = rx->up.netdev;
1077 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1078 int error;
1079
1080 ovs_mutex_lock(&netdev->mutex);
1081 rx->is_tap = is_tap_netdev(netdev_);
1082 if (rx->is_tap) {
1083 rx->fd = netdev->tap_fd;
1084 } else {
1085 struct sockaddr_ll sll;
1086 int ifindex, val;
1087 /* Result of tcpdump -dd inbound */
1088 static const struct sock_filter filt[] = {
1089 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1090 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1091 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1092 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1093 };
1094 static const struct sock_fprog fprog = {
1095 ARRAY_SIZE(filt), (struct sock_filter *) filt
1096 };
1097
1098 /* Create file descriptor. */
1099 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1100 if (rx->fd < 0) {
1101 error = errno;
1102 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1103 goto error;
1104 }
1105
1106 val = 1;
1107 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1108 error = errno;
1109 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1110 netdev_get_name(netdev_), ovs_strerror(error));
1111 goto error;
1112 }
1113
1114 if (userspace_tso_enabled()
1115 && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1116 sizeof val)) {
1117 error = errno;
1118 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1119 netdev_get_name(netdev_), ovs_strerror(errno));
1120 goto error;
1121 }
1122
1123 /* Set non-blocking mode. */
1124 error = set_nonblocking(rx->fd);
1125 if (error) {
1126 goto error;
1127 }
1128
1129 /* Get ethernet device index. */
1130 error = get_ifindex(&netdev->up, &ifindex);
1131 if (error) {
1132 goto error;
1133 }
1134
1135 /* Bind to specific ethernet device. */
1136 memset(&sll, 0, sizeof sll);
1137 sll.sll_family = AF_PACKET;
1138 sll.sll_ifindex = ifindex;
1139 sll.sll_protocol = htons(ETH_P_ALL);
1140 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1141 error = errno;
1142 VLOG_ERR("%s: failed to bind raw socket (%s)",
1143 netdev_get_name(netdev_), ovs_strerror(error));
1144 goto error;
1145 }
1146
1147 /* Filter for only inbound packets. */
1148 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1149 sizeof fprog);
1150 if (error) {
1151 error = errno;
1152 VLOG_ERR("%s: failed to attach filter (%s)",
1153 netdev_get_name(netdev_), ovs_strerror(error));
1154 goto error;
1155 }
1156 }
1157 ovs_mutex_unlock(&netdev->mutex);
1158
1159 return 0;
1160
1161 error:
1162 if (rx->fd >= 0) {
1163 close(rx->fd);
1164 }
1165 ovs_mutex_unlock(&netdev->mutex);
1166 return error;
1167 }
1168
1169 static void
1170 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1171 {
1172 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1173 int i;
1174
1175 if (!rx->is_tap) {
1176 close(rx->fd);
1177 }
1178
1179 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1180 dp_packet_delete(rx->aux_bufs[i]);
1181 }
1182 }
1183
1184 static void
1185 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1186 {
1187 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1188
1189 free(rx);
1190 }
1191
1192 static ovs_be16
1193 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1194 {
1195 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1196 return htons(aux->tp_vlan_tpid);
1197 } else if (double_tagged) {
1198 return htons(ETH_TYPE_VLAN_8021AD);
1199 } else {
1200 return htons(ETH_TYPE_VLAN_8021Q);
1201 }
1202 }
1203
1204 static bool
1205 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1206 {
1207 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1208 }
1209
1210 /*
1211 * Receive packets from raw socket in batch process for better performance,
1212 * it can receive NETDEV_MAX_BURST packets at most once, the received
1213 * packets are added into *batch. The return value is 0 or errno.
1214 *
1215 * It also used recvmmsg to reduce multiple syscalls overhead;
1216 */
1217 static int
1218 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1219 struct dp_packet_batch *batch)
1220 {
1221 int iovlen;
1222 size_t std_len;
1223 ssize_t retval;
1224 int virtio_net_hdr_size;
1225 struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1226 struct cmsghdr *cmsg;
1227 union {
1228 struct cmsghdr cmsg;
1229 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1230 } cmsg_buffers[NETDEV_MAX_BURST];
1231 struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1232 struct dp_packet *buffers[NETDEV_MAX_BURST];
1233 int i;
1234
1235 if (userspace_tso_enabled()) {
1236 /* Use the buffer from the allocated packet below to receive MTU
1237 * sized packets and an aux_buf for extra TSO data. */
1238 iovlen = IOV_TSO_SIZE;
1239 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1240 } else {
1241 /* Use only the buffer from the allocated packet. */
1242 iovlen = IOV_STD_SIZE;
1243 virtio_net_hdr_size = 0;
1244 }
1245
1246 /* The length here needs to be accounted in the same way when the
1247 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1248 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1249 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1250 buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1251 iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1252 iovs[i][IOV_PACKET].iov_len = std_len;
1253 if (iovlen == IOV_TSO_SIZE) {
1254 iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1255 iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1256 }
1257
1258 mmsgs[i].msg_hdr.msg_name = NULL;
1259 mmsgs[i].msg_hdr.msg_namelen = 0;
1260 mmsgs[i].msg_hdr.msg_iov = iovs[i];
1261 mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1262 mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1263 mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1264 mmsgs[i].msg_hdr.msg_flags = 0;
1265 }
1266
1267 do {
1268 retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1269 } while (retval < 0 && errno == EINTR);
1270
1271 if (retval < 0) {
1272 retval = errno;
1273 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1274 dp_packet_delete(buffers[i]);
1275 }
1276
1277 return retval;
1278 }
1279
1280 for (i = 0; i < retval; i++) {
1281 struct dp_packet *pkt;
1282
1283 if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
1284 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1285 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1286
1287 dp_packet_delete(buffers[i]);
1288 netdev->rx_dropped += 1;
1289 VLOG_WARN_RL(&rl, "%s: Dropped packet: less than ether hdr size",
1290 netdev_get_name(netdev_));
1291 continue;
1292 }
1293
1294 if (mmsgs[i].msg_len > std_len) {
1295 /* Build a single linear TSO packet by prepending the data from
1296 * std_len buffer to the aux_buf. */
1297 pkt = rx->aux_bufs[i];
1298 dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1299 dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1300 /* The headroom should be the same in buffers[i], pkt and
1301 * DP_NETDEV_HEADROOM. */
1302 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1303 dp_packet_delete(buffers[i]);
1304 rx->aux_bufs[i] = NULL;
1305 } else {
1306 dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1307 pkt = buffers[i];
1308 }
1309
1310 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1311 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1313
1314 /* Unexpected error situation: the virtio header is not present
1315 * or corrupted. Drop the packet but continue in case next ones
1316 * are correct. */
1317 dp_packet_delete(pkt);
1318 netdev->rx_dropped += 1;
1319 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1320 netdev_get_name(netdev_));
1321 continue;
1322 }
1323
1324 for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1325 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1326 const struct tpacket_auxdata *aux;
1327
1328 if (cmsg->cmsg_level != SOL_PACKET
1329 || cmsg->cmsg_type != PACKET_AUXDATA
1330 || cmsg->cmsg_len <
1331 CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1332 continue;
1333 }
1334
1335 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1336 if (auxdata_has_vlan_tci(aux)) {
1337 struct eth_header *eth;
1338 bool double_tagged;
1339
1340 eth = dp_packet_data(pkt);
1341 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1342
1343 eth_push_vlan(pkt,
1344 auxdata_to_vlan_tpid(aux, double_tagged),
1345 htons(aux->tp_vlan_tci));
1346 break;
1347 }
1348 }
1349 dp_packet_batch_add(batch, pkt);
1350 }
1351
1352 /* Delete unused buffers. */
1353 for (; i < NETDEV_MAX_BURST; i++) {
1354 dp_packet_delete(buffers[i]);
1355 }
1356
1357 return 0;
1358 }
1359
1360 /*
1361 * Receive packets from tap by batch process for better performance,
1362 * it can receive NETDEV_MAX_BURST packets at most once, the received
1363 * packets are added into *batch. The return value is 0 or errno.
1364 */
1365 static int
1366 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1367 struct dp_packet_batch *batch)
1368 {
1369 int virtio_net_hdr_size;
1370 ssize_t retval;
1371 size_t std_len;
1372 int iovlen;
1373 int i;
1374
1375 if (userspace_tso_enabled()) {
1376 /* Use the buffer from the allocated packet below to receive MTU
1377 * sized packets and an aux_buf for extra TSO data. */
1378 iovlen = IOV_TSO_SIZE;
1379 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1380 } else {
1381 /* Use only the buffer from the allocated packet. */
1382 iovlen = IOV_STD_SIZE;
1383 virtio_net_hdr_size = 0;
1384 }
1385
1386 /* The length here needs to be accounted in the same way when the
1387 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1388 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1389 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1390 struct dp_packet *buffer;
1391 struct dp_packet *pkt;
1392 struct iovec iov[IOV_TSO_SIZE];
1393
1394 /* Assume Ethernet port. No need to set packet_type. */
1395 buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1396 iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1397 iov[IOV_PACKET].iov_len = std_len;
1398 if (iovlen == IOV_TSO_SIZE) {
1399 iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1400 iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1401 }
1402
1403 do {
1404 retval = readv(rx->fd, iov, iovlen);
1405 } while (retval < 0 && errno == EINTR);
1406
1407 if (retval < 0) {
1408 dp_packet_delete(buffer);
1409 break;
1410 }
1411
1412 if (retval > std_len) {
1413 /* Build a single linear TSO packet by prepending the data from
1414 * std_len buffer to the aux_buf. */
1415 pkt = rx->aux_bufs[i];
1416 dp_packet_set_size(pkt, retval - std_len);
1417 dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1418 /* The headroom should be the same in buffers[i], pkt and
1419 * DP_NETDEV_HEADROOM. */
1420 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1421 dp_packet_delete(buffer);
1422 rx->aux_bufs[i] = NULL;
1423 } else {
1424 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1425 pkt = buffer;
1426 }
1427
1428 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1429 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1431
1432 /* Unexpected error situation: the virtio header is not present
1433 * or corrupted. Drop the packet but continue in case next ones
1434 * are correct. */
1435 dp_packet_delete(pkt);
1436 netdev->rx_dropped += 1;
1437 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1438 netdev_get_name(netdev_));
1439 continue;
1440 }
1441
1442 dp_packet_batch_add(batch, pkt);
1443 }
1444
1445 if ((i == 0) && (retval < 0)) {
1446 return errno;
1447 }
1448
1449 return 0;
1450 }
1451
1452 static int
1453 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1454 int *qfill)
1455 {
1456 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1457 struct netdev *netdev = rx->up.netdev;
1458 ssize_t retval;
1459 int mtu;
1460
1461 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1462 mtu = ETH_PAYLOAD_MAX;
1463 }
1464
1465 if (userspace_tso_enabled()) {
1466 /* Allocate TSO packets. The packet has enough headroom to store
1467 * a full non-TSO packet. When a TSO packet is received, the data
1468 * from non-TSO buffer (std_len) is prepended to the TSO packet
1469 * (aux_buf). */
1470 size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1471 + DP_NETDEV_HEADROOM + mtu;
1472 size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1473 for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1474 if (rx->aux_bufs[i]) {
1475 continue;
1476 }
1477
1478 rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1479 }
1480 }
1481
1482 dp_packet_batch_init(batch);
1483 retval = (rx->is_tap
1484 ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1485 : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1486
1487 if (retval) {
1488 if (retval != EAGAIN && retval != EMSGSIZE) {
1489 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1490 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1491 }
1492 }
1493
1494 if (qfill) {
1495 *qfill = -ENOTSUP;
1496 }
1497
1498 return retval;
1499 }
1500
1501 static void
1502 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1503 {
1504 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1505 poll_fd_wait(rx->fd, POLLIN);
1506 }
1507
1508 static int
1509 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1510 {
1511 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1512 if (rx->is_tap) {
1513 struct ifreq ifr;
1514 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1515 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1516 if (error) {
1517 return error;
1518 }
1519 drain_fd(rx->fd, ifr.ifr_qlen);
1520 return 0;
1521 } else {
1522 return drain_rcvbuf(rx->fd);
1523 }
1524 }
1525
1526 static int
1527 netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
1528 struct dp_packet_batch *batch)
1529 {
1530 const size_t size = dp_packet_batch_size(batch);
1531 /* We don't bother setting most fields in sockaddr_ll because the
1532 * kernel ignores them for SOCK_RAW. */
1533 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1534 .sll_ifindex = ifindex };
1535
1536 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1537 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1538
1539 struct dp_packet *packet;
1540 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1541 if (tso) {
1542 netdev_linux_prepend_vnet_hdr(packet, mtu);
1543 }
1544
1545 iov[i].iov_base = dp_packet_data(packet);
1546 iov[i].iov_len = dp_packet_size(packet);
1547 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1548 .msg_namelen = sizeof sll,
1549 .msg_iov = &iov[i],
1550 .msg_iovlen = 1 };
1551 }
1552
1553 int error = 0;
1554 for (uint32_t ofs = 0; ofs < size; ) {
1555 ssize_t retval;
1556 do {
1557 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1558 error = retval < 0 ? errno : 0;
1559 } while (error == EINTR);
1560 if (error) {
1561 break;
1562 }
1563 ofs += retval;
1564 }
1565
1566 free(mmsg);
1567 free(iov);
1568 return error;
1569 }
1570
1571 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1572 * essential, because packets sent to a tap device with an AF_PACKET socket
1573 * will loop back to be *received* again on the tap device. This doesn't occur
1574 * on other interface types because we attach a socket filter to the rx
1575 * socket. */
1576 static int
1577 netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
1578 struct dp_packet_batch *batch)
1579 {
1580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1581 struct dp_packet *packet;
1582
1583 /* The Linux tap driver returns EIO if the device is not up,
1584 * so if the device is not up, don't waste time sending it.
1585 * However, if the device is in another network namespace
1586 * then OVS can't retrieve the state. In that case, send the
1587 * packets anyway. */
1588 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1589 netdev->tx_dropped += dp_packet_batch_size(batch);
1590 return 0;
1591 }
1592
1593 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1594 size_t size;
1595 ssize_t retval;
1596 int error;
1597
1598 if (tso) {
1599 netdev_linux_prepend_vnet_hdr(packet, mtu);
1600 }
1601
1602 size = dp_packet_size(packet);
1603 do {
1604 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1605 error = retval < 0 ? errno : 0;
1606 } while (error == EINTR);
1607
1608 if (error) {
1609 /* The Linux tap driver returns EIO if the device is not up. From
1610 * the OVS side this is not an error, so we ignore it; otherwise,
1611 * return the erro. */
1612 if (error != EIO) {
1613 return error;
1614 }
1615 } else if (retval != size) {
1616 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1617 "bytes of %"PRIuSIZE") on %s",
1618 retval, size, netdev_get_name(netdev_));
1619 return EMSGSIZE;
1620 }
1621 }
1622 return 0;
1623 }
1624
1625 static int
1626 netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1627 OVS_REQUIRES(netdev->mutex)
1628 {
1629 char *numa_node_path;
1630 const char *name;
1631 int node_id;
1632 FILE *stream;
1633
1634 if (netdev->cache_valid & VALID_NUMA_ID) {
1635 return netdev->numa_id;
1636 }
1637
1638 netdev->numa_id = 0;
1639 netdev->cache_valid |= VALID_NUMA_ID;
1640
1641 if (ovs_numa_get_n_numas() < 2) {
1642 /* No need to check on system with a single NUMA node. */
1643 return 0;
1644 }
1645
1646 name = netdev_get_name(&netdev->up);
1647 if (strpbrk(name, "/\\")) {
1648 VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1649 "A valid name must not include '/' or '\\'."
1650 "Using numa_id 0", name);
1651 return 0;
1652 }
1653
1654 numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1655
1656 stream = fopen(numa_node_path, "r");
1657 if (!stream) {
1658 /* Virtual device does not have this info. */
1659 VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1660 name, numa_node_path, ovs_strerror(errno));
1661 free(numa_node_path);
1662 return 0;
1663 }
1664
1665 if (fscanf(stream, "%d", &node_id) != 1
1666 || !ovs_numa_numa_id_is_valid(node_id)) {
1667 VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1668 node_id = 0;
1669 }
1670
1671 netdev->numa_id = node_id;
1672 fclose(stream);
1673 free(numa_node_path);
1674 return node_id;
1675 }
1676
1677 static int OVS_UNUSED
1678 netdev_linux_get_numa_id(const struct netdev *netdev_)
1679 {
1680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1681 int numa_id;
1682
1683 ovs_mutex_lock(&netdev->mutex);
1684 numa_id = netdev_linux_get_numa_id__(netdev);
1685 ovs_mutex_unlock(&netdev->mutex);
1686
1687 return numa_id;
1688 }
1689
1690 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1691 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1692 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1693 * the packet is too big or too small to transmit on the device.
1694 *
1695 * The kernel maintains a packet transmission queue, so the caller is not
1696 * expected to do additional queuing of packets. */
1697 static int
1698 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1699 struct dp_packet_batch *batch,
1700 bool concurrent_txq OVS_UNUSED)
1701 {
1702 bool tso = userspace_tso_enabled();
1703 int mtu = ETH_PAYLOAD_MAX;
1704 int error = 0;
1705 int sock = 0;
1706
1707 if (tso) {
1708 netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1709 }
1710
1711 if (!is_tap_netdev(netdev_)) {
1712 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1713 error = EOPNOTSUPP;
1714 goto free_batch;
1715 }
1716
1717 sock = af_packet_sock();
1718 if (sock < 0) {
1719 error = -sock;
1720 goto free_batch;
1721 }
1722
1723 int ifindex = netdev_get_ifindex(netdev_);
1724 if (ifindex < 0) {
1725 error = -ifindex;
1726 goto free_batch;
1727 }
1728
1729 error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
1730 } else {
1731 error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
1732 }
1733 if (error) {
1734 if (error == ENOBUFS) {
1735 /* The Linux AF_PACKET implementation never blocks waiting
1736 * for room for packets, instead returning ENOBUFS.
1737 * Translate this into EAGAIN for the caller. */
1738 error = EAGAIN;
1739 } else {
1740 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1741 netdev_get_name(netdev_), ovs_strerror(error));
1742 }
1743 }
1744
1745 free_batch:
1746 dp_packet_delete_batch(batch, true);
1747 return error;
1748 }
1749
1750 /* Registers with the poll loop to wake up from the next call to poll_block()
1751 * when the packet transmission queue has sufficient room to transmit a packet
1752 * with netdev_send().
1753 *
1754 * The kernel maintains a packet transmission queue, so the client is not
1755 * expected to do additional queuing of packets. Thus, this function is
1756 * unlikely to ever be used. It is included for completeness. */
1757 static void
1758 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1759 {
1760 if (is_tap_netdev(netdev)) {
1761 /* TAP device always accepts packets.*/
1762 poll_immediate_wake();
1763 }
1764 }
1765
1766 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1767 * otherwise a positive errno value. */
1768 static int
1769 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1770 {
1771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1772 enum netdev_flags old_flags = 0;
1773 int error;
1774
1775 ovs_mutex_lock(&netdev->mutex);
1776 if (netdev_linux_netnsid_is_remote(netdev)) {
1777 error = EOPNOTSUPP;
1778 goto exit;
1779 }
1780
1781 if (netdev->cache_valid & VALID_ETHERADDR) {
1782 error = netdev->ether_addr_error;
1783 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1784 goto exit;
1785 }
1786 netdev->cache_valid &= ~VALID_ETHERADDR;
1787 }
1788
1789 /* Tap devices must be brought down before setting the address. */
1790 if (is_tap_netdev(netdev_)) {
1791 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1792 }
1793 error = set_etheraddr(netdev_get_name(netdev_), mac);
1794 if (!error || error == ENODEV) {
1795 netdev->ether_addr_error = error;
1796 netdev->cache_valid |= VALID_ETHERADDR;
1797 if (!error) {
1798 netdev->etheraddr = mac;
1799 }
1800 }
1801
1802 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1803 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1804 }
1805
1806 exit:
1807 ovs_mutex_unlock(&netdev->mutex);
1808 return error;
1809 }
1810
1811 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1812 static int
1813 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1814 {
1815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1816 int error;
1817
1818 ovs_mutex_lock(&netdev->mutex);
1819 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1820 netdev_linux_update_via_netlink(netdev);
1821 }
1822
1823 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1824 /* Fall back to ioctl if netlink fails */
1825 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1826 &netdev->etheraddr);
1827 netdev->cache_valid |= VALID_ETHERADDR;
1828 }
1829
1830 error = netdev->ether_addr_error;
1831 if (!error) {
1832 *mac = netdev->etheraddr;
1833 }
1834 ovs_mutex_unlock(&netdev->mutex);
1835
1836 return error;
1837 }
1838
1839 static int
1840 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1841 {
1842 int error;
1843
1844 if (!(netdev->cache_valid & VALID_MTU)) {
1845 netdev_linux_update_via_netlink(netdev);
1846 }
1847
1848 if (!(netdev->cache_valid & VALID_MTU)) {
1849 /* Fall back to ioctl if netlink fails */
1850 struct ifreq ifr;
1851
1852 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1853 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1854 netdev->mtu = ifr.ifr_mtu;
1855 netdev->cache_valid |= VALID_MTU;
1856 }
1857
1858 error = netdev->netdev_mtu_error;
1859 if (!error) {
1860 *mtup = netdev->mtu;
1861 }
1862
1863 return error;
1864 }
1865
1866 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1867 * in bytes, not including the hardware header; thus, this is typically 1500
1868 * bytes for Ethernet devices. */
1869 static int
1870 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1871 {
1872 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1873 int error;
1874
1875 ovs_mutex_lock(&netdev->mutex);
1876 error = netdev_linux_get_mtu__(netdev, mtup);
1877 ovs_mutex_unlock(&netdev->mutex);
1878
1879 return error;
1880 }
1881
1882 /* Sets the maximum size of transmitted (MTU) for given device using linux
1883 * networking ioctl interface.
1884 */
1885 static int
1886 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1887 {
1888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1889 struct ifreq ifr;
1890 int error;
1891
1892 ovs_mutex_lock(&netdev->mutex);
1893 if (netdev_linux_netnsid_is_remote(netdev)) {
1894 error = EOPNOTSUPP;
1895 goto exit;
1896 }
1897
1898 #ifdef HAVE_AF_XDP
1899 if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1900 error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1901 if (error) {
1902 goto exit;
1903 }
1904 }
1905 #endif
1906
1907 if (netdev->cache_valid & VALID_MTU) {
1908 error = netdev->netdev_mtu_error;
1909 if (error || netdev->mtu == mtu) {
1910 goto exit;
1911 }
1912 netdev->cache_valid &= ~VALID_MTU;
1913 }
1914 ifr.ifr_mtu = mtu;
1915 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1916 SIOCSIFMTU, "SIOCSIFMTU");
1917 if (!error || error == ENODEV) {
1918 netdev->netdev_mtu_error = error;
1919 netdev->mtu = ifr.ifr_mtu;
1920 netdev->cache_valid |= VALID_MTU;
1921 }
1922 exit:
1923 ovs_mutex_unlock(&netdev->mutex);
1924 return error;
1925 }
1926
1927 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1928 * On failure, returns a negative errno value. */
1929 static int
1930 netdev_linux_get_ifindex(const struct netdev *netdev_)
1931 {
1932 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1933 int ifindex, error;
1934
1935 ovs_mutex_lock(&netdev->mutex);
1936 if (netdev_linux_netnsid_is_remote(netdev)) {
1937 error = EOPNOTSUPP;
1938 goto exit;
1939 }
1940 error = get_ifindex(netdev_, &ifindex);
1941
1942 exit:
1943 ovs_mutex_unlock(&netdev->mutex);
1944 return error ? -error : ifindex;
1945 }
1946
1947 static int
1948 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1949 {
1950 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1951
1952 ovs_mutex_lock(&netdev->mutex);
1953 if (netdev->miimon_interval > 0) {
1954 *carrier = netdev->miimon;
1955 } else {
1956 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1957 }
1958 ovs_mutex_unlock(&netdev->mutex);
1959
1960 return 0;
1961 }
1962
1963 static long long int
1964 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1965 {
1966 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1967 long long int carrier_resets;
1968
1969 ovs_mutex_lock(&netdev->mutex);
1970 carrier_resets = netdev->carrier_resets;
1971 ovs_mutex_unlock(&netdev->mutex);
1972
1973 return carrier_resets;
1974 }
1975
1976 static int
1977 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1978 struct mii_ioctl_data *data)
1979 {
1980 struct ifreq ifr;
1981 int error;
1982
1983 memset(&ifr, 0, sizeof ifr);
1984 memcpy(&ifr.ifr_data, data, sizeof *data);
1985 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1986 memcpy(data, &ifr.ifr_data, sizeof *data);
1987
1988 return error;
1989 }
1990
1991 static int
1992 netdev_linux_get_miimon(const char *name, bool *miimon)
1993 {
1994 struct mii_ioctl_data data;
1995 int error;
1996
1997 *miimon = false;
1998
1999 memset(&data, 0, sizeof data);
2000 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2001 if (!error) {
2002 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2003 data.reg_num = MII_BMSR;
2004 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2005 &data);
2006
2007 if (!error) {
2008 *miimon = !!(data.val_out & BMSR_LSTATUS);
2009 }
2010 }
2011 if (error) {
2012 struct ethtool_cmd ecmd;
2013
2014 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2015 name);
2016
2017 COVERAGE_INC(netdev_get_ethtool);
2018 memset(&ecmd, 0, sizeof ecmd);
2019 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2020 "ETHTOOL_GLINK");
2021 if (!error) {
2022 struct ethtool_value eval;
2023
2024 memcpy(&eval, &ecmd, sizeof eval);
2025 *miimon = !!eval.data;
2026 } else {
2027 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2028 }
2029 }
2030
2031 return error;
2032 }
2033
2034 static int
2035 netdev_linux_set_miimon_interval(struct netdev *netdev_,
2036 long long int interval)
2037 {
2038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2039
2040 ovs_mutex_lock(&netdev->mutex);
2041 interval = interval > 0 ? MAX(interval, 100) : 0;
2042 if (netdev->miimon_interval != interval) {
2043 if (interval && !netdev->miimon_interval) {
2044 atomic_count_inc(&miimon_cnt);
2045 } else if (!interval && netdev->miimon_interval) {
2046 atomic_count_dec(&miimon_cnt);
2047 }
2048
2049 netdev->miimon_interval = interval;
2050 timer_set_expired(&netdev->miimon_timer);
2051 }
2052 ovs_mutex_unlock(&netdev->mutex);
2053
2054 return 0;
2055 }
2056
2057 static void
2058 netdev_linux_miimon_run(void)
2059 {
2060 struct shash device_shash;
2061 struct shash_node *node;
2062
2063 shash_init(&device_shash);
2064 netdev_get_devices(&netdev_linux_class, &device_shash);
2065 SHASH_FOR_EACH (node, &device_shash) {
2066 struct netdev *netdev = node->data;
2067 struct netdev_linux *dev = netdev_linux_cast(netdev);
2068 bool miimon;
2069
2070 ovs_mutex_lock(&dev->mutex);
2071 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2072 netdev_linux_get_miimon(dev->up.name, &miimon);
2073 if (miimon != dev->miimon) {
2074 dev->miimon = miimon;
2075 netdev_linux_changed(dev, dev->ifi_flags, 0);
2076 }
2077
2078 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2079 }
2080 ovs_mutex_unlock(&dev->mutex);
2081 netdev_close(netdev);
2082 }
2083
2084 shash_destroy(&device_shash);
2085 }
2086
2087 static void
2088 netdev_linux_miimon_wait(void)
2089 {
2090 struct shash device_shash;
2091 struct shash_node *node;
2092
2093 shash_init(&device_shash);
2094 netdev_get_devices(&netdev_linux_class, &device_shash);
2095 SHASH_FOR_EACH (node, &device_shash) {
2096 struct netdev *netdev = node->data;
2097 struct netdev_linux *dev = netdev_linux_cast(netdev);
2098
2099 ovs_mutex_lock(&dev->mutex);
2100 if (dev->miimon_interval > 0) {
2101 timer_wait(&dev->miimon_timer);
2102 }
2103 ovs_mutex_unlock(&dev->mutex);
2104 netdev_close(netdev);
2105 }
2106 shash_destroy(&device_shash);
2107 }
2108
2109 static void
2110 swap_uint64(uint64_t *a, uint64_t *b)
2111 {
2112 uint64_t tmp = *a;
2113 *a = *b;
2114 *b = tmp;
2115 }
2116
2117 /* Copies 'src' into 'dst', performing format conversion in the process.
2118 *
2119 * 'src' is allowed to be misaligned. */
2120 static void
2121 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2122 const struct ovs_vport_stats *src)
2123 {
2124 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
2125 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
2126 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
2127 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
2128 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
2129 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
2130 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
2131 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
2132 dst->multicast = 0;
2133 dst->collisions = 0;
2134 dst->rx_length_errors = 0;
2135 dst->rx_over_errors = 0;
2136 dst->rx_crc_errors = 0;
2137 dst->rx_frame_errors = 0;
2138 dst->rx_fifo_errors = 0;
2139 dst->rx_missed_errors = 0;
2140 dst->tx_aborted_errors = 0;
2141 dst->tx_carrier_errors = 0;
2142 dst->tx_fifo_errors = 0;
2143 dst->tx_heartbeat_errors = 0;
2144 dst->tx_window_errors = 0;
2145 }
2146
2147 static int
2148 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2149 {
2150 struct dpif_netlink_vport reply;
2151 struct ofpbuf *buf;
2152 int error;
2153
2154 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2155 if (error) {
2156 return error;
2157 } else if (!reply.stats) {
2158 ofpbuf_delete(buf);
2159 return EOPNOTSUPP;
2160 }
2161
2162 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
2163
2164 ofpbuf_delete(buf);
2165
2166 return 0;
2167 }
2168
2169 static void
2170 get_stats_via_vport(const struct netdev *netdev_,
2171 struct netdev_stats *stats)
2172 {
2173 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2174
2175 if (!netdev->vport_stats_error ||
2176 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2177 int error;
2178
2179 error = get_stats_via_vport__(netdev_, stats);
2180 if (error && error != ENOENT && error != ENODEV) {
2181 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2182 "(%s)",
2183 netdev_get_name(netdev_), ovs_strerror(error));
2184 }
2185 netdev->vport_stats_error = error;
2186 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2187 }
2188 }
2189
2190 /* Retrieves current device stats for 'netdev-linux'. */
2191 static int
2192 netdev_linux_get_stats(const struct netdev *netdev_,
2193 struct netdev_stats *stats)
2194 {
2195 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2196 struct netdev_stats dev_stats;
2197 int error;
2198
2199 ovs_mutex_lock(&netdev->mutex);
2200 get_stats_via_vport(netdev_, stats);
2201 error = get_stats_via_netlink(netdev_, &dev_stats);
2202 if (error) {
2203 if (!netdev->vport_stats_error) {
2204 error = 0;
2205 }
2206 } else if (netdev->vport_stats_error) {
2207 /* stats not available from OVS then use netdev stats. */
2208 *stats = dev_stats;
2209 } else {
2210 stats->multicast += dev_stats.multicast;
2211 stats->collisions += dev_stats.collisions;
2212 stats->rx_length_errors += dev_stats.rx_length_errors;
2213 stats->rx_over_errors += dev_stats.rx_over_errors;
2214 stats->rx_crc_errors += dev_stats.rx_crc_errors;
2215 stats->rx_frame_errors += dev_stats.rx_frame_errors;
2216 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
2217 stats->rx_missed_errors += dev_stats.rx_missed_errors;
2218 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
2219 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
2220 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
2221 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2222 stats->tx_window_errors += dev_stats.tx_window_errors;
2223 }
2224 ovs_mutex_unlock(&netdev->mutex);
2225
2226 return error;
2227 }
2228
2229 /* Retrieves current device stats for 'netdev-tap' netdev or
2230 * netdev-internal. */
2231 static int
2232 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2233 {
2234 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2235 struct netdev_stats dev_stats;
2236 int error;
2237
2238 ovs_mutex_lock(&netdev->mutex);
2239 get_stats_via_vport(netdev_, stats);
2240 error = get_stats_via_netlink(netdev_, &dev_stats);
2241 if (error) {
2242 if (!netdev->vport_stats_error) {
2243 error = 0;
2244 }
2245 } else if (netdev->vport_stats_error) {
2246 /* Transmit and receive stats will appear to be swapped relative to the
2247 * other ports since we are the one sending the data, not a remote
2248 * computer. For consistency, we swap them back here. This does not
2249 * apply if we are getting stats from the vport layer because it always
2250 * tracks stats from the perspective of the switch. */
2251
2252 *stats = dev_stats;
2253 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2254 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2255 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2256 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2257 stats->rx_length_errors = 0;
2258 stats->rx_over_errors = 0;
2259 stats->rx_crc_errors = 0;
2260 stats->rx_frame_errors = 0;
2261 stats->rx_fifo_errors = 0;
2262 stats->rx_missed_errors = 0;
2263 stats->tx_aborted_errors = 0;
2264 stats->tx_carrier_errors = 0;
2265 stats->tx_fifo_errors = 0;
2266 stats->tx_heartbeat_errors = 0;
2267 stats->tx_window_errors = 0;
2268 } else {
2269 /* Use kernel netdev's packet and byte counts since vport counters
2270 * do not reflect packet counts on the wire when GSO, TSO or GRO
2271 * are enabled. */
2272 stats->rx_packets = dev_stats.tx_packets;
2273 stats->rx_bytes = dev_stats.tx_bytes;
2274 stats->tx_packets = dev_stats.rx_packets;
2275 stats->tx_bytes = dev_stats.rx_bytes;
2276
2277 stats->rx_dropped += dev_stats.tx_dropped;
2278 stats->tx_dropped += dev_stats.rx_dropped;
2279
2280 stats->rx_errors += dev_stats.tx_errors;
2281 stats->tx_errors += dev_stats.rx_errors;
2282
2283 stats->multicast += dev_stats.multicast;
2284 stats->collisions += dev_stats.collisions;
2285 }
2286 stats->tx_dropped += netdev->tx_dropped;
2287 stats->rx_dropped += netdev->rx_dropped;
2288 ovs_mutex_unlock(&netdev->mutex);
2289
2290 return error;
2291 }
2292
2293 static int
2294 netdev_internal_get_stats(const struct netdev *netdev_,
2295 struct netdev_stats *stats)
2296 {
2297 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2298 int error;
2299
2300 ovs_mutex_lock(&netdev->mutex);
2301 get_stats_via_vport(netdev_, stats);
2302 error = netdev->vport_stats_error;
2303 ovs_mutex_unlock(&netdev->mutex);
2304
2305 return error;
2306 }
2307
2308 static void
2309 netdev_linux_read_features(struct netdev_linux *netdev)
2310 {
2311 struct ethtool_cmd ecmd;
2312 uint32_t speed;
2313 int error;
2314
2315 if (netdev->cache_valid & VALID_FEATURES) {
2316 return;
2317 }
2318
2319 COVERAGE_INC(netdev_get_ethtool);
2320 memset(&ecmd, 0, sizeof ecmd);
2321 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2322 ETHTOOL_GSET, "ETHTOOL_GSET");
2323 if (error) {
2324 goto out;
2325 }
2326
2327 /* Supported features. */
2328 netdev->supported = 0;
2329 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2330 netdev->supported |= NETDEV_F_10MB_HD;
2331 }
2332 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2333 netdev->supported |= NETDEV_F_10MB_FD;
2334 }
2335 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2336 netdev->supported |= NETDEV_F_100MB_HD;
2337 }
2338 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2339 netdev->supported |= NETDEV_F_100MB_FD;
2340 }
2341 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2342 netdev->supported |= NETDEV_F_1GB_HD;
2343 }
2344 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2345 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2346 netdev->supported |= NETDEV_F_1GB_FD;
2347 }
2348 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2349 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2350 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2351 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2352 netdev->supported |= NETDEV_F_10GB_FD;
2353 }
2354 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2355 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2356 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2357 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2358 netdev->supported |= NETDEV_F_40GB_FD;
2359 }
2360 if (ecmd.supported & SUPPORTED_TP) {
2361 netdev->supported |= NETDEV_F_COPPER;
2362 }
2363 if (ecmd.supported & SUPPORTED_FIBRE) {
2364 netdev->supported |= NETDEV_F_FIBER;
2365 }
2366 if (ecmd.supported & SUPPORTED_Autoneg) {
2367 netdev->supported |= NETDEV_F_AUTONEG;
2368 }
2369 if (ecmd.supported & SUPPORTED_Pause) {
2370 netdev->supported |= NETDEV_F_PAUSE;
2371 }
2372 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2373 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2374 }
2375
2376 /* Advertised features. */
2377 netdev->advertised = 0;
2378 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2379 netdev->advertised |= NETDEV_F_10MB_HD;
2380 }
2381 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2382 netdev->advertised |= NETDEV_F_10MB_FD;
2383 }
2384 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2385 netdev->advertised |= NETDEV_F_100MB_HD;
2386 }
2387 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2388 netdev->advertised |= NETDEV_F_100MB_FD;
2389 }
2390 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2391 netdev->advertised |= NETDEV_F_1GB_HD;
2392 }
2393 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2394 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2395 netdev->advertised |= NETDEV_F_1GB_FD;
2396 }
2397 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2398 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2399 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2400 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2401 netdev->advertised |= NETDEV_F_10GB_FD;
2402 }
2403 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2404 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2405 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2406 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2407 netdev->advertised |= NETDEV_F_40GB_FD;
2408 }
2409 if (ecmd.advertising & ADVERTISED_TP) {
2410 netdev->advertised |= NETDEV_F_COPPER;
2411 }
2412 if (ecmd.advertising & ADVERTISED_FIBRE) {
2413 netdev->advertised |= NETDEV_F_FIBER;
2414 }
2415 if (ecmd.advertising & ADVERTISED_Autoneg) {
2416 netdev->advertised |= NETDEV_F_AUTONEG;
2417 }
2418 if (ecmd.advertising & ADVERTISED_Pause) {
2419 netdev->advertised |= NETDEV_F_PAUSE;
2420 }
2421 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2422 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2423 }
2424
2425 /* Current settings. */
2426 speed = ethtool_cmd_speed(&ecmd);
2427 if (speed == SPEED_10) {
2428 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2429 } else if (speed == SPEED_100) {
2430 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2431 } else if (speed == SPEED_1000) {
2432 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2433 } else if (speed == SPEED_10000) {
2434 netdev->current = NETDEV_F_10GB_FD;
2435 } else if (speed == 40000) {
2436 netdev->current = NETDEV_F_40GB_FD;
2437 } else if (speed == 100000) {
2438 netdev->current = NETDEV_F_100GB_FD;
2439 } else if (speed == 1000000) {
2440 netdev->current = NETDEV_F_1TB_FD;
2441 } else {
2442 netdev->current = 0;
2443 }
2444
2445 if (ecmd.port == PORT_TP) {
2446 netdev->current |= NETDEV_F_COPPER;
2447 } else if (ecmd.port == PORT_FIBRE) {
2448 netdev->current |= NETDEV_F_FIBER;
2449 }
2450
2451 if (ecmd.autoneg) {
2452 netdev->current |= NETDEV_F_AUTONEG;
2453 }
2454
2455 out:
2456 netdev->cache_valid |= VALID_FEATURES;
2457 netdev->get_features_error = error;
2458 }
2459
2460 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2461 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2462 * Returns 0 if successful, otherwise a positive errno value. */
2463 static int
2464 netdev_linux_get_features(const struct netdev *netdev_,
2465 enum netdev_features *current,
2466 enum netdev_features *advertised,
2467 enum netdev_features *supported,
2468 enum netdev_features *peer)
2469 {
2470 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2471 int error;
2472
2473 ovs_mutex_lock(&netdev->mutex);
2474 if (netdev_linux_netnsid_is_remote(netdev)) {
2475 error = EOPNOTSUPP;
2476 goto exit;
2477 }
2478
2479 netdev_linux_read_features(netdev);
2480 if (!netdev->get_features_error) {
2481 *current = netdev->current;
2482 *advertised = netdev->advertised;
2483 *supported = netdev->supported;
2484 *peer = 0; /* XXX */
2485 }
2486 error = netdev->get_features_error;
2487
2488 exit:
2489 ovs_mutex_unlock(&netdev->mutex);
2490 return error;
2491 }
2492
2493 /* Set the features advertised by 'netdev' to 'advertise'. */
2494 static int
2495 netdev_linux_set_advertisements(struct netdev *netdev_,
2496 enum netdev_features advertise)
2497 {
2498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2499 struct ethtool_cmd ecmd;
2500 int error;
2501
2502 ovs_mutex_lock(&netdev->mutex);
2503
2504 COVERAGE_INC(netdev_get_ethtool);
2505
2506 if (netdev_linux_netnsid_is_remote(netdev)) {
2507 error = EOPNOTSUPP;
2508 goto exit;
2509 }
2510
2511 memset(&ecmd, 0, sizeof ecmd);
2512 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2513 ETHTOOL_GSET, "ETHTOOL_GSET");
2514 if (error) {
2515 goto exit;
2516 }
2517
2518 ecmd.advertising = 0;
2519 if (advertise & NETDEV_F_10MB_HD) {
2520 ecmd.advertising |= ADVERTISED_10baseT_Half;
2521 }
2522 if (advertise & NETDEV_F_10MB_FD) {
2523 ecmd.advertising |= ADVERTISED_10baseT_Full;
2524 }
2525 if (advertise & NETDEV_F_100MB_HD) {
2526 ecmd.advertising |= ADVERTISED_100baseT_Half;
2527 }
2528 if (advertise & NETDEV_F_100MB_FD) {
2529 ecmd.advertising |= ADVERTISED_100baseT_Full;
2530 }
2531 if (advertise & NETDEV_F_1GB_HD) {
2532 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2533 }
2534 if (advertise & NETDEV_F_1GB_FD) {
2535 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2536 }
2537 if (advertise & NETDEV_F_10GB_FD) {
2538 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2539 }
2540 if (advertise & NETDEV_F_COPPER) {
2541 ecmd.advertising |= ADVERTISED_TP;
2542 }
2543 if (advertise & NETDEV_F_FIBER) {
2544 ecmd.advertising |= ADVERTISED_FIBRE;
2545 }
2546 if (advertise & NETDEV_F_AUTONEG) {
2547 ecmd.advertising |= ADVERTISED_Autoneg;
2548 }
2549 if (advertise & NETDEV_F_PAUSE) {
2550 ecmd.advertising |= ADVERTISED_Pause;
2551 }
2552 if (advertise & NETDEV_F_PAUSE_ASYM) {
2553 ecmd.advertising |= ADVERTISED_Asym_Pause;
2554 }
2555 COVERAGE_INC(netdev_set_ethtool);
2556 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2557 ETHTOOL_SSET, "ETHTOOL_SSET");
2558
2559 exit:
2560 ovs_mutex_unlock(&netdev->mutex);
2561 return error;
2562 }
2563
2564 static struct tc_police
2565 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2566 {
2567 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2568 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2569 struct tc_police police;
2570 struct tc_ratespec rate;
2571 int mtu = 65535;
2572
2573 memset(&rate, 0, sizeof rate);
2574 rate.rate = bps;
2575 rate.cell_log = tc_calc_cell_log(mtu);
2576 rate.mpu = ETH_TOTAL_MIN;
2577
2578 memset(&police, 0, sizeof police);
2579 police.burst = tc_bytes_to_ticks(bps, bsize);
2580 police.action = TC_POLICE_SHOT;
2581 police.rate = rate;
2582 police.mtu = mtu;
2583
2584 return police;
2585 }
2586
2587 static void
2588 nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2589 {
2590 size_t offset;
2591
2592 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2593 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2594 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2595 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2596 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2597 nl_msg_end_nested(request, offset);
2598 }
2599
2600 static int
2601 tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2602 uint32_t kbits_burst)
2603 {
2604 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2605 size_t basic_offset, action_offset, inner_offset;
2606 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2607 int ifindex, err = 0;
2608 struct tc_police pol_act;
2609 struct ofpbuf request;
2610 struct ofpbuf *reply;
2611 struct tcmsg *tcmsg;
2612 uint32_t handle = 1;
2613
2614 err = get_ifindex(netdev, &ifindex);
2615 if (err) {
2616 return err;
2617 }
2618
2619 tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2620 &request);
2621 tcmsg->tcm_parent = TC_INGRESS_PARENT;
2622 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2623 tcmsg->tcm_handle = handle;
2624
2625 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2626 nl_msg_put_string(&request, TCA_KIND, "matchall");
2627 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2628 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2629 inner_offset = nl_msg_start_nested(&request, 1);
2630 nl_msg_put_act_police(&request, pol_act);
2631 nl_msg_end_nested(&request, inner_offset);
2632 nl_msg_end_nested(&request, action_offset);
2633 nl_msg_end_nested(&request, basic_offset);
2634
2635 err = tc_transact(&request, &reply);
2636 if (!err) {
2637 struct tcmsg *tc =
2638 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2639 ofpbuf_delete(reply);
2640 }
2641
2642 return err;
2643 }
2644
2645 static int
2646 tc_del_matchall_policer(struct netdev *netdev)
2647 {
2648 int prio = TC_RESERVED_PRIORITY_POLICE;
2649 uint32_t block_id = 0;
2650 struct tcf_id id;
2651 int ifindex;
2652 int err;
2653
2654 err = get_ifindex(netdev, &ifindex);
2655 if (err) {
2656 return err;
2657 }
2658
2659 id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2660 err = tc_del_filter(&id);
2661 if (err) {
2662 return err;
2663 }
2664
2665 return 0;
2666 }
2667
2668 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2669 * successful, otherwise a positive errno value. */
2670 static int
2671 netdev_linux_set_policing(struct netdev *netdev_,
2672 uint32_t kbits_rate, uint32_t kbits_burst)
2673 {
2674 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2675 const char *netdev_name = netdev_get_name(netdev_);
2676 int ifindex;
2677 int error;
2678
2679 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2680 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2681 : kbits_burst); /* Stick with user-specified value. */
2682
2683 ovs_mutex_lock(&netdev->mutex);
2684 if (netdev_linux_netnsid_is_remote(netdev)) {
2685 error = EOPNOTSUPP;
2686 goto out;
2687 }
2688
2689 if (netdev->cache_valid & VALID_POLICING) {
2690 error = netdev->netdev_policing_error;
2691 if (error || (netdev->kbits_rate == kbits_rate &&
2692 netdev->kbits_burst == kbits_burst)) {
2693 /* Assume that settings haven't changed since we last set them. */
2694 goto out;
2695 }
2696 netdev->cache_valid &= ~VALID_POLICING;
2697 }
2698
2699 COVERAGE_INC(netdev_set_policing);
2700
2701 /* Use matchall for policing when offloadling ovs with tc-flower. */
2702 if (netdev_is_flow_api_enabled()) {
2703 error = tc_del_matchall_policer(netdev_);
2704 if (kbits_rate) {
2705 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2706 }
2707 ovs_mutex_unlock(&netdev->mutex);
2708 return error;
2709 }
2710
2711 error = get_ifindex(netdev_, &ifindex);
2712 if (error) {
2713 goto out;
2714 }
2715
2716 /* Remove any existing ingress qdisc. */
2717 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
2718 if (error) {
2719 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2720 netdev_name, ovs_strerror(error));
2721 goto out;
2722 }
2723
2724 if (kbits_rate) {
2725 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
2726 if (error) {
2727 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2728 netdev_name, ovs_strerror(error));
2729 goto out;
2730 }
2731
2732 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2733 if (error){
2734 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2735 netdev_name, ovs_strerror(error));
2736 goto out;
2737 }
2738 }
2739
2740 netdev->kbits_rate = kbits_rate;
2741 netdev->kbits_burst = kbits_burst;
2742
2743 out:
2744 if (!error || error == ENODEV) {
2745 netdev->netdev_policing_error = error;
2746 netdev->cache_valid |= VALID_POLICING;
2747 }
2748 ovs_mutex_unlock(&netdev->mutex);
2749 return error;
2750 }
2751
2752 static int
2753 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2754 struct sset *types)
2755 {
2756 const struct tc_ops *const *opsp;
2757 for (opsp = tcs; *opsp != NULL; opsp++) {
2758 const struct tc_ops *ops = *opsp;
2759 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2760 sset_add(types, ops->ovs_name);
2761 }
2762 }
2763 return 0;
2764 }
2765
2766 static const struct tc_ops *
2767 tc_lookup_ovs_name(const char *name)
2768 {
2769 const struct tc_ops *const *opsp;
2770
2771 for (opsp = tcs; *opsp != NULL; opsp++) {
2772 const struct tc_ops *ops = *opsp;
2773 if (!strcmp(name, ops->ovs_name)) {
2774 return ops;
2775 }
2776 }
2777 return NULL;
2778 }
2779
2780 static const struct tc_ops *
2781 tc_lookup_linux_name(const char *name)
2782 {
2783 const struct tc_ops *const *opsp;
2784
2785 for (opsp = tcs; *opsp != NULL; opsp++) {
2786 const struct tc_ops *ops = *opsp;
2787 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2788 return ops;
2789 }
2790 }
2791 return NULL;
2792 }
2793
2794 static struct tc_queue *
2795 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2796 size_t hash)
2797 {
2798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2799 struct tc_queue *queue;
2800
2801 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2802 if (queue->queue_id == queue_id) {
2803 return queue;
2804 }
2805 }
2806 return NULL;
2807 }
2808
2809 static struct tc_queue *
2810 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2811 {
2812 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2813 }
2814
2815 static int
2816 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2817 const char *type,
2818 struct netdev_qos_capabilities *caps)
2819 {
2820 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2821 if (!ops) {
2822 return EOPNOTSUPP;
2823 }
2824 caps->n_queues = ops->n_queues;
2825 return 0;
2826 }
2827
2828 static int
2829 netdev_linux_get_qos(const struct netdev *netdev_,
2830 const char **typep, struct smap *details)
2831 {
2832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2833 int error;
2834
2835 ovs_mutex_lock(&netdev->mutex);
2836 if (netdev_linux_netnsid_is_remote(netdev)) {
2837 error = EOPNOTSUPP;
2838 goto exit;
2839 }
2840
2841 error = tc_query_qdisc(netdev_);
2842 if (!error) {
2843 *typep = netdev->tc->ops->ovs_name;
2844 error = (netdev->tc->ops->qdisc_get
2845 ? netdev->tc->ops->qdisc_get(netdev_, details)
2846 : 0);
2847 }
2848
2849 exit:
2850 ovs_mutex_unlock(&netdev->mutex);
2851 return error;
2852 }
2853
2854 static int
2855 netdev_linux_set_qos(struct netdev *netdev_,
2856 const char *type, const struct smap *details)
2857 {
2858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2859 const struct tc_ops *new_ops;
2860 int error;
2861
2862 new_ops = tc_lookup_ovs_name(type);
2863 if (!new_ops || !new_ops->tc_install) {
2864 return EOPNOTSUPP;
2865 }
2866
2867 if (new_ops == &tc_ops_noop) {
2868 return new_ops->tc_install(netdev_, details);
2869 }
2870
2871 ovs_mutex_lock(&netdev->mutex);
2872 if (netdev_linux_netnsid_is_remote(netdev)) {
2873 error = EOPNOTSUPP;
2874 goto exit;
2875 }
2876
2877 error = tc_query_qdisc(netdev_);
2878 if (error) {
2879 goto exit;
2880 }
2881
2882 if (new_ops == netdev->tc->ops) {
2883 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2884 } else {
2885 /* Delete existing qdisc. */
2886 error = tc_del_qdisc(netdev_);
2887 if (error) {
2888 goto exit;
2889 }
2890 ovs_assert(netdev->tc == NULL);
2891
2892 /* Install new qdisc. */
2893 error = new_ops->tc_install(netdev_, details);
2894 ovs_assert((error == 0) == (netdev->tc != NULL));
2895 }
2896
2897 exit:
2898 ovs_mutex_unlock(&netdev->mutex);
2899 return error;
2900 }
2901
2902 static int
2903 netdev_linux_get_queue(const struct netdev *netdev_,
2904 unsigned int queue_id, struct smap *details)
2905 {
2906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2907 int error;
2908
2909 ovs_mutex_lock(&netdev->mutex);
2910 if (netdev_linux_netnsid_is_remote(netdev)) {
2911 error = EOPNOTSUPP;
2912 goto exit;
2913 }
2914
2915 error = tc_query_qdisc(netdev_);
2916 if (!error) {
2917 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2918 error = (queue
2919 ? netdev->tc->ops->class_get(netdev_, queue, details)
2920 : ENOENT);
2921 }
2922
2923 exit:
2924 ovs_mutex_unlock(&netdev->mutex);
2925 return error;
2926 }
2927
2928 static int
2929 netdev_linux_set_queue(struct netdev *netdev_,
2930 unsigned int queue_id, const struct smap *details)
2931 {
2932 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2933 int error;
2934
2935 ovs_mutex_lock(&netdev->mutex);
2936 if (netdev_linux_netnsid_is_remote(netdev)) {
2937 error = EOPNOTSUPP;
2938 goto exit;
2939 }
2940
2941 error = tc_query_qdisc(netdev_);
2942 if (!error) {
2943 error = (queue_id < netdev->tc->ops->n_queues
2944 && netdev->tc->ops->class_set
2945 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2946 : EINVAL);
2947 }
2948
2949 exit:
2950 ovs_mutex_unlock(&netdev->mutex);
2951 return error;
2952 }
2953
2954 static int
2955 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2956 {
2957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2958 int error;
2959
2960 ovs_mutex_lock(&netdev->mutex);
2961 if (netdev_linux_netnsid_is_remote(netdev)) {
2962 error = EOPNOTSUPP;
2963 goto exit;
2964 }
2965
2966 error = tc_query_qdisc(netdev_);
2967 if (!error) {
2968 if (netdev->tc->ops->class_delete) {
2969 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2970 error = (queue
2971 ? netdev->tc->ops->class_delete(netdev_, queue)
2972 : ENOENT);
2973 } else {
2974 error = EINVAL;
2975 }
2976 }
2977
2978 exit:
2979 ovs_mutex_unlock(&netdev->mutex);
2980 return error;
2981 }
2982
2983 static int
2984 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2985 unsigned int queue_id,
2986 struct netdev_queue_stats *stats)
2987 {
2988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2989 int error;
2990
2991 ovs_mutex_lock(&netdev->mutex);
2992 if (netdev_linux_netnsid_is_remote(netdev)) {
2993 error = EOPNOTSUPP;
2994 goto exit;
2995 }
2996
2997 error = tc_query_qdisc(netdev_);
2998 if (!error) {
2999 if (netdev->tc->ops->class_get_stats) {
3000 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3001 if (queue) {
3002 stats->created = queue->created;
3003 error = netdev->tc->ops->class_get_stats(netdev_, queue,
3004 stats);
3005 } else {
3006 error = ENOENT;
3007 }
3008 } else {
3009 error = EOPNOTSUPP;
3010 }
3011 }
3012
3013 exit:
3014 ovs_mutex_unlock(&netdev->mutex);
3015 return error;
3016 }
3017
3018 struct queue_dump_state {
3019 struct nl_dump dump;
3020 struct ofpbuf buf;
3021 };
3022
3023 static bool
3024 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3025 {
3026 struct ofpbuf request;
3027 struct tcmsg *tcmsg;
3028
3029 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3030 if (!tcmsg) {
3031 return false;
3032 }
3033 tcmsg->tcm_parent = 0;
3034 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3035 ofpbuf_uninit(&request);
3036
3037 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3038 return true;
3039 }
3040
3041 static int
3042 finish_queue_dump(struct queue_dump_state *state)
3043 {
3044 ofpbuf_uninit(&state->buf);
3045 return nl_dump_done(&state->dump);
3046 }
3047
3048 struct netdev_linux_queue_state {
3049 unsigned int *queues;
3050 size_t cur_queue;
3051 size_t n_queues;
3052 };
3053
3054 static int
3055 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3056 {
3057 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3058 int error;
3059
3060 ovs_mutex_lock(&netdev->mutex);
3061 if (netdev_linux_netnsid_is_remote(netdev)) {
3062 error = EOPNOTSUPP;
3063 goto exit;
3064 }
3065
3066 error = tc_query_qdisc(netdev_);
3067 if (!error) {
3068 if (netdev->tc->ops->class_get) {
3069 struct netdev_linux_queue_state *state;
3070 struct tc_queue *queue;
3071 size_t i;
3072
3073 *statep = state = xmalloc(sizeof *state);
3074 state->n_queues = hmap_count(&netdev->tc->queues);
3075 state->cur_queue = 0;
3076 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3077
3078 i = 0;
3079 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3080 state->queues[i++] = queue->queue_id;
3081 }
3082 } else {
3083 error = EOPNOTSUPP;
3084 }
3085 }
3086
3087 exit:
3088 ovs_mutex_unlock(&netdev->mutex);
3089 return error;
3090 }
3091
3092 static int
3093 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3094 unsigned int *queue_idp, struct smap *details)
3095 {
3096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3097 struct netdev_linux_queue_state *state = state_;
3098 int error = EOF;
3099
3100 ovs_mutex_lock(&netdev->mutex);
3101 if (netdev_linux_netnsid_is_remote(netdev)) {
3102 error = EOPNOTSUPP;
3103 goto exit;
3104 }
3105
3106 while (state->cur_queue < state->n_queues) {
3107 unsigned int queue_id = state->queues[state->cur_queue++];
3108 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3109
3110 if (queue) {
3111 *queue_idp = queue_id;
3112 error = netdev->tc->ops->class_get(netdev_, queue, details);
3113 break;
3114 }
3115 }
3116
3117 exit:
3118 ovs_mutex_unlock(&netdev->mutex);
3119 return error;
3120 }
3121
3122 static int
3123 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3124 void *state_)
3125 {
3126 struct netdev_linux_queue_state *state = state_;
3127
3128 free(state->queues);
3129 free(state);
3130 return 0;
3131 }
3132
3133 static int
3134 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3135 netdev_dump_queue_stats_cb *cb, void *aux)
3136 {
3137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3138 int error;
3139
3140 ovs_mutex_lock(&netdev->mutex);
3141 if (netdev_linux_netnsid_is_remote(netdev)) {
3142 error = EOPNOTSUPP;
3143 goto exit;
3144 }
3145
3146 error = tc_query_qdisc(netdev_);
3147 if (!error) {
3148 struct queue_dump_state state;
3149
3150 if (!netdev->tc->ops->class_dump_stats) {
3151 error = EOPNOTSUPP;
3152 } else if (!start_queue_dump(netdev_, &state)) {
3153 error = ENODEV;
3154 } else {
3155 struct ofpbuf msg;
3156 int retval;
3157
3158 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3159 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3160 cb, aux);
3161 if (retval) {
3162 error = retval;
3163 }
3164 }
3165
3166 retval = finish_queue_dump(&state);
3167 if (retval) {
3168 error = retval;
3169 }
3170 }
3171 }
3172
3173 exit:
3174 ovs_mutex_unlock(&netdev->mutex);
3175 return error;
3176 }
3177
3178 static int
3179 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3180 struct in_addr netmask)
3181 {
3182 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3183 int error;
3184
3185 ovs_mutex_lock(&netdev->mutex);
3186 if (netdev_linux_netnsid_is_remote(netdev)) {
3187 error = EOPNOTSUPP;
3188 goto exit;
3189 }
3190
3191 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3192 if (!error) {
3193 if (address.s_addr != INADDR_ANY) {
3194 error = do_set_addr(netdev_, SIOCSIFNETMASK,
3195 "SIOCSIFNETMASK", netmask);
3196 }
3197 }
3198
3199 exit:
3200 ovs_mutex_unlock(&netdev->mutex);
3201 return error;
3202 }
3203
3204 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3205 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3206 * error. */
3207 static int
3208 netdev_linux_get_addr_list(const struct netdev *netdev_,
3209 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3210 {
3211 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3212 int error;
3213
3214 ovs_mutex_lock(&netdev->mutex);
3215 if (netdev_linux_netnsid_is_remote(netdev)) {
3216 error = EOPNOTSUPP;
3217 goto exit;
3218 }
3219
3220 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3221
3222 exit:
3223 ovs_mutex_unlock(&netdev->mutex);
3224 return error;
3225 }
3226
3227 static void
3228 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3229 {
3230 struct sockaddr_in sin;
3231 memset(&sin, 0, sizeof sin);
3232 sin.sin_family = AF_INET;
3233 sin.sin_addr = addr;
3234 sin.sin_port = 0;
3235
3236 memset(sa, 0, sizeof *sa);
3237 memcpy(sa, &sin, sizeof sin);
3238 }
3239
3240 static int
3241 do_set_addr(struct netdev *netdev,
3242 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3243 {
3244 struct ifreq ifr;
3245
3246 make_in4_sockaddr(&ifr.ifr_addr, addr);
3247 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3248 ioctl_name);
3249 }
3250
3251 /* Adds 'router' as a default IP gateway. */
3252 static int
3253 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3254 {
3255 struct in_addr any = { INADDR_ANY };
3256 struct rtentry rt;
3257 int error;
3258
3259 memset(&rt, 0, sizeof rt);
3260 make_in4_sockaddr(&rt.rt_dst, any);
3261 make_in4_sockaddr(&rt.rt_gateway, router);
3262 make_in4_sockaddr(&rt.rt_genmask, any);
3263 rt.rt_flags = RTF_UP | RTF_GATEWAY;
3264 error = af_inet_ioctl(SIOCADDRT, &rt);
3265 if (error) {
3266 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3267 }
3268 return error;
3269 }
3270
3271 static int
3272 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3273 char **netdev_name)
3274 {
3275 static const char fn[] = "/proc/net/route";
3276 FILE *stream;
3277 char line[256];
3278 int ln;
3279
3280 *netdev_name = NULL;
3281 stream = fopen(fn, "r");
3282 if (stream == NULL) {
3283 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3284 return errno;
3285 }
3286
3287 ln = 0;
3288 while (fgets(line, sizeof line, stream)) {
3289 if (++ln >= 2) {
3290 char iface[17];
3291 ovs_be32 dest, gateway, mask;
3292 int refcnt, metric, mtu;
3293 unsigned int flags, use, window, irtt;
3294
3295 if (!ovs_scan(line,
3296 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3297 " %d %u %u\n",
3298 iface, &dest, &gateway, &flags, &refcnt,
3299 &use, &metric, &mask, &mtu, &window, &irtt)) {
3300 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3301 fn, ln, line);
3302 continue;
3303 }
3304 if (!(flags & RTF_UP)) {
3305 /* Skip routes that aren't up. */
3306 continue;
3307 }
3308
3309 /* The output of 'dest', 'mask', and 'gateway' were given in
3310 * network byte order, so we don't need need any endian
3311 * conversions here. */
3312 if ((dest & mask) == (host->s_addr & mask)) {
3313 if (!gateway) {
3314 /* The host is directly reachable. */
3315 next_hop->s_addr = 0;
3316 } else {
3317 /* To reach the host, we must go through a gateway. */
3318 next_hop->s_addr = gateway;
3319 }
3320 *netdev_name = xstrdup(iface);
3321 fclose(stream);
3322 return 0;
3323 }
3324 }
3325 }
3326
3327 fclose(stream);
3328 return ENXIO;
3329 }
3330
3331 static int
3332 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3333 {
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3335 int error = 0;
3336
3337 ovs_mutex_lock(&netdev->mutex);
3338 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3339 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3340
3341 COVERAGE_INC(netdev_get_ethtool);
3342 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3343 error = netdev_linux_do_ethtool(netdev->up.name,
3344 cmd,
3345 ETHTOOL_GDRVINFO,
3346 "ETHTOOL_GDRVINFO");
3347 if (!error) {
3348 netdev->cache_valid |= VALID_DRVINFO;
3349 }
3350 }
3351
3352 if (!error) {
3353 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3354 smap_add(smap, "driver_version", netdev->drvinfo.version);
3355 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3356 }
3357 ovs_mutex_unlock(&netdev->mutex);
3358
3359 return error;
3360 }
3361
3362 static int
3363 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3364 struct smap *smap)
3365 {
3366 smap_add(smap, "driver_name", "openvswitch");
3367 return 0;
3368 }
3369
3370 static uint32_t
3371 netdev_linux_get_block_id(struct netdev *netdev_)
3372 {
3373 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3374 uint32_t block_id = 0;
3375
3376 ovs_mutex_lock(&netdev->mutex);
3377 /* Ensure the linux netdev has had its fields populated. */
3378 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3379 netdev_linux_update_via_netlink(netdev);
3380 }
3381
3382 /* Only assigning block ids to linux netdevs that are LAG masters. */
3383 if (netdev->is_lag_master) {
3384 block_id = netdev->ifindex;
3385 }
3386 ovs_mutex_unlock(&netdev->mutex);
3387
3388 return block_id;
3389 }
3390
3391 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3392 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3393 * returns 0. Otherwise, it returns a positive errno value; in particular,
3394 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3395 static int
3396 netdev_linux_arp_lookup(const struct netdev *netdev,
3397 ovs_be32 ip, struct eth_addr *mac)
3398 {
3399 struct arpreq r;
3400 struct sockaddr_in sin;
3401 int retval;
3402
3403 memset(&r, 0, sizeof r);
3404 memset(&sin, 0, sizeof sin);
3405 sin.sin_family = AF_INET;
3406 sin.sin_addr.s_addr = ip;
3407 sin.sin_port = 0;
3408 memcpy(&r.arp_pa, &sin, sizeof sin);
3409 r.arp_ha.sa_family = ARPHRD_ETHER;
3410 r.arp_flags = 0;
3411 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3412 COVERAGE_INC(netdev_arp_lookup);
3413 retval = af_inet_ioctl(SIOCGARP, &r);
3414 if (!retval) {
3415 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3416 } else if (retval != ENXIO) {
3417 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3418 netdev_get_name(netdev), IP_ARGS(ip),
3419 ovs_strerror(retval));
3420 }
3421 return retval;
3422 }
3423
3424 static unsigned int
3425 nd_to_iff_flags(enum netdev_flags nd)
3426 {
3427 unsigned int iff = 0;
3428 if (nd & NETDEV_UP) {
3429 iff |= IFF_UP;
3430 }
3431 if (nd & NETDEV_PROMISC) {
3432 iff |= IFF_PROMISC;
3433 }
3434 if (nd & NETDEV_LOOPBACK) {
3435 iff |= IFF_LOOPBACK;
3436 }
3437 return iff;
3438 }
3439
3440 static int
3441 iff_to_nd_flags(unsigned int iff)
3442 {
3443 enum netdev_flags nd = 0;
3444 if (iff & IFF_UP) {
3445 nd |= NETDEV_UP;
3446 }
3447 if (iff & IFF_PROMISC) {
3448 nd |= NETDEV_PROMISC;
3449 }
3450 if (iff & IFF_LOOPBACK) {
3451 nd |= NETDEV_LOOPBACK;
3452 }
3453 return nd;
3454 }
3455
3456 static int
3457 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3458 enum netdev_flags on, enum netdev_flags *old_flagsp)
3459 OVS_REQUIRES(netdev->mutex)
3460 {
3461 unsigned int old_flags, new_flags;
3462 int error = 0;
3463
3464 old_flags = netdev->ifi_flags;
3465 *old_flagsp = iff_to_nd_flags(old_flags);
3466 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3467 if (new_flags != old_flags) {
3468 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3469 get_flags(&netdev->up, &netdev->ifi_flags);
3470 }
3471
3472 return error;
3473 }
3474
3475 static int
3476 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3477 enum netdev_flags on, enum netdev_flags *old_flagsp)
3478 {
3479 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3480 int error = 0;
3481
3482 ovs_mutex_lock(&netdev->mutex);
3483 if (on || off) {
3484 /* Changing flags over netlink isn't support yet. */
3485 if (netdev_linux_netnsid_is_remote(netdev)) {
3486 error = EOPNOTSUPP;
3487 goto exit;
3488 }
3489 error = update_flags(netdev, off, on, old_flagsp);
3490 } else {
3491 /* Try reading flags over netlink, or fall back to ioctl. */
3492 if (!netdev_linux_update_via_netlink(netdev)) {
3493 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3494 } else {
3495 error = update_flags(netdev, off, on, old_flagsp);
3496 }
3497 }
3498
3499 exit:
3500 ovs_mutex_unlock(&netdev->mutex);
3501 return error;
3502 }
3503
3504 #define NETDEV_LINUX_CLASS_COMMON \
3505 .run = netdev_linux_run, \
3506 .wait = netdev_linux_wait, \
3507 .alloc = netdev_linux_alloc, \
3508 .dealloc = netdev_linux_dealloc, \
3509 .send_wait = netdev_linux_send_wait, \
3510 .set_etheraddr = netdev_linux_set_etheraddr, \
3511 .get_etheraddr = netdev_linux_get_etheraddr, \
3512 .get_mtu = netdev_linux_get_mtu, \
3513 .set_mtu = netdev_linux_set_mtu, \
3514 .get_ifindex = netdev_linux_get_ifindex, \
3515 .get_carrier = netdev_linux_get_carrier, \
3516 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3517 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3518 .set_advertisements = netdev_linux_set_advertisements, \
3519 .set_policing = netdev_linux_set_policing, \
3520 .get_qos_types = netdev_linux_get_qos_types, \
3521 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3522 .get_qos = netdev_linux_get_qos, \
3523 .set_qos = netdev_linux_set_qos, \
3524 .get_queue = netdev_linux_get_queue, \
3525 .set_queue = netdev_linux_set_queue, \
3526 .delete_queue = netdev_linux_delete_queue, \
3527 .get_queue_stats = netdev_linux_get_queue_stats, \
3528 .queue_dump_start = netdev_linux_queue_dump_start, \
3529 .queue_dump_next = netdev_linux_queue_dump_next, \
3530 .queue_dump_done = netdev_linux_queue_dump_done, \
3531 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3532 .set_in4 = netdev_linux_set_in4, \
3533 .get_addr_list = netdev_linux_get_addr_list, \
3534 .add_router = netdev_linux_add_router, \
3535 .get_next_hop = netdev_linux_get_next_hop, \
3536 .arp_lookup = netdev_linux_arp_lookup, \
3537 .update_flags = netdev_linux_update_flags, \
3538 .rxq_alloc = netdev_linux_rxq_alloc, \
3539 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3540 .rxq_wait = netdev_linux_rxq_wait, \
3541 .rxq_drain = netdev_linux_rxq_drain
3542
3543 const struct netdev_class netdev_linux_class = {
3544 NETDEV_LINUX_CLASS_COMMON,
3545 .type = "system",
3546 .is_pmd = false,
3547 .construct = netdev_linux_construct,
3548 .destruct = netdev_linux_destruct,
3549 .get_stats = netdev_linux_get_stats,
3550 .get_features = netdev_linux_get_features,
3551 .get_status = netdev_linux_get_status,
3552 .get_block_id = netdev_linux_get_block_id,
3553 .send = netdev_linux_send,
3554 .rxq_construct = netdev_linux_rxq_construct,
3555 .rxq_destruct = netdev_linux_rxq_destruct,
3556 .rxq_recv = netdev_linux_rxq_recv,
3557 };
3558
3559 const struct netdev_class netdev_tap_class = {
3560 NETDEV_LINUX_CLASS_COMMON,
3561 .type = "tap",
3562 .is_pmd = false,
3563 .construct = netdev_linux_construct_tap,
3564 .destruct = netdev_linux_destruct,
3565 .get_stats = netdev_tap_get_stats,
3566 .get_features = netdev_linux_get_features,
3567 .get_status = netdev_linux_get_status,
3568 .send = netdev_linux_send,
3569 .rxq_construct = netdev_linux_rxq_construct,
3570 .rxq_destruct = netdev_linux_rxq_destruct,
3571 .rxq_recv = netdev_linux_rxq_recv,
3572 };
3573
3574 const struct netdev_class netdev_internal_class = {
3575 NETDEV_LINUX_CLASS_COMMON,
3576 .type = "internal",
3577 .is_pmd = false,
3578 .construct = netdev_linux_construct,
3579 .destruct = netdev_linux_destruct,
3580 .get_stats = netdev_internal_get_stats,
3581 .get_status = netdev_internal_get_status,
3582 .send = netdev_linux_send,
3583 .rxq_construct = netdev_linux_rxq_construct,
3584 .rxq_destruct = netdev_linux_rxq_destruct,
3585 .rxq_recv = netdev_linux_rxq_recv,
3586 };
3587
3588 #ifdef HAVE_AF_XDP
3589 #define NETDEV_AFXDP_CLASS_COMMON \
3590 .init = netdev_afxdp_init, \
3591 .construct = netdev_afxdp_construct, \
3592 .destruct = netdev_afxdp_destruct, \
3593 .get_stats = netdev_afxdp_get_stats, \
3594 .get_custom_stats = netdev_afxdp_get_custom_stats, \
3595 .get_status = netdev_linux_get_status, \
3596 .set_config = netdev_afxdp_set_config, \
3597 .get_config = netdev_afxdp_get_config, \
3598 .reconfigure = netdev_afxdp_reconfigure, \
3599 .get_numa_id = netdev_linux_get_numa_id, \
3600 .send = netdev_afxdp_batch_send, \
3601 .rxq_construct = netdev_afxdp_rxq_construct, \
3602 .rxq_destruct = netdev_afxdp_rxq_destruct, \
3603 .rxq_recv = netdev_afxdp_rxq_recv
3604
3605 const struct netdev_class netdev_afxdp_class = {
3606 NETDEV_LINUX_CLASS_COMMON,
3607 NETDEV_AFXDP_CLASS_COMMON,
3608 .type = "afxdp",
3609 .is_pmd = true,
3610 };
3611
3612 const struct netdev_class netdev_afxdp_nonpmd_class = {
3613 NETDEV_LINUX_CLASS_COMMON,
3614 NETDEV_AFXDP_CLASS_COMMON,
3615 .type = "afxdp-nonpmd",
3616 .is_pmd = false,
3617 };
3618 #endif
3619 \f
3620
3621 #define CODEL_N_QUEUES 0x0000
3622
3623 /* In sufficiently new kernel headers these are defined as enums in
3624 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3625 * kernels. (This overrides any enum definition in the header file but that's
3626 * harmless.) */
3627 #define TCA_CODEL_TARGET 1
3628 #define TCA_CODEL_LIMIT 2
3629 #define TCA_CODEL_INTERVAL 3
3630
3631 struct codel {
3632 struct tc tc;
3633 uint32_t target;
3634 uint32_t limit;
3635 uint32_t interval;
3636 };
3637
3638 static struct codel *
3639 codel_get__(const struct netdev *netdev_)
3640 {
3641 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3642 return CONTAINER_OF(netdev->tc, struct codel, tc);
3643 }
3644
3645 static void
3646 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3647 uint32_t interval)
3648 {
3649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3650 struct codel *codel;
3651
3652 codel = xmalloc(sizeof *codel);
3653 tc_init(&codel->tc, &tc_ops_codel);
3654 codel->target = target;
3655 codel->limit = limit;
3656 codel->interval = interval;
3657
3658 netdev->tc = &codel->tc;
3659 }
3660
3661 static int
3662 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3663 uint32_t interval)
3664 {
3665 size_t opt_offset;
3666 struct ofpbuf request;
3667 struct tcmsg *tcmsg;
3668 uint32_t otarget, olimit, ointerval;
3669 int error;
3670
3671 tc_del_qdisc(netdev);
3672
3673 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3674 NLM_F_EXCL | NLM_F_CREATE, &request);
3675 if (!tcmsg) {
3676 return ENODEV;
3677 }
3678 tcmsg->tcm_handle = tc_make_handle(1, 0);
3679 tcmsg->tcm_parent = TC_H_ROOT;
3680
3681 otarget = target ? target : 5000;
3682 olimit = limit ? limit : 10240;
3683 ointerval = interval ? interval : 100000;
3684
3685 nl_msg_put_string(&request, TCA_KIND, "codel");
3686 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3687 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3688 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3689 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3690 nl_msg_end_nested(&request, opt_offset);
3691
3692 error = tc_transact(&request, NULL);
3693 if (error) {
3694 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3695 "target %u, limit %u, interval %u error %d(%s)",
3696 netdev_get_name(netdev),
3697 otarget, olimit, ointerval,
3698 error, ovs_strerror(error));
3699 }
3700 return error;
3701 }
3702
3703 static void
3704 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3705 const struct smap *details, struct codel *codel)
3706 {
3707 codel->target = smap_get_ullong(details, "target", 0);
3708 codel->limit = smap_get_ullong(details, "limit", 0);
3709 codel->interval = smap_get_ullong(details, "interval", 0);
3710
3711 if (!codel->target) {
3712 codel->target = 5000;
3713 }
3714 if (!codel->limit) {
3715 codel->limit = 10240;
3716 }
3717 if (!codel->interval) {
3718 codel->interval = 100000;
3719 }
3720 }
3721
3722 static int
3723 codel_tc_install(struct netdev *netdev, const struct smap *details)
3724 {
3725 int error;
3726 struct codel codel;
3727
3728 codel_parse_qdisc_details__(netdev, details, &codel);
3729 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3730 codel.interval);
3731 if (!error) {
3732 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3733 }
3734 return error;
3735 }
3736
3737 static int
3738 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3739 {
3740 static const struct nl_policy tca_codel_policy[] = {
3741 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3742 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3743 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3744 };
3745
3746 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3747
3748 if (!nl_parse_nested(nl_options, tca_codel_policy,
3749 attrs, ARRAY_SIZE(tca_codel_policy))) {
3750 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3751 return EPROTO;
3752 }
3753
3754 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3755 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3756 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3757 return 0;
3758 }
3759
3760 static int
3761 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3762 {
3763 struct nlattr *nlattr;
3764 const char * kind;
3765 int error;
3766 struct codel codel;
3767
3768 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3769 if (error != 0) {
3770 return error;
3771 }
3772
3773 error = codel_parse_tca_options__(nlattr, &codel);
3774 if (error != 0) {
3775 return error;
3776 }
3777
3778 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3779 return 0;
3780 }
3781
3782
3783 static void
3784 codel_tc_destroy(struct tc *tc)
3785 {
3786 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3787 tc_destroy(tc);
3788 free(codel);
3789 }
3790
3791 static int
3792 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3793 {
3794 const struct codel *codel = codel_get__(netdev);
3795 smap_add_format(details, "target", "%u", codel->target);
3796 smap_add_format(details, "limit", "%u", codel->limit);
3797 smap_add_format(details, "interval", "%u", codel->interval);
3798 return 0;
3799 }
3800
3801 static int
3802 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3803 {
3804 struct codel codel;
3805
3806 codel_parse_qdisc_details__(netdev, details, &codel);
3807 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3808 codel_get__(netdev)->target = codel.target;
3809 codel_get__(netdev)->limit = codel.limit;
3810 codel_get__(netdev)->interval = codel.interval;
3811 return 0;
3812 }
3813
3814 static const struct tc_ops tc_ops_codel = {
3815 .linux_name = "codel",
3816 .ovs_name = "linux-codel",
3817 .n_queues = CODEL_N_QUEUES,
3818 .tc_install = codel_tc_install,
3819 .tc_load = codel_tc_load,
3820 .tc_destroy = codel_tc_destroy,
3821 .qdisc_get = codel_qdisc_get,
3822 .qdisc_set = codel_qdisc_set,
3823 };
3824 \f
3825 /* FQ-CoDel traffic control class. */
3826
3827 #define FQCODEL_N_QUEUES 0x0000
3828
3829 /* In sufficiently new kernel headers these are defined as enums in
3830 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3831 * kernels. (This overrides any enum definition in the header file but that's
3832 * harmless.) */
3833 #define TCA_FQ_CODEL_TARGET 1
3834 #define TCA_FQ_CODEL_LIMIT 2
3835 #define TCA_FQ_CODEL_INTERVAL 3
3836 #define TCA_FQ_CODEL_ECN 4
3837 #define TCA_FQ_CODEL_FLOWS 5
3838 #define TCA_FQ_CODEL_QUANTUM 6
3839
3840 struct fqcodel {
3841 struct tc tc;
3842 uint32_t target;
3843 uint32_t limit;
3844 uint32_t interval;
3845 uint32_t flows;
3846 uint32_t quantum;
3847 };
3848
3849 static struct fqcodel *
3850 fqcodel_get__(const struct netdev *netdev_)
3851 {
3852 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3853 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3854 }
3855
3856 static void
3857 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3858 uint32_t interval, uint32_t flows, uint32_t quantum)
3859 {
3860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3861 struct fqcodel *fqcodel;
3862
3863 fqcodel = xmalloc(sizeof *fqcodel);
3864 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3865 fqcodel->target = target;
3866 fqcodel->limit = limit;
3867 fqcodel->interval = interval;
3868 fqcodel->flows = flows;
3869 fqcodel->quantum = quantum;
3870
3871 netdev->tc = &fqcodel->tc;
3872 }
3873
3874 static int
3875 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3876 uint32_t interval, uint32_t flows, uint32_t quantum)
3877 {
3878 size_t opt_offset;
3879 struct ofpbuf request;
3880 struct tcmsg *tcmsg;
3881 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3882 int error;
3883
3884 tc_del_qdisc(netdev);
3885
3886 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3887 NLM_F_EXCL | NLM_F_CREATE, &request);
3888 if (!tcmsg) {
3889 return ENODEV;
3890 }
3891 tcmsg->tcm_handle = tc_make_handle(1, 0);
3892 tcmsg->tcm_parent = TC_H_ROOT;
3893
3894 otarget = target ? target : 5000;
3895 olimit = limit ? limit : 10240;
3896 ointerval = interval ? interval : 100000;
3897 oflows = flows ? flows : 1024;
3898 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3899 not mtu */
3900
3901 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3902 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3903 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3904 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3905 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3906 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3907 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3908 nl_msg_end_nested(&request, opt_offset);
3909
3910 error = tc_transact(&request, NULL);
3911 if (error) {
3912 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3913 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3914 netdev_get_name(netdev),
3915 otarget, olimit, ointerval, oflows, oquantum,
3916 error, ovs_strerror(error));
3917 }
3918 return error;
3919 }
3920
3921 static void
3922 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3923 const struct smap *details, struct fqcodel *fqcodel)
3924 {
3925 fqcodel->target = smap_get_ullong(details, "target", 0);
3926 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3927 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3928 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3929 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3930
3931 if (!fqcodel->target) {
3932 fqcodel->target = 5000;
3933 }
3934 if (!fqcodel->limit) {
3935 fqcodel->limit = 10240;
3936 }
3937 if (!fqcodel->interval) {
3938 fqcodel->interval = 1000000;
3939 }
3940 if (!fqcodel->flows) {
3941 fqcodel->flows = 1024;
3942 }
3943 if (!fqcodel->quantum) {
3944 fqcodel->quantum = 1514;
3945 }
3946 }
3947
3948 static int
3949 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3950 {
3951 int error;
3952 struct fqcodel fqcodel;
3953
3954 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3955 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3956 fqcodel.interval, fqcodel.flows,
3957 fqcodel.quantum);
3958 if (!error) {
3959 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3960 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3961 }
3962 return error;
3963 }
3964
3965 static int
3966 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3967 {
3968 static const struct nl_policy tca_fqcodel_policy[] = {
3969 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3970 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3971 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3972 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3973 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3974 };
3975
3976 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3977
3978 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3979 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3980 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3981 return EPROTO;
3982 }
3983
3984 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3985 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3986 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3987 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3988 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3989 return 0;
3990 }
3991
3992 static int
3993 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3994 {
3995 struct nlattr *nlattr;
3996 const char * kind;
3997 int error;
3998 struct fqcodel fqcodel;
3999
4000 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4001 if (error != 0) {
4002 return error;
4003 }
4004
4005 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4006 if (error != 0) {
4007 return error;
4008 }
4009
4010 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4011 fqcodel.flows, fqcodel.quantum);
4012 return 0;
4013 }
4014
4015 static void
4016 fqcodel_tc_destroy(struct tc *tc)
4017 {
4018 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4019 tc_destroy(tc);
4020 free(fqcodel);
4021 }
4022
4023 static int
4024 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4025 {
4026 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4027 smap_add_format(details, "target", "%u", fqcodel->target);
4028 smap_add_format(details, "limit", "%u", fqcodel->limit);
4029 smap_add_format(details, "interval", "%u", fqcodel->interval);
4030 smap_add_format(details, "flows", "%u", fqcodel->flows);
4031 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4032 return 0;
4033 }
4034
4035 static int
4036 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4037 {
4038 struct fqcodel fqcodel;
4039
4040 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4041 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4042 fqcodel.flows, fqcodel.quantum);
4043 fqcodel_get__(netdev)->target = fqcodel.target;
4044 fqcodel_get__(netdev)->limit = fqcodel.limit;
4045 fqcodel_get__(netdev)->interval = fqcodel.interval;
4046 fqcodel_get__(netdev)->flows = fqcodel.flows;
4047 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4048 return 0;
4049 }
4050
4051 static const struct tc_ops tc_ops_fqcodel = {
4052 .linux_name = "fq_codel",
4053 .ovs_name = "linux-fq_codel",
4054 .n_queues = FQCODEL_N_QUEUES,
4055 .tc_install = fqcodel_tc_install,
4056 .tc_load = fqcodel_tc_load,
4057 .tc_destroy = fqcodel_tc_destroy,
4058 .qdisc_get = fqcodel_qdisc_get,
4059 .qdisc_set = fqcodel_qdisc_set,
4060 };
4061 \f
4062 /* SFQ traffic control class. */
4063
4064 #define SFQ_N_QUEUES 0x0000
4065
4066 struct sfq {
4067 struct tc tc;
4068 uint32_t quantum;
4069 uint32_t perturb;
4070 };
4071
4072 static struct sfq *
4073 sfq_get__(const struct netdev *netdev_)
4074 {
4075 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4076 return CONTAINER_OF(netdev->tc, struct sfq, tc);
4077 }
4078
4079 static void
4080 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4081 {
4082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4083 struct sfq *sfq;
4084
4085 sfq = xmalloc(sizeof *sfq);
4086 tc_init(&sfq->tc, &tc_ops_sfq);
4087 sfq->perturb = perturb;
4088 sfq->quantum = quantum;
4089
4090 netdev->tc = &sfq->tc;
4091 }
4092
4093 static int
4094 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4095 {
4096 struct tc_sfq_qopt opt;
4097 struct ofpbuf request;
4098 struct tcmsg *tcmsg;
4099 int mtu;
4100 int mtu_error, error;
4101 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4102
4103 tc_del_qdisc(netdev);
4104
4105 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4106 NLM_F_EXCL | NLM_F_CREATE, &request);
4107 if (!tcmsg) {
4108 return ENODEV;
4109 }
4110 tcmsg->tcm_handle = tc_make_handle(1, 0);
4111 tcmsg->tcm_parent = TC_H_ROOT;
4112
4113 memset(&opt, 0, sizeof opt);
4114 if (!quantum) {
4115 if (!mtu_error) {
4116 opt.quantum = mtu; /* if we cannot find mtu, use default */
4117 }
4118 } else {
4119 opt.quantum = quantum;
4120 }
4121
4122 if (!perturb) {
4123 opt.perturb_period = 10;
4124 } else {
4125 opt.perturb_period = perturb;
4126 }
4127
4128 nl_msg_put_string(&request, TCA_KIND, "sfq");
4129 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4130
4131 error = tc_transact(&request, NULL);
4132 if (error) {
4133 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4134 "quantum %u, perturb %u error %d(%s)",
4135 netdev_get_name(netdev),
4136 opt.quantum, opt.perturb_period,
4137 error, ovs_strerror(error));
4138 }
4139 return error;
4140 }
4141
4142 static void
4143 sfq_parse_qdisc_details__(struct netdev *netdev,
4144 const struct smap *details, struct sfq *sfq)
4145 {
4146 sfq->perturb = smap_get_ullong(details, "perturb", 0);
4147 sfq->quantum = smap_get_ullong(details, "quantum", 0);
4148
4149 if (!sfq->perturb) {
4150 sfq->perturb = 10;
4151 }
4152
4153 if (!sfq->quantum) {
4154 int mtu;
4155 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4156 sfq->quantum = mtu;
4157 } else {
4158 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4159 "device without mtu");
4160 }
4161 }
4162 }
4163
4164 static int
4165 sfq_tc_install(struct netdev *netdev, const struct smap *details)
4166 {
4167 int error;
4168 struct sfq sfq;
4169
4170 sfq_parse_qdisc_details__(netdev, details, &sfq);
4171 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4172 if (!error) {
4173 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4174 }
4175 return error;
4176 }
4177
4178 static int
4179 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4180 {
4181 const struct tc_sfq_qopt *sfq;
4182 struct nlattr *nlattr;
4183 const char * kind;
4184 int error;
4185
4186 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4187 if (error == 0) {
4188 sfq = nl_attr_get(nlattr);
4189 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4190 return 0;
4191 }
4192
4193 return error;
4194 }
4195
4196 static void
4197 sfq_tc_destroy(struct tc *tc)
4198 {
4199 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4200 tc_destroy(tc);
4201 free(sfq);
4202 }
4203
4204 static int
4205 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4206 {
4207 const struct sfq *sfq = sfq_get__(netdev);
4208 smap_add_format(details, "quantum", "%u", sfq->quantum);
4209 smap_add_format(details, "perturb", "%u", sfq->perturb);
4210 return 0;
4211 }
4212
4213 static int
4214 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4215 {
4216 struct sfq sfq;
4217
4218 sfq_parse_qdisc_details__(netdev, details, &sfq);
4219 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4220 sfq_get__(netdev)->quantum = sfq.quantum;
4221 sfq_get__(netdev)->perturb = sfq.perturb;
4222 return 0;
4223 }
4224
4225 static const struct tc_ops tc_ops_sfq = {
4226 .linux_name = "sfq",
4227 .ovs_name = "linux-sfq",
4228 .n_queues = SFQ_N_QUEUES,
4229 .tc_install = sfq_tc_install,
4230 .tc_load = sfq_tc_load,
4231 .tc_destroy = sfq_tc_destroy,
4232 .qdisc_get = sfq_qdisc_get,
4233 .qdisc_set = sfq_qdisc_set,
4234 };
4235 \f
4236 /* netem traffic control class. */
4237
4238 struct netem {
4239 struct tc tc;
4240 uint32_t latency;
4241 uint32_t limit;
4242 uint32_t loss;
4243 };
4244
4245 static struct netem *
4246 netem_get__(const struct netdev *netdev_)
4247 {
4248 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4249 return CONTAINER_OF(netdev->tc, struct netem, tc);
4250 }
4251
4252 static void
4253 netem_install__(struct netdev *netdev_, uint32_t latency,
4254 uint32_t limit, uint32_t loss)
4255 {
4256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4257 struct netem *netem;
4258
4259 netem = xmalloc(sizeof *netem);
4260 tc_init(&netem->tc, &tc_ops_netem);
4261 netem->latency = latency;
4262 netem->limit = limit;
4263 netem->loss = loss;
4264
4265 netdev->tc = &netem->tc;
4266 }
4267
4268 static int
4269 netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4270 uint32_t limit, uint32_t loss)
4271 {
4272 struct tc_netem_qopt opt;
4273 struct ofpbuf request;
4274 struct tcmsg *tcmsg;
4275 int error;
4276
4277 tc_del_qdisc(netdev);
4278
4279 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4280 NLM_F_EXCL | NLM_F_CREATE, &request);
4281 if (!tcmsg) {
4282 return ENODEV;
4283 }
4284 tcmsg->tcm_handle = tc_make_handle(1, 0);
4285 tcmsg->tcm_parent = TC_H_ROOT;
4286
4287 memset(&opt, 0, sizeof opt);
4288
4289 if (!limit) {
4290 opt.limit = 1000;
4291 } else {
4292 opt.limit = limit;
4293 }
4294
4295 if (loss) {
4296 if (loss > 100) {
4297 VLOG_WARN_RL(&rl,
4298 "loss should be a percentage value between 0 to 100, "
4299 "loss was %u", loss);
4300 return EINVAL;
4301 }
4302 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4303 }
4304
4305 opt.latency = tc_time_to_ticks(latency);
4306
4307 nl_msg_put_string(&request, TCA_KIND, "netem");
4308 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4309
4310 error = tc_transact(&request, NULL);
4311 if (error) {
4312 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4313 "latency %u, limit %u, loss %u error %d(%s)",
4314 netdev_get_name(netdev),
4315 opt.latency, opt.limit, opt.loss,
4316 error, ovs_strerror(error));
4317 }
4318 return error;
4319 }
4320
4321 static void
4322 netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4323 const struct smap *details, struct netem *netem)
4324 {
4325 netem->latency = smap_get_ullong(details, "latency", 0);
4326 netem->limit = smap_get_ullong(details, "limit", 0);
4327 netem->loss = smap_get_ullong(details, "loss", 0);
4328
4329 if (!netem->limit) {
4330 netem->limit = 1000;
4331 }
4332 }
4333
4334 static int
4335 netem_tc_install(struct netdev *netdev, const struct smap *details)
4336 {
4337 int error;
4338 struct netem netem;
4339
4340 netem_parse_qdisc_details__(netdev, details, &netem);
4341 error = netem_setup_qdisc__(netdev, netem.latency,
4342 netem.limit, netem.loss);
4343 if (!error) {
4344 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4345 }
4346 return error;
4347 }
4348
4349 static int
4350 netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4351 {
4352 const struct tc_netem_qopt *netem;
4353 struct nlattr *nlattr;
4354 const char *kind;
4355 int error;
4356
4357 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4358 if (error == 0) {
4359 netem = nl_attr_get(nlattr);
4360 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4361 return 0;
4362 }
4363
4364 return error;
4365 }
4366
4367 static void
4368 netem_tc_destroy(struct tc *tc)
4369 {
4370 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4371 tc_destroy(tc);
4372 free(netem);
4373 }
4374
4375 static int
4376 netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4377 {
4378 const struct netem *netem = netem_get__(netdev);
4379 smap_add_format(details, "latency", "%u", netem->latency);
4380 smap_add_format(details, "limit", "%u", netem->limit);
4381 smap_add_format(details, "loss", "%u", netem->loss);
4382 return 0;
4383 }
4384
4385 static int
4386 netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4387 {
4388 struct netem netem;
4389
4390 netem_parse_qdisc_details__(netdev, details, &netem);
4391 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4392 netem_get__(netdev)->latency = netem.latency;
4393 netem_get__(netdev)->limit = netem.limit;
4394 netem_get__(netdev)->loss = netem.loss;
4395 return 0;
4396 }
4397
4398 static const struct tc_ops tc_ops_netem = {
4399 .linux_name = "netem",
4400 .ovs_name = "linux-netem",
4401 .n_queues = 0,
4402 .tc_install = netem_tc_install,
4403 .tc_load = netem_tc_load,
4404 .tc_destroy = netem_tc_destroy,
4405 .qdisc_get = netem_qdisc_get,
4406 .qdisc_set = netem_qdisc_set,
4407 };
4408 \f
4409 /* HTB traffic control class. */
4410
4411 #define HTB_N_QUEUES 0xf000
4412 #define HTB_RATE2QUANTUM 10
4413
4414 struct htb {
4415 struct tc tc;
4416 unsigned int max_rate; /* In bytes/s. */
4417 };
4418
4419 struct htb_class {
4420 struct tc_queue tc_queue;
4421 unsigned int min_rate; /* In bytes/s. */
4422 unsigned int max_rate; /* In bytes/s. */
4423 unsigned int burst; /* In bytes. */
4424 unsigned int priority; /* Lower values are higher priorities. */
4425 };
4426
4427 static struct htb *
4428 htb_get__(const struct netdev *netdev_)
4429 {
4430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4431 return CONTAINER_OF(netdev->tc, struct htb, tc);
4432 }
4433
4434 static void
4435 htb_install__(struct netdev *netdev_, uint64_t max_rate)
4436 {
4437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4438 struct htb *htb;
4439
4440 htb = xmalloc(sizeof *htb);
4441 tc_init(&htb->tc, &tc_ops_htb);
4442 htb->max_rate = max_rate;
4443
4444 netdev->tc = &htb->tc;
4445 }
4446
4447 /* Create an HTB qdisc.
4448 *
4449 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4450 static int
4451 htb_setup_qdisc__(struct netdev *netdev)
4452 {
4453 size_t opt_offset;
4454 struct tc_htb_glob opt;
4455 struct ofpbuf request;
4456 struct tcmsg *tcmsg;
4457
4458 tc_del_qdisc(netdev);
4459
4460 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4461 NLM_F_EXCL | NLM_F_CREATE, &request);
4462 if (!tcmsg) {
4463 return ENODEV;
4464 }
4465 tcmsg->tcm_handle = tc_make_handle(1, 0);
4466 tcmsg->tcm_parent = TC_H_ROOT;
4467
4468 nl_msg_put_string(&request, TCA_KIND, "htb");
4469
4470 memset(&opt, 0, sizeof opt);
4471 opt.rate2quantum = HTB_RATE2QUANTUM;
4472 opt.version = 3;
4473 opt.defcls = 1;
4474
4475 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4476 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4477 nl_msg_end_nested(&request, opt_offset);
4478
4479 return tc_transact(&request, NULL);
4480 }
4481
4482 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4483 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4484 static int
4485 htb_setup_class__(struct netdev *netdev, unsigned int handle,
4486 unsigned int parent, struct htb_class *class)
4487 {
4488 size_t opt_offset;
4489 struct tc_htb_opt opt;
4490 struct ofpbuf request;
4491 struct tcmsg *tcmsg;
4492 int error;
4493 int mtu;
4494
4495 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4496 if (error) {
4497 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4498 netdev_get_name(netdev));
4499 return error;
4500 }
4501
4502 memset(&opt, 0, sizeof opt);
4503 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4504 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4505 /* Makes sure the quantum is at least MTU. Setting quantum will
4506 * make htb ignore the r2q for this class. */
4507 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4508 opt.quantum = mtu;
4509 }
4510 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4511 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4512 opt.prio = class->priority;
4513
4514 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4515 &request);
4516 if (!tcmsg) {
4517 return ENODEV;
4518 }
4519 tcmsg->tcm_handle = handle;
4520 tcmsg->tcm_parent = parent;
4521
4522 nl_msg_put_string(&request, TCA_KIND, "htb");
4523 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4524 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4525 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4526 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4527 nl_msg_end_nested(&request, opt_offset);
4528
4529 error = tc_transact(&request, NULL);
4530 if (error) {
4531 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4532 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4533 netdev_get_name(netdev),
4534 tc_get_major(handle), tc_get_minor(handle),
4535 tc_get_major(parent), tc_get_minor(parent),
4536 class->min_rate, class->max_rate,
4537 class->burst, class->priority, ovs_strerror(error));
4538 }
4539 return error;
4540 }
4541
4542 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4543 * description of them into 'details'. The description complies with the
4544 * specification given in the vswitch database documentation for linux-htb
4545 * queue details. */
4546 static int
4547 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4548 {
4549 static const struct nl_policy tca_htb_policy[] = {
4550 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4551 .min_len = sizeof(struct tc_htb_opt) },
4552 };
4553
4554 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4555 const struct tc_htb_opt *htb;
4556
4557 if (!nl_parse_nested(nl_options, tca_htb_policy,
4558 attrs, ARRAY_SIZE(tca_htb_policy))) {
4559 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4560 return EPROTO;
4561 }
4562
4563 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4564 class->min_rate = htb->rate.rate;
4565 class->max_rate = htb->ceil.rate;
4566 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4567 class->priority = htb->prio;
4568 return 0;
4569 }
4570
4571 static int
4572 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4573 struct htb_class *options,
4574 struct netdev_queue_stats *stats)
4575 {
4576 struct nlattr *nl_options;
4577 unsigned int handle;
4578 int error;
4579
4580 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4581 if (!error && queue_id) {
4582 unsigned int major = tc_get_major(handle);
4583 unsigned int minor = tc_get_minor(handle);
4584 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4585 *queue_id = minor - 1;
4586 } else {
4587 error = EPROTO;
4588 }
4589 }
4590 if (!error && options) {
4591 error = htb_parse_tca_options__(nl_options, options);
4592 }
4593 return error;
4594 }
4595
4596 static void
4597 htb_parse_qdisc_details__(struct netdev *netdev_,
4598 const struct smap *details, struct htb_class *hc)
4599 {
4600 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4601
4602 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4603 if (!hc->max_rate) {
4604 enum netdev_features current;
4605
4606 netdev_linux_read_features(netdev);
4607 current = !netdev->get_features_error ? netdev->current : 0;
4608 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4609 }
4610 hc->min_rate = hc->max_rate;
4611 hc->burst = 0;
4612 hc->priority = 0;
4613 }
4614
4615 static int
4616 htb_parse_class_details__(struct netdev *netdev,
4617 const struct smap *details, struct htb_class *hc)
4618 {
4619 const struct htb *htb = htb_get__(netdev);
4620 int mtu, error;
4621 unsigned long long int max_rate_bit;
4622
4623 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4624 if (error) {
4625 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4626 netdev_get_name(netdev));
4627 return error;
4628 }
4629
4630 /* HTB requires at least an mtu sized min-rate to send any traffic even
4631 * on uncongested links. */
4632 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4633 hc->min_rate = MAX(hc->min_rate, mtu);
4634 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4635
4636 /* max-rate */
4637 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4638 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4639 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4640 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4641
4642 /* burst
4643 *
4644 * According to hints in the documentation that I've read, it is important
4645 * that 'burst' be at least as big as the largest frame that might be
4646 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4647 * but having it a bit too small is a problem. Since netdev_get_mtu()
4648 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4649 * the MTU. We actually add 64, instead of 14, as a guard against
4650 * additional headers get tacked on somewhere that we're not aware of. */
4651 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4652 hc->burst = MAX(hc->burst, mtu + 64);
4653
4654 /* priority */
4655 hc->priority = smap_get_ullong(details, "priority", 0);
4656
4657 return 0;
4658 }
4659
4660 static int
4661 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4662 unsigned int parent, struct htb_class *options,
4663 struct netdev_queue_stats *stats)
4664 {
4665 struct ofpbuf *reply;
4666 int error;
4667
4668 error = tc_query_class(netdev, handle, parent, &reply);
4669 if (!error) {
4670 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4671 ofpbuf_delete(reply);
4672 }
4673 return error;
4674 }
4675
4676 static int
4677 htb_tc_install(struct netdev *netdev, const struct smap *details)
4678 {
4679 int error;
4680
4681 error = htb_setup_qdisc__(netdev);
4682 if (!error) {
4683 struct htb_class hc;
4684
4685 htb_parse_qdisc_details__(netdev, details, &hc);
4686 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4687 tc_make_handle(1, 0), &hc);
4688 if (!error) {
4689 htb_install__(netdev, hc.max_rate);
4690 }
4691 }
4692 return error;
4693 }
4694
4695 static struct htb_class *
4696 htb_class_cast__(const struct tc_queue *queue)
4697 {
4698 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4699 }
4700
4701 static void
4702 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4703 const struct htb_class *hc)
4704 {
4705 struct htb *htb = htb_get__(netdev);
4706 size_t hash = hash_int(queue_id, 0);
4707 struct tc_queue *queue;
4708 struct htb_class *hcp;
4709
4710 queue = tc_find_queue__(netdev, queue_id, hash);
4711 if (queue) {
4712 hcp = htb_class_cast__(queue);
4713 } else {
4714 hcp = xmalloc(sizeof *hcp);
4715 queue = &hcp->tc_queue;
4716 queue->queue_id = queue_id;
4717 queue->created = time_msec();
4718 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4719 }
4720
4721 hcp->min_rate = hc->min_rate;
4722 hcp->max_rate = hc->max_rate;
4723 hcp->burst = hc->burst;
4724 hcp->priority = hc->priority;
4725 }
4726
4727 static int
4728 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4729 {
4730 struct ofpbuf msg;
4731 struct queue_dump_state state;
4732 struct htb_class hc;
4733
4734 /* Get qdisc options. */
4735 hc.max_rate = 0;
4736 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4737 htb_install__(netdev, hc.max_rate);
4738
4739 /* Get queues. */
4740 if (!start_queue_dump(netdev, &state)) {
4741 return ENODEV;
4742 }
4743 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4744 unsigned int queue_id;
4745
4746 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4747 htb_update_queue__(netdev, queue_id, &hc);
4748 }
4749 }
4750 finish_queue_dump(&state);
4751
4752 return 0;
4753 }
4754
4755 static void
4756 htb_tc_destroy(struct tc *tc)
4757 {
4758 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4759 struct htb_class *hc;
4760
4761 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4762 free(hc);
4763 }
4764 tc_destroy(tc);
4765 free(htb);
4766 }
4767
4768 static int
4769 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4770 {
4771 const struct htb *htb = htb_get__(netdev);
4772 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4773 return 0;
4774 }
4775
4776 static int
4777 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4778 {
4779 struct htb_class hc;
4780 int error;
4781
4782 htb_parse_qdisc_details__(netdev, details, &hc);
4783 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4784 tc_make_handle(1, 0), &hc);
4785 if (!error) {
4786 htb_get__(netdev)->max_rate = hc.max_rate;
4787 }
4788 return error;
4789 }
4790
4791 static int
4792 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4793 const struct tc_queue *queue, struct smap *details)
4794 {
4795 const struct htb_class *hc = htb_class_cast__(queue);
4796
4797 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4798 if (hc->min_rate != hc->max_rate) {
4799 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4800 }
4801 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4802 if (hc->priority) {
4803 smap_add_format(details, "priority", "%u", hc->priority);
4804 }
4805 return 0;
4806 }
4807
4808 static int
4809 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4810 const struct smap *details)
4811 {
4812 struct htb_class hc;
4813 int error;
4814
4815 error = htb_parse_class_details__(netdev, details, &hc);
4816 if (error) {
4817 return error;
4818 }
4819
4820 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4821 tc_make_handle(1, 0xfffe), &hc);
4822 if (error) {
4823 return error;
4824 }
4825
4826 htb_update_queue__(netdev, queue_id, &hc);
4827 return 0;
4828 }
4829
4830 static int
4831 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4832 {
4833 struct htb_class *hc = htb_class_cast__(queue);
4834 struct htb *htb = htb_get__(netdev);
4835 int error;
4836
4837 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4838 if (!error) {
4839 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4840 free(hc);
4841 }
4842 return error;
4843 }
4844
4845 static int
4846 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4847 struct netdev_queue_stats *stats)
4848 {
4849 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4850 tc_make_handle(1, 0xfffe), NULL, stats);
4851 }
4852
4853 static int
4854 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4855 const struct ofpbuf *nlmsg,
4856 netdev_dump_queue_stats_cb *cb, void *aux)
4857 {
4858 struct netdev_queue_stats stats;
4859 unsigned int handle, major, minor;
4860 int error;
4861
4862 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4863 if (error) {
4864 return error;
4865 }
4866
4867 major = tc_get_major(handle);
4868 minor = tc_get_minor(handle);
4869 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4870 (*cb)(minor - 1, &stats, aux);
4871 }
4872 return 0;
4873 }
4874
4875 static const struct tc_ops tc_ops_htb = {
4876 .linux_name = "htb",
4877 .ovs_name = "linux-htb",
4878 .n_queues = HTB_N_QUEUES,
4879 .tc_install = htb_tc_install,
4880 .tc_load = htb_tc_load,
4881 .tc_destroy = htb_tc_destroy,
4882 .qdisc_get = htb_qdisc_get,
4883 .qdisc_set = htb_qdisc_set,
4884 .class_get = htb_class_get,
4885 .class_set = htb_class_set,
4886 .class_delete = htb_class_delete,
4887 .class_get_stats = htb_class_get_stats,
4888 .class_dump_stats = htb_class_dump_stats
4889 };
4890 \f
4891 /* "linux-hfsc" traffic control class. */
4892
4893 #define HFSC_N_QUEUES 0xf000
4894
4895 struct hfsc {
4896 struct tc tc;
4897 uint32_t max_rate;
4898 };
4899
4900 struct hfsc_class {
4901 struct tc_queue tc_queue;
4902 uint32_t min_rate;
4903 uint32_t max_rate;
4904 };
4905
4906 static struct hfsc *
4907 hfsc_get__(const struct netdev *netdev_)
4908 {
4909 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4910 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4911 }
4912
4913 static struct hfsc_class *
4914 hfsc_class_cast__(const struct tc_queue *queue)
4915 {
4916 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4917 }
4918
4919 static void
4920 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4921 {
4922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4923 struct hfsc *hfsc;
4924
4925 hfsc = xmalloc(sizeof *hfsc);
4926 tc_init(&hfsc->tc, &tc_ops_hfsc);
4927 hfsc->max_rate = max_rate;
4928 netdev->tc = &hfsc->tc;
4929 }
4930
4931 static void
4932 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4933 const struct hfsc_class *hc)
4934 {
4935 size_t hash;
4936 struct hfsc *hfsc;
4937 struct hfsc_class *hcp;
4938 struct tc_queue *queue;
4939
4940 hfsc = hfsc_get__(netdev);
4941 hash = hash_int(queue_id, 0);
4942
4943 queue = tc_find_queue__(netdev, queue_id, hash);
4944 if (queue) {
4945 hcp = hfsc_class_cast__(queue);
4946 } else {
4947 hcp = xmalloc(sizeof *hcp);
4948 queue = &hcp->tc_queue;
4949 queue->queue_id = queue_id;
4950 queue->created = time_msec();
4951 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4952 }
4953
4954 hcp->min_rate = hc->min_rate;
4955 hcp->max_rate = hc->max_rate;
4956 }
4957
4958 static int
4959 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4960 {
4961 const struct tc_service_curve *rsc, *fsc, *usc;
4962 static const struct nl_policy tca_hfsc_policy[] = {
4963 [TCA_HFSC_RSC] = {
4964 .type = NL_A_UNSPEC,
4965 .optional = false,
4966 .min_len = sizeof(struct tc_service_curve),
4967 },
4968 [TCA_HFSC_FSC] = {
4969 .type = NL_A_UNSPEC,
4970 .optional = false,
4971 .min_len = sizeof(struct tc_service_curve),
4972 },
4973 [TCA_HFSC_USC] = {
4974 .type = NL_A_UNSPEC,
4975 .optional = false,
4976 .min_len = sizeof(struct tc_service_curve),
4977 },
4978 };
4979 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4980
4981 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4982 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4983 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4984 return EPROTO;
4985 }
4986
4987 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4988 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4989 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4990
4991 if (rsc->m1 != 0 || rsc->d != 0 ||
4992 fsc->m1 != 0 || fsc->d != 0 ||
4993 usc->m1 != 0 || usc->d != 0) {
4994 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4995 "Non-linear service curves are not supported.");
4996 return EPROTO;
4997 }
4998
4999 if (rsc->m2 != fsc->m2) {
5000 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5001 "Real-time service curves are not supported ");
5002 return EPROTO;
5003 }
5004
5005 if (rsc->m2 > usc->m2) {
5006 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5007 "Min-rate service curve is greater than "
5008 "the max-rate service curve.");
5009 return EPROTO;
5010 }
5011
5012 class->min_rate = fsc->m2;
5013 class->max_rate = usc->m2;
5014 return 0;
5015 }
5016
5017 static int
5018 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5019 struct hfsc_class *options,
5020 struct netdev_queue_stats *stats)
5021 {
5022 int error;
5023 unsigned int handle;
5024 struct nlattr *nl_options;
5025
5026 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5027 if (error) {
5028 return error;
5029 }
5030
5031 if (queue_id) {
5032 unsigned int major, minor;
5033
5034 major = tc_get_major(handle);
5035 minor = tc_get_minor(handle);
5036 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5037 *queue_id = minor - 1;
5038 } else {
5039 return EPROTO;
5040 }
5041 }
5042
5043 if (options) {
5044 error = hfsc_parse_tca_options__(nl_options, options);
5045 }
5046
5047 return error;
5048 }
5049
5050 static int
5051 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5052 unsigned int parent, struct hfsc_class *options,
5053 struct netdev_queue_stats *stats)
5054 {
5055 int error;
5056 struct ofpbuf *reply;
5057
5058 error = tc_query_class(netdev, handle, parent, &reply);
5059 if (error) {
5060 return error;
5061 }
5062
5063 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5064 ofpbuf_delete(reply);
5065 return error;
5066 }
5067
5068 static void
5069 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
5070 struct hfsc_class *class)
5071 {
5072 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5073
5074 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5075 if (!max_rate) {
5076 enum netdev_features current;
5077
5078 netdev_linux_read_features(netdev);
5079 current = !netdev->get_features_error ? netdev->current : 0;
5080 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
5081 }
5082
5083 class->min_rate = max_rate;
5084 class->max_rate = max_rate;
5085 }
5086
5087 static int
5088 hfsc_parse_class_details__(struct netdev *netdev,
5089 const struct smap *details,
5090 struct hfsc_class * class)
5091 {
5092 const struct hfsc *hfsc;
5093 uint32_t min_rate, max_rate;
5094
5095 hfsc = hfsc_get__(netdev);
5096
5097 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5098 min_rate = MAX(min_rate, 1);
5099 min_rate = MIN(min_rate, hfsc->max_rate);
5100
5101 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5102 max_rate = MAX(max_rate, min_rate);
5103 max_rate = MIN(max_rate, hfsc->max_rate);
5104
5105 class->min_rate = min_rate;
5106 class->max_rate = max_rate;
5107
5108 return 0;
5109 }
5110
5111 /* Create an HFSC qdisc.
5112 *
5113 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5114 static int
5115 hfsc_setup_qdisc__(struct netdev * netdev)
5116 {
5117 struct tcmsg *tcmsg;
5118 struct ofpbuf request;
5119 struct tc_hfsc_qopt opt;
5120
5121 tc_del_qdisc(netdev);
5122
5123 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5124 NLM_F_EXCL | NLM_F_CREATE, &request);
5125
5126 if (!tcmsg) {
5127 return ENODEV;
5128 }
5129
5130 tcmsg->tcm_handle = tc_make_handle(1, 0);
5131 tcmsg->tcm_parent = TC_H_ROOT;
5132
5133 memset(&opt, 0, sizeof opt);
5134 opt.defcls = 1;
5135
5136 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5137 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5138
5139 return tc_transact(&request, NULL);
5140 }
5141
5142 /* Create an HFSC class.
5143 *
5144 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5145 * sc rate <min_rate> ul rate <max_rate>" */
5146 static int
5147 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5148 unsigned int parent, struct hfsc_class *class)
5149 {
5150 int error;
5151 size_t opt_offset;
5152 struct tcmsg *tcmsg;
5153 struct ofpbuf request;
5154 struct tc_service_curve min, max;
5155
5156 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5157 &request);
5158
5159 if (!tcmsg) {
5160 return ENODEV;
5161 }
5162
5163 tcmsg->tcm_handle = handle;
5164 tcmsg->tcm_parent = parent;
5165
5166 min.m1 = 0;
5167 min.d = 0;
5168 min.m2 = class->min_rate;
5169
5170 max.m1 = 0;
5171 max.d = 0;
5172 max.m2 = class->max_rate;
5173
5174 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5175 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5176 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5177 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5178 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5179 nl_msg_end_nested(&request, opt_offset);
5180
5181 error = tc_transact(&request, NULL);
5182 if (error) {
5183 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5184 "min-rate %ubps, max-rate %ubps (%s)",
5185 netdev_get_name(netdev),
5186 tc_get_major(handle), tc_get_minor(handle),
5187 tc_get_major(parent), tc_get_minor(parent),
5188 class->min_rate, class->max_rate, ovs_strerror(error));
5189 }
5190
5191 return error;
5192 }
5193
5194 static int
5195 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5196 {
5197 int error;
5198 struct hfsc_class class;
5199
5200 error = hfsc_setup_qdisc__(netdev);
5201
5202 if (error) {
5203 return error;
5204 }
5205
5206 hfsc_parse_qdisc_details__(netdev, details, &class);
5207 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5208 tc_make_handle(1, 0), &class);
5209
5210 if (error) {
5211 return error;
5212 }
5213
5214 hfsc_install__(netdev, class.max_rate);
5215 return 0;
5216 }
5217
5218 static int
5219 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5220 {
5221 struct ofpbuf msg;
5222 struct queue_dump_state state;
5223 struct hfsc_class hc;
5224
5225 hc.max_rate = 0;
5226 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5227 hfsc_install__(netdev, hc.max_rate);
5228
5229 if (!start_queue_dump(netdev, &state)) {
5230 return ENODEV;
5231 }
5232
5233 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5234 unsigned int queue_id;
5235
5236 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5237 hfsc_update_queue__(netdev, queue_id, &hc);
5238 }
5239 }
5240
5241 finish_queue_dump(&state);
5242 return 0;
5243 }
5244
5245 static void
5246 hfsc_tc_destroy(struct tc *tc)
5247 {
5248 struct hfsc *hfsc;
5249 struct hfsc_class *hc, *next;
5250
5251 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5252
5253 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
5254 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5255 free(hc);
5256 }
5257
5258 tc_destroy(tc);
5259 free(hfsc);
5260 }
5261
5262 static int
5263 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5264 {
5265 const struct hfsc *hfsc;
5266 hfsc = hfsc_get__(netdev);
5267 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5268 return 0;
5269 }
5270
5271 static int
5272 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5273 {
5274 int error;
5275 struct hfsc_class class;
5276
5277 hfsc_parse_qdisc_details__(netdev, details, &class);
5278 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5279 tc_make_handle(1, 0), &class);
5280
5281 if (!error) {
5282 hfsc_get__(netdev)->max_rate = class.max_rate;
5283 }
5284
5285 return error;
5286 }
5287
5288 static int
5289 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5290 const struct tc_queue *queue, struct smap *details)
5291 {
5292 const struct hfsc_class *hc;
5293
5294 hc = hfsc_class_cast__(queue);
5295 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5296 if (hc->min_rate != hc->max_rate) {
5297 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5298 }
5299 return 0;
5300 }
5301
5302 static int
5303 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5304 const struct smap *details)
5305 {
5306 int error;
5307 struct hfsc_class class;
5308
5309 error = hfsc_parse_class_details__(netdev, details, &class);
5310 if (error) {
5311 return error;
5312 }
5313
5314 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5315 tc_make_handle(1, 0xfffe), &class);
5316 if (error) {
5317 return error;
5318 }
5319
5320 hfsc_update_queue__(netdev, queue_id, &class);
5321 return 0;
5322 }
5323
5324 static int
5325 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5326 {
5327 int error;
5328 struct hfsc *hfsc;
5329 struct hfsc_class *hc;
5330
5331 hc = hfsc_class_cast__(queue);
5332 hfsc = hfsc_get__(netdev);
5333
5334 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5335 if (!error) {
5336 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5337 free(hc);
5338 }
5339 return error;
5340 }
5341
5342 static int
5343 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5344 struct netdev_queue_stats *stats)
5345 {
5346 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5347 tc_make_handle(1, 0xfffe), NULL, stats);
5348 }
5349
5350 static int
5351 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5352 const struct ofpbuf *nlmsg,
5353 netdev_dump_queue_stats_cb *cb, void *aux)
5354 {
5355 struct netdev_queue_stats stats;
5356 unsigned int handle, major, minor;
5357 int error;
5358
5359 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5360 if (error) {
5361 return error;
5362 }
5363
5364 major = tc_get_major(handle);
5365 minor = tc_get_minor(handle);
5366 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5367 (*cb)(minor - 1, &stats, aux);
5368 }
5369 return 0;
5370 }
5371
5372 static const struct tc_ops tc_ops_hfsc = {
5373 .linux_name = "hfsc",
5374 .ovs_name = "linux-hfsc",
5375 .n_queues = HFSC_N_QUEUES, /* n_queues */
5376 .tc_install = hfsc_tc_install,
5377 .tc_load = hfsc_tc_load,
5378 .tc_destroy = hfsc_tc_destroy,
5379 .qdisc_get = hfsc_qdisc_get,
5380 .qdisc_set = hfsc_qdisc_set,
5381 .class_get = hfsc_class_get,
5382 .class_set = hfsc_class_set,
5383 .class_delete = hfsc_class_delete,
5384 .class_get_stats = hfsc_class_get_stats,
5385 .class_dump_stats = hfsc_class_dump_stats,
5386 };
5387 \f
5388 /* "linux-noop" traffic control class. */
5389
5390 static void
5391 noop_install__(struct netdev *netdev_)
5392 {
5393 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5394 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5395
5396 netdev->tc = CONST_CAST(struct tc *, &tc);
5397 }
5398
5399 static int
5400 noop_tc_install(struct netdev *netdev,
5401 const struct smap *details OVS_UNUSED)
5402 {
5403 noop_install__(netdev);
5404 return 0;
5405 }
5406
5407 static int
5408 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5409 {
5410 noop_install__(netdev);
5411 return 0;
5412 }
5413
5414 static const struct tc_ops tc_ops_noop = {
5415 .ovs_name = "linux-noop", /* ovs_name */
5416 .tc_install = noop_tc_install,
5417 .tc_load = noop_tc_load,
5418 };
5419 \f
5420 /* "linux-default" traffic control class.
5421 *
5422 * This class represents the default, unnamed Linux qdisc. It corresponds to
5423 * the "" (empty string) QoS type in the OVS database. */
5424
5425 static void
5426 default_install__(struct netdev *netdev_)
5427 {
5428 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5429 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5430
5431 /* Nothing but a tc class implementation is allowed to write to a tc. This
5432 * class never does that, so we can legitimately use a const tc object. */
5433 netdev->tc = CONST_CAST(struct tc *, &tc);
5434 }
5435
5436 static int
5437 default_tc_install(struct netdev *netdev,
5438 const struct smap *details OVS_UNUSED)
5439 {
5440 default_install__(netdev);
5441 return 0;
5442 }
5443
5444 static int
5445 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5446 {
5447 default_install__(netdev);
5448 return 0;
5449 }
5450
5451 static const struct tc_ops tc_ops_default = {
5452 .ovs_name = "", /* ovs_name */
5453 .tc_install = default_tc_install,
5454 .tc_load = default_tc_load,
5455 };
5456 \f
5457 /* "linux-other" traffic control class.
5458 *
5459 * */
5460
5461 static int
5462 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5463 {
5464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5465 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5466
5467 /* Nothing but a tc class implementation is allowed to write to a tc. This
5468 * class never does that, so we can legitimately use a const tc object. */
5469 netdev->tc = CONST_CAST(struct tc *, &tc);
5470 return 0;
5471 }
5472
5473 static const struct tc_ops tc_ops_other = {
5474 .ovs_name = "linux-other",
5475 .tc_load = other_tc_load,
5476 };
5477 \f
5478 /* Traffic control. */
5479
5480 /* Number of kernel "tc" ticks per second. */
5481 static double ticks_per_s;
5482
5483 /* Number of kernel "jiffies" per second. This is used for the purpose of
5484 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5485 * one jiffy's worth of data.
5486 *
5487 * There are two possibilities here:
5488 *
5489 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5490 * approximate range of 100 to 1024. That means that we really need to
5491 * make sure that the qdisc can buffer that much data.
5492 *
5493 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5494 * has finely granular timers and there's no need to fudge additional room
5495 * for buffers. (There's no extra effort needed to implement that: the
5496 * large 'buffer_hz' is used as a divisor, so practically any number will
5497 * come out as 0 in the division. Small integer results in the case of
5498 * really high dividends won't have any real effect anyhow.)
5499 */
5500 static unsigned int buffer_hz;
5501
5502 static struct tcmsg *
5503 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5504 unsigned int flags, struct ofpbuf *request)
5505 {
5506 int ifindex;
5507 int error;
5508
5509 error = get_ifindex(netdev, &ifindex);
5510 if (error) {
5511 return NULL;
5512 }
5513
5514 return tc_make_request(ifindex, type, flags, request);
5515 }
5516
5517 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5518 * of 'kbits_burst'.
5519 *
5520 * This function is equivalent to running:
5521 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5522 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5523 * mtu 65535 drop
5524 *
5525 * The configuration and stats may be seen with the following command:
5526 * /sbin/tc -s filter show dev <devname> parent ffff:
5527 *
5528 * Returns 0 if successful, otherwise a positive errno value.
5529 */
5530 static int
5531 tc_add_policer(struct netdev *netdev,
5532 uint32_t kbits_rate, uint32_t kbits_burst)
5533 {
5534 struct tc_police tc_police;
5535 struct ofpbuf request;
5536 struct tcmsg *tcmsg;
5537 size_t basic_offset;
5538 size_t police_offset;
5539 int error;
5540 int mtu = 65535;
5541
5542 memset(&tc_police, 0, sizeof tc_police);
5543 tc_police.action = TC_POLICE_SHOT;
5544 tc_police.mtu = mtu;
5545 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5546
5547 /* The following appears wrong in one way: In networking a kilobit is
5548 * usually 1000 bits but this uses 1024 bits.
5549 *
5550 * However if you "fix" those problems then "tc filter show ..." shows
5551 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5552 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5553 * tc's point of view. Whatever. */
5554 tc_police.burst = tc_bytes_to_ticks(
5555 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5556
5557 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5558 NLM_F_EXCL | NLM_F_CREATE, &request);
5559 if (!tcmsg) {
5560 return ENODEV;
5561 }
5562 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5563 tcmsg->tcm_info = tc_make_handle(49,
5564 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5565
5566 nl_msg_put_string(&request, TCA_KIND, "basic");
5567 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5568 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5569 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5570 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5571 nl_msg_end_nested(&request, police_offset);
5572 nl_msg_end_nested(&request, basic_offset);
5573
5574 error = tc_transact(&request, NULL);
5575 if (error) {
5576 return error;
5577 }
5578
5579 return 0;
5580 }
5581
5582 static void
5583 read_psched(void)
5584 {
5585 /* The values in psched are not individually very meaningful, but they are
5586 * important. The tables below show some values seen in the wild.
5587 *
5588 * Some notes:
5589 *
5590 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5591 * (Before that, there are hints that it was 1000000000.)
5592 *
5593 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5594 * above.
5595 *
5596 * /proc/net/psched
5597 * -----------------------------------
5598 * [1] 000c8000 000f4240 000f4240 00000064
5599 * [2] 000003e8 00000400 000f4240 3b9aca00
5600 * [3] 000003e8 00000400 000f4240 3b9aca00
5601 * [4] 000003e8 00000400 000f4240 00000064
5602 * [5] 000003e8 00000040 000f4240 3b9aca00
5603 * [6] 000003e8 00000040 000f4240 000000f9
5604 *
5605 * a b c d ticks_per_s buffer_hz
5606 * ------- --------- ---------- ------------- ----------- -------------
5607 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5608 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5609 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5610 * [4] 1,000 1,024 1,000,000 100 976,562 100
5611 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5612 * [6] 1,000 64 1,000,000 249 15,625,000 249
5613 *
5614 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5615 * [2] 2.6.26-1-686-bigmem from Debian lenny
5616 * [3] 2.6.26-2-sparc64 from Debian lenny
5617 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5618 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5619 * [6] 2.6.34 from kernel.org on KVM
5620 */
5621 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5622 static const char fn[] = "/proc/net/psched";
5623 unsigned int a, b, c, d;
5624 FILE *stream;
5625
5626 if (!ovsthread_once_start(&once)) {
5627 return;
5628 }
5629
5630 ticks_per_s = 1.0;
5631 buffer_hz = 100;
5632
5633 stream = fopen(fn, "r");
5634 if (!stream) {
5635 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5636 goto exit;
5637 }
5638
5639 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5640 VLOG_WARN("%s: read failed", fn);
5641 fclose(stream);
5642 goto exit;
5643 }
5644 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5645 fclose(stream);
5646
5647 if (!a || !b || !c) {
5648 VLOG_WARN("%s: invalid scheduler parameters", fn);
5649 goto exit;
5650 }
5651
5652 ticks_per_s = (double) a * c / b;
5653 if (c == 1000000) {
5654 buffer_hz = d;
5655 } else {
5656 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5657 fn, a, b, c, d);
5658 }
5659 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5660
5661 exit:
5662 ovsthread_once_done(&once);
5663 }
5664
5665 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5666 * rate of 'rate' bytes per second. */
5667 static unsigned int
5668 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5669 {
5670 read_psched();
5671 return (rate * ticks) / ticks_per_s;
5672 }
5673
5674 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5675 * rate of 'rate' bytes per second. */
5676 static unsigned int
5677 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5678 {
5679 read_psched();
5680 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5681 }
5682
5683 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5684 * a transmission rate of 'rate' bytes per second. */
5685 static unsigned int
5686 tc_buffer_per_jiffy(unsigned int rate)
5687 {
5688 read_psched();
5689 return rate / buffer_hz;
5690 }
5691
5692 static uint32_t
5693 tc_time_to_ticks(uint32_t time) {
5694 read_psched();
5695 return time * (ticks_per_s / 1000000);
5696 }
5697
5698 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5699 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5700 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5701 * stores NULL into it if it is absent.
5702 *
5703 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5704 * 'msg'.
5705 *
5706 * Returns 0 if successful, otherwise a positive errno value. */
5707 static int
5708 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5709 struct nlattr **options)
5710 {
5711 static const struct nl_policy tca_policy[] = {
5712 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5713 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5714 };
5715 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5716
5717 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5718 tca_policy, ta, ARRAY_SIZE(ta))) {
5719 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5720 goto error;
5721 }
5722
5723 if (kind) {
5724 *kind = nl_attr_get_string(ta[TCA_KIND]);
5725 }
5726
5727 if (options) {
5728 *options = ta[TCA_OPTIONS];
5729 }
5730
5731 return 0;
5732
5733 error:
5734 if (kind) {
5735 *kind = NULL;
5736 }
5737 if (options) {
5738 *options = NULL;
5739 }
5740 return EPROTO;
5741 }
5742
5743 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5744 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5745 * into '*options', and its queue statistics into '*stats'. Any of the output
5746 * arguments may be null.
5747 *
5748 * Returns 0 if successful, otherwise a positive errno value. */
5749 static int
5750 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5751 struct nlattr **options, struct netdev_queue_stats *stats)
5752 {
5753 static const struct nl_policy tca_policy[] = {
5754 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5755 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5756 };
5757 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5758
5759 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5760 tca_policy, ta, ARRAY_SIZE(ta))) {
5761 VLOG_WARN_RL(&rl, "failed to parse class message");
5762 goto error;
5763 }
5764
5765 if (handlep) {
5766 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5767 *handlep = tc->tcm_handle;
5768 }
5769
5770 if (options) {
5771 *options = ta[TCA_OPTIONS];
5772 }
5773
5774 if (stats) {
5775 const struct gnet_stats_queue *gsq;
5776 struct gnet_stats_basic gsb;
5777
5778 static const struct nl_policy stats_policy[] = {
5779 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5780 .min_len = sizeof gsb },
5781 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5782 .min_len = sizeof *gsq },
5783 };
5784 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5785
5786 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5787 sa, ARRAY_SIZE(sa))) {
5788 VLOG_WARN_RL(&rl, "failed to parse class stats");
5789 goto error;
5790 }
5791
5792 /* Alignment issues screw up the length of struct gnet_stats_basic on
5793 * some arch/bitsize combinations. Newer versions of Linux have a
5794 * struct gnet_stats_basic_packed, but we can't depend on that. The
5795 * easiest thing to do is just to make a copy. */
5796 memset(&gsb, 0, sizeof gsb);
5797 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5798 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5799 stats->tx_bytes = gsb.bytes;
5800 stats->tx_packets = gsb.packets;
5801
5802 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5803 stats->tx_errors = gsq->drops;
5804 }
5805
5806 return 0;
5807
5808 error:
5809 if (options) {
5810 *options = NULL;
5811 }
5812 if (stats) {
5813 memset(stats, 0, sizeof *stats);
5814 }
5815 return EPROTO;
5816 }
5817
5818 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5819 * on 'netdev'. */
5820 static int
5821 tc_query_class(const struct netdev *netdev,
5822 unsigned int handle, unsigned int parent,
5823 struct ofpbuf **replyp)
5824 {
5825 struct ofpbuf request;
5826 struct tcmsg *tcmsg;
5827 int error;
5828
5829 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5830 &request);
5831 if (!tcmsg) {
5832 return ENODEV;
5833 }
5834 tcmsg->tcm_handle = handle;
5835 tcmsg->tcm_parent = parent;
5836
5837 error = tc_transact(&request, replyp);
5838 if (error) {
5839 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5840 netdev_get_name(netdev),
5841 tc_get_major(handle), tc_get_minor(handle),
5842 tc_get_major(parent), tc_get_minor(parent),
5843 ovs_strerror(error));
5844 }
5845 return error;
5846 }
5847
5848 /* Equivalent to "tc class del dev <name> handle <handle>". */
5849 static int
5850 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5851 {
5852 struct ofpbuf request;
5853 struct tcmsg *tcmsg;
5854 int error;
5855
5856 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5857 if (!tcmsg) {
5858 return ENODEV;
5859 }
5860 tcmsg->tcm_handle = handle;
5861 tcmsg->tcm_parent = 0;
5862
5863 error = tc_transact(&request, NULL);
5864 if (error) {
5865 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5866 netdev_get_name(netdev),
5867 tc_get_major(handle), tc_get_minor(handle),
5868 ovs_strerror(error));
5869 }
5870 return error;
5871 }
5872
5873 /* Equivalent to "tc qdisc del dev <name> root". */
5874 static int
5875 tc_del_qdisc(struct netdev *netdev_)
5876 {
5877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5878 struct ofpbuf request;
5879 struct tcmsg *tcmsg;
5880 int error;
5881
5882 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5883 if (!tcmsg) {
5884 return ENODEV;
5885 }
5886 tcmsg->tcm_handle = tc_make_handle(1, 0);
5887 tcmsg->tcm_parent = TC_H_ROOT;
5888
5889 error = tc_transact(&request, NULL);
5890 if (error == EINVAL) {
5891 /* EINVAL probably means that the default qdisc was in use, in which
5892 * case we've accomplished our purpose. */
5893 error = 0;
5894 }
5895 if (!error && netdev->tc) {
5896 if (netdev->tc->ops->tc_destroy) {
5897 netdev->tc->ops->tc_destroy(netdev->tc);
5898 }
5899 netdev->tc = NULL;
5900 }
5901 return error;
5902 }
5903
5904 static bool
5905 getqdisc_is_safe(void)
5906 {
5907 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5908 static bool safe = false;
5909
5910 if (ovsthread_once_start(&once)) {
5911 struct utsname utsname;
5912 int major, minor;
5913
5914 if (uname(&utsname) == -1) {
5915 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5916 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5917 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5918 } else if (major < 2 || (major == 2 && minor < 35)) {
5919 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5920 utsname.release);
5921 } else {
5922 safe = true;
5923 }
5924 ovsthread_once_done(&once);
5925 }
5926 return safe;
5927 }
5928
5929 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5930 * kernel to determine what they are. Returns 0 if successful, otherwise a
5931 * positive errno value. */
5932 static int
5933 tc_query_qdisc(const struct netdev *netdev_)
5934 {
5935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5936 struct ofpbuf request, *qdisc;
5937 const struct tc_ops *ops;
5938 struct tcmsg *tcmsg;
5939 int load_error;
5940 int error;
5941
5942 if (netdev->tc) {
5943 return 0;
5944 }
5945
5946 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5947 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5948 * 2.6.35 without that fix backported to it.
5949 *
5950 * To avoid the OOPS, we must not make a request that would attempt to dump
5951 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5952 * few others. There are a few ways that I can see to do this, but most of
5953 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5954 * technique chosen here is to assume that any non-default qdisc that we
5955 * create will have a class with handle 1:0. The built-in qdiscs only have
5956 * a class with handle 0:0.
5957 *
5958 * On Linux 2.6.35+ we use the straightforward method because it allows us
5959 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5960 * in such a case we get no response at all from the kernel (!) if a
5961 * builtin qdisc is in use (which is later caught by "!error &&
5962 * !qdisc->size"). */
5963 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5964 &request);
5965 if (!tcmsg) {
5966 return ENODEV;
5967 }
5968 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5969 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5970
5971 /* Figure out what tc class to instantiate. */
5972 error = tc_transact(&request, &qdisc);
5973 if (!error && qdisc->size) {
5974 const char *kind;
5975
5976 error = tc_parse_qdisc(qdisc, &kind, NULL);
5977 if (error) {
5978 ops = &tc_ops_other;
5979 } else {
5980 ops = tc_lookup_linux_name(kind);
5981 if (!ops) {
5982 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5983 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5984
5985 ops = &tc_ops_other;
5986 }
5987 }
5988 } else if ((!error && !qdisc->size) || error == ENOENT) {
5989 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5990 * set up by some other entity that doesn't have a handle 1:0. We will
5991 * assume that it's the system default qdisc. */
5992 ops = &tc_ops_default;
5993 error = 0;
5994 } else {
5995 /* Who knows? Maybe the device got deleted. */
5996 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5997 netdev_get_name(netdev_), ovs_strerror(error));
5998 ops = &tc_ops_other;
5999 }
6000
6001 /* Instantiate it. */
6002 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6003 ovs_assert((load_error == 0) == (netdev->tc != NULL));
6004 ofpbuf_delete(qdisc);
6005
6006 return error ? error : load_error;
6007 }
6008
6009 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6010 approximate the time to transmit packets of various lengths. For an MTU of
6011 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6012 represents two possible packet lengths; for a MTU of 513 through 1024, four
6013 possible lengths; and so on.
6014
6015 Returns, for the specified 'mtu', the number of bits that packet lengths
6016 need to be shifted right to fit within such a 256-entry table. */
6017 static int
6018 tc_calc_cell_log(unsigned int mtu)
6019 {
6020 int cell_log;
6021
6022 if (!mtu) {
6023 mtu = ETH_PAYLOAD_MAX;
6024 }
6025 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6026
6027 for (cell_log = 0; mtu >= 256; cell_log++) {
6028 mtu >>= 1;
6029 }
6030
6031 return cell_log;
6032 }
6033
6034 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6035 * of 'mtu'. */
6036 static void
6037 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6038 {
6039 memset(rate, 0, sizeof *rate);
6040 rate->cell_log = tc_calc_cell_log(mtu);
6041 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6042 /* rate->cell_align = 0; */ /* distro headers. */
6043 rate->mpu = ETH_TOTAL_MIN;
6044 rate->rate = Bps;
6045 }
6046
6047 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6048 * attribute of the specified "type".
6049 *
6050 * See tc_calc_cell_log() above for a description of "rtab"s. */
6051 void
6052 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
6053 {
6054 uint32_t *rtab;
6055 unsigned int i;
6056
6057 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6058 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6059 unsigned packet_size = (i + 1) << rate->cell_log;
6060 if (packet_size < rate->mpu) {
6061 packet_size = rate->mpu;
6062 }
6063 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
6064 }
6065 }
6066
6067 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6068 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6069 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6070 * 0 is fine.) */
6071 static int
6072 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
6073 {
6074 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6075 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6076 }
6077 \f
6078 /* Linux-only functions declared in netdev-linux.h */
6079
6080 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6081 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6082 int
6083 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6084 const char *flag_name, bool enable)
6085 {
6086 const char *netdev_name = netdev_get_name(netdev);
6087 struct ethtool_value evalue;
6088 uint32_t new_flags;
6089 int error;
6090
6091 COVERAGE_INC(netdev_get_ethtool);
6092 memset(&evalue, 0, sizeof evalue);
6093 error = netdev_linux_do_ethtool(netdev_name,
6094 (struct ethtool_cmd *)&evalue,
6095 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6096 if (error) {
6097 return error;
6098 }
6099
6100 COVERAGE_INC(netdev_set_ethtool);
6101 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6102 if (new_flags == evalue.data) {
6103 return 0;
6104 }
6105 evalue.data = new_flags;
6106 error = netdev_linux_do_ethtool(netdev_name,
6107 (struct ethtool_cmd *)&evalue,
6108 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6109 if (error) {
6110 return error;
6111 }
6112
6113 COVERAGE_INC(netdev_get_ethtool);
6114 memset(&evalue, 0, sizeof evalue);
6115 error = netdev_linux_do_ethtool(netdev_name,
6116 (struct ethtool_cmd *)&evalue,
6117 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6118 if (error) {
6119 return error;
6120 }
6121
6122 if (new_flags != evalue.data) {
6123 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6124 "device %s failed", enable ? "enable" : "disable",
6125 flag_name, netdev_name);
6126 return EOPNOTSUPP;
6127 }
6128
6129 return 0;
6130 }
6131 \f
6132 /* Utility functions. */
6133
6134 /* Copies 'src' into 'dst', performing format conversion in the process. */
6135 static void
6136 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6137 const struct rtnl_link_stats *src)
6138 {
6139 dst->rx_packets = src->rx_packets;
6140 dst->tx_packets = src->tx_packets;
6141 dst->rx_bytes = src->rx_bytes;
6142 dst->tx_bytes = src->tx_bytes;
6143 dst->rx_errors = src->rx_errors;
6144 dst->tx_errors = src->tx_errors;
6145 dst->rx_dropped = src->rx_dropped;
6146 dst->tx_dropped = src->tx_dropped;
6147 dst->multicast = src->multicast;
6148 dst->collisions = src->collisions;
6149 dst->rx_length_errors = src->rx_length_errors;
6150 dst->rx_over_errors = src->rx_over_errors;
6151 dst->rx_crc_errors = src->rx_crc_errors;
6152 dst->rx_frame_errors = src->rx_frame_errors;
6153 dst->rx_fifo_errors = src->rx_fifo_errors;
6154 dst->rx_missed_errors = src->rx_missed_errors;
6155 dst->tx_aborted_errors = src->tx_aborted_errors;
6156 dst->tx_carrier_errors = src->tx_carrier_errors;
6157 dst->tx_fifo_errors = src->tx_fifo_errors;
6158 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6159 dst->tx_window_errors = src->tx_window_errors;
6160 }
6161
6162 /* Copies 'src' into 'dst', performing format conversion in the process. */
6163 static void
6164 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6165 const struct rtnl_link_stats64 *src)
6166 {
6167 dst->rx_packets = src->rx_packets;
6168 dst->tx_packets = src->tx_packets;
6169 dst->rx_bytes = src->rx_bytes;
6170 dst->tx_bytes = src->tx_bytes;
6171 dst->rx_errors = src->rx_errors;
6172 dst->tx_errors = src->tx_errors;
6173 dst->rx_dropped = src->rx_dropped;
6174 dst->tx_dropped = src->tx_dropped;
6175 dst->multicast = src->multicast;
6176 dst->collisions = src->collisions;
6177 dst->rx_length_errors = src->rx_length_errors;
6178 dst->rx_over_errors = src->rx_over_errors;
6179 dst->rx_crc_errors = src->rx_crc_errors;
6180 dst->rx_frame_errors = src->rx_frame_errors;
6181 dst->rx_fifo_errors = src->rx_fifo_errors;
6182 dst->rx_missed_errors = src->rx_missed_errors;
6183 dst->tx_aborted_errors = src->tx_aborted_errors;
6184 dst->tx_carrier_errors = src->tx_carrier_errors;
6185 dst->tx_fifo_errors = src->tx_fifo_errors;
6186 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6187 dst->tx_window_errors = src->tx_window_errors;
6188 }
6189
6190 int
6191 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6192 {
6193 struct ofpbuf request;
6194 struct ofpbuf *reply;
6195 int error;
6196
6197 /* Filtering all counters by default */
6198 memset(stats, 0xFF, sizeof(struct netdev_stats));
6199
6200 ofpbuf_init(&request, 0);
6201 nl_msg_put_nlmsghdr(&request,
6202 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6203 RTM_GETLINK, NLM_F_REQUEST);
6204 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6205 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6206 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6207 ofpbuf_uninit(&request);
6208 if (error) {
6209 return error;
6210 }
6211
6212 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6213 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6214 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6215 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
6216 error = 0;
6217 } else {
6218 a = nl_attr_find(reply, 0, IFLA_STATS);
6219 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6220 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6221 error = 0;
6222 } else {
6223 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6224 error = EPROTO;
6225 }
6226 }
6227 } else {
6228 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6229 error = EPROTO;
6230 }
6231
6232
6233 ofpbuf_delete(reply);
6234 return error;
6235 }
6236
6237 static int
6238 get_flags(const struct netdev *dev, unsigned int *flags)
6239 {
6240 struct ifreq ifr;
6241 int error;
6242
6243 *flags = 0;
6244 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6245 if (!error) {
6246 *flags = ifr.ifr_flags;
6247 }
6248 return error;
6249 }
6250
6251 static int
6252 set_flags(const char *name, unsigned int flags)
6253 {
6254 struct ifreq ifr;
6255
6256 ifr.ifr_flags = flags;
6257 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6258 }
6259
6260 int
6261 linux_get_ifindex(const char *netdev_name)
6262 {
6263 struct ifreq ifr;
6264 int error;
6265
6266 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6267 COVERAGE_INC(netdev_get_ifindex);
6268
6269 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6270 if (error) {
6271 /* ENODEV probably means that a vif disappeared asynchronously and
6272 * hasn't been removed from the database yet, so reduce the log level
6273 * to INFO for that case. */
6274 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6275 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6276 netdev_name, ovs_strerror(error));
6277 return -error;
6278 }
6279 return ifr.ifr_ifindex;
6280 }
6281
6282 static int
6283 get_ifindex(const struct netdev *netdev_, int *ifindexp)
6284 {
6285 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6286
6287 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6288 netdev_linux_update_via_netlink(netdev);
6289 }
6290
6291 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6292 /* Fall back to ioctl if netlink fails */
6293 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6294
6295 if (ifindex < 0) {
6296 netdev->get_ifindex_error = -ifindex;
6297 netdev->ifindex = 0;
6298 } else {
6299 netdev->get_ifindex_error = 0;
6300 netdev->ifindex = ifindex;
6301 }
6302 netdev->cache_valid |= VALID_IFINDEX;
6303 }
6304
6305 *ifindexp = netdev->ifindex;
6306 return netdev->get_ifindex_error;
6307 }
6308
6309 static int
6310 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6311 {
6312 struct ofpbuf request;
6313 struct ofpbuf *reply;
6314 struct rtnetlink_change chg;
6315 struct rtnetlink_change *change = &chg;
6316 int error;
6317
6318 ofpbuf_init(&request, 0);
6319 nl_msg_put_nlmsghdr(&request,
6320 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6321 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6322 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6323
6324 /* The correct identifiers for a Linux device are netnsid and ifindex,
6325 * but ifindex changes as the port is moved to another network namespace
6326 * and the interface name statically stored in ovsdb. */
6327 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6328 if (netdev_linux_netnsid_is_remote(netdev)) {
6329 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6330 }
6331 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6332 ofpbuf_uninit(&request);
6333 if (error) {
6334 ofpbuf_delete(reply);
6335 return error;
6336 }
6337
6338 if (rtnetlink_parse(reply, change)
6339 && change->nlmsg_type == RTM_NEWLINK) {
6340 bool changed = false;
6341 error = 0;
6342
6343 /* Update netdev from rtnl msg and increment its seq if needed. */
6344 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6345 netdev->carrier_resets++;
6346 changed = true;
6347 }
6348 if (change->ifi_flags != netdev->ifi_flags) {
6349 netdev->ifi_flags = change->ifi_flags;
6350 changed = true;
6351 }
6352 if (change->mtu && change->mtu != netdev->mtu) {
6353 netdev->mtu = change->mtu;
6354 netdev->cache_valid |= VALID_MTU;
6355 netdev->netdev_mtu_error = 0;
6356 changed = true;
6357 }
6358 if (!eth_addr_is_zero(change->mac)
6359 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6360 netdev->etheraddr = change->mac;
6361 netdev->cache_valid |= VALID_ETHERADDR;
6362 netdev->ether_addr_error = 0;
6363 changed = true;
6364 }
6365 if (change->if_index != netdev->ifindex) {
6366 netdev->ifindex = change->if_index;
6367 netdev->cache_valid |= VALID_IFINDEX;
6368 netdev->get_ifindex_error = 0;
6369 changed = true;
6370 }
6371 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6372 netdev->is_lag_master = true;
6373 }
6374 if (changed) {
6375 netdev_change_seq_changed(&netdev->up);
6376 }
6377 } else {
6378 error = EINVAL;
6379 }
6380
6381 ofpbuf_delete(reply);
6382 return error;
6383 }
6384
6385 static int
6386 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6387 {
6388 struct ifreq ifr;
6389 int hwaddr_family;
6390 int error;
6391
6392 memset(&ifr, 0, sizeof ifr);
6393 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6394 COVERAGE_INC(netdev_get_hwaddr);
6395 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6396 if (error) {
6397 /* ENODEV probably means that a vif disappeared asynchronously and
6398 * hasn't been removed from the database yet, so reduce the log level
6399 * to INFO for that case. */
6400 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6401 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6402 netdev_name, ovs_strerror(error));
6403 return error;
6404 }
6405 hwaddr_family = ifr.ifr_hwaddr.sa_family;
6406 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6407 hwaddr_family != ARPHRD_NONE) {
6408 VLOG_INFO("%s device has unknown hardware address family %d",
6409 netdev_name, hwaddr_family);
6410 return EINVAL;
6411 }
6412 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6413 return 0;
6414 }
6415
6416 static int
6417 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6418 {
6419 struct ifreq ifr;
6420 int error;
6421
6422 memset(&ifr, 0, sizeof ifr);
6423 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6424 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6425 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6426 COVERAGE_INC(netdev_set_hwaddr);
6427 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6428 if (error) {
6429 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6430 netdev_name, ovs_strerror(error));
6431 }
6432 return error;
6433 }
6434
6435 static int
6436 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6437 int cmd, const char *cmd_name)
6438 {
6439 struct ifreq ifr;
6440 int error;
6441
6442 memset(&ifr, 0, sizeof ifr);
6443 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6444 ifr.ifr_data = (caddr_t) ecmd;
6445
6446 ecmd->cmd = cmd;
6447 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6448 if (error) {
6449 if (error != EOPNOTSUPP) {
6450 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6451 "failed: %s", cmd_name, name, ovs_strerror(error));
6452 } else {
6453 /* The device doesn't support this operation. That's pretty
6454 * common, so there's no point in logging anything. */
6455 }
6456 }
6457 return error;
6458 }
6459
6460 /* Returns an AF_PACKET raw socket or a negative errno value. */
6461 static int
6462 af_packet_sock(void)
6463 {
6464 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6465 static int sock;
6466
6467 if (ovsthread_once_start(&once)) {
6468 sock = socket(AF_PACKET, SOCK_RAW, 0);
6469 if (sock >= 0) {
6470 int error = set_nonblocking(sock);
6471 if (error) {
6472 close(sock);
6473 sock = -error;
6474 } else if (userspace_tso_enabled()) {
6475 int val = 1;
6476 error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
6477 sizeof val);
6478 if (error) {
6479 error = errno;
6480 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6481 ovs_strerror(errno));
6482 close(sock);
6483 sock = -error;
6484 }
6485 }
6486 } else {
6487 sock = -errno;
6488 VLOG_ERR("failed to create packet socket: %s",
6489 ovs_strerror(errno));
6490 }
6491 ovsthread_once_done(&once);
6492 }
6493
6494 return sock;
6495 }
6496
6497 static int
6498 netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
6499 {
6500 struct eth_header *eth_hdr;
6501 ovs_be16 eth_type;
6502 int l2_len;
6503
6504 eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
6505 if (!eth_hdr) {
6506 return -EINVAL;
6507 }
6508
6509 l2_len = ETH_HEADER_LEN;
6510 eth_type = eth_hdr->eth_type;
6511 if (eth_type_vlan(eth_type)) {
6512 struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
6513
6514 if (!vlan) {
6515 return -EINVAL;
6516 }
6517
6518 eth_type = vlan->vlan_next_type;
6519 l2_len += VLAN_HEADER_LEN;
6520 }
6521
6522 if (eth_type == htons(ETH_TYPE_IP)) {
6523 struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
6524
6525 if (!ip_hdr) {
6526 return -EINVAL;
6527 }
6528
6529 *l4proto = ip_hdr->ip_proto;
6530 dp_packet_hwol_set_tx_ipv4(b);
6531 } else if (eth_type == htons(ETH_TYPE_IPV6)) {
6532 struct ovs_16aligned_ip6_hdr *nh6;
6533
6534 nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
6535 if (!nh6) {
6536 return -EINVAL;
6537 }
6538
6539 *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
6540 dp_packet_hwol_set_tx_ipv6(b);
6541 }
6542
6543 return 0;
6544 }
6545
6546 static int
6547 netdev_linux_parse_vnet_hdr(struct dp_packet *b)
6548 {
6549 struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
6550 uint16_t l4proto = 0;
6551
6552 if (OVS_UNLIKELY(!vnet)) {
6553 return -EINVAL;
6554 }
6555
6556 if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
6557 return 0;
6558 }
6559
6560 if (netdev_linux_parse_l2(b, &l4proto)) {
6561 return -EINVAL;
6562 }
6563
6564 if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
6565 if (l4proto == IPPROTO_TCP) {
6566 dp_packet_hwol_set_csum_tcp(b);
6567 } else if (l4proto == IPPROTO_UDP) {
6568 dp_packet_hwol_set_csum_udp(b);
6569 } else if (l4proto == IPPROTO_SCTP) {
6570 dp_packet_hwol_set_csum_sctp(b);
6571 }
6572 }
6573
6574 if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
6575 uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4
6576 | VIRTIO_NET_HDR_GSO_TCPV6
6577 | VIRTIO_NET_HDR_GSO_UDP;
6578 uint8_t type = vnet->gso_type & allowed_mask;
6579
6580 if (type == VIRTIO_NET_HDR_GSO_TCPV4
6581 || type == VIRTIO_NET_HDR_GSO_TCPV6) {
6582 dp_packet_hwol_set_tcp_seg(b);
6583 }
6584 }
6585
6586 return 0;
6587 }
6588
6589 static void
6590 netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
6591 {
6592 struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
6593
6594 if (dp_packet_hwol_is_tso(b)) {
6595 uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
6596 + TCP_HEADER_LEN;
6597
6598 vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
6599 vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
6600 if (dp_packet_hwol_is_ipv4(b)) {
6601 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
6602 } else {
6603 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
6604 }
6605
6606 } else {
6607 vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
6608 }
6609
6610 if (dp_packet_hwol_l4_mask(b)) {
6611 vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
6612 vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b)
6613 - (char *)dp_packet_eth(b));
6614
6615 if (dp_packet_hwol_l4_is_tcp(b)) {
6616 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6617 struct tcp_header, tcp_csum);
6618 } else if (dp_packet_hwol_l4_is_udp(b)) {
6619 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6620 struct udp_header, udp_csum);
6621 } else if (dp_packet_hwol_l4_is_sctp(b)) {
6622 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6623 struct sctp_header, sctp_csum);
6624 } else {
6625 VLOG_WARN_RL(&rl, "Unsupported L4 protocol");
6626 }
6627 }
6628 }