]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
netdev-linux: Enable TSO in the TAP device.
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
27 #include <inttypes.h>
28 #include <math.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/utsname.h>
44 #include <net/if.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
47 #include <poll.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51
52 #include "coverage.h"
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
58 #include "hash.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
65 #include "netlink.h"
66 #include "netnsid.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
70 #include "ovs-numa.h"
71 #include "packets.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
76 #include "sset.h"
77 #include "tc.h"
78 #include "timer.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
82 #include "util.h"
83
84 VLOG_DEFINE_THIS_MODULE(netdev_linux);
85
86 COVERAGE_DEFINE(netdev_set_policing);
87 COVERAGE_DEFINE(netdev_arp_lookup);
88 COVERAGE_DEFINE(netdev_get_ifindex);
89 COVERAGE_DEFINE(netdev_get_hwaddr);
90 COVERAGE_DEFINE(netdev_set_hwaddr);
91 COVERAGE_DEFINE(netdev_get_ethtool);
92 COVERAGE_DEFINE(netdev_set_ethtool);
93
94 \f
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
97 #endif
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
99 * old headers. */
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
102 #endif
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
105 #endif
106
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
111 #endif
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
114 #endif
115
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
117 * headers. */
118 #ifndef TC_RTAB_SIZE
119 #define TC_RTAB_SIZE 1024
120 #endif
121
122 #ifndef TCM_IFINDEX_MAGIC_BLOCK
123 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
124 #endif
125
126 /* Linux 2.6.21 introduced struct tpacket_auxdata.
127 * Linux 2.6.27 added the tp_vlan_tci member.
128 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
129 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
130 * TP_STATUS_VLAN_TPID_VALID.
131 *
132 * With all this churn it's easiest to unconditionally define a replacement
133 * structure that has everything we want.
134 */
135 #ifndef PACKET_AUXDATA
136 #define PACKET_AUXDATA 8
137 #endif
138 #ifndef TP_STATUS_VLAN_VALID
139 #define TP_STATUS_VLAN_VALID (1 << 4)
140 #endif
141 #ifndef TP_STATUS_VLAN_TPID_VALID
142 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
143 #endif
144 #undef tpacket_auxdata
145 #define tpacket_auxdata rpl_tpacket_auxdata
146 struct tpacket_auxdata {
147 uint32_t tp_status;
148 uint32_t tp_len;
149 uint32_t tp_snaplen;
150 uint16_t tp_mac;
151 uint16_t tp_net;
152 uint16_t tp_vlan_tci;
153 uint16_t tp_vlan_tpid;
154 };
155
156 /* Linux 2.6.27 introduced ethtool_cmd_speed
157 *
158 * To avoid revisiting problems reported with using configure to detect
159 * compatibility (see report at
160 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
161 * unconditionally replace ethtool_cmd_speed. */
162 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
163 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
164 {
165 return ep->speed | (ep->speed_hi << 16);
166 }
167
168 /* Linux 2.6.30 introduced supported and advertised flags for
169 * 1G base KX, and 10G base KX4, KR and R. */
170 #ifndef SUPPORTED_1000baseKX_Full
171 #define SUPPORTED_1000baseKX_Full (1 << 17)
172 #define SUPPORTED_10000baseKX4_Full (1 << 18)
173 #define SUPPORTED_10000baseKR_Full (1 << 19)
174 #define SUPPORTED_10000baseR_FEC (1 << 20)
175 #define ADVERTISED_1000baseKX_Full (1 << 17)
176 #define ADVERTISED_10000baseKX4_Full (1 << 18)
177 #define ADVERTISED_10000baseKR_Full (1 << 19)
178 #define ADVERTISED_10000baseR_FEC (1 << 20)
179 #endif
180
181 /* Linux 3.5 introduced supported and advertised flags for
182 * 40G base KR4, CR4, SR4 and LR4. */
183 #ifndef SUPPORTED_40000baseKR4_Full
184 #define SUPPORTED_40000baseKR4_Full (1 << 23)
185 #define SUPPORTED_40000baseCR4_Full (1 << 24)
186 #define SUPPORTED_40000baseSR4_Full (1 << 25)
187 #define SUPPORTED_40000baseLR4_Full (1 << 26)
188 #define ADVERTISED_40000baseKR4_Full (1 << 23)
189 #define ADVERTISED_40000baseCR4_Full (1 << 24)
190 #define ADVERTISED_40000baseSR4_Full (1 << 25)
191 #define ADVERTISED_40000baseLR4_Full (1 << 26)
192 #endif
193
194 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
195 *
196 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
197 * 2.6.32-431.29.2.el6.x86_64 (see report at
198 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
199 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
200 * unconditionally define a replacement. */
201 #ifndef IFLA_STATS64
202 #define IFLA_STATS64 23
203 #endif
204 #define rtnl_link_stats64 rpl_rtnl_link_stats64
205 struct rtnl_link_stats64 {
206 uint64_t rx_packets;
207 uint64_t tx_packets;
208 uint64_t rx_bytes;
209 uint64_t tx_bytes;
210 uint64_t rx_errors;
211 uint64_t tx_errors;
212 uint64_t rx_dropped;
213 uint64_t tx_dropped;
214 uint64_t multicast;
215 uint64_t collisions;
216
217 uint64_t rx_length_errors;
218 uint64_t rx_over_errors;
219 uint64_t rx_crc_errors;
220 uint64_t rx_frame_errors;
221 uint64_t rx_fifo_errors;
222 uint64_t rx_missed_errors;
223
224 uint64_t tx_aborted_errors;
225 uint64_t tx_carrier_errors;
226 uint64_t tx_fifo_errors;
227 uint64_t tx_heartbeat_errors;
228 uint64_t tx_window_errors;
229
230 uint64_t rx_compressed;
231 uint64_t tx_compressed;
232 };
233
234 enum {
235 VALID_IFINDEX = 1 << 0,
236 VALID_ETHERADDR = 1 << 1,
237 VALID_IN = 1 << 2,
238 VALID_MTU = 1 << 3,
239 VALID_POLICING = 1 << 4,
240 VALID_VPORT_STAT_ERROR = 1 << 5,
241 VALID_DRVINFO = 1 << 6,
242 VALID_FEATURES = 1 << 7,
243 VALID_NUMA_ID = 1 << 8,
244 };
245
246 /* Use one for the packet buffer and another for the aux buffer to receive
247 * TSO packets. */
248 #define IOV_STD_SIZE 1
249 #define IOV_TSO_SIZE 2
250
251 enum {
252 IOV_PACKET = 0,
253 IOV_AUXBUF = 1,
254 };
255 \f
256 struct linux_lag_slave {
257 uint32_t block_id;
258 struct shash_node *node;
259 };
260
261 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
262 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
263
264 /* All slaves whose LAG masters are network devices in OvS. */
265 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
266 = SHASH_INITIALIZER(&lag_shash);
267
268 /* Traffic control. */
269
270 /* An instance of a traffic control class. Always associated with a particular
271 * network device.
272 *
273 * Each TC implementation subclasses this with whatever additional data it
274 * needs. */
275 struct tc {
276 const struct tc_ops *ops;
277 struct hmap queues; /* Contains "struct tc_queue"s.
278 * Read by generic TC layer.
279 * Written only by TC implementation. */
280 };
281
282 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
283
284 /* One traffic control queue.
285 *
286 * Each TC implementation subclasses this with whatever additional data it
287 * needs. */
288 struct tc_queue {
289 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
290 unsigned int queue_id; /* OpenFlow queue ID. */
291 long long int created; /* Time queue was created, in msecs. */
292 };
293
294 /* A particular kind of traffic control. Each implementation generally maps to
295 * one particular Linux qdisc class.
296 *
297 * The functions below return 0 if successful or a positive errno value on
298 * failure, except where otherwise noted. All of them must be provided, except
299 * where otherwise noted. */
300 struct tc_ops {
301 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
302 * This is null for tc_ops_default and tc_ops_other, for which there are no
303 * appropriate values. */
304 const char *linux_name;
305
306 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
307 const char *ovs_name;
308
309 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
310 * queues. The queues are numbered 0 through n_queues - 1. */
311 unsigned int n_queues;
312
313 /* Called to install this TC class on 'netdev'. The implementation should
314 * make the Netlink calls required to set up 'netdev' with the right qdisc
315 * and configure it according to 'details'. The implementation may assume
316 * that the current qdisc is the default; that is, there is no need for it
317 * to delete the current qdisc before installing itself.
318 *
319 * The contents of 'details' should be documented as valid for 'ovs_name'
320 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
321 * (which is built as ovs-vswitchd.conf.db(8)).
322 *
323 * This function must return 0 if and only if it sets 'netdev->tc' to an
324 * initialized 'struct tc'.
325 *
326 * (This function is null for tc_ops_other, which cannot be installed. For
327 * other TC classes it should always be nonnull.) */
328 int (*tc_install)(struct netdev *netdev, const struct smap *details);
329
330 /* Called when the netdev code determines (through a Netlink query) that
331 * this TC class's qdisc is installed on 'netdev', but we didn't install
332 * it ourselves and so don't know any of the details.
333 *
334 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
335 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
336 * implementation should parse the other attributes of 'nlmsg' as
337 * necessary to determine its configuration. If necessary it should also
338 * use Netlink queries to determine the configuration of queues on
339 * 'netdev'.
340 *
341 * This function must return 0 if and only if it sets 'netdev->tc' to an
342 * initialized 'struct tc'. */
343 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
344
345 /* Destroys the data structures allocated by the implementation as part of
346 * 'tc'. (This includes destroying 'tc->queues' by calling
347 * tc_destroy(tc).
348 *
349 * The implementation should not need to perform any Netlink calls. If
350 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
351 * (But it may not be desirable.)
352 *
353 * This function may be null if 'tc' is trivial. */
354 void (*tc_destroy)(struct tc *tc);
355
356 /* Retrieves details of 'netdev->tc' configuration into 'details'.
357 *
358 * The implementation should not need to perform any Netlink calls, because
359 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
360 * cached the configuration.
361 *
362 * The contents of 'details' should be documented as valid for 'ovs_name'
363 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
364 * (which is built as ovs-vswitchd.conf.db(8)).
365 *
366 * This function may be null if 'tc' is not configurable.
367 */
368 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
369
370 /* Reconfigures 'netdev->tc' according to 'details', performing any
371 * required Netlink calls to complete the reconfiguration.
372 *
373 * The contents of 'details' should be documented as valid for 'ovs_name'
374 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
375 * (which is built as ovs-vswitchd.conf.db(8)).
376 *
377 * This function may be null if 'tc' is not configurable.
378 */
379 int (*qdisc_set)(struct netdev *, const struct smap *details);
380
381 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
382 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
383 *
384 * The contents of 'details' should be documented as valid for 'ovs_name'
385 * in the "other_config" column in the "Queue" table in
386 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
387 *
388 * The implementation should not need to perform any Netlink calls, because
389 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
390 * cached the queue configuration.
391 *
392 * This function may be null if 'tc' does not have queues ('n_queues' is
393 * 0). */
394 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
395 struct smap *details);
396
397 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
398 * 'details', perfoming any required Netlink calls to complete the
399 * reconfiguration. The caller ensures that 'queue_id' is less than
400 * 'n_queues'.
401 *
402 * The contents of 'details' should be documented as valid for 'ovs_name'
403 * in the "other_config" column in the "Queue" table in
404 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
405 *
406 * This function may be null if 'tc' does not have queues or its queues are
407 * not configurable. */
408 int (*class_set)(struct netdev *, unsigned int queue_id,
409 const struct smap *details);
410
411 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
412 * tc_queue's within 'netdev->tc->queues'.
413 *
414 * This function may be null if 'tc' does not have queues or its queues
415 * cannot be deleted. */
416 int (*class_delete)(struct netdev *, struct tc_queue *queue);
417
418 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
419 * 'struct tc_queue's within 'netdev->tc->queues'.
420 *
421 * On success, initializes '*stats'.
422 *
423 * This function may be null if 'tc' does not have queues or if it cannot
424 * report queue statistics. */
425 int (*class_get_stats)(const struct netdev *netdev,
426 const struct tc_queue *queue,
427 struct netdev_queue_stats *stats);
428
429 /* Extracts queue stats from 'nlmsg', which is a response to a
430 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
431 *
432 * This function may be null if 'tc' does not have queues or if it cannot
433 * report queue statistics. */
434 int (*class_dump_stats)(const struct netdev *netdev,
435 const struct ofpbuf *nlmsg,
436 netdev_dump_queue_stats_cb *cb, void *aux);
437 };
438
439 static void
440 tc_init(struct tc *tc, const struct tc_ops *ops)
441 {
442 tc->ops = ops;
443 hmap_init(&tc->queues);
444 }
445
446 static void
447 tc_destroy(struct tc *tc)
448 {
449 hmap_destroy(&tc->queues);
450 }
451
452 static const struct tc_ops tc_ops_htb;
453 static const struct tc_ops tc_ops_hfsc;
454 static const struct tc_ops tc_ops_codel;
455 static const struct tc_ops tc_ops_fqcodel;
456 static const struct tc_ops tc_ops_sfq;
457 static const struct tc_ops tc_ops_netem;
458 static const struct tc_ops tc_ops_default;
459 static const struct tc_ops tc_ops_noop;
460 static const struct tc_ops tc_ops_other;
461
462 static const struct tc_ops *const tcs[] = {
463 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
464 &tc_ops_hfsc, /* Hierarchical fair service curve. */
465 &tc_ops_codel, /* Controlled delay */
466 &tc_ops_fqcodel, /* Fair queue controlled delay */
467 &tc_ops_sfq, /* Stochastic fair queueing */
468 &tc_ops_netem, /* Network Emulator */
469 &tc_ops_noop, /* Non operating qos type. */
470 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
471 &tc_ops_other, /* Some other qdisc. */
472 NULL
473 };
474
475 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
476 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
477 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
478 static uint32_t tc_time_to_ticks(uint32_t time);
479
480 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
481 int type,
482 unsigned int flags,
483 struct ofpbuf *);
484 static int tc_add_policer(struct netdev *,
485 uint32_t kbits_rate, uint32_t kbits_burst);
486
487 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
488 struct nlattr **options);
489 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
490 struct nlattr **options,
491 struct netdev_queue_stats *);
492 static int tc_query_class(const struct netdev *,
493 unsigned int handle, unsigned int parent,
494 struct ofpbuf **replyp);
495 static int tc_delete_class(const struct netdev *, unsigned int handle);
496
497 static int tc_del_qdisc(struct netdev *netdev);
498 static int tc_query_qdisc(const struct netdev *netdev);
499
500 void
501 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
502 static int tc_calc_cell_log(unsigned int mtu);
503 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
504 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
505 \f
506
507 /* This is set pretty low because we probably won't learn anything from the
508 * additional log messages. */
509 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
510
511 /* Polling miimon status for all ports causes performance degradation when
512 * handling a large number of ports. If there are no devices using miimon, then
513 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
514 *
515 * Readers do not depend on this variable synchronizing with the related
516 * changes in the device miimon status, so we can use atomic_count. */
517 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
518
519 static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
520 static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
521 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
522 int cmd, const char *cmd_name);
523 static int get_flags(const struct netdev *, unsigned int *flags);
524 static int set_flags(const char *, unsigned int flags);
525 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
526 enum netdev_flags on, enum netdev_flags *old_flagsp)
527 OVS_REQUIRES(netdev->mutex);
528 static int get_ifindex(const struct netdev *, int *ifindexp);
529 static int do_set_addr(struct netdev *netdev,
530 int ioctl_nr, const char *ioctl_name,
531 struct in_addr addr);
532 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
533 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
534 static int af_packet_sock(void);
535 static bool netdev_linux_miimon_enabled(void);
536 static void netdev_linux_miimon_run(void);
537 static void netdev_linux_miimon_wait(void);
538 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
539
540 static bool
541 is_tap_netdev(const struct netdev *netdev)
542 {
543 return netdev_get_class(netdev) == &netdev_tap_class;
544 }
545 \f
546 static int
547 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
548 {
549 struct dpif_netlink_vport reply;
550 struct ofpbuf *buf;
551 int error;
552
553 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
554 if (error) {
555 if (error == ENOENT) {
556 /* Assume it is local if there is no API (e.g. if the openvswitch
557 * kernel module is not loaded). */
558 netnsid_set_local(&netdev->netnsid);
559 } else {
560 netnsid_unset(&netdev->netnsid);
561 }
562 return error;
563 }
564
565 netnsid_set(&netdev->netnsid, reply.netnsid);
566 ofpbuf_delete(buf);
567 return 0;
568 }
569
570 static int
571 netdev_linux_netnsid_update(struct netdev_linux *netdev)
572 {
573 if (netnsid_is_unset(netdev->netnsid)) {
574 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
575 netnsid_set_local(&netdev->netnsid);
576 } else {
577 return netdev_linux_netnsid_update__(netdev);
578 }
579 }
580
581 return 0;
582 }
583
584 static bool
585 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
586 {
587 netdev_linux_netnsid_update(netdev);
588 return netnsid_eq(netdev->netnsid, nsid);
589 }
590
591 static bool
592 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
593 {
594 netdev_linux_netnsid_update(netdev);
595 return netnsid_is_remote(netdev->netnsid);
596 }
597
598 static int netdev_linux_update_via_netlink(struct netdev_linux *);
599 static void netdev_linux_update(struct netdev_linux *netdev, int,
600 const struct rtnetlink_change *)
601 OVS_REQUIRES(netdev->mutex);
602 static void netdev_linux_changed(struct netdev_linux *netdev,
603 unsigned int ifi_flags, unsigned int mask)
604 OVS_REQUIRES(netdev->mutex);
605
606 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
607 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
608 * if no such socket could be created. */
609 static struct nl_sock *
610 netdev_linux_notify_sock(void)
611 {
612 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
613 static struct nl_sock *sock;
614 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
615 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
616
617 if (ovsthread_once_start(&once)) {
618 int error;
619
620 error = nl_sock_create(NETLINK_ROUTE, &sock);
621 if (!error) {
622 size_t i;
623
624 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
625 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
626 if (error) {
627 nl_sock_destroy(sock);
628 sock = NULL;
629 break;
630 }
631 }
632 }
633 nl_sock_listen_all_nsid(sock, true);
634 ovsthread_once_done(&once);
635 }
636
637 return sock;
638 }
639
640 static bool
641 netdev_linux_miimon_enabled(void)
642 {
643 return atomic_count_get(&miimon_cnt) > 0;
644 }
645
646 static bool
647 netdev_linux_kind_is_lag(const char *kind)
648 {
649 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
650 return true;
651 }
652
653 return false;
654 }
655
656 static void
657 netdev_linux_update_lag(struct rtnetlink_change *change)
658 OVS_REQUIRES(lag_mutex)
659 {
660 struct linux_lag_slave *lag;
661
662 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
663 return;
664 }
665
666 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
667 lag = shash_find_data(&lag_shash, change->ifname);
668
669 if (!lag) {
670 struct netdev *master_netdev;
671 char master_name[IFNAMSIZ];
672 uint32_t block_id;
673 int error = 0;
674
675 if_indextoname(change->master_ifindex, master_name);
676 master_netdev = netdev_from_name(master_name);
677 if (!master_netdev) {
678 return;
679 }
680
681 if (is_netdev_linux_class(master_netdev->netdev_class)) {
682 block_id = netdev_get_block_id(master_netdev);
683 if (!block_id) {
684 netdev_close(master_netdev);
685 return;
686 }
687
688 lag = xmalloc(sizeof *lag);
689 lag->block_id = block_id;
690 lag->node = shash_add(&lag_shash, change->ifname, lag);
691
692 /* delete ingress block in case it exists */
693 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
694 /* LAG master is linux netdev so add slave to same block. */
695 error = tc_add_del_qdisc(change->if_index, true, block_id,
696 TC_INGRESS);
697 if (error) {
698 VLOG_WARN("failed to bind LAG slave %s to master's block",
699 change->ifname);
700 shash_delete(&lag_shash, lag->node);
701 free(lag);
702 }
703 }
704
705 netdev_close(master_netdev);
706 }
707 } else if (change->master_ifindex == 0) {
708 /* Check if this was a lag slave that has been freed. */
709 lag = shash_find_data(&lag_shash, change->ifname);
710
711 if (lag) {
712 tc_add_del_qdisc(change->if_index, false, lag->block_id,
713 TC_INGRESS);
714 shash_delete(&lag_shash, lag->node);
715 free(lag);
716 }
717 }
718 }
719
720 void
721 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
722 {
723 struct nl_sock *sock;
724 int error;
725
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
728 }
729
730 sock = netdev_linux_notify_sock();
731 if (!sock) {
732 return;
733 }
734
735 do {
736 uint64_t buf_stub[4096 / 8];
737 int nsid;
738 struct ofpbuf buf;
739
740 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
741 error = nl_sock_recv(sock, &buf, &nsid, false);
742 if (!error) {
743 struct rtnetlink_change change;
744
745 if (rtnetlink_parse(&buf, &change)) {
746 struct netdev *netdev_ = NULL;
747 char dev_name[IFNAMSIZ];
748
749 if (!change.ifname) {
750 change.ifname = if_indextoname(change.if_index, dev_name);
751 }
752
753 if (change.ifname) {
754 netdev_ = netdev_from_name(change.ifname);
755 }
756 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758
759 ovs_mutex_lock(&netdev->mutex);
760 netdev_linux_update(netdev, nsid, &change);
761 ovs_mutex_unlock(&netdev->mutex);
762 }
763 else if (!netdev_ && change.ifname) {
764 /* Netdev is not present in OvS but its master could be. */
765 ovs_mutex_lock(&lag_mutex);
766 netdev_linux_update_lag(&change);
767 ovs_mutex_unlock(&lag_mutex);
768 }
769 netdev_close(netdev_);
770 }
771 } else if (error == ENOBUFS) {
772 struct shash device_shash;
773 struct shash_node *node;
774
775 nl_sock_drain(sock);
776
777 shash_init(&device_shash);
778 netdev_get_devices(&netdev_linux_class, &device_shash);
779 SHASH_FOR_EACH (node, &device_shash) {
780 struct netdev *netdev_ = node->data;
781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 unsigned int flags;
783
784 ovs_mutex_lock(&netdev->mutex);
785 get_flags(netdev_, &flags);
786 netdev_linux_changed(netdev, flags, 0);
787 ovs_mutex_unlock(&netdev->mutex);
788
789 netdev_close(netdev_);
790 }
791 shash_destroy(&device_shash);
792 } else if (error != EAGAIN) {
793 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
794 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
795 ovs_strerror(error));
796 }
797 ofpbuf_uninit(&buf);
798 } while (!error);
799 }
800
801 static void
802 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
803 {
804 struct nl_sock *sock;
805
806 if (netdev_linux_miimon_enabled()) {
807 netdev_linux_miimon_wait();
808 }
809 sock = netdev_linux_notify_sock();
810 if (sock) {
811 nl_sock_wait(sock, POLLIN);
812 }
813 }
814
815 static void
816 netdev_linux_changed(struct netdev_linux *dev,
817 unsigned int ifi_flags, unsigned int mask)
818 OVS_REQUIRES(dev->mutex)
819 {
820 netdev_change_seq_changed(&dev->up);
821
822 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
823 dev->carrier_resets++;
824 }
825 dev->ifi_flags = ifi_flags;
826
827 dev->cache_valid &= mask;
828 if (!(mask & VALID_IN)) {
829 netdev_get_addrs_list_flush();
830 }
831 }
832
833 static void
834 netdev_linux_update__(struct netdev_linux *dev,
835 const struct rtnetlink_change *change)
836 OVS_REQUIRES(dev->mutex)
837 {
838 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
839 if (change->nlmsg_type == RTM_NEWLINK) {
840 /* Keep drv-info, ip addresses, and NUMA id. */
841 netdev_linux_changed(dev, change->ifi_flags,
842 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
843
844 /* Update netdev from rtnl-change msg. */
845 if (change->mtu) {
846 dev->mtu = change->mtu;
847 dev->cache_valid |= VALID_MTU;
848 dev->netdev_mtu_error = 0;
849 }
850
851 if (!eth_addr_is_zero(change->mac)) {
852 dev->etheraddr = change->mac;
853 dev->cache_valid |= VALID_ETHERADDR;
854 dev->ether_addr_error = 0;
855
856 /* The mac addr has been changed, report it now. */
857 rtnetlink_report_link();
858 }
859
860 if (change->master && netdev_linux_kind_is_lag(change->master)) {
861 dev->is_lag_master = true;
862 }
863
864 dev->ifindex = change->if_index;
865 dev->cache_valid |= VALID_IFINDEX;
866 dev->get_ifindex_error = 0;
867 dev->present = true;
868 } else {
869 /* FIXME */
870 netdev_linux_changed(dev, change->ifi_flags, 0);
871 dev->present = false;
872 netnsid_unset(&dev->netnsid);
873 }
874 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
875 /* Invalidates in4, in6. */
876 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
877 } else {
878 OVS_NOT_REACHED();
879 }
880 }
881
882 static void
883 netdev_linux_update(struct netdev_linux *dev, int nsid,
884 const struct rtnetlink_change *change)
885 OVS_REQUIRES(dev->mutex)
886 {
887 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
888 netdev_linux_update__(dev, change);
889 }
890 }
891
892 static struct netdev *
893 netdev_linux_alloc(void)
894 {
895 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
896 return &netdev->up;
897 }
898
899 static int
900 netdev_linux_common_construct(struct netdev *netdev_)
901 {
902 /* Prevent any attempt to create (or open) a network device named "default"
903 * or "all". These device names are effectively reserved on Linux because
904 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
905 * itself this wouldn't call for any special treatment, but in practice if
906 * a program tries to create devices with these names, it causes the kernel
907 * to fire a "new device" notification event even though creation failed,
908 * and in turn that causes OVS to wake up and try to create them again,
909 * which ends up as a 100% CPU loop. */
910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
911 const char *name = netdev_->name;
912 if (!strcmp(name, "default") || !strcmp(name, "all")) {
913 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
914 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
915 name);
916 return EINVAL;
917 }
918
919 /* The device could be in the same network namespace or in another one. */
920 netnsid_unset(&netdev->netnsid);
921 ovs_mutex_init(&netdev->mutex);
922
923 if (userspace_tso_enabled()) {
924 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
925 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
926 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
927 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
928 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
929 }
930
931 return 0;
932 }
933
934 /* Creates system and internal devices. */
935 int
936 netdev_linux_construct(struct netdev *netdev_)
937 {
938 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
939 int error = netdev_linux_common_construct(netdev_);
940 if (error) {
941 return error;
942 }
943
944 error = get_flags(&netdev->up, &netdev->ifi_flags);
945 if (error == ENODEV) {
946 if (netdev->up.netdev_class != &netdev_internal_class) {
947 /* The device does not exist, so don't allow it to be opened. */
948 return ENODEV;
949 } else {
950 /* "Internal" netdevs have to be created as netdev objects before
951 * they exist in the kernel, because creating them in the kernel
952 * happens by passing a netdev object to dpif_port_add().
953 * Therefore, ignore the error. */
954 }
955 }
956
957 return 0;
958 }
959
960 /* For most types of netdevs we open the device for each call of
961 * netdev_open(). However, this is not the case with tap devices,
962 * since it is only possible to open the device once. In this
963 * situation we share a single file descriptor, and consequently
964 * buffers, across all readers. Therefore once data is read it will
965 * be unavailable to other reads for tap devices. */
966 static int
967 netdev_linux_construct_tap(struct netdev *netdev_)
968 {
969 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
970 static const char tap_dev[] = "/dev/net/tun";
971 const char *name = netdev_->name;
972 struct ifreq ifr;
973
974 int error = netdev_linux_common_construct(netdev_);
975 if (error) {
976 return error;
977 }
978
979 /* Open tap device. */
980 netdev->tap_fd = open(tap_dev, O_RDWR);
981 if (netdev->tap_fd < 0) {
982 error = errno;
983 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
984 return error;
985 }
986
987 /* Create tap device. */
988 get_flags(&netdev->up, &netdev->ifi_flags);
989 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
990 if (userspace_tso_enabled()) {
991 ifr.ifr_flags |= IFF_VNET_HDR;
992 }
993
994 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
995 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
996 VLOG_WARN("%s: creating tap device failed: %s", name,
997 ovs_strerror(errno));
998 error = errno;
999 goto error_close;
1000 }
1001
1002 /* Make non-blocking. */
1003 error = set_nonblocking(netdev->tap_fd);
1004 if (error) {
1005 goto error_close;
1006 }
1007
1008 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1009 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1010 ovs_strerror(errno));
1011 error = errno;
1012 goto error_close;
1013 }
1014
1015 if (userspace_tso_enabled()) {
1016 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1017 * available, it will return EINVAL when a flag is unknown.
1018 * Therefore, try enabling offload with no flags to check
1019 * if TUNSETOFFLOAD support is available or not. */
1020 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) {
1021 unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
1022
1023 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
1024 VLOG_WARN("%s: enabling tap offloading failed: %s", name,
1025 ovs_strerror(errno));
1026 error = errno;
1027 goto error_close;
1028 }
1029 }
1030 }
1031
1032 netdev->present = true;
1033 return 0;
1034
1035 error_close:
1036 close(netdev->tap_fd);
1037 return error;
1038 }
1039
1040 static void
1041 netdev_linux_destruct(struct netdev *netdev_)
1042 {
1043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1044
1045 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1046 netdev->tc->ops->tc_destroy(netdev->tc);
1047 }
1048
1049 if (netdev_get_class(netdev_) == &netdev_tap_class
1050 && netdev->tap_fd >= 0)
1051 {
1052 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1053 close(netdev->tap_fd);
1054 }
1055
1056 if (netdev->miimon_interval > 0) {
1057 atomic_count_dec(&miimon_cnt);
1058 }
1059
1060 ovs_mutex_destroy(&netdev->mutex);
1061 }
1062
1063 static void
1064 netdev_linux_dealloc(struct netdev *netdev_)
1065 {
1066 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1067 free(netdev);
1068 }
1069
1070 static struct netdev_rxq *
1071 netdev_linux_rxq_alloc(void)
1072 {
1073 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1074 return &rx->up;
1075 }
1076
1077 static int
1078 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1079 {
1080 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1081 struct netdev *netdev_ = rx->up.netdev;
1082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1083 int error;
1084
1085 ovs_mutex_lock(&netdev->mutex);
1086 rx->is_tap = is_tap_netdev(netdev_);
1087 if (rx->is_tap) {
1088 rx->fd = netdev->tap_fd;
1089 } else {
1090 struct sockaddr_ll sll;
1091 int ifindex, val;
1092 /* Result of tcpdump -dd inbound */
1093 static const struct sock_filter filt[] = {
1094 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1095 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1096 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1097 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1098 };
1099 static const struct sock_fprog fprog = {
1100 ARRAY_SIZE(filt), (struct sock_filter *) filt
1101 };
1102
1103 /* Create file descriptor. */
1104 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1105 if (rx->fd < 0) {
1106 error = errno;
1107 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1108 goto error;
1109 }
1110
1111 val = 1;
1112 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1113 error = errno;
1114 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1115 netdev_get_name(netdev_), ovs_strerror(error));
1116 goto error;
1117 }
1118
1119 if (userspace_tso_enabled()
1120 && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1121 sizeof val)) {
1122 error = errno;
1123 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1124 netdev_get_name(netdev_), ovs_strerror(errno));
1125 goto error;
1126 }
1127
1128 /* Set non-blocking mode. */
1129 error = set_nonblocking(rx->fd);
1130 if (error) {
1131 goto error;
1132 }
1133
1134 /* Get ethernet device index. */
1135 error = get_ifindex(&netdev->up, &ifindex);
1136 if (error) {
1137 goto error;
1138 }
1139
1140 /* Bind to specific ethernet device. */
1141 memset(&sll, 0, sizeof sll);
1142 sll.sll_family = AF_PACKET;
1143 sll.sll_ifindex = ifindex;
1144 sll.sll_protocol = htons(ETH_P_ALL);
1145 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1146 error = errno;
1147 VLOG_ERR("%s: failed to bind raw socket (%s)",
1148 netdev_get_name(netdev_), ovs_strerror(error));
1149 goto error;
1150 }
1151
1152 /* Filter for only inbound packets. */
1153 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1154 sizeof fprog);
1155 if (error) {
1156 error = errno;
1157 VLOG_ERR("%s: failed to attach filter (%s)",
1158 netdev_get_name(netdev_), ovs_strerror(error));
1159 goto error;
1160 }
1161 }
1162 ovs_mutex_unlock(&netdev->mutex);
1163
1164 return 0;
1165
1166 error:
1167 if (rx->fd >= 0) {
1168 close(rx->fd);
1169 }
1170 ovs_mutex_unlock(&netdev->mutex);
1171 return error;
1172 }
1173
1174 static void
1175 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1176 {
1177 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1178 int i;
1179
1180 if (!rx->is_tap) {
1181 close(rx->fd);
1182 }
1183
1184 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1185 dp_packet_delete(rx->aux_bufs[i]);
1186 }
1187 }
1188
1189 static void
1190 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1191 {
1192 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1193
1194 free(rx);
1195 }
1196
1197 static ovs_be16
1198 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1199 {
1200 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1201 return htons(aux->tp_vlan_tpid);
1202 } else if (double_tagged) {
1203 return htons(ETH_TYPE_VLAN_8021AD);
1204 } else {
1205 return htons(ETH_TYPE_VLAN_8021Q);
1206 }
1207 }
1208
1209 static bool
1210 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1211 {
1212 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1213 }
1214
1215 /*
1216 * Receive packets from raw socket in batch process for better performance,
1217 * it can receive NETDEV_MAX_BURST packets at most once, the received
1218 * packets are added into *batch. The return value is 0 or errno.
1219 *
1220 * It also used recvmmsg to reduce multiple syscalls overhead;
1221 */
1222 static int
1223 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1224 struct dp_packet_batch *batch)
1225 {
1226 int iovlen;
1227 size_t std_len;
1228 ssize_t retval;
1229 int virtio_net_hdr_size;
1230 struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1231 struct cmsghdr *cmsg;
1232 union {
1233 struct cmsghdr cmsg;
1234 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1235 } cmsg_buffers[NETDEV_MAX_BURST];
1236 struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1237 struct dp_packet *buffers[NETDEV_MAX_BURST];
1238 int i;
1239
1240 if (userspace_tso_enabled()) {
1241 /* Use the buffer from the allocated packet below to receive MTU
1242 * sized packets and an aux_buf for extra TSO data. */
1243 iovlen = IOV_TSO_SIZE;
1244 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1245 } else {
1246 /* Use only the buffer from the allocated packet. */
1247 iovlen = IOV_STD_SIZE;
1248 virtio_net_hdr_size = 0;
1249 }
1250
1251 /* The length here needs to be accounted in the same way when the
1252 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1253 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1254 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1255 buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1256 iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1257 iovs[i][IOV_PACKET].iov_len = std_len;
1258 if (iovlen == IOV_TSO_SIZE) {
1259 iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1260 iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1261 }
1262
1263 mmsgs[i].msg_hdr.msg_name = NULL;
1264 mmsgs[i].msg_hdr.msg_namelen = 0;
1265 mmsgs[i].msg_hdr.msg_iov = iovs[i];
1266 mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1267 mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1268 mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1269 mmsgs[i].msg_hdr.msg_flags = 0;
1270 }
1271
1272 do {
1273 retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1274 } while (retval < 0 && errno == EINTR);
1275
1276 if (retval < 0) {
1277 retval = errno;
1278 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1279 dp_packet_delete(buffers[i]);
1280 }
1281
1282 return retval;
1283 }
1284
1285 for (i = 0; i < retval; i++) {
1286 struct dp_packet *pkt;
1287
1288 if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
1289 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1290 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1291
1292 dp_packet_delete(buffers[i]);
1293 netdev->rx_dropped += 1;
1294 VLOG_WARN_RL(&rl, "%s: Dropped packet: less than ether hdr size",
1295 netdev_get_name(netdev_));
1296 continue;
1297 }
1298
1299 if (mmsgs[i].msg_len > std_len) {
1300 /* Build a single linear TSO packet by prepending the data from
1301 * std_len buffer to the aux_buf. */
1302 pkt = rx->aux_bufs[i];
1303 dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1304 dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1305 /* The headroom should be the same in buffers[i], pkt and
1306 * DP_NETDEV_HEADROOM. */
1307 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1308 dp_packet_delete(buffers[i]);
1309 rx->aux_bufs[i] = NULL;
1310 } else {
1311 dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1312 pkt = buffers[i];
1313 }
1314
1315 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1316 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1317 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1318
1319 /* Unexpected error situation: the virtio header is not present
1320 * or corrupted. Drop the packet but continue in case next ones
1321 * are correct. */
1322 dp_packet_delete(pkt);
1323 netdev->rx_dropped += 1;
1324 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1325 netdev_get_name(netdev_));
1326 continue;
1327 }
1328
1329 for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1330 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1331 const struct tpacket_auxdata *aux;
1332
1333 if (cmsg->cmsg_level != SOL_PACKET
1334 || cmsg->cmsg_type != PACKET_AUXDATA
1335 || cmsg->cmsg_len <
1336 CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1337 continue;
1338 }
1339
1340 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1341 if (auxdata_has_vlan_tci(aux)) {
1342 struct eth_header *eth;
1343 bool double_tagged;
1344
1345 eth = dp_packet_data(pkt);
1346 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1347
1348 eth_push_vlan(pkt,
1349 auxdata_to_vlan_tpid(aux, double_tagged),
1350 htons(aux->tp_vlan_tci));
1351 break;
1352 }
1353 }
1354 dp_packet_batch_add(batch, pkt);
1355 }
1356
1357 /* Delete unused buffers. */
1358 for (; i < NETDEV_MAX_BURST; i++) {
1359 dp_packet_delete(buffers[i]);
1360 }
1361
1362 return 0;
1363 }
1364
1365 /*
1366 * Receive packets from tap by batch process for better performance,
1367 * it can receive NETDEV_MAX_BURST packets at most once, the received
1368 * packets are added into *batch. The return value is 0 or errno.
1369 */
1370 static int
1371 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1372 struct dp_packet_batch *batch)
1373 {
1374 int virtio_net_hdr_size;
1375 ssize_t retval;
1376 size_t std_len;
1377 int iovlen;
1378 int i;
1379
1380 if (userspace_tso_enabled()) {
1381 /* Use the buffer from the allocated packet below to receive MTU
1382 * sized packets and an aux_buf for extra TSO data. */
1383 iovlen = IOV_TSO_SIZE;
1384 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1385 } else {
1386 /* Use only the buffer from the allocated packet. */
1387 iovlen = IOV_STD_SIZE;
1388 virtio_net_hdr_size = 0;
1389 }
1390
1391 /* The length here needs to be accounted in the same way when the
1392 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1393 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1394 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1395 struct dp_packet *buffer;
1396 struct dp_packet *pkt;
1397 struct iovec iov[IOV_TSO_SIZE];
1398
1399 /* Assume Ethernet port. No need to set packet_type. */
1400 buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1401 iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1402 iov[IOV_PACKET].iov_len = std_len;
1403 if (iovlen == IOV_TSO_SIZE) {
1404 iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1405 iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1406 }
1407
1408 do {
1409 retval = readv(rx->fd, iov, iovlen);
1410 } while (retval < 0 && errno == EINTR);
1411
1412 if (retval < 0) {
1413 dp_packet_delete(buffer);
1414 break;
1415 }
1416
1417 if (retval > std_len) {
1418 /* Build a single linear TSO packet by prepending the data from
1419 * std_len buffer to the aux_buf. */
1420 pkt = rx->aux_bufs[i];
1421 dp_packet_set_size(pkt, retval - std_len);
1422 dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1423 /* The headroom should be the same in buffers[i], pkt and
1424 * DP_NETDEV_HEADROOM. */
1425 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1426 dp_packet_delete(buffer);
1427 rx->aux_bufs[i] = NULL;
1428 } else {
1429 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1430 pkt = buffer;
1431 }
1432
1433 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1434 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1435 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1436
1437 /* Unexpected error situation: the virtio header is not present
1438 * or corrupted. Drop the packet but continue in case next ones
1439 * are correct. */
1440 dp_packet_delete(pkt);
1441 netdev->rx_dropped += 1;
1442 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1443 netdev_get_name(netdev_));
1444 continue;
1445 }
1446
1447 dp_packet_batch_add(batch, pkt);
1448 }
1449
1450 if ((i == 0) && (retval < 0)) {
1451 return errno;
1452 }
1453
1454 return 0;
1455 }
1456
1457 static int
1458 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1459 int *qfill)
1460 {
1461 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1462 struct netdev *netdev = rx->up.netdev;
1463 ssize_t retval;
1464 int mtu;
1465
1466 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1467 mtu = ETH_PAYLOAD_MAX;
1468 }
1469
1470 if (userspace_tso_enabled()) {
1471 /* Allocate TSO packets. The packet has enough headroom to store
1472 * a full non-TSO packet. When a TSO packet is received, the data
1473 * from non-TSO buffer (std_len) is prepended to the TSO packet
1474 * (aux_buf). */
1475 size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1476 + DP_NETDEV_HEADROOM + mtu;
1477 size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1478 for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1479 if (rx->aux_bufs[i]) {
1480 continue;
1481 }
1482
1483 rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1484 }
1485 }
1486
1487 dp_packet_batch_init(batch);
1488 retval = (rx->is_tap
1489 ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1490 : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1491
1492 if (retval) {
1493 if (retval != EAGAIN && retval != EMSGSIZE) {
1494 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1495 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1496 }
1497 }
1498
1499 if (qfill) {
1500 *qfill = -ENOTSUP;
1501 }
1502
1503 return retval;
1504 }
1505
1506 static void
1507 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1508 {
1509 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1510 poll_fd_wait(rx->fd, POLLIN);
1511 }
1512
1513 static int
1514 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1515 {
1516 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1517 if (rx->is_tap) {
1518 struct ifreq ifr;
1519 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1520 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1521 if (error) {
1522 return error;
1523 }
1524 drain_fd(rx->fd, ifr.ifr_qlen);
1525 return 0;
1526 } else {
1527 return drain_rcvbuf(rx->fd);
1528 }
1529 }
1530
1531 static int
1532 netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
1533 struct dp_packet_batch *batch)
1534 {
1535 const size_t size = dp_packet_batch_size(batch);
1536 /* We don't bother setting most fields in sockaddr_ll because the
1537 * kernel ignores them for SOCK_RAW. */
1538 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1539 .sll_ifindex = ifindex };
1540
1541 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1542 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1543
1544 struct dp_packet *packet;
1545 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1546 if (tso) {
1547 netdev_linux_prepend_vnet_hdr(packet, mtu);
1548 }
1549
1550 iov[i].iov_base = dp_packet_data(packet);
1551 iov[i].iov_len = dp_packet_size(packet);
1552 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1553 .msg_namelen = sizeof sll,
1554 .msg_iov = &iov[i],
1555 .msg_iovlen = 1 };
1556 }
1557
1558 int error = 0;
1559 for (uint32_t ofs = 0; ofs < size; ) {
1560 ssize_t retval;
1561 do {
1562 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1563 error = retval < 0 ? errno : 0;
1564 } while (error == EINTR);
1565 if (error) {
1566 break;
1567 }
1568 ofs += retval;
1569 }
1570
1571 free(mmsg);
1572 free(iov);
1573 return error;
1574 }
1575
1576 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1577 * essential, because packets sent to a tap device with an AF_PACKET socket
1578 * will loop back to be *received* again on the tap device. This doesn't occur
1579 * on other interface types because we attach a socket filter to the rx
1580 * socket. */
1581 static int
1582 netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
1583 struct dp_packet_batch *batch)
1584 {
1585 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1586 struct dp_packet *packet;
1587
1588 /* The Linux tap driver returns EIO if the device is not up,
1589 * so if the device is not up, don't waste time sending it.
1590 * However, if the device is in another network namespace
1591 * then OVS can't retrieve the state. In that case, send the
1592 * packets anyway. */
1593 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1594 netdev->tx_dropped += dp_packet_batch_size(batch);
1595 return 0;
1596 }
1597
1598 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1599 size_t size;
1600 ssize_t retval;
1601 int error;
1602
1603 if (tso) {
1604 netdev_linux_prepend_vnet_hdr(packet, mtu);
1605 }
1606
1607 size = dp_packet_size(packet);
1608 do {
1609 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1610 error = retval < 0 ? errno : 0;
1611 } while (error == EINTR);
1612
1613 if (error) {
1614 /* The Linux tap driver returns EIO if the device is not up. From
1615 * the OVS side this is not an error, so we ignore it; otherwise,
1616 * return the erro. */
1617 if (error != EIO) {
1618 return error;
1619 }
1620 } else if (retval != size) {
1621 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1622 "bytes of %"PRIuSIZE") on %s",
1623 retval, size, netdev_get_name(netdev_));
1624 return EMSGSIZE;
1625 }
1626 }
1627 return 0;
1628 }
1629
1630 static int
1631 netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1632 OVS_REQUIRES(netdev->mutex)
1633 {
1634 char *numa_node_path;
1635 const char *name;
1636 int node_id;
1637 FILE *stream;
1638
1639 if (netdev->cache_valid & VALID_NUMA_ID) {
1640 return netdev->numa_id;
1641 }
1642
1643 netdev->numa_id = 0;
1644 netdev->cache_valid |= VALID_NUMA_ID;
1645
1646 if (ovs_numa_get_n_numas() < 2) {
1647 /* No need to check on system with a single NUMA node. */
1648 return 0;
1649 }
1650
1651 name = netdev_get_name(&netdev->up);
1652 if (strpbrk(name, "/\\")) {
1653 VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1654 "A valid name must not include '/' or '\\'."
1655 "Using numa_id 0", name);
1656 return 0;
1657 }
1658
1659 numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1660
1661 stream = fopen(numa_node_path, "r");
1662 if (!stream) {
1663 /* Virtual device does not have this info. */
1664 VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1665 name, numa_node_path, ovs_strerror(errno));
1666 free(numa_node_path);
1667 return 0;
1668 }
1669
1670 if (fscanf(stream, "%d", &node_id) != 1
1671 || !ovs_numa_numa_id_is_valid(node_id)) {
1672 VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1673 node_id = 0;
1674 }
1675
1676 netdev->numa_id = node_id;
1677 fclose(stream);
1678 free(numa_node_path);
1679 return node_id;
1680 }
1681
1682 static int OVS_UNUSED
1683 netdev_linux_get_numa_id(const struct netdev *netdev_)
1684 {
1685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1686 int numa_id;
1687
1688 ovs_mutex_lock(&netdev->mutex);
1689 numa_id = netdev_linux_get_numa_id__(netdev);
1690 ovs_mutex_unlock(&netdev->mutex);
1691
1692 return numa_id;
1693 }
1694
1695 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1696 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1697 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1698 * the packet is too big or too small to transmit on the device.
1699 *
1700 * The kernel maintains a packet transmission queue, so the caller is not
1701 * expected to do additional queuing of packets. */
1702 static int
1703 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1704 struct dp_packet_batch *batch,
1705 bool concurrent_txq OVS_UNUSED)
1706 {
1707 bool tso = userspace_tso_enabled();
1708 int mtu = ETH_PAYLOAD_MAX;
1709 int error = 0;
1710 int sock = 0;
1711
1712 if (tso) {
1713 netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1714 }
1715
1716 if (!is_tap_netdev(netdev_)) {
1717 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1718 error = EOPNOTSUPP;
1719 goto free_batch;
1720 }
1721
1722 sock = af_packet_sock();
1723 if (sock < 0) {
1724 error = -sock;
1725 goto free_batch;
1726 }
1727
1728 int ifindex = netdev_get_ifindex(netdev_);
1729 if (ifindex < 0) {
1730 error = -ifindex;
1731 goto free_batch;
1732 }
1733
1734 error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
1735 } else {
1736 error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
1737 }
1738 if (error) {
1739 if (error == ENOBUFS) {
1740 /* The Linux AF_PACKET implementation never blocks waiting
1741 * for room for packets, instead returning ENOBUFS.
1742 * Translate this into EAGAIN for the caller. */
1743 error = EAGAIN;
1744 } else {
1745 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1746 netdev_get_name(netdev_), ovs_strerror(error));
1747 }
1748 }
1749
1750 free_batch:
1751 dp_packet_delete_batch(batch, true);
1752 return error;
1753 }
1754
1755 /* Registers with the poll loop to wake up from the next call to poll_block()
1756 * when the packet transmission queue has sufficient room to transmit a packet
1757 * with netdev_send().
1758 *
1759 * The kernel maintains a packet transmission queue, so the client is not
1760 * expected to do additional queuing of packets. Thus, this function is
1761 * unlikely to ever be used. It is included for completeness. */
1762 static void
1763 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1764 {
1765 if (is_tap_netdev(netdev)) {
1766 /* TAP device always accepts packets.*/
1767 poll_immediate_wake();
1768 }
1769 }
1770
1771 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1772 * otherwise a positive errno value. */
1773 static int
1774 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1775 {
1776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1777 enum netdev_flags old_flags = 0;
1778 int error;
1779
1780 ovs_mutex_lock(&netdev->mutex);
1781 if (netdev_linux_netnsid_is_remote(netdev)) {
1782 error = EOPNOTSUPP;
1783 goto exit;
1784 }
1785
1786 if (netdev->cache_valid & VALID_ETHERADDR) {
1787 error = netdev->ether_addr_error;
1788 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1789 goto exit;
1790 }
1791 netdev->cache_valid &= ~VALID_ETHERADDR;
1792 }
1793
1794 /* Tap devices must be brought down before setting the address. */
1795 if (is_tap_netdev(netdev_)) {
1796 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1797 }
1798 error = set_etheraddr(netdev_get_name(netdev_), mac);
1799 if (!error || error == ENODEV) {
1800 netdev->ether_addr_error = error;
1801 netdev->cache_valid |= VALID_ETHERADDR;
1802 if (!error) {
1803 netdev->etheraddr = mac;
1804 }
1805 }
1806
1807 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1808 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1809 }
1810
1811 exit:
1812 ovs_mutex_unlock(&netdev->mutex);
1813 return error;
1814 }
1815
1816 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1817 static int
1818 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1819 {
1820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1821 int error;
1822
1823 ovs_mutex_lock(&netdev->mutex);
1824 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1825 netdev_linux_update_via_netlink(netdev);
1826 }
1827
1828 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1829 /* Fall back to ioctl if netlink fails */
1830 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1831 &netdev->etheraddr);
1832 netdev->cache_valid |= VALID_ETHERADDR;
1833 }
1834
1835 error = netdev->ether_addr_error;
1836 if (!error) {
1837 *mac = netdev->etheraddr;
1838 }
1839 ovs_mutex_unlock(&netdev->mutex);
1840
1841 return error;
1842 }
1843
1844 static int
1845 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1846 {
1847 int error;
1848
1849 if (!(netdev->cache_valid & VALID_MTU)) {
1850 netdev_linux_update_via_netlink(netdev);
1851 }
1852
1853 if (!(netdev->cache_valid & VALID_MTU)) {
1854 /* Fall back to ioctl if netlink fails */
1855 struct ifreq ifr;
1856
1857 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1858 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1859 netdev->mtu = ifr.ifr_mtu;
1860 netdev->cache_valid |= VALID_MTU;
1861 }
1862
1863 error = netdev->netdev_mtu_error;
1864 if (!error) {
1865 *mtup = netdev->mtu;
1866 }
1867
1868 return error;
1869 }
1870
1871 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1872 * in bytes, not including the hardware header; thus, this is typically 1500
1873 * bytes for Ethernet devices. */
1874 static int
1875 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1876 {
1877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1878 int error;
1879
1880 ovs_mutex_lock(&netdev->mutex);
1881 error = netdev_linux_get_mtu__(netdev, mtup);
1882 ovs_mutex_unlock(&netdev->mutex);
1883
1884 return error;
1885 }
1886
1887 /* Sets the maximum size of transmitted (MTU) for given device using linux
1888 * networking ioctl interface.
1889 */
1890 static int
1891 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1892 {
1893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1894 struct ifreq ifr;
1895 int error;
1896
1897 ovs_mutex_lock(&netdev->mutex);
1898 if (netdev_linux_netnsid_is_remote(netdev)) {
1899 error = EOPNOTSUPP;
1900 goto exit;
1901 }
1902
1903 #ifdef HAVE_AF_XDP
1904 if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1905 error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1906 if (error) {
1907 goto exit;
1908 }
1909 }
1910 #endif
1911
1912 if (netdev->cache_valid & VALID_MTU) {
1913 error = netdev->netdev_mtu_error;
1914 if (error || netdev->mtu == mtu) {
1915 goto exit;
1916 }
1917 netdev->cache_valid &= ~VALID_MTU;
1918 }
1919 ifr.ifr_mtu = mtu;
1920 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1921 SIOCSIFMTU, "SIOCSIFMTU");
1922 if (!error || error == ENODEV) {
1923 netdev->netdev_mtu_error = error;
1924 netdev->mtu = ifr.ifr_mtu;
1925 netdev->cache_valid |= VALID_MTU;
1926 }
1927 exit:
1928 ovs_mutex_unlock(&netdev->mutex);
1929 return error;
1930 }
1931
1932 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1933 * On failure, returns a negative errno value. */
1934 static int
1935 netdev_linux_get_ifindex(const struct netdev *netdev_)
1936 {
1937 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1938 int ifindex, error;
1939
1940 ovs_mutex_lock(&netdev->mutex);
1941 if (netdev_linux_netnsid_is_remote(netdev)) {
1942 error = EOPNOTSUPP;
1943 goto exit;
1944 }
1945 error = get_ifindex(netdev_, &ifindex);
1946
1947 exit:
1948 ovs_mutex_unlock(&netdev->mutex);
1949 return error ? -error : ifindex;
1950 }
1951
1952 static int
1953 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1954 {
1955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1956
1957 ovs_mutex_lock(&netdev->mutex);
1958 if (netdev->miimon_interval > 0) {
1959 *carrier = netdev->miimon;
1960 } else {
1961 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1962 }
1963 ovs_mutex_unlock(&netdev->mutex);
1964
1965 return 0;
1966 }
1967
1968 static long long int
1969 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1970 {
1971 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1972 long long int carrier_resets;
1973
1974 ovs_mutex_lock(&netdev->mutex);
1975 carrier_resets = netdev->carrier_resets;
1976 ovs_mutex_unlock(&netdev->mutex);
1977
1978 return carrier_resets;
1979 }
1980
1981 static int
1982 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1983 struct mii_ioctl_data *data)
1984 {
1985 struct ifreq ifr;
1986 int error;
1987
1988 memset(&ifr, 0, sizeof ifr);
1989 memcpy(&ifr.ifr_data, data, sizeof *data);
1990 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1991 memcpy(data, &ifr.ifr_data, sizeof *data);
1992
1993 return error;
1994 }
1995
1996 static int
1997 netdev_linux_get_miimon(const char *name, bool *miimon)
1998 {
1999 struct mii_ioctl_data data;
2000 int error;
2001
2002 *miimon = false;
2003
2004 memset(&data, 0, sizeof data);
2005 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2006 if (!error) {
2007 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2008 data.reg_num = MII_BMSR;
2009 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2010 &data);
2011
2012 if (!error) {
2013 *miimon = !!(data.val_out & BMSR_LSTATUS);
2014 }
2015 }
2016 if (error) {
2017 struct ethtool_cmd ecmd;
2018
2019 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2020 name);
2021
2022 COVERAGE_INC(netdev_get_ethtool);
2023 memset(&ecmd, 0, sizeof ecmd);
2024 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2025 "ETHTOOL_GLINK");
2026 if (!error) {
2027 struct ethtool_value eval;
2028
2029 memcpy(&eval, &ecmd, sizeof eval);
2030 *miimon = !!eval.data;
2031 } else {
2032 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2033 }
2034 }
2035
2036 return error;
2037 }
2038
2039 static int
2040 netdev_linux_set_miimon_interval(struct netdev *netdev_,
2041 long long int interval)
2042 {
2043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2044
2045 ovs_mutex_lock(&netdev->mutex);
2046 interval = interval > 0 ? MAX(interval, 100) : 0;
2047 if (netdev->miimon_interval != interval) {
2048 if (interval && !netdev->miimon_interval) {
2049 atomic_count_inc(&miimon_cnt);
2050 } else if (!interval && netdev->miimon_interval) {
2051 atomic_count_dec(&miimon_cnt);
2052 }
2053
2054 netdev->miimon_interval = interval;
2055 timer_set_expired(&netdev->miimon_timer);
2056 }
2057 ovs_mutex_unlock(&netdev->mutex);
2058
2059 return 0;
2060 }
2061
2062 static void
2063 netdev_linux_miimon_run(void)
2064 {
2065 struct shash device_shash;
2066 struct shash_node *node;
2067
2068 shash_init(&device_shash);
2069 netdev_get_devices(&netdev_linux_class, &device_shash);
2070 SHASH_FOR_EACH (node, &device_shash) {
2071 struct netdev *netdev = node->data;
2072 struct netdev_linux *dev = netdev_linux_cast(netdev);
2073 bool miimon;
2074
2075 ovs_mutex_lock(&dev->mutex);
2076 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2077 netdev_linux_get_miimon(dev->up.name, &miimon);
2078 if (miimon != dev->miimon) {
2079 dev->miimon = miimon;
2080 netdev_linux_changed(dev, dev->ifi_flags, 0);
2081 }
2082
2083 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2084 }
2085 ovs_mutex_unlock(&dev->mutex);
2086 netdev_close(netdev);
2087 }
2088
2089 shash_destroy(&device_shash);
2090 }
2091
2092 static void
2093 netdev_linux_miimon_wait(void)
2094 {
2095 struct shash device_shash;
2096 struct shash_node *node;
2097
2098 shash_init(&device_shash);
2099 netdev_get_devices(&netdev_linux_class, &device_shash);
2100 SHASH_FOR_EACH (node, &device_shash) {
2101 struct netdev *netdev = node->data;
2102 struct netdev_linux *dev = netdev_linux_cast(netdev);
2103
2104 ovs_mutex_lock(&dev->mutex);
2105 if (dev->miimon_interval > 0) {
2106 timer_wait(&dev->miimon_timer);
2107 }
2108 ovs_mutex_unlock(&dev->mutex);
2109 netdev_close(netdev);
2110 }
2111 shash_destroy(&device_shash);
2112 }
2113
2114 static void
2115 swap_uint64(uint64_t *a, uint64_t *b)
2116 {
2117 uint64_t tmp = *a;
2118 *a = *b;
2119 *b = tmp;
2120 }
2121
2122 /* Copies 'src' into 'dst', performing format conversion in the process.
2123 *
2124 * 'src' is allowed to be misaligned. */
2125 static void
2126 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2127 const struct ovs_vport_stats *src)
2128 {
2129 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
2130 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
2131 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
2132 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
2133 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
2134 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
2135 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
2136 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
2137 dst->multicast = 0;
2138 dst->collisions = 0;
2139 dst->rx_length_errors = 0;
2140 dst->rx_over_errors = 0;
2141 dst->rx_crc_errors = 0;
2142 dst->rx_frame_errors = 0;
2143 dst->rx_fifo_errors = 0;
2144 dst->rx_missed_errors = 0;
2145 dst->tx_aborted_errors = 0;
2146 dst->tx_carrier_errors = 0;
2147 dst->tx_fifo_errors = 0;
2148 dst->tx_heartbeat_errors = 0;
2149 dst->tx_window_errors = 0;
2150 }
2151
2152 static int
2153 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2154 {
2155 struct dpif_netlink_vport reply;
2156 struct ofpbuf *buf;
2157 int error;
2158
2159 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2160 if (error) {
2161 return error;
2162 } else if (!reply.stats) {
2163 ofpbuf_delete(buf);
2164 return EOPNOTSUPP;
2165 }
2166
2167 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
2168
2169 ofpbuf_delete(buf);
2170
2171 return 0;
2172 }
2173
2174 static void
2175 get_stats_via_vport(const struct netdev *netdev_,
2176 struct netdev_stats *stats)
2177 {
2178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2179
2180 if (!netdev->vport_stats_error ||
2181 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2182 int error;
2183
2184 error = get_stats_via_vport__(netdev_, stats);
2185 if (error && error != ENOENT && error != ENODEV) {
2186 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2187 "(%s)",
2188 netdev_get_name(netdev_), ovs_strerror(error));
2189 }
2190 netdev->vport_stats_error = error;
2191 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2192 }
2193 }
2194
2195 /* Retrieves current device stats for 'netdev-linux'. */
2196 static int
2197 netdev_linux_get_stats(const struct netdev *netdev_,
2198 struct netdev_stats *stats)
2199 {
2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2201 struct netdev_stats dev_stats;
2202 int error;
2203
2204 ovs_mutex_lock(&netdev->mutex);
2205 get_stats_via_vport(netdev_, stats);
2206 error = get_stats_via_netlink(netdev_, &dev_stats);
2207 if (error) {
2208 if (!netdev->vport_stats_error) {
2209 error = 0;
2210 }
2211 } else if (netdev->vport_stats_error) {
2212 /* stats not available from OVS then use netdev stats. */
2213 *stats = dev_stats;
2214 } else {
2215 /* Use kernel netdev's packet and byte counts since vport's counters
2216 * do not reflect packet counts on the wire when GSO, TSO or GRO are
2217 * enabled. */
2218 stats->rx_packets = dev_stats.rx_packets;
2219 stats->rx_bytes = dev_stats.rx_bytes;
2220 stats->tx_packets = dev_stats.tx_packets;
2221 stats->tx_bytes = dev_stats.tx_bytes;
2222
2223 stats->rx_errors += dev_stats.rx_errors;
2224 stats->tx_errors += dev_stats.tx_errors;
2225 stats->rx_dropped += dev_stats.rx_dropped;
2226 stats->tx_dropped += dev_stats.tx_dropped;
2227 stats->multicast += dev_stats.multicast;
2228 stats->collisions += dev_stats.collisions;
2229 stats->rx_length_errors += dev_stats.rx_length_errors;
2230 stats->rx_over_errors += dev_stats.rx_over_errors;
2231 stats->rx_crc_errors += dev_stats.rx_crc_errors;
2232 stats->rx_frame_errors += dev_stats.rx_frame_errors;
2233 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
2234 stats->rx_missed_errors += dev_stats.rx_missed_errors;
2235 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
2236 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
2237 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
2238 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2239 stats->tx_window_errors += dev_stats.tx_window_errors;
2240 }
2241 ovs_mutex_unlock(&netdev->mutex);
2242
2243 return error;
2244 }
2245
2246 /* Retrieves current device stats for 'netdev-tap' netdev or
2247 * netdev-internal. */
2248 static int
2249 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2250 {
2251 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2252 struct netdev_stats dev_stats;
2253 int error;
2254
2255 ovs_mutex_lock(&netdev->mutex);
2256 get_stats_via_vport(netdev_, stats);
2257 error = get_stats_via_netlink(netdev_, &dev_stats);
2258 if (error) {
2259 if (!netdev->vport_stats_error) {
2260 error = 0;
2261 }
2262 } else if (netdev->vport_stats_error) {
2263 /* Transmit and receive stats will appear to be swapped relative to the
2264 * other ports since we are the one sending the data, not a remote
2265 * computer. For consistency, we swap them back here. This does not
2266 * apply if we are getting stats from the vport layer because it always
2267 * tracks stats from the perspective of the switch. */
2268
2269 *stats = dev_stats;
2270 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2271 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2272 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2273 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2274 stats->rx_length_errors = 0;
2275 stats->rx_over_errors = 0;
2276 stats->rx_crc_errors = 0;
2277 stats->rx_frame_errors = 0;
2278 stats->rx_fifo_errors = 0;
2279 stats->rx_missed_errors = 0;
2280 stats->tx_aborted_errors = 0;
2281 stats->tx_carrier_errors = 0;
2282 stats->tx_fifo_errors = 0;
2283 stats->tx_heartbeat_errors = 0;
2284 stats->tx_window_errors = 0;
2285 } else {
2286 /* Use kernel netdev's packet and byte counts since vport counters
2287 * do not reflect packet counts on the wire when GSO, TSO or GRO
2288 * are enabled. */
2289 stats->rx_packets = dev_stats.tx_packets;
2290 stats->rx_bytes = dev_stats.tx_bytes;
2291 stats->tx_packets = dev_stats.rx_packets;
2292 stats->tx_bytes = dev_stats.rx_bytes;
2293
2294 stats->rx_dropped += dev_stats.tx_dropped;
2295 stats->tx_dropped += dev_stats.rx_dropped;
2296
2297 stats->rx_errors += dev_stats.tx_errors;
2298 stats->tx_errors += dev_stats.rx_errors;
2299
2300 stats->multicast += dev_stats.multicast;
2301 stats->collisions += dev_stats.collisions;
2302 }
2303 stats->tx_dropped += netdev->tx_dropped;
2304 stats->rx_dropped += netdev->rx_dropped;
2305 ovs_mutex_unlock(&netdev->mutex);
2306
2307 return error;
2308 }
2309
2310 static int
2311 netdev_internal_get_stats(const struct netdev *netdev_,
2312 struct netdev_stats *stats)
2313 {
2314 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2315 int error;
2316
2317 ovs_mutex_lock(&netdev->mutex);
2318 get_stats_via_vport(netdev_, stats);
2319 error = netdev->vport_stats_error;
2320 ovs_mutex_unlock(&netdev->mutex);
2321
2322 return error;
2323 }
2324
2325 static void
2326 netdev_linux_read_features(struct netdev_linux *netdev)
2327 {
2328 struct ethtool_cmd ecmd;
2329 uint32_t speed;
2330 int error;
2331
2332 if (netdev->cache_valid & VALID_FEATURES) {
2333 return;
2334 }
2335
2336 COVERAGE_INC(netdev_get_ethtool);
2337 memset(&ecmd, 0, sizeof ecmd);
2338 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2339 ETHTOOL_GSET, "ETHTOOL_GSET");
2340 if (error) {
2341 goto out;
2342 }
2343
2344 /* Supported features. */
2345 netdev->supported = 0;
2346 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2347 netdev->supported |= NETDEV_F_10MB_HD;
2348 }
2349 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2350 netdev->supported |= NETDEV_F_10MB_FD;
2351 }
2352 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2353 netdev->supported |= NETDEV_F_100MB_HD;
2354 }
2355 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2356 netdev->supported |= NETDEV_F_100MB_FD;
2357 }
2358 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2359 netdev->supported |= NETDEV_F_1GB_HD;
2360 }
2361 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2362 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2363 netdev->supported |= NETDEV_F_1GB_FD;
2364 }
2365 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2366 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2367 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2368 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2369 netdev->supported |= NETDEV_F_10GB_FD;
2370 }
2371 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2372 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2373 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2374 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2375 netdev->supported |= NETDEV_F_40GB_FD;
2376 }
2377 if (ecmd.supported & SUPPORTED_TP) {
2378 netdev->supported |= NETDEV_F_COPPER;
2379 }
2380 if (ecmd.supported & SUPPORTED_FIBRE) {
2381 netdev->supported |= NETDEV_F_FIBER;
2382 }
2383 if (ecmd.supported & SUPPORTED_Autoneg) {
2384 netdev->supported |= NETDEV_F_AUTONEG;
2385 }
2386 if (ecmd.supported & SUPPORTED_Pause) {
2387 netdev->supported |= NETDEV_F_PAUSE;
2388 }
2389 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2390 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2391 }
2392
2393 /* Advertised features. */
2394 netdev->advertised = 0;
2395 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2396 netdev->advertised |= NETDEV_F_10MB_HD;
2397 }
2398 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2399 netdev->advertised |= NETDEV_F_10MB_FD;
2400 }
2401 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2402 netdev->advertised |= NETDEV_F_100MB_HD;
2403 }
2404 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2405 netdev->advertised |= NETDEV_F_100MB_FD;
2406 }
2407 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2408 netdev->advertised |= NETDEV_F_1GB_HD;
2409 }
2410 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2411 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2412 netdev->advertised |= NETDEV_F_1GB_FD;
2413 }
2414 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2415 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2416 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2417 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2418 netdev->advertised |= NETDEV_F_10GB_FD;
2419 }
2420 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2421 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2422 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2423 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2424 netdev->advertised |= NETDEV_F_40GB_FD;
2425 }
2426 if (ecmd.advertising & ADVERTISED_TP) {
2427 netdev->advertised |= NETDEV_F_COPPER;
2428 }
2429 if (ecmd.advertising & ADVERTISED_FIBRE) {
2430 netdev->advertised |= NETDEV_F_FIBER;
2431 }
2432 if (ecmd.advertising & ADVERTISED_Autoneg) {
2433 netdev->advertised |= NETDEV_F_AUTONEG;
2434 }
2435 if (ecmd.advertising & ADVERTISED_Pause) {
2436 netdev->advertised |= NETDEV_F_PAUSE;
2437 }
2438 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2439 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2440 }
2441
2442 /* Current settings. */
2443 speed = ethtool_cmd_speed(&ecmd);
2444 if (speed == SPEED_10) {
2445 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2446 } else if (speed == SPEED_100) {
2447 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2448 } else if (speed == SPEED_1000) {
2449 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2450 } else if (speed == SPEED_10000) {
2451 netdev->current = NETDEV_F_10GB_FD;
2452 } else if (speed == 40000) {
2453 netdev->current = NETDEV_F_40GB_FD;
2454 } else if (speed == 100000) {
2455 netdev->current = NETDEV_F_100GB_FD;
2456 } else if (speed == 1000000) {
2457 netdev->current = NETDEV_F_1TB_FD;
2458 } else {
2459 netdev->current = 0;
2460 }
2461
2462 if (ecmd.port == PORT_TP) {
2463 netdev->current |= NETDEV_F_COPPER;
2464 } else if (ecmd.port == PORT_FIBRE) {
2465 netdev->current |= NETDEV_F_FIBER;
2466 }
2467
2468 if (ecmd.autoneg) {
2469 netdev->current |= NETDEV_F_AUTONEG;
2470 }
2471
2472 out:
2473 netdev->cache_valid |= VALID_FEATURES;
2474 netdev->get_features_error = error;
2475 }
2476
2477 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2478 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2479 * Returns 0 if successful, otherwise a positive errno value. */
2480 static int
2481 netdev_linux_get_features(const struct netdev *netdev_,
2482 enum netdev_features *current,
2483 enum netdev_features *advertised,
2484 enum netdev_features *supported,
2485 enum netdev_features *peer)
2486 {
2487 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2488 int error;
2489
2490 ovs_mutex_lock(&netdev->mutex);
2491 if (netdev_linux_netnsid_is_remote(netdev)) {
2492 error = EOPNOTSUPP;
2493 goto exit;
2494 }
2495
2496 netdev_linux_read_features(netdev);
2497 if (!netdev->get_features_error) {
2498 *current = netdev->current;
2499 *advertised = netdev->advertised;
2500 *supported = netdev->supported;
2501 *peer = 0; /* XXX */
2502 }
2503 error = netdev->get_features_error;
2504
2505 exit:
2506 ovs_mutex_unlock(&netdev->mutex);
2507 return error;
2508 }
2509
2510 /* Set the features advertised by 'netdev' to 'advertise'. */
2511 static int
2512 netdev_linux_set_advertisements(struct netdev *netdev_,
2513 enum netdev_features advertise)
2514 {
2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2516 struct ethtool_cmd ecmd;
2517 int error;
2518
2519 ovs_mutex_lock(&netdev->mutex);
2520
2521 COVERAGE_INC(netdev_get_ethtool);
2522
2523 if (netdev_linux_netnsid_is_remote(netdev)) {
2524 error = EOPNOTSUPP;
2525 goto exit;
2526 }
2527
2528 memset(&ecmd, 0, sizeof ecmd);
2529 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2530 ETHTOOL_GSET, "ETHTOOL_GSET");
2531 if (error) {
2532 goto exit;
2533 }
2534
2535 ecmd.advertising = 0;
2536 if (advertise & NETDEV_F_10MB_HD) {
2537 ecmd.advertising |= ADVERTISED_10baseT_Half;
2538 }
2539 if (advertise & NETDEV_F_10MB_FD) {
2540 ecmd.advertising |= ADVERTISED_10baseT_Full;
2541 }
2542 if (advertise & NETDEV_F_100MB_HD) {
2543 ecmd.advertising |= ADVERTISED_100baseT_Half;
2544 }
2545 if (advertise & NETDEV_F_100MB_FD) {
2546 ecmd.advertising |= ADVERTISED_100baseT_Full;
2547 }
2548 if (advertise & NETDEV_F_1GB_HD) {
2549 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2550 }
2551 if (advertise & NETDEV_F_1GB_FD) {
2552 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2553 }
2554 if (advertise & NETDEV_F_10GB_FD) {
2555 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2556 }
2557 if (advertise & NETDEV_F_COPPER) {
2558 ecmd.advertising |= ADVERTISED_TP;
2559 }
2560 if (advertise & NETDEV_F_FIBER) {
2561 ecmd.advertising |= ADVERTISED_FIBRE;
2562 }
2563 if (advertise & NETDEV_F_AUTONEG) {
2564 ecmd.advertising |= ADVERTISED_Autoneg;
2565 }
2566 if (advertise & NETDEV_F_PAUSE) {
2567 ecmd.advertising |= ADVERTISED_Pause;
2568 }
2569 if (advertise & NETDEV_F_PAUSE_ASYM) {
2570 ecmd.advertising |= ADVERTISED_Asym_Pause;
2571 }
2572 COVERAGE_INC(netdev_set_ethtool);
2573 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2574 ETHTOOL_SSET, "ETHTOOL_SSET");
2575
2576 exit:
2577 ovs_mutex_unlock(&netdev->mutex);
2578 return error;
2579 }
2580
2581 static struct tc_police
2582 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2583 {
2584 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2585 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2586 struct tc_police police;
2587 struct tc_ratespec rate;
2588 int mtu = 65535;
2589
2590 memset(&rate, 0, sizeof rate);
2591 rate.rate = bps;
2592 rate.cell_log = tc_calc_cell_log(mtu);
2593 rate.mpu = ETH_TOTAL_MIN;
2594
2595 memset(&police, 0, sizeof police);
2596 police.burst = tc_bytes_to_ticks(bps, bsize);
2597 police.action = TC_POLICE_SHOT;
2598 police.rate = rate;
2599 police.mtu = mtu;
2600
2601 return police;
2602 }
2603
2604 static void
2605 nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2606 {
2607 size_t offset;
2608
2609 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2610 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2611 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2612 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2613 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2614 nl_msg_end_nested(request, offset);
2615 }
2616
2617 static int
2618 tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2619 uint32_t kbits_burst)
2620 {
2621 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2622 size_t basic_offset, action_offset, inner_offset;
2623 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2624 int ifindex, index, err = 0;
2625 struct tc_police pol_act;
2626 uint32_t block_id = 0;
2627 struct ofpbuf request;
2628 struct ofpbuf *reply;
2629 struct tcmsg *tcmsg;
2630 uint32_t handle = 1;
2631
2632 err = get_ifindex(netdev, &ifindex);
2633 if (err) {
2634 return err;
2635 }
2636
2637 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2638 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2639 &request);
2640 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2641 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2642 tcmsg->tcm_handle = handle;
2643
2644 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2645 nl_msg_put_string(&request, TCA_KIND, "matchall");
2646 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2647 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2648 inner_offset = nl_msg_start_nested(&request, 1);
2649 nl_msg_put_act_police(&request, pol_act);
2650 nl_msg_end_nested(&request, inner_offset);
2651 nl_msg_end_nested(&request, action_offset);
2652 nl_msg_end_nested(&request, basic_offset);
2653
2654 err = tc_transact(&request, &reply);
2655 if (!err) {
2656 struct tcmsg *tc =
2657 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2658 ofpbuf_delete(reply);
2659 }
2660
2661 return err;
2662 }
2663
2664 static int
2665 tc_del_matchall_policer(struct netdev *netdev)
2666 {
2667 int prio = TC_RESERVED_PRIORITY_POLICE;
2668 uint32_t block_id = 0;
2669 struct tcf_id id;
2670 int ifindex;
2671 int err;
2672
2673 err = get_ifindex(netdev, &ifindex);
2674 if (err) {
2675 return err;
2676 }
2677
2678 id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2679 err = tc_del_filter(&id);
2680 if (err) {
2681 return err;
2682 }
2683
2684 return 0;
2685 }
2686
2687 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2688 * successful, otherwise a positive errno value. */
2689 static int
2690 netdev_linux_set_policing(struct netdev *netdev_,
2691 uint32_t kbits_rate, uint32_t kbits_burst)
2692 {
2693 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2694 const char *netdev_name = netdev_get_name(netdev_);
2695 int ifindex;
2696 int error;
2697
2698 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2699 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2700 : kbits_burst); /* Stick with user-specified value. */
2701
2702 ovs_mutex_lock(&netdev->mutex);
2703 if (netdev_linux_netnsid_is_remote(netdev)) {
2704 error = EOPNOTSUPP;
2705 goto out;
2706 }
2707
2708 if (netdev->cache_valid & VALID_POLICING) {
2709 error = netdev->netdev_policing_error;
2710 if (error || (netdev->kbits_rate == kbits_rate &&
2711 netdev->kbits_burst == kbits_burst)) {
2712 /* Assume that settings haven't changed since we last set them. */
2713 goto out;
2714 }
2715 netdev->cache_valid &= ~VALID_POLICING;
2716 }
2717
2718 COVERAGE_INC(netdev_set_policing);
2719
2720 /* Use matchall for policing when offloadling ovs with tc-flower. */
2721 if (netdev_is_flow_api_enabled()) {
2722 error = tc_del_matchall_policer(netdev_);
2723 if (kbits_rate) {
2724 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2725 }
2726 ovs_mutex_unlock(&netdev->mutex);
2727 return error;
2728 }
2729
2730 error = get_ifindex(netdev_, &ifindex);
2731 if (error) {
2732 goto out;
2733 }
2734
2735 /* Remove any existing ingress qdisc. */
2736 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
2737 if (error) {
2738 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2739 netdev_name, ovs_strerror(error));
2740 goto out;
2741 }
2742
2743 if (kbits_rate) {
2744 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
2745 if (error) {
2746 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2747 netdev_name, ovs_strerror(error));
2748 goto out;
2749 }
2750
2751 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2752 if (error){
2753 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2754 netdev_name, ovs_strerror(error));
2755 goto out;
2756 }
2757 }
2758
2759 netdev->kbits_rate = kbits_rate;
2760 netdev->kbits_burst = kbits_burst;
2761
2762 out:
2763 if (!error || error == ENODEV) {
2764 netdev->netdev_policing_error = error;
2765 netdev->cache_valid |= VALID_POLICING;
2766 }
2767 ovs_mutex_unlock(&netdev->mutex);
2768 return error;
2769 }
2770
2771 static int
2772 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2773 struct sset *types)
2774 {
2775 const struct tc_ops *const *opsp;
2776 for (opsp = tcs; *opsp != NULL; opsp++) {
2777 const struct tc_ops *ops = *opsp;
2778 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2779 sset_add(types, ops->ovs_name);
2780 }
2781 }
2782 return 0;
2783 }
2784
2785 static const struct tc_ops *
2786 tc_lookup_ovs_name(const char *name)
2787 {
2788 const struct tc_ops *const *opsp;
2789
2790 for (opsp = tcs; *opsp != NULL; opsp++) {
2791 const struct tc_ops *ops = *opsp;
2792 if (!strcmp(name, ops->ovs_name)) {
2793 return ops;
2794 }
2795 }
2796 return NULL;
2797 }
2798
2799 static const struct tc_ops *
2800 tc_lookup_linux_name(const char *name)
2801 {
2802 const struct tc_ops *const *opsp;
2803
2804 for (opsp = tcs; *opsp != NULL; opsp++) {
2805 const struct tc_ops *ops = *opsp;
2806 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2807 return ops;
2808 }
2809 }
2810 return NULL;
2811 }
2812
2813 static struct tc_queue *
2814 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2815 size_t hash)
2816 {
2817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2818 struct tc_queue *queue;
2819
2820 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2821 if (queue->queue_id == queue_id) {
2822 return queue;
2823 }
2824 }
2825 return NULL;
2826 }
2827
2828 static struct tc_queue *
2829 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2830 {
2831 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2832 }
2833
2834 static int
2835 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2836 const char *type,
2837 struct netdev_qos_capabilities *caps)
2838 {
2839 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2840 if (!ops) {
2841 return EOPNOTSUPP;
2842 }
2843 caps->n_queues = ops->n_queues;
2844 return 0;
2845 }
2846
2847 static int
2848 netdev_linux_get_qos(const struct netdev *netdev_,
2849 const char **typep, struct smap *details)
2850 {
2851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2852 int error;
2853
2854 ovs_mutex_lock(&netdev->mutex);
2855 if (netdev_linux_netnsid_is_remote(netdev)) {
2856 error = EOPNOTSUPP;
2857 goto exit;
2858 }
2859
2860 error = tc_query_qdisc(netdev_);
2861 if (!error) {
2862 *typep = netdev->tc->ops->ovs_name;
2863 error = (netdev->tc->ops->qdisc_get
2864 ? netdev->tc->ops->qdisc_get(netdev_, details)
2865 : 0);
2866 }
2867
2868 exit:
2869 ovs_mutex_unlock(&netdev->mutex);
2870 return error;
2871 }
2872
2873 static int
2874 netdev_linux_set_qos(struct netdev *netdev_,
2875 const char *type, const struct smap *details)
2876 {
2877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2878 const struct tc_ops *new_ops;
2879 int error;
2880
2881 new_ops = tc_lookup_ovs_name(type);
2882 if (!new_ops || !new_ops->tc_install) {
2883 return EOPNOTSUPP;
2884 }
2885
2886 if (new_ops == &tc_ops_noop) {
2887 return new_ops->tc_install(netdev_, details);
2888 }
2889
2890 ovs_mutex_lock(&netdev->mutex);
2891 if (netdev_linux_netnsid_is_remote(netdev)) {
2892 error = EOPNOTSUPP;
2893 goto exit;
2894 }
2895
2896 error = tc_query_qdisc(netdev_);
2897 if (error) {
2898 goto exit;
2899 }
2900
2901 if (new_ops == netdev->tc->ops) {
2902 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2903 } else {
2904 /* Delete existing qdisc. */
2905 error = tc_del_qdisc(netdev_);
2906 if (error) {
2907 goto exit;
2908 }
2909 ovs_assert(netdev->tc == NULL);
2910
2911 /* Install new qdisc. */
2912 error = new_ops->tc_install(netdev_, details);
2913 ovs_assert((error == 0) == (netdev->tc != NULL));
2914 }
2915
2916 exit:
2917 ovs_mutex_unlock(&netdev->mutex);
2918 return error;
2919 }
2920
2921 static int
2922 netdev_linux_get_queue(const struct netdev *netdev_,
2923 unsigned int queue_id, struct smap *details)
2924 {
2925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2926 int error;
2927
2928 ovs_mutex_lock(&netdev->mutex);
2929 if (netdev_linux_netnsid_is_remote(netdev)) {
2930 error = EOPNOTSUPP;
2931 goto exit;
2932 }
2933
2934 error = tc_query_qdisc(netdev_);
2935 if (!error) {
2936 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2937 error = (queue
2938 ? netdev->tc->ops->class_get(netdev_, queue, details)
2939 : ENOENT);
2940 }
2941
2942 exit:
2943 ovs_mutex_unlock(&netdev->mutex);
2944 return error;
2945 }
2946
2947 static int
2948 netdev_linux_set_queue(struct netdev *netdev_,
2949 unsigned int queue_id, const struct smap *details)
2950 {
2951 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2952 int error;
2953
2954 ovs_mutex_lock(&netdev->mutex);
2955 if (netdev_linux_netnsid_is_remote(netdev)) {
2956 error = EOPNOTSUPP;
2957 goto exit;
2958 }
2959
2960 error = tc_query_qdisc(netdev_);
2961 if (!error) {
2962 error = (queue_id < netdev->tc->ops->n_queues
2963 && netdev->tc->ops->class_set
2964 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2965 : EINVAL);
2966 }
2967
2968 exit:
2969 ovs_mutex_unlock(&netdev->mutex);
2970 return error;
2971 }
2972
2973 static int
2974 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2975 {
2976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2977 int error;
2978
2979 ovs_mutex_lock(&netdev->mutex);
2980 if (netdev_linux_netnsid_is_remote(netdev)) {
2981 error = EOPNOTSUPP;
2982 goto exit;
2983 }
2984
2985 error = tc_query_qdisc(netdev_);
2986 if (!error) {
2987 if (netdev->tc->ops->class_delete) {
2988 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2989 error = (queue
2990 ? netdev->tc->ops->class_delete(netdev_, queue)
2991 : ENOENT);
2992 } else {
2993 error = EINVAL;
2994 }
2995 }
2996
2997 exit:
2998 ovs_mutex_unlock(&netdev->mutex);
2999 return error;
3000 }
3001
3002 static int
3003 netdev_linux_get_queue_stats(const struct netdev *netdev_,
3004 unsigned int queue_id,
3005 struct netdev_queue_stats *stats)
3006 {
3007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3008 int error;
3009
3010 ovs_mutex_lock(&netdev->mutex);
3011 if (netdev_linux_netnsid_is_remote(netdev)) {
3012 error = EOPNOTSUPP;
3013 goto exit;
3014 }
3015
3016 error = tc_query_qdisc(netdev_);
3017 if (!error) {
3018 if (netdev->tc->ops->class_get_stats) {
3019 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3020 if (queue) {
3021 stats->created = queue->created;
3022 error = netdev->tc->ops->class_get_stats(netdev_, queue,
3023 stats);
3024 } else {
3025 error = ENOENT;
3026 }
3027 } else {
3028 error = EOPNOTSUPP;
3029 }
3030 }
3031
3032 exit:
3033 ovs_mutex_unlock(&netdev->mutex);
3034 return error;
3035 }
3036
3037 struct queue_dump_state {
3038 struct nl_dump dump;
3039 struct ofpbuf buf;
3040 };
3041
3042 static bool
3043 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3044 {
3045 struct ofpbuf request;
3046 struct tcmsg *tcmsg;
3047
3048 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3049 if (!tcmsg) {
3050 return false;
3051 }
3052 tcmsg->tcm_parent = 0;
3053 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3054 ofpbuf_uninit(&request);
3055
3056 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3057 return true;
3058 }
3059
3060 static int
3061 finish_queue_dump(struct queue_dump_state *state)
3062 {
3063 ofpbuf_uninit(&state->buf);
3064 return nl_dump_done(&state->dump);
3065 }
3066
3067 struct netdev_linux_queue_state {
3068 unsigned int *queues;
3069 size_t cur_queue;
3070 size_t n_queues;
3071 };
3072
3073 static int
3074 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3075 {
3076 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3077 int error;
3078
3079 ovs_mutex_lock(&netdev->mutex);
3080 if (netdev_linux_netnsid_is_remote(netdev)) {
3081 error = EOPNOTSUPP;
3082 goto exit;
3083 }
3084
3085 error = tc_query_qdisc(netdev_);
3086 if (!error) {
3087 if (netdev->tc->ops->class_get) {
3088 struct netdev_linux_queue_state *state;
3089 struct tc_queue *queue;
3090 size_t i;
3091
3092 *statep = state = xmalloc(sizeof *state);
3093 state->n_queues = hmap_count(&netdev->tc->queues);
3094 state->cur_queue = 0;
3095 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3096
3097 i = 0;
3098 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3099 state->queues[i++] = queue->queue_id;
3100 }
3101 } else {
3102 error = EOPNOTSUPP;
3103 }
3104 }
3105
3106 exit:
3107 ovs_mutex_unlock(&netdev->mutex);
3108 return error;
3109 }
3110
3111 static int
3112 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3113 unsigned int *queue_idp, struct smap *details)
3114 {
3115 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3116 struct netdev_linux_queue_state *state = state_;
3117 int error = EOF;
3118
3119 ovs_mutex_lock(&netdev->mutex);
3120 if (netdev_linux_netnsid_is_remote(netdev)) {
3121 error = EOPNOTSUPP;
3122 goto exit;
3123 }
3124
3125 while (state->cur_queue < state->n_queues) {
3126 unsigned int queue_id = state->queues[state->cur_queue++];
3127 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3128
3129 if (queue) {
3130 *queue_idp = queue_id;
3131 error = netdev->tc->ops->class_get(netdev_, queue, details);
3132 break;
3133 }
3134 }
3135
3136 exit:
3137 ovs_mutex_unlock(&netdev->mutex);
3138 return error;
3139 }
3140
3141 static int
3142 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3143 void *state_)
3144 {
3145 struct netdev_linux_queue_state *state = state_;
3146
3147 free(state->queues);
3148 free(state);
3149 return 0;
3150 }
3151
3152 static int
3153 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3154 netdev_dump_queue_stats_cb *cb, void *aux)
3155 {
3156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3157 int error;
3158
3159 ovs_mutex_lock(&netdev->mutex);
3160 if (netdev_linux_netnsid_is_remote(netdev)) {
3161 error = EOPNOTSUPP;
3162 goto exit;
3163 }
3164
3165 error = tc_query_qdisc(netdev_);
3166 if (!error) {
3167 struct queue_dump_state state;
3168
3169 if (!netdev->tc->ops->class_dump_stats) {
3170 error = EOPNOTSUPP;
3171 } else if (!start_queue_dump(netdev_, &state)) {
3172 error = ENODEV;
3173 } else {
3174 struct ofpbuf msg;
3175 int retval;
3176
3177 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3178 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3179 cb, aux);
3180 if (retval) {
3181 error = retval;
3182 }
3183 }
3184
3185 retval = finish_queue_dump(&state);
3186 if (retval) {
3187 error = retval;
3188 }
3189 }
3190 }
3191
3192 exit:
3193 ovs_mutex_unlock(&netdev->mutex);
3194 return error;
3195 }
3196
3197 static int
3198 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3199 struct in_addr netmask)
3200 {
3201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3202 int error;
3203
3204 ovs_mutex_lock(&netdev->mutex);
3205 if (netdev_linux_netnsid_is_remote(netdev)) {
3206 error = EOPNOTSUPP;
3207 goto exit;
3208 }
3209
3210 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3211 if (!error) {
3212 if (address.s_addr != INADDR_ANY) {
3213 error = do_set_addr(netdev_, SIOCSIFNETMASK,
3214 "SIOCSIFNETMASK", netmask);
3215 }
3216 }
3217
3218 exit:
3219 ovs_mutex_unlock(&netdev->mutex);
3220 return error;
3221 }
3222
3223 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3224 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3225 * error. */
3226 static int
3227 netdev_linux_get_addr_list(const struct netdev *netdev_,
3228 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3229 {
3230 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3231 int error;
3232
3233 ovs_mutex_lock(&netdev->mutex);
3234 if (netdev_linux_netnsid_is_remote(netdev)) {
3235 error = EOPNOTSUPP;
3236 goto exit;
3237 }
3238
3239 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3240
3241 exit:
3242 ovs_mutex_unlock(&netdev->mutex);
3243 return error;
3244 }
3245
3246 static void
3247 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3248 {
3249 struct sockaddr_in sin;
3250 memset(&sin, 0, sizeof sin);
3251 sin.sin_family = AF_INET;
3252 sin.sin_addr = addr;
3253 sin.sin_port = 0;
3254
3255 memset(sa, 0, sizeof *sa);
3256 memcpy(sa, &sin, sizeof sin);
3257 }
3258
3259 static int
3260 do_set_addr(struct netdev *netdev,
3261 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3262 {
3263 struct ifreq ifr;
3264
3265 make_in4_sockaddr(&ifr.ifr_addr, addr);
3266 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3267 ioctl_name);
3268 }
3269
3270 /* Adds 'router' as a default IP gateway. */
3271 static int
3272 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3273 {
3274 struct in_addr any = { INADDR_ANY };
3275 struct rtentry rt;
3276 int error;
3277
3278 memset(&rt, 0, sizeof rt);
3279 make_in4_sockaddr(&rt.rt_dst, any);
3280 make_in4_sockaddr(&rt.rt_gateway, router);
3281 make_in4_sockaddr(&rt.rt_genmask, any);
3282 rt.rt_flags = RTF_UP | RTF_GATEWAY;
3283 error = af_inet_ioctl(SIOCADDRT, &rt);
3284 if (error) {
3285 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3286 }
3287 return error;
3288 }
3289
3290 static int
3291 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3292 char **netdev_name)
3293 {
3294 static const char fn[] = "/proc/net/route";
3295 FILE *stream;
3296 char line[256];
3297 int ln;
3298
3299 *netdev_name = NULL;
3300 stream = fopen(fn, "r");
3301 if (stream == NULL) {
3302 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3303 return errno;
3304 }
3305
3306 ln = 0;
3307 while (fgets(line, sizeof line, stream)) {
3308 if (++ln >= 2) {
3309 char iface[17];
3310 ovs_be32 dest, gateway, mask;
3311 int refcnt, metric, mtu;
3312 unsigned int flags, use, window, irtt;
3313
3314 if (!ovs_scan(line,
3315 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3316 " %d %u %u\n",
3317 iface, &dest, &gateway, &flags, &refcnt,
3318 &use, &metric, &mask, &mtu, &window, &irtt)) {
3319 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3320 fn, ln, line);
3321 continue;
3322 }
3323 if (!(flags & RTF_UP)) {
3324 /* Skip routes that aren't up. */
3325 continue;
3326 }
3327
3328 /* The output of 'dest', 'mask', and 'gateway' were given in
3329 * network byte order, so we don't need need any endian
3330 * conversions here. */
3331 if ((dest & mask) == (host->s_addr & mask)) {
3332 if (!gateway) {
3333 /* The host is directly reachable. */
3334 next_hop->s_addr = 0;
3335 } else {
3336 /* To reach the host, we must go through a gateway. */
3337 next_hop->s_addr = gateway;
3338 }
3339 *netdev_name = xstrdup(iface);
3340 fclose(stream);
3341 return 0;
3342 }
3343 }
3344 }
3345
3346 fclose(stream);
3347 return ENXIO;
3348 }
3349
3350 static int
3351 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3352 {
3353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3354 int error = 0;
3355
3356 ovs_mutex_lock(&netdev->mutex);
3357 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3358 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3359
3360 COVERAGE_INC(netdev_get_ethtool);
3361 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3362 error = netdev_linux_do_ethtool(netdev->up.name,
3363 cmd,
3364 ETHTOOL_GDRVINFO,
3365 "ETHTOOL_GDRVINFO");
3366 if (!error) {
3367 netdev->cache_valid |= VALID_DRVINFO;
3368 }
3369 }
3370
3371 if (!error) {
3372 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3373 smap_add(smap, "driver_version", netdev->drvinfo.version);
3374 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3375 }
3376 ovs_mutex_unlock(&netdev->mutex);
3377
3378 return error;
3379 }
3380
3381 static int
3382 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3383 struct smap *smap)
3384 {
3385 smap_add(smap, "driver_name", "openvswitch");
3386 return 0;
3387 }
3388
3389 static uint32_t
3390 netdev_linux_get_block_id(struct netdev *netdev_)
3391 {
3392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3393 uint32_t block_id = 0;
3394
3395 ovs_mutex_lock(&netdev->mutex);
3396 /* Ensure the linux netdev has had its fields populated. */
3397 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3398 netdev_linux_update_via_netlink(netdev);
3399 }
3400
3401 /* Only assigning block ids to linux netdevs that are LAG masters. */
3402 if (netdev->is_lag_master) {
3403 block_id = netdev->ifindex;
3404 }
3405 ovs_mutex_unlock(&netdev->mutex);
3406
3407 return block_id;
3408 }
3409
3410 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3411 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3412 * returns 0. Otherwise, it returns a positive errno value; in particular,
3413 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3414 static int
3415 netdev_linux_arp_lookup(const struct netdev *netdev,
3416 ovs_be32 ip, struct eth_addr *mac)
3417 {
3418 struct arpreq r;
3419 struct sockaddr_in sin;
3420 int retval;
3421
3422 memset(&r, 0, sizeof r);
3423 memset(&sin, 0, sizeof sin);
3424 sin.sin_family = AF_INET;
3425 sin.sin_addr.s_addr = ip;
3426 sin.sin_port = 0;
3427 memcpy(&r.arp_pa, &sin, sizeof sin);
3428 r.arp_ha.sa_family = ARPHRD_ETHER;
3429 r.arp_flags = 0;
3430 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3431 COVERAGE_INC(netdev_arp_lookup);
3432 retval = af_inet_ioctl(SIOCGARP, &r);
3433 if (!retval) {
3434 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3435 } else if (retval != ENXIO) {
3436 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3437 netdev_get_name(netdev), IP_ARGS(ip),
3438 ovs_strerror(retval));
3439 }
3440 return retval;
3441 }
3442
3443 static unsigned int
3444 nd_to_iff_flags(enum netdev_flags nd)
3445 {
3446 unsigned int iff = 0;
3447 if (nd & NETDEV_UP) {
3448 iff |= IFF_UP;
3449 }
3450 if (nd & NETDEV_PROMISC) {
3451 iff |= IFF_PROMISC;
3452 }
3453 if (nd & NETDEV_LOOPBACK) {
3454 iff |= IFF_LOOPBACK;
3455 }
3456 return iff;
3457 }
3458
3459 static int
3460 iff_to_nd_flags(unsigned int iff)
3461 {
3462 enum netdev_flags nd = 0;
3463 if (iff & IFF_UP) {
3464 nd |= NETDEV_UP;
3465 }
3466 if (iff & IFF_PROMISC) {
3467 nd |= NETDEV_PROMISC;
3468 }
3469 if (iff & IFF_LOOPBACK) {
3470 nd |= NETDEV_LOOPBACK;
3471 }
3472 return nd;
3473 }
3474
3475 static int
3476 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3477 enum netdev_flags on, enum netdev_flags *old_flagsp)
3478 OVS_REQUIRES(netdev->mutex)
3479 {
3480 unsigned int old_flags, new_flags;
3481 int error = 0;
3482
3483 old_flags = netdev->ifi_flags;
3484 *old_flagsp = iff_to_nd_flags(old_flags);
3485 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3486 if (new_flags != old_flags) {
3487 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3488 get_flags(&netdev->up, &netdev->ifi_flags);
3489 }
3490
3491 return error;
3492 }
3493
3494 static int
3495 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3496 enum netdev_flags on, enum netdev_flags *old_flagsp)
3497 {
3498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3499 int error = 0;
3500
3501 ovs_mutex_lock(&netdev->mutex);
3502 if (on || off) {
3503 /* Changing flags over netlink isn't support yet. */
3504 if (netdev_linux_netnsid_is_remote(netdev)) {
3505 error = EOPNOTSUPP;
3506 goto exit;
3507 }
3508 error = update_flags(netdev, off, on, old_flagsp);
3509 } else {
3510 /* Try reading flags over netlink, or fall back to ioctl. */
3511 if (!netdev_linux_update_via_netlink(netdev)) {
3512 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3513 } else {
3514 error = update_flags(netdev, off, on, old_flagsp);
3515 }
3516 }
3517
3518 exit:
3519 ovs_mutex_unlock(&netdev->mutex);
3520 return error;
3521 }
3522
3523 #define NETDEV_LINUX_CLASS_COMMON \
3524 .run = netdev_linux_run, \
3525 .wait = netdev_linux_wait, \
3526 .alloc = netdev_linux_alloc, \
3527 .dealloc = netdev_linux_dealloc, \
3528 .send_wait = netdev_linux_send_wait, \
3529 .set_etheraddr = netdev_linux_set_etheraddr, \
3530 .get_etheraddr = netdev_linux_get_etheraddr, \
3531 .get_mtu = netdev_linux_get_mtu, \
3532 .set_mtu = netdev_linux_set_mtu, \
3533 .get_ifindex = netdev_linux_get_ifindex, \
3534 .get_carrier = netdev_linux_get_carrier, \
3535 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3536 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3537 .set_advertisements = netdev_linux_set_advertisements, \
3538 .set_policing = netdev_linux_set_policing, \
3539 .get_qos_types = netdev_linux_get_qos_types, \
3540 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3541 .get_qos = netdev_linux_get_qos, \
3542 .set_qos = netdev_linux_set_qos, \
3543 .get_queue = netdev_linux_get_queue, \
3544 .set_queue = netdev_linux_set_queue, \
3545 .delete_queue = netdev_linux_delete_queue, \
3546 .get_queue_stats = netdev_linux_get_queue_stats, \
3547 .queue_dump_start = netdev_linux_queue_dump_start, \
3548 .queue_dump_next = netdev_linux_queue_dump_next, \
3549 .queue_dump_done = netdev_linux_queue_dump_done, \
3550 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3551 .set_in4 = netdev_linux_set_in4, \
3552 .get_addr_list = netdev_linux_get_addr_list, \
3553 .add_router = netdev_linux_add_router, \
3554 .get_next_hop = netdev_linux_get_next_hop, \
3555 .arp_lookup = netdev_linux_arp_lookup, \
3556 .update_flags = netdev_linux_update_flags, \
3557 .rxq_alloc = netdev_linux_rxq_alloc, \
3558 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3559 .rxq_wait = netdev_linux_rxq_wait, \
3560 .rxq_drain = netdev_linux_rxq_drain
3561
3562 const struct netdev_class netdev_linux_class = {
3563 NETDEV_LINUX_CLASS_COMMON,
3564 .type = "system",
3565 .is_pmd = false,
3566 .construct = netdev_linux_construct,
3567 .destruct = netdev_linux_destruct,
3568 .get_stats = netdev_linux_get_stats,
3569 .get_features = netdev_linux_get_features,
3570 .get_status = netdev_linux_get_status,
3571 .get_block_id = netdev_linux_get_block_id,
3572 .send = netdev_linux_send,
3573 .rxq_construct = netdev_linux_rxq_construct,
3574 .rxq_destruct = netdev_linux_rxq_destruct,
3575 .rxq_recv = netdev_linux_rxq_recv,
3576 };
3577
3578 const struct netdev_class netdev_tap_class = {
3579 NETDEV_LINUX_CLASS_COMMON,
3580 .type = "tap",
3581 .is_pmd = false,
3582 .construct = netdev_linux_construct_tap,
3583 .destruct = netdev_linux_destruct,
3584 .get_stats = netdev_tap_get_stats,
3585 .get_features = netdev_linux_get_features,
3586 .get_status = netdev_linux_get_status,
3587 .send = netdev_linux_send,
3588 .rxq_construct = netdev_linux_rxq_construct,
3589 .rxq_destruct = netdev_linux_rxq_destruct,
3590 .rxq_recv = netdev_linux_rxq_recv,
3591 };
3592
3593 const struct netdev_class netdev_internal_class = {
3594 NETDEV_LINUX_CLASS_COMMON,
3595 .type = "internal",
3596 .is_pmd = false,
3597 .construct = netdev_linux_construct,
3598 .destruct = netdev_linux_destruct,
3599 .get_stats = netdev_internal_get_stats,
3600 .get_status = netdev_internal_get_status,
3601 .send = netdev_linux_send,
3602 .rxq_construct = netdev_linux_rxq_construct,
3603 .rxq_destruct = netdev_linux_rxq_destruct,
3604 .rxq_recv = netdev_linux_rxq_recv,
3605 };
3606
3607 #ifdef HAVE_AF_XDP
3608 const struct netdev_class netdev_afxdp_class = {
3609 NETDEV_LINUX_CLASS_COMMON,
3610 .type = "afxdp",
3611 .is_pmd = true,
3612 .init = netdev_afxdp_init,
3613 .construct = netdev_afxdp_construct,
3614 .destruct = netdev_afxdp_destruct,
3615 .get_stats = netdev_afxdp_get_stats,
3616 .get_custom_stats = netdev_afxdp_get_custom_stats,
3617 .get_status = netdev_linux_get_status,
3618 .set_config = netdev_afxdp_set_config,
3619 .get_config = netdev_afxdp_get_config,
3620 .reconfigure = netdev_afxdp_reconfigure,
3621 .get_numa_id = netdev_linux_get_numa_id,
3622 .send = netdev_afxdp_batch_send,
3623 .rxq_construct = netdev_afxdp_rxq_construct,
3624 .rxq_destruct = netdev_afxdp_rxq_destruct,
3625 .rxq_recv = netdev_afxdp_rxq_recv,
3626 };
3627 #endif
3628 \f
3629
3630 #define CODEL_N_QUEUES 0x0000
3631
3632 /* In sufficiently new kernel headers these are defined as enums in
3633 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3634 * kernels. (This overrides any enum definition in the header file but that's
3635 * harmless.) */
3636 #define TCA_CODEL_TARGET 1
3637 #define TCA_CODEL_LIMIT 2
3638 #define TCA_CODEL_INTERVAL 3
3639
3640 struct codel {
3641 struct tc tc;
3642 uint32_t target;
3643 uint32_t limit;
3644 uint32_t interval;
3645 };
3646
3647 static struct codel *
3648 codel_get__(const struct netdev *netdev_)
3649 {
3650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3651 return CONTAINER_OF(netdev->tc, struct codel, tc);
3652 }
3653
3654 static void
3655 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3656 uint32_t interval)
3657 {
3658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3659 struct codel *codel;
3660
3661 codel = xmalloc(sizeof *codel);
3662 tc_init(&codel->tc, &tc_ops_codel);
3663 codel->target = target;
3664 codel->limit = limit;
3665 codel->interval = interval;
3666
3667 netdev->tc = &codel->tc;
3668 }
3669
3670 static int
3671 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3672 uint32_t interval)
3673 {
3674 size_t opt_offset;
3675 struct ofpbuf request;
3676 struct tcmsg *tcmsg;
3677 uint32_t otarget, olimit, ointerval;
3678 int error;
3679
3680 tc_del_qdisc(netdev);
3681
3682 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3683 NLM_F_EXCL | NLM_F_CREATE, &request);
3684 if (!tcmsg) {
3685 return ENODEV;
3686 }
3687 tcmsg->tcm_handle = tc_make_handle(1, 0);
3688 tcmsg->tcm_parent = TC_H_ROOT;
3689
3690 otarget = target ? target : 5000;
3691 olimit = limit ? limit : 10240;
3692 ointerval = interval ? interval : 100000;
3693
3694 nl_msg_put_string(&request, TCA_KIND, "codel");
3695 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3696 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3697 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3698 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3699 nl_msg_end_nested(&request, opt_offset);
3700
3701 error = tc_transact(&request, NULL);
3702 if (error) {
3703 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3704 "target %u, limit %u, interval %u error %d(%s)",
3705 netdev_get_name(netdev),
3706 otarget, olimit, ointerval,
3707 error, ovs_strerror(error));
3708 }
3709 return error;
3710 }
3711
3712 static void
3713 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3714 const struct smap *details, struct codel *codel)
3715 {
3716 codel->target = smap_get_ullong(details, "target", 0);
3717 codel->limit = smap_get_ullong(details, "limit", 0);
3718 codel->interval = smap_get_ullong(details, "interval", 0);
3719
3720 if (!codel->target) {
3721 codel->target = 5000;
3722 }
3723 if (!codel->limit) {
3724 codel->limit = 10240;
3725 }
3726 if (!codel->interval) {
3727 codel->interval = 100000;
3728 }
3729 }
3730
3731 static int
3732 codel_tc_install(struct netdev *netdev, const struct smap *details)
3733 {
3734 int error;
3735 struct codel codel;
3736
3737 codel_parse_qdisc_details__(netdev, details, &codel);
3738 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3739 codel.interval);
3740 if (!error) {
3741 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3742 }
3743 return error;
3744 }
3745
3746 static int
3747 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3748 {
3749 static const struct nl_policy tca_codel_policy[] = {
3750 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3751 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3752 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3753 };
3754
3755 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3756
3757 if (!nl_parse_nested(nl_options, tca_codel_policy,
3758 attrs, ARRAY_SIZE(tca_codel_policy))) {
3759 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3760 return EPROTO;
3761 }
3762
3763 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3764 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3765 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3766 return 0;
3767 }
3768
3769 static int
3770 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3771 {
3772 struct nlattr *nlattr;
3773 const char * kind;
3774 int error;
3775 struct codel codel;
3776
3777 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3778 if (error != 0) {
3779 return error;
3780 }
3781
3782 error = codel_parse_tca_options__(nlattr, &codel);
3783 if (error != 0) {
3784 return error;
3785 }
3786
3787 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3788 return 0;
3789 }
3790
3791
3792 static void
3793 codel_tc_destroy(struct tc *tc)
3794 {
3795 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3796 tc_destroy(tc);
3797 free(codel);
3798 }
3799
3800 static int
3801 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3802 {
3803 const struct codel *codel = codel_get__(netdev);
3804 smap_add_format(details, "target", "%u", codel->target);
3805 smap_add_format(details, "limit", "%u", codel->limit);
3806 smap_add_format(details, "interval", "%u", codel->interval);
3807 return 0;
3808 }
3809
3810 static int
3811 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3812 {
3813 struct codel codel;
3814
3815 codel_parse_qdisc_details__(netdev, details, &codel);
3816 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3817 codel_get__(netdev)->target = codel.target;
3818 codel_get__(netdev)->limit = codel.limit;
3819 codel_get__(netdev)->interval = codel.interval;
3820 return 0;
3821 }
3822
3823 static const struct tc_ops tc_ops_codel = {
3824 .linux_name = "codel",
3825 .ovs_name = "linux-codel",
3826 .n_queues = CODEL_N_QUEUES,
3827 .tc_install = codel_tc_install,
3828 .tc_load = codel_tc_load,
3829 .tc_destroy = codel_tc_destroy,
3830 .qdisc_get = codel_qdisc_get,
3831 .qdisc_set = codel_qdisc_set,
3832 };
3833 \f
3834 /* FQ-CoDel traffic control class. */
3835
3836 #define FQCODEL_N_QUEUES 0x0000
3837
3838 /* In sufficiently new kernel headers these are defined as enums in
3839 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3840 * kernels. (This overrides any enum definition in the header file but that's
3841 * harmless.) */
3842 #define TCA_FQ_CODEL_TARGET 1
3843 #define TCA_FQ_CODEL_LIMIT 2
3844 #define TCA_FQ_CODEL_INTERVAL 3
3845 #define TCA_FQ_CODEL_ECN 4
3846 #define TCA_FQ_CODEL_FLOWS 5
3847 #define TCA_FQ_CODEL_QUANTUM 6
3848
3849 struct fqcodel {
3850 struct tc tc;
3851 uint32_t target;
3852 uint32_t limit;
3853 uint32_t interval;
3854 uint32_t flows;
3855 uint32_t quantum;
3856 };
3857
3858 static struct fqcodel *
3859 fqcodel_get__(const struct netdev *netdev_)
3860 {
3861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3862 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3863 }
3864
3865 static void
3866 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3867 uint32_t interval, uint32_t flows, uint32_t quantum)
3868 {
3869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3870 struct fqcodel *fqcodel;
3871
3872 fqcodel = xmalloc(sizeof *fqcodel);
3873 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3874 fqcodel->target = target;
3875 fqcodel->limit = limit;
3876 fqcodel->interval = interval;
3877 fqcodel->flows = flows;
3878 fqcodel->quantum = quantum;
3879
3880 netdev->tc = &fqcodel->tc;
3881 }
3882
3883 static int
3884 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3885 uint32_t interval, uint32_t flows, uint32_t quantum)
3886 {
3887 size_t opt_offset;
3888 struct ofpbuf request;
3889 struct tcmsg *tcmsg;
3890 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3891 int error;
3892
3893 tc_del_qdisc(netdev);
3894
3895 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3896 NLM_F_EXCL | NLM_F_CREATE, &request);
3897 if (!tcmsg) {
3898 return ENODEV;
3899 }
3900 tcmsg->tcm_handle = tc_make_handle(1, 0);
3901 tcmsg->tcm_parent = TC_H_ROOT;
3902
3903 otarget = target ? target : 5000;
3904 olimit = limit ? limit : 10240;
3905 ointerval = interval ? interval : 100000;
3906 oflows = flows ? flows : 1024;
3907 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3908 not mtu */
3909
3910 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3911 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3912 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3913 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3914 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3915 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3916 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3917 nl_msg_end_nested(&request, opt_offset);
3918
3919 error = tc_transact(&request, NULL);
3920 if (error) {
3921 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3922 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3923 netdev_get_name(netdev),
3924 otarget, olimit, ointerval, oflows, oquantum,
3925 error, ovs_strerror(error));
3926 }
3927 return error;
3928 }
3929
3930 static void
3931 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3932 const struct smap *details, struct fqcodel *fqcodel)
3933 {
3934 fqcodel->target = smap_get_ullong(details, "target", 0);
3935 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3936 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3937 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3938 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3939
3940 if (!fqcodel->target) {
3941 fqcodel->target = 5000;
3942 }
3943 if (!fqcodel->limit) {
3944 fqcodel->limit = 10240;
3945 }
3946 if (!fqcodel->interval) {
3947 fqcodel->interval = 1000000;
3948 }
3949 if (!fqcodel->flows) {
3950 fqcodel->flows = 1024;
3951 }
3952 if (!fqcodel->quantum) {
3953 fqcodel->quantum = 1514;
3954 }
3955 }
3956
3957 static int
3958 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3959 {
3960 int error;
3961 struct fqcodel fqcodel;
3962
3963 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3964 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3965 fqcodel.interval, fqcodel.flows,
3966 fqcodel.quantum);
3967 if (!error) {
3968 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3969 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3970 }
3971 return error;
3972 }
3973
3974 static int
3975 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3976 {
3977 static const struct nl_policy tca_fqcodel_policy[] = {
3978 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3979 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3980 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3981 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3982 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3983 };
3984
3985 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3986
3987 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3988 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3989 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3990 return EPROTO;
3991 }
3992
3993 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3994 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3995 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3996 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3997 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3998 return 0;
3999 }
4000
4001 static int
4002 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4003 {
4004 struct nlattr *nlattr;
4005 const char * kind;
4006 int error;
4007 struct fqcodel fqcodel;
4008
4009 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4010 if (error != 0) {
4011 return error;
4012 }
4013
4014 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4015 if (error != 0) {
4016 return error;
4017 }
4018
4019 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4020 fqcodel.flows, fqcodel.quantum);
4021 return 0;
4022 }
4023
4024 static void
4025 fqcodel_tc_destroy(struct tc *tc)
4026 {
4027 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4028 tc_destroy(tc);
4029 free(fqcodel);
4030 }
4031
4032 static int
4033 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4034 {
4035 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4036 smap_add_format(details, "target", "%u", fqcodel->target);
4037 smap_add_format(details, "limit", "%u", fqcodel->limit);
4038 smap_add_format(details, "interval", "%u", fqcodel->interval);
4039 smap_add_format(details, "flows", "%u", fqcodel->flows);
4040 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4041 return 0;
4042 }
4043
4044 static int
4045 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4046 {
4047 struct fqcodel fqcodel;
4048
4049 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4050 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4051 fqcodel.flows, fqcodel.quantum);
4052 fqcodel_get__(netdev)->target = fqcodel.target;
4053 fqcodel_get__(netdev)->limit = fqcodel.limit;
4054 fqcodel_get__(netdev)->interval = fqcodel.interval;
4055 fqcodel_get__(netdev)->flows = fqcodel.flows;
4056 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4057 return 0;
4058 }
4059
4060 static const struct tc_ops tc_ops_fqcodel = {
4061 .linux_name = "fq_codel",
4062 .ovs_name = "linux-fq_codel",
4063 .n_queues = FQCODEL_N_QUEUES,
4064 .tc_install = fqcodel_tc_install,
4065 .tc_load = fqcodel_tc_load,
4066 .tc_destroy = fqcodel_tc_destroy,
4067 .qdisc_get = fqcodel_qdisc_get,
4068 .qdisc_set = fqcodel_qdisc_set,
4069 };
4070 \f
4071 /* SFQ traffic control class. */
4072
4073 #define SFQ_N_QUEUES 0x0000
4074
4075 struct sfq {
4076 struct tc tc;
4077 uint32_t quantum;
4078 uint32_t perturb;
4079 };
4080
4081 static struct sfq *
4082 sfq_get__(const struct netdev *netdev_)
4083 {
4084 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4085 return CONTAINER_OF(netdev->tc, struct sfq, tc);
4086 }
4087
4088 static void
4089 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4090 {
4091 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4092 struct sfq *sfq;
4093
4094 sfq = xmalloc(sizeof *sfq);
4095 tc_init(&sfq->tc, &tc_ops_sfq);
4096 sfq->perturb = perturb;
4097 sfq->quantum = quantum;
4098
4099 netdev->tc = &sfq->tc;
4100 }
4101
4102 static int
4103 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4104 {
4105 struct tc_sfq_qopt opt;
4106 struct ofpbuf request;
4107 struct tcmsg *tcmsg;
4108 int mtu;
4109 int mtu_error, error;
4110 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4111
4112 tc_del_qdisc(netdev);
4113
4114 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4115 NLM_F_EXCL | NLM_F_CREATE, &request);
4116 if (!tcmsg) {
4117 return ENODEV;
4118 }
4119 tcmsg->tcm_handle = tc_make_handle(1, 0);
4120 tcmsg->tcm_parent = TC_H_ROOT;
4121
4122 memset(&opt, 0, sizeof opt);
4123 if (!quantum) {
4124 if (!mtu_error) {
4125 opt.quantum = mtu; /* if we cannot find mtu, use default */
4126 }
4127 } else {
4128 opt.quantum = quantum;
4129 }
4130
4131 if (!perturb) {
4132 opt.perturb_period = 10;
4133 } else {
4134 opt.perturb_period = perturb;
4135 }
4136
4137 nl_msg_put_string(&request, TCA_KIND, "sfq");
4138 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4139
4140 error = tc_transact(&request, NULL);
4141 if (error) {
4142 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4143 "quantum %u, perturb %u error %d(%s)",
4144 netdev_get_name(netdev),
4145 opt.quantum, opt.perturb_period,
4146 error, ovs_strerror(error));
4147 }
4148 return error;
4149 }
4150
4151 static void
4152 sfq_parse_qdisc_details__(struct netdev *netdev,
4153 const struct smap *details, struct sfq *sfq)
4154 {
4155 sfq->perturb = smap_get_ullong(details, "perturb", 0);
4156 sfq->quantum = smap_get_ullong(details, "quantum", 0);
4157
4158 if (!sfq->perturb) {
4159 sfq->perturb = 10;
4160 }
4161
4162 if (!sfq->quantum) {
4163 int mtu;
4164 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4165 sfq->quantum = mtu;
4166 } else {
4167 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4168 "device without mtu");
4169 }
4170 }
4171 }
4172
4173 static int
4174 sfq_tc_install(struct netdev *netdev, const struct smap *details)
4175 {
4176 int error;
4177 struct sfq sfq;
4178
4179 sfq_parse_qdisc_details__(netdev, details, &sfq);
4180 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4181 if (!error) {
4182 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4183 }
4184 return error;
4185 }
4186
4187 static int
4188 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4189 {
4190 const struct tc_sfq_qopt *sfq;
4191 struct nlattr *nlattr;
4192 const char * kind;
4193 int error;
4194
4195 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4196 if (error == 0) {
4197 sfq = nl_attr_get(nlattr);
4198 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4199 return 0;
4200 }
4201
4202 return error;
4203 }
4204
4205 static void
4206 sfq_tc_destroy(struct tc *tc)
4207 {
4208 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4209 tc_destroy(tc);
4210 free(sfq);
4211 }
4212
4213 static int
4214 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4215 {
4216 const struct sfq *sfq = sfq_get__(netdev);
4217 smap_add_format(details, "quantum", "%u", sfq->quantum);
4218 smap_add_format(details, "perturb", "%u", sfq->perturb);
4219 return 0;
4220 }
4221
4222 static int
4223 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4224 {
4225 struct sfq sfq;
4226
4227 sfq_parse_qdisc_details__(netdev, details, &sfq);
4228 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4229 sfq_get__(netdev)->quantum = sfq.quantum;
4230 sfq_get__(netdev)->perturb = sfq.perturb;
4231 return 0;
4232 }
4233
4234 static const struct tc_ops tc_ops_sfq = {
4235 .linux_name = "sfq",
4236 .ovs_name = "linux-sfq",
4237 .n_queues = SFQ_N_QUEUES,
4238 .tc_install = sfq_tc_install,
4239 .tc_load = sfq_tc_load,
4240 .tc_destroy = sfq_tc_destroy,
4241 .qdisc_get = sfq_qdisc_get,
4242 .qdisc_set = sfq_qdisc_set,
4243 };
4244 \f
4245 /* netem traffic control class. */
4246
4247 struct netem {
4248 struct tc tc;
4249 uint32_t latency;
4250 uint32_t limit;
4251 uint32_t loss;
4252 };
4253
4254 static struct netem *
4255 netem_get__(const struct netdev *netdev_)
4256 {
4257 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4258 return CONTAINER_OF(netdev->tc, struct netem, tc);
4259 }
4260
4261 static void
4262 netem_install__(struct netdev *netdev_, uint32_t latency,
4263 uint32_t limit, uint32_t loss)
4264 {
4265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4266 struct netem *netem;
4267
4268 netem = xmalloc(sizeof *netem);
4269 tc_init(&netem->tc, &tc_ops_netem);
4270 netem->latency = latency;
4271 netem->limit = limit;
4272 netem->loss = loss;
4273
4274 netdev->tc = &netem->tc;
4275 }
4276
4277 static int
4278 netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4279 uint32_t limit, uint32_t loss)
4280 {
4281 struct tc_netem_qopt opt;
4282 struct ofpbuf request;
4283 struct tcmsg *tcmsg;
4284 int error;
4285
4286 tc_del_qdisc(netdev);
4287
4288 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4289 NLM_F_EXCL | NLM_F_CREATE, &request);
4290 if (!tcmsg) {
4291 return ENODEV;
4292 }
4293 tcmsg->tcm_handle = tc_make_handle(1, 0);
4294 tcmsg->tcm_parent = TC_H_ROOT;
4295
4296 memset(&opt, 0, sizeof opt);
4297
4298 if (!limit) {
4299 opt.limit = 1000;
4300 } else {
4301 opt.limit = limit;
4302 }
4303
4304 if (loss) {
4305 if (loss > 100) {
4306 VLOG_WARN_RL(&rl,
4307 "loss should be a percentage value between 0 to 100, "
4308 "loss was %u", loss);
4309 return EINVAL;
4310 }
4311 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4312 }
4313
4314 opt.latency = tc_time_to_ticks(latency);
4315
4316 nl_msg_put_string(&request, TCA_KIND, "netem");
4317 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4318
4319 error = tc_transact(&request, NULL);
4320 if (error) {
4321 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4322 "latency %u, limit %u, loss %u error %d(%s)",
4323 netdev_get_name(netdev),
4324 opt.latency, opt.limit, opt.loss,
4325 error, ovs_strerror(error));
4326 }
4327 return error;
4328 }
4329
4330 static void
4331 netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4332 const struct smap *details, struct netem *netem)
4333 {
4334 netem->latency = smap_get_ullong(details, "latency", 0);
4335 netem->limit = smap_get_ullong(details, "limit", 0);
4336 netem->loss = smap_get_ullong(details, "loss", 0);
4337
4338 if (!netem->limit) {
4339 netem->limit = 1000;
4340 }
4341 }
4342
4343 static int
4344 netem_tc_install(struct netdev *netdev, const struct smap *details)
4345 {
4346 int error;
4347 struct netem netem;
4348
4349 netem_parse_qdisc_details__(netdev, details, &netem);
4350 error = netem_setup_qdisc__(netdev, netem.latency,
4351 netem.limit, netem.loss);
4352 if (!error) {
4353 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4354 }
4355 return error;
4356 }
4357
4358 static int
4359 netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4360 {
4361 const struct tc_netem_qopt *netem;
4362 struct nlattr *nlattr;
4363 const char *kind;
4364 int error;
4365
4366 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4367 if (error == 0) {
4368 netem = nl_attr_get(nlattr);
4369 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4370 return 0;
4371 }
4372
4373 return error;
4374 }
4375
4376 static void
4377 netem_tc_destroy(struct tc *tc)
4378 {
4379 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4380 tc_destroy(tc);
4381 free(netem);
4382 }
4383
4384 static int
4385 netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4386 {
4387 const struct netem *netem = netem_get__(netdev);
4388 smap_add_format(details, "latency", "%u", netem->latency);
4389 smap_add_format(details, "limit", "%u", netem->limit);
4390 smap_add_format(details, "loss", "%u", netem->loss);
4391 return 0;
4392 }
4393
4394 static int
4395 netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4396 {
4397 struct netem netem;
4398
4399 netem_parse_qdisc_details__(netdev, details, &netem);
4400 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4401 netem_get__(netdev)->latency = netem.latency;
4402 netem_get__(netdev)->limit = netem.limit;
4403 netem_get__(netdev)->loss = netem.loss;
4404 return 0;
4405 }
4406
4407 static const struct tc_ops tc_ops_netem = {
4408 .linux_name = "netem",
4409 .ovs_name = "linux-netem",
4410 .n_queues = 0,
4411 .tc_install = netem_tc_install,
4412 .tc_load = netem_tc_load,
4413 .tc_destroy = netem_tc_destroy,
4414 .qdisc_get = netem_qdisc_get,
4415 .qdisc_set = netem_qdisc_set,
4416 };
4417 \f
4418 /* HTB traffic control class. */
4419
4420 #define HTB_N_QUEUES 0xf000
4421 #define HTB_RATE2QUANTUM 10
4422
4423 struct htb {
4424 struct tc tc;
4425 unsigned int max_rate; /* In bytes/s. */
4426 };
4427
4428 struct htb_class {
4429 struct tc_queue tc_queue;
4430 unsigned int min_rate; /* In bytes/s. */
4431 unsigned int max_rate; /* In bytes/s. */
4432 unsigned int burst; /* In bytes. */
4433 unsigned int priority; /* Lower values are higher priorities. */
4434 };
4435
4436 static struct htb *
4437 htb_get__(const struct netdev *netdev_)
4438 {
4439 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4440 return CONTAINER_OF(netdev->tc, struct htb, tc);
4441 }
4442
4443 static void
4444 htb_install__(struct netdev *netdev_, uint64_t max_rate)
4445 {
4446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4447 struct htb *htb;
4448
4449 htb = xmalloc(sizeof *htb);
4450 tc_init(&htb->tc, &tc_ops_htb);
4451 htb->max_rate = max_rate;
4452
4453 netdev->tc = &htb->tc;
4454 }
4455
4456 /* Create an HTB qdisc.
4457 *
4458 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4459 static int
4460 htb_setup_qdisc__(struct netdev *netdev)
4461 {
4462 size_t opt_offset;
4463 struct tc_htb_glob opt;
4464 struct ofpbuf request;
4465 struct tcmsg *tcmsg;
4466
4467 tc_del_qdisc(netdev);
4468
4469 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4470 NLM_F_EXCL | NLM_F_CREATE, &request);
4471 if (!tcmsg) {
4472 return ENODEV;
4473 }
4474 tcmsg->tcm_handle = tc_make_handle(1, 0);
4475 tcmsg->tcm_parent = TC_H_ROOT;
4476
4477 nl_msg_put_string(&request, TCA_KIND, "htb");
4478
4479 memset(&opt, 0, sizeof opt);
4480 opt.rate2quantum = HTB_RATE2QUANTUM;
4481 opt.version = 3;
4482 opt.defcls = 1;
4483
4484 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4485 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4486 nl_msg_end_nested(&request, opt_offset);
4487
4488 return tc_transact(&request, NULL);
4489 }
4490
4491 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4492 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4493 static int
4494 htb_setup_class__(struct netdev *netdev, unsigned int handle,
4495 unsigned int parent, struct htb_class *class)
4496 {
4497 size_t opt_offset;
4498 struct tc_htb_opt opt;
4499 struct ofpbuf request;
4500 struct tcmsg *tcmsg;
4501 int error;
4502 int mtu;
4503
4504 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4505 if (error) {
4506 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4507 netdev_get_name(netdev));
4508 return error;
4509 }
4510
4511 memset(&opt, 0, sizeof opt);
4512 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4513 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4514 /* Makes sure the quantum is at least MTU. Setting quantum will
4515 * make htb ignore the r2q for this class. */
4516 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4517 opt.quantum = mtu;
4518 }
4519 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4520 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4521 opt.prio = class->priority;
4522
4523 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4524 &request);
4525 if (!tcmsg) {
4526 return ENODEV;
4527 }
4528 tcmsg->tcm_handle = handle;
4529 tcmsg->tcm_parent = parent;
4530
4531 nl_msg_put_string(&request, TCA_KIND, "htb");
4532 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4533 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4534 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4535 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4536 nl_msg_end_nested(&request, opt_offset);
4537
4538 error = tc_transact(&request, NULL);
4539 if (error) {
4540 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4541 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4542 netdev_get_name(netdev),
4543 tc_get_major(handle), tc_get_minor(handle),
4544 tc_get_major(parent), tc_get_minor(parent),
4545 class->min_rate, class->max_rate,
4546 class->burst, class->priority, ovs_strerror(error));
4547 }
4548 return error;
4549 }
4550
4551 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4552 * description of them into 'details'. The description complies with the
4553 * specification given in the vswitch database documentation for linux-htb
4554 * queue details. */
4555 static int
4556 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4557 {
4558 static const struct nl_policy tca_htb_policy[] = {
4559 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4560 .min_len = sizeof(struct tc_htb_opt) },
4561 };
4562
4563 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4564 const struct tc_htb_opt *htb;
4565
4566 if (!nl_parse_nested(nl_options, tca_htb_policy,
4567 attrs, ARRAY_SIZE(tca_htb_policy))) {
4568 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4569 return EPROTO;
4570 }
4571
4572 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4573 class->min_rate = htb->rate.rate;
4574 class->max_rate = htb->ceil.rate;
4575 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4576 class->priority = htb->prio;
4577 return 0;
4578 }
4579
4580 static int
4581 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4582 struct htb_class *options,
4583 struct netdev_queue_stats *stats)
4584 {
4585 struct nlattr *nl_options;
4586 unsigned int handle;
4587 int error;
4588
4589 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4590 if (!error && queue_id) {
4591 unsigned int major = tc_get_major(handle);
4592 unsigned int minor = tc_get_minor(handle);
4593 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4594 *queue_id = minor - 1;
4595 } else {
4596 error = EPROTO;
4597 }
4598 }
4599 if (!error && options) {
4600 error = htb_parse_tca_options__(nl_options, options);
4601 }
4602 return error;
4603 }
4604
4605 static void
4606 htb_parse_qdisc_details__(struct netdev *netdev_,
4607 const struct smap *details, struct htb_class *hc)
4608 {
4609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4610
4611 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4612 if (!hc->max_rate) {
4613 enum netdev_features current;
4614
4615 netdev_linux_read_features(netdev);
4616 current = !netdev->get_features_error ? netdev->current : 0;
4617 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4618 }
4619 hc->min_rate = hc->max_rate;
4620 hc->burst = 0;
4621 hc->priority = 0;
4622 }
4623
4624 static int
4625 htb_parse_class_details__(struct netdev *netdev,
4626 const struct smap *details, struct htb_class *hc)
4627 {
4628 const struct htb *htb = htb_get__(netdev);
4629 int mtu, error;
4630 unsigned long long int max_rate_bit;
4631
4632 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4633 if (error) {
4634 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4635 netdev_get_name(netdev));
4636 return error;
4637 }
4638
4639 /* HTB requires at least an mtu sized min-rate to send any traffic even
4640 * on uncongested links. */
4641 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4642 hc->min_rate = MAX(hc->min_rate, mtu);
4643 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4644
4645 /* max-rate */
4646 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4647 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4648 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4649 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4650
4651 /* burst
4652 *
4653 * According to hints in the documentation that I've read, it is important
4654 * that 'burst' be at least as big as the largest frame that might be
4655 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4656 * but having it a bit too small is a problem. Since netdev_get_mtu()
4657 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4658 * the MTU. We actually add 64, instead of 14, as a guard against
4659 * additional headers get tacked on somewhere that we're not aware of. */
4660 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4661 hc->burst = MAX(hc->burst, mtu + 64);
4662
4663 /* priority */
4664 hc->priority = smap_get_ullong(details, "priority", 0);
4665
4666 return 0;
4667 }
4668
4669 static int
4670 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4671 unsigned int parent, struct htb_class *options,
4672 struct netdev_queue_stats *stats)
4673 {
4674 struct ofpbuf *reply;
4675 int error;
4676
4677 error = tc_query_class(netdev, handle, parent, &reply);
4678 if (!error) {
4679 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4680 ofpbuf_delete(reply);
4681 }
4682 return error;
4683 }
4684
4685 static int
4686 htb_tc_install(struct netdev *netdev, const struct smap *details)
4687 {
4688 int error;
4689
4690 error = htb_setup_qdisc__(netdev);
4691 if (!error) {
4692 struct htb_class hc;
4693
4694 htb_parse_qdisc_details__(netdev, details, &hc);
4695 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4696 tc_make_handle(1, 0), &hc);
4697 if (!error) {
4698 htb_install__(netdev, hc.max_rate);
4699 }
4700 }
4701 return error;
4702 }
4703
4704 static struct htb_class *
4705 htb_class_cast__(const struct tc_queue *queue)
4706 {
4707 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4708 }
4709
4710 static void
4711 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4712 const struct htb_class *hc)
4713 {
4714 struct htb *htb = htb_get__(netdev);
4715 size_t hash = hash_int(queue_id, 0);
4716 struct tc_queue *queue;
4717 struct htb_class *hcp;
4718
4719 queue = tc_find_queue__(netdev, queue_id, hash);
4720 if (queue) {
4721 hcp = htb_class_cast__(queue);
4722 } else {
4723 hcp = xmalloc(sizeof *hcp);
4724 queue = &hcp->tc_queue;
4725 queue->queue_id = queue_id;
4726 queue->created = time_msec();
4727 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4728 }
4729
4730 hcp->min_rate = hc->min_rate;
4731 hcp->max_rate = hc->max_rate;
4732 hcp->burst = hc->burst;
4733 hcp->priority = hc->priority;
4734 }
4735
4736 static int
4737 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4738 {
4739 struct ofpbuf msg;
4740 struct queue_dump_state state;
4741 struct htb_class hc;
4742
4743 /* Get qdisc options. */
4744 hc.max_rate = 0;
4745 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4746 htb_install__(netdev, hc.max_rate);
4747
4748 /* Get queues. */
4749 if (!start_queue_dump(netdev, &state)) {
4750 return ENODEV;
4751 }
4752 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4753 unsigned int queue_id;
4754
4755 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4756 htb_update_queue__(netdev, queue_id, &hc);
4757 }
4758 }
4759 finish_queue_dump(&state);
4760
4761 return 0;
4762 }
4763
4764 static void
4765 htb_tc_destroy(struct tc *tc)
4766 {
4767 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4768 struct htb_class *hc;
4769
4770 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4771 free(hc);
4772 }
4773 tc_destroy(tc);
4774 free(htb);
4775 }
4776
4777 static int
4778 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4779 {
4780 const struct htb *htb = htb_get__(netdev);
4781 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4782 return 0;
4783 }
4784
4785 static int
4786 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4787 {
4788 struct htb_class hc;
4789 int error;
4790
4791 htb_parse_qdisc_details__(netdev, details, &hc);
4792 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4793 tc_make_handle(1, 0), &hc);
4794 if (!error) {
4795 htb_get__(netdev)->max_rate = hc.max_rate;
4796 }
4797 return error;
4798 }
4799
4800 static int
4801 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4802 const struct tc_queue *queue, struct smap *details)
4803 {
4804 const struct htb_class *hc = htb_class_cast__(queue);
4805
4806 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4807 if (hc->min_rate != hc->max_rate) {
4808 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4809 }
4810 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4811 if (hc->priority) {
4812 smap_add_format(details, "priority", "%u", hc->priority);
4813 }
4814 return 0;
4815 }
4816
4817 static int
4818 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4819 const struct smap *details)
4820 {
4821 struct htb_class hc;
4822 int error;
4823
4824 error = htb_parse_class_details__(netdev, details, &hc);
4825 if (error) {
4826 return error;
4827 }
4828
4829 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4830 tc_make_handle(1, 0xfffe), &hc);
4831 if (error) {
4832 return error;
4833 }
4834
4835 htb_update_queue__(netdev, queue_id, &hc);
4836 return 0;
4837 }
4838
4839 static int
4840 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4841 {
4842 struct htb_class *hc = htb_class_cast__(queue);
4843 struct htb *htb = htb_get__(netdev);
4844 int error;
4845
4846 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4847 if (!error) {
4848 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4849 free(hc);
4850 }
4851 return error;
4852 }
4853
4854 static int
4855 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4856 struct netdev_queue_stats *stats)
4857 {
4858 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4859 tc_make_handle(1, 0xfffe), NULL, stats);
4860 }
4861
4862 static int
4863 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4864 const struct ofpbuf *nlmsg,
4865 netdev_dump_queue_stats_cb *cb, void *aux)
4866 {
4867 struct netdev_queue_stats stats;
4868 unsigned int handle, major, minor;
4869 int error;
4870
4871 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4872 if (error) {
4873 return error;
4874 }
4875
4876 major = tc_get_major(handle);
4877 minor = tc_get_minor(handle);
4878 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4879 (*cb)(minor - 1, &stats, aux);
4880 }
4881 return 0;
4882 }
4883
4884 static const struct tc_ops tc_ops_htb = {
4885 .linux_name = "htb",
4886 .ovs_name = "linux-htb",
4887 .n_queues = HTB_N_QUEUES,
4888 .tc_install = htb_tc_install,
4889 .tc_load = htb_tc_load,
4890 .tc_destroy = htb_tc_destroy,
4891 .qdisc_get = htb_qdisc_get,
4892 .qdisc_set = htb_qdisc_set,
4893 .class_get = htb_class_get,
4894 .class_set = htb_class_set,
4895 .class_delete = htb_class_delete,
4896 .class_get_stats = htb_class_get_stats,
4897 .class_dump_stats = htb_class_dump_stats
4898 };
4899 \f
4900 /* "linux-hfsc" traffic control class. */
4901
4902 #define HFSC_N_QUEUES 0xf000
4903
4904 struct hfsc {
4905 struct tc tc;
4906 uint32_t max_rate;
4907 };
4908
4909 struct hfsc_class {
4910 struct tc_queue tc_queue;
4911 uint32_t min_rate;
4912 uint32_t max_rate;
4913 };
4914
4915 static struct hfsc *
4916 hfsc_get__(const struct netdev *netdev_)
4917 {
4918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4919 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4920 }
4921
4922 static struct hfsc_class *
4923 hfsc_class_cast__(const struct tc_queue *queue)
4924 {
4925 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4926 }
4927
4928 static void
4929 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4930 {
4931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4932 struct hfsc *hfsc;
4933
4934 hfsc = xmalloc(sizeof *hfsc);
4935 tc_init(&hfsc->tc, &tc_ops_hfsc);
4936 hfsc->max_rate = max_rate;
4937 netdev->tc = &hfsc->tc;
4938 }
4939
4940 static void
4941 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4942 const struct hfsc_class *hc)
4943 {
4944 size_t hash;
4945 struct hfsc *hfsc;
4946 struct hfsc_class *hcp;
4947 struct tc_queue *queue;
4948
4949 hfsc = hfsc_get__(netdev);
4950 hash = hash_int(queue_id, 0);
4951
4952 queue = tc_find_queue__(netdev, queue_id, hash);
4953 if (queue) {
4954 hcp = hfsc_class_cast__(queue);
4955 } else {
4956 hcp = xmalloc(sizeof *hcp);
4957 queue = &hcp->tc_queue;
4958 queue->queue_id = queue_id;
4959 queue->created = time_msec();
4960 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4961 }
4962
4963 hcp->min_rate = hc->min_rate;
4964 hcp->max_rate = hc->max_rate;
4965 }
4966
4967 static int
4968 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4969 {
4970 const struct tc_service_curve *rsc, *fsc, *usc;
4971 static const struct nl_policy tca_hfsc_policy[] = {
4972 [TCA_HFSC_RSC] = {
4973 .type = NL_A_UNSPEC,
4974 .optional = false,
4975 .min_len = sizeof(struct tc_service_curve),
4976 },
4977 [TCA_HFSC_FSC] = {
4978 .type = NL_A_UNSPEC,
4979 .optional = false,
4980 .min_len = sizeof(struct tc_service_curve),
4981 },
4982 [TCA_HFSC_USC] = {
4983 .type = NL_A_UNSPEC,
4984 .optional = false,
4985 .min_len = sizeof(struct tc_service_curve),
4986 },
4987 };
4988 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4989
4990 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4991 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4992 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4993 return EPROTO;
4994 }
4995
4996 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4997 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4998 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4999
5000 if (rsc->m1 != 0 || rsc->d != 0 ||
5001 fsc->m1 != 0 || fsc->d != 0 ||
5002 usc->m1 != 0 || usc->d != 0) {
5003 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5004 "Non-linear service curves are not supported.");
5005 return EPROTO;
5006 }
5007
5008 if (rsc->m2 != fsc->m2) {
5009 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5010 "Real-time service curves are not supported ");
5011 return EPROTO;
5012 }
5013
5014 if (rsc->m2 > usc->m2) {
5015 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5016 "Min-rate service curve is greater than "
5017 "the max-rate service curve.");
5018 return EPROTO;
5019 }
5020
5021 class->min_rate = fsc->m2;
5022 class->max_rate = usc->m2;
5023 return 0;
5024 }
5025
5026 static int
5027 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5028 struct hfsc_class *options,
5029 struct netdev_queue_stats *stats)
5030 {
5031 int error;
5032 unsigned int handle;
5033 struct nlattr *nl_options;
5034
5035 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5036 if (error) {
5037 return error;
5038 }
5039
5040 if (queue_id) {
5041 unsigned int major, minor;
5042
5043 major = tc_get_major(handle);
5044 minor = tc_get_minor(handle);
5045 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5046 *queue_id = minor - 1;
5047 } else {
5048 return EPROTO;
5049 }
5050 }
5051
5052 if (options) {
5053 error = hfsc_parse_tca_options__(nl_options, options);
5054 }
5055
5056 return error;
5057 }
5058
5059 static int
5060 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5061 unsigned int parent, struct hfsc_class *options,
5062 struct netdev_queue_stats *stats)
5063 {
5064 int error;
5065 struct ofpbuf *reply;
5066
5067 error = tc_query_class(netdev, handle, parent, &reply);
5068 if (error) {
5069 return error;
5070 }
5071
5072 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5073 ofpbuf_delete(reply);
5074 return error;
5075 }
5076
5077 static void
5078 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
5079 struct hfsc_class *class)
5080 {
5081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5082
5083 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5084 if (!max_rate) {
5085 enum netdev_features current;
5086
5087 netdev_linux_read_features(netdev);
5088 current = !netdev->get_features_error ? netdev->current : 0;
5089 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
5090 }
5091
5092 class->min_rate = max_rate;
5093 class->max_rate = max_rate;
5094 }
5095
5096 static int
5097 hfsc_parse_class_details__(struct netdev *netdev,
5098 const struct smap *details,
5099 struct hfsc_class * class)
5100 {
5101 const struct hfsc *hfsc;
5102 uint32_t min_rate, max_rate;
5103
5104 hfsc = hfsc_get__(netdev);
5105
5106 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5107 min_rate = MAX(min_rate, 1);
5108 min_rate = MIN(min_rate, hfsc->max_rate);
5109
5110 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5111 max_rate = MAX(max_rate, min_rate);
5112 max_rate = MIN(max_rate, hfsc->max_rate);
5113
5114 class->min_rate = min_rate;
5115 class->max_rate = max_rate;
5116
5117 return 0;
5118 }
5119
5120 /* Create an HFSC qdisc.
5121 *
5122 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5123 static int
5124 hfsc_setup_qdisc__(struct netdev * netdev)
5125 {
5126 struct tcmsg *tcmsg;
5127 struct ofpbuf request;
5128 struct tc_hfsc_qopt opt;
5129
5130 tc_del_qdisc(netdev);
5131
5132 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5133 NLM_F_EXCL | NLM_F_CREATE, &request);
5134
5135 if (!tcmsg) {
5136 return ENODEV;
5137 }
5138
5139 tcmsg->tcm_handle = tc_make_handle(1, 0);
5140 tcmsg->tcm_parent = TC_H_ROOT;
5141
5142 memset(&opt, 0, sizeof opt);
5143 opt.defcls = 1;
5144
5145 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5146 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5147
5148 return tc_transact(&request, NULL);
5149 }
5150
5151 /* Create an HFSC class.
5152 *
5153 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5154 * sc rate <min_rate> ul rate <max_rate>" */
5155 static int
5156 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5157 unsigned int parent, struct hfsc_class *class)
5158 {
5159 int error;
5160 size_t opt_offset;
5161 struct tcmsg *tcmsg;
5162 struct ofpbuf request;
5163 struct tc_service_curve min, max;
5164
5165 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5166 &request);
5167
5168 if (!tcmsg) {
5169 return ENODEV;
5170 }
5171
5172 tcmsg->tcm_handle = handle;
5173 tcmsg->tcm_parent = parent;
5174
5175 min.m1 = 0;
5176 min.d = 0;
5177 min.m2 = class->min_rate;
5178
5179 max.m1 = 0;
5180 max.d = 0;
5181 max.m2 = class->max_rate;
5182
5183 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5184 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5185 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5186 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5187 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5188 nl_msg_end_nested(&request, opt_offset);
5189
5190 error = tc_transact(&request, NULL);
5191 if (error) {
5192 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5193 "min-rate %ubps, max-rate %ubps (%s)",
5194 netdev_get_name(netdev),
5195 tc_get_major(handle), tc_get_minor(handle),
5196 tc_get_major(parent), tc_get_minor(parent),
5197 class->min_rate, class->max_rate, ovs_strerror(error));
5198 }
5199
5200 return error;
5201 }
5202
5203 static int
5204 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5205 {
5206 int error;
5207 struct hfsc_class class;
5208
5209 error = hfsc_setup_qdisc__(netdev);
5210
5211 if (error) {
5212 return error;
5213 }
5214
5215 hfsc_parse_qdisc_details__(netdev, details, &class);
5216 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5217 tc_make_handle(1, 0), &class);
5218
5219 if (error) {
5220 return error;
5221 }
5222
5223 hfsc_install__(netdev, class.max_rate);
5224 return 0;
5225 }
5226
5227 static int
5228 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5229 {
5230 struct ofpbuf msg;
5231 struct queue_dump_state state;
5232 struct hfsc_class hc;
5233
5234 hc.max_rate = 0;
5235 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5236 hfsc_install__(netdev, hc.max_rate);
5237
5238 if (!start_queue_dump(netdev, &state)) {
5239 return ENODEV;
5240 }
5241
5242 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5243 unsigned int queue_id;
5244
5245 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5246 hfsc_update_queue__(netdev, queue_id, &hc);
5247 }
5248 }
5249
5250 finish_queue_dump(&state);
5251 return 0;
5252 }
5253
5254 static void
5255 hfsc_tc_destroy(struct tc *tc)
5256 {
5257 struct hfsc *hfsc;
5258 struct hfsc_class *hc, *next;
5259
5260 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5261
5262 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
5263 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5264 free(hc);
5265 }
5266
5267 tc_destroy(tc);
5268 free(hfsc);
5269 }
5270
5271 static int
5272 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5273 {
5274 const struct hfsc *hfsc;
5275 hfsc = hfsc_get__(netdev);
5276 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5277 return 0;
5278 }
5279
5280 static int
5281 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5282 {
5283 int error;
5284 struct hfsc_class class;
5285
5286 hfsc_parse_qdisc_details__(netdev, details, &class);
5287 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5288 tc_make_handle(1, 0), &class);
5289
5290 if (!error) {
5291 hfsc_get__(netdev)->max_rate = class.max_rate;
5292 }
5293
5294 return error;
5295 }
5296
5297 static int
5298 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5299 const struct tc_queue *queue, struct smap *details)
5300 {
5301 const struct hfsc_class *hc;
5302
5303 hc = hfsc_class_cast__(queue);
5304 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5305 if (hc->min_rate != hc->max_rate) {
5306 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5307 }
5308 return 0;
5309 }
5310
5311 static int
5312 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5313 const struct smap *details)
5314 {
5315 int error;
5316 struct hfsc_class class;
5317
5318 error = hfsc_parse_class_details__(netdev, details, &class);
5319 if (error) {
5320 return error;
5321 }
5322
5323 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5324 tc_make_handle(1, 0xfffe), &class);
5325 if (error) {
5326 return error;
5327 }
5328
5329 hfsc_update_queue__(netdev, queue_id, &class);
5330 return 0;
5331 }
5332
5333 static int
5334 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5335 {
5336 int error;
5337 struct hfsc *hfsc;
5338 struct hfsc_class *hc;
5339
5340 hc = hfsc_class_cast__(queue);
5341 hfsc = hfsc_get__(netdev);
5342
5343 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5344 if (!error) {
5345 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5346 free(hc);
5347 }
5348 return error;
5349 }
5350
5351 static int
5352 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5353 struct netdev_queue_stats *stats)
5354 {
5355 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5356 tc_make_handle(1, 0xfffe), NULL, stats);
5357 }
5358
5359 static int
5360 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5361 const struct ofpbuf *nlmsg,
5362 netdev_dump_queue_stats_cb *cb, void *aux)
5363 {
5364 struct netdev_queue_stats stats;
5365 unsigned int handle, major, minor;
5366 int error;
5367
5368 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5369 if (error) {
5370 return error;
5371 }
5372
5373 major = tc_get_major(handle);
5374 minor = tc_get_minor(handle);
5375 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5376 (*cb)(minor - 1, &stats, aux);
5377 }
5378 return 0;
5379 }
5380
5381 static const struct tc_ops tc_ops_hfsc = {
5382 .linux_name = "hfsc",
5383 .ovs_name = "linux-hfsc",
5384 .n_queues = HFSC_N_QUEUES, /* n_queues */
5385 .tc_install = hfsc_tc_install,
5386 .tc_load = hfsc_tc_load,
5387 .tc_destroy = hfsc_tc_destroy,
5388 .qdisc_get = hfsc_qdisc_get,
5389 .qdisc_set = hfsc_qdisc_set,
5390 .class_get = hfsc_class_get,
5391 .class_set = hfsc_class_set,
5392 .class_delete = hfsc_class_delete,
5393 .class_get_stats = hfsc_class_get_stats,
5394 .class_dump_stats = hfsc_class_dump_stats,
5395 };
5396 \f
5397 /* "linux-noop" traffic control class. */
5398
5399 static void
5400 noop_install__(struct netdev *netdev_)
5401 {
5402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5403 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5404
5405 netdev->tc = CONST_CAST(struct tc *, &tc);
5406 }
5407
5408 static int
5409 noop_tc_install(struct netdev *netdev,
5410 const struct smap *details OVS_UNUSED)
5411 {
5412 noop_install__(netdev);
5413 return 0;
5414 }
5415
5416 static int
5417 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5418 {
5419 noop_install__(netdev);
5420 return 0;
5421 }
5422
5423 static const struct tc_ops tc_ops_noop = {
5424 .ovs_name = "linux-noop", /* ovs_name */
5425 .tc_install = noop_tc_install,
5426 .tc_load = noop_tc_load,
5427 };
5428 \f
5429 /* "linux-default" traffic control class.
5430 *
5431 * This class represents the default, unnamed Linux qdisc. It corresponds to
5432 * the "" (empty string) QoS type in the OVS database. */
5433
5434 static void
5435 default_install__(struct netdev *netdev_)
5436 {
5437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5438 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5439
5440 /* Nothing but a tc class implementation is allowed to write to a tc. This
5441 * class never does that, so we can legitimately use a const tc object. */
5442 netdev->tc = CONST_CAST(struct tc *, &tc);
5443 }
5444
5445 static int
5446 default_tc_install(struct netdev *netdev,
5447 const struct smap *details OVS_UNUSED)
5448 {
5449 default_install__(netdev);
5450 return 0;
5451 }
5452
5453 static int
5454 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5455 {
5456 default_install__(netdev);
5457 return 0;
5458 }
5459
5460 static const struct tc_ops tc_ops_default = {
5461 .ovs_name = "", /* ovs_name */
5462 .tc_install = default_tc_install,
5463 .tc_load = default_tc_load,
5464 };
5465 \f
5466 /* "linux-other" traffic control class.
5467 *
5468 * */
5469
5470 static int
5471 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5472 {
5473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5474 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5475
5476 /* Nothing but a tc class implementation is allowed to write to a tc. This
5477 * class never does that, so we can legitimately use a const tc object. */
5478 netdev->tc = CONST_CAST(struct tc *, &tc);
5479 return 0;
5480 }
5481
5482 static const struct tc_ops tc_ops_other = {
5483 .ovs_name = "linux-other",
5484 .tc_load = other_tc_load,
5485 };
5486 \f
5487 /* Traffic control. */
5488
5489 /* Number of kernel "tc" ticks per second. */
5490 static double ticks_per_s;
5491
5492 /* Number of kernel "jiffies" per second. This is used for the purpose of
5493 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5494 * one jiffy's worth of data.
5495 *
5496 * There are two possibilities here:
5497 *
5498 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5499 * approximate range of 100 to 1024. That means that we really need to
5500 * make sure that the qdisc can buffer that much data.
5501 *
5502 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5503 * has finely granular timers and there's no need to fudge additional room
5504 * for buffers. (There's no extra effort needed to implement that: the
5505 * large 'buffer_hz' is used as a divisor, so practically any number will
5506 * come out as 0 in the division. Small integer results in the case of
5507 * really high dividends won't have any real effect anyhow.)
5508 */
5509 static unsigned int buffer_hz;
5510
5511 static struct tcmsg *
5512 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5513 unsigned int flags, struct ofpbuf *request)
5514 {
5515 int ifindex;
5516 int error;
5517
5518 error = get_ifindex(netdev, &ifindex);
5519 if (error) {
5520 return NULL;
5521 }
5522
5523 return tc_make_request(ifindex, type, flags, request);
5524 }
5525
5526 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5527 * of 'kbits_burst'.
5528 *
5529 * This function is equivalent to running:
5530 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5531 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5532 * mtu 65535 drop
5533 *
5534 * The configuration and stats may be seen with the following command:
5535 * /sbin/tc -s filter show dev <devname> parent ffff:
5536 *
5537 * Returns 0 if successful, otherwise a positive errno value.
5538 */
5539 static int
5540 tc_add_policer(struct netdev *netdev,
5541 uint32_t kbits_rate, uint32_t kbits_burst)
5542 {
5543 struct tc_police tc_police;
5544 struct ofpbuf request;
5545 struct tcmsg *tcmsg;
5546 size_t basic_offset;
5547 size_t police_offset;
5548 int error;
5549 int mtu = 65535;
5550
5551 memset(&tc_police, 0, sizeof tc_police);
5552 tc_police.action = TC_POLICE_SHOT;
5553 tc_police.mtu = mtu;
5554 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5555
5556 /* The following appears wrong in one way: In networking a kilobit is
5557 * usually 1000 bits but this uses 1024 bits.
5558 *
5559 * However if you "fix" those problems then "tc filter show ..." shows
5560 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5561 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5562 * tc's point of view. Whatever. */
5563 tc_police.burst = tc_bytes_to_ticks(
5564 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5565
5566 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5567 NLM_F_EXCL | NLM_F_CREATE, &request);
5568 if (!tcmsg) {
5569 return ENODEV;
5570 }
5571 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5572 tcmsg->tcm_info = tc_make_handle(49,
5573 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5574
5575 nl_msg_put_string(&request, TCA_KIND, "basic");
5576 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5577 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5578 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5579 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5580 nl_msg_end_nested(&request, police_offset);
5581 nl_msg_end_nested(&request, basic_offset);
5582
5583 error = tc_transact(&request, NULL);
5584 if (error) {
5585 return error;
5586 }
5587
5588 return 0;
5589 }
5590
5591 static void
5592 read_psched(void)
5593 {
5594 /* The values in psched are not individually very meaningful, but they are
5595 * important. The tables below show some values seen in the wild.
5596 *
5597 * Some notes:
5598 *
5599 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5600 * (Before that, there are hints that it was 1000000000.)
5601 *
5602 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5603 * above.
5604 *
5605 * /proc/net/psched
5606 * -----------------------------------
5607 * [1] 000c8000 000f4240 000f4240 00000064
5608 * [2] 000003e8 00000400 000f4240 3b9aca00
5609 * [3] 000003e8 00000400 000f4240 3b9aca00
5610 * [4] 000003e8 00000400 000f4240 00000064
5611 * [5] 000003e8 00000040 000f4240 3b9aca00
5612 * [6] 000003e8 00000040 000f4240 000000f9
5613 *
5614 * a b c d ticks_per_s buffer_hz
5615 * ------- --------- ---------- ------------- ----------- -------------
5616 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5617 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5619 * [4] 1,000 1,024 1,000,000 100 976,562 100
5620 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5621 * [6] 1,000 64 1,000,000 249 15,625,000 249
5622 *
5623 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5624 * [2] 2.6.26-1-686-bigmem from Debian lenny
5625 * [3] 2.6.26-2-sparc64 from Debian lenny
5626 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5627 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5628 * [6] 2.6.34 from kernel.org on KVM
5629 */
5630 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5631 static const char fn[] = "/proc/net/psched";
5632 unsigned int a, b, c, d;
5633 FILE *stream;
5634
5635 if (!ovsthread_once_start(&once)) {
5636 return;
5637 }
5638
5639 ticks_per_s = 1.0;
5640 buffer_hz = 100;
5641
5642 stream = fopen(fn, "r");
5643 if (!stream) {
5644 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5645 goto exit;
5646 }
5647
5648 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5649 VLOG_WARN("%s: read failed", fn);
5650 fclose(stream);
5651 goto exit;
5652 }
5653 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5654 fclose(stream);
5655
5656 if (!a || !b || !c) {
5657 VLOG_WARN("%s: invalid scheduler parameters", fn);
5658 goto exit;
5659 }
5660
5661 ticks_per_s = (double) a * c / b;
5662 if (c == 1000000) {
5663 buffer_hz = d;
5664 } else {
5665 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5666 fn, a, b, c, d);
5667 }
5668 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5669
5670 exit:
5671 ovsthread_once_done(&once);
5672 }
5673
5674 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5675 * rate of 'rate' bytes per second. */
5676 static unsigned int
5677 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5678 {
5679 read_psched();
5680 return (rate * ticks) / ticks_per_s;
5681 }
5682
5683 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5684 * rate of 'rate' bytes per second. */
5685 static unsigned int
5686 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5687 {
5688 read_psched();
5689 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5690 }
5691
5692 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5693 * a transmission rate of 'rate' bytes per second. */
5694 static unsigned int
5695 tc_buffer_per_jiffy(unsigned int rate)
5696 {
5697 read_psched();
5698 return rate / buffer_hz;
5699 }
5700
5701 static uint32_t
5702 tc_time_to_ticks(uint32_t time) {
5703 read_psched();
5704 return time * (ticks_per_s / 1000000);
5705 }
5706
5707 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5708 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5709 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5710 * stores NULL into it if it is absent.
5711 *
5712 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5713 * 'msg'.
5714 *
5715 * Returns 0 if successful, otherwise a positive errno value. */
5716 static int
5717 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5718 struct nlattr **options)
5719 {
5720 static const struct nl_policy tca_policy[] = {
5721 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5722 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5723 };
5724 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5725
5726 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5727 tca_policy, ta, ARRAY_SIZE(ta))) {
5728 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5729 goto error;
5730 }
5731
5732 if (kind) {
5733 *kind = nl_attr_get_string(ta[TCA_KIND]);
5734 }
5735
5736 if (options) {
5737 *options = ta[TCA_OPTIONS];
5738 }
5739
5740 return 0;
5741
5742 error:
5743 if (kind) {
5744 *kind = NULL;
5745 }
5746 if (options) {
5747 *options = NULL;
5748 }
5749 return EPROTO;
5750 }
5751
5752 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5753 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5754 * into '*options', and its queue statistics into '*stats'. Any of the output
5755 * arguments may be null.
5756 *
5757 * Returns 0 if successful, otherwise a positive errno value. */
5758 static int
5759 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5760 struct nlattr **options, struct netdev_queue_stats *stats)
5761 {
5762 static const struct nl_policy tca_policy[] = {
5763 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5764 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5765 };
5766 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5767
5768 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5769 tca_policy, ta, ARRAY_SIZE(ta))) {
5770 VLOG_WARN_RL(&rl, "failed to parse class message");
5771 goto error;
5772 }
5773
5774 if (handlep) {
5775 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5776 *handlep = tc->tcm_handle;
5777 }
5778
5779 if (options) {
5780 *options = ta[TCA_OPTIONS];
5781 }
5782
5783 if (stats) {
5784 const struct gnet_stats_queue *gsq;
5785 struct gnet_stats_basic gsb;
5786
5787 static const struct nl_policy stats_policy[] = {
5788 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5789 .min_len = sizeof gsb },
5790 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5791 .min_len = sizeof *gsq },
5792 };
5793 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5794
5795 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5796 sa, ARRAY_SIZE(sa))) {
5797 VLOG_WARN_RL(&rl, "failed to parse class stats");
5798 goto error;
5799 }
5800
5801 /* Alignment issues screw up the length of struct gnet_stats_basic on
5802 * some arch/bitsize combinations. Newer versions of Linux have a
5803 * struct gnet_stats_basic_packed, but we can't depend on that. The
5804 * easiest thing to do is just to make a copy. */
5805 memset(&gsb, 0, sizeof gsb);
5806 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5807 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5808 stats->tx_bytes = gsb.bytes;
5809 stats->tx_packets = gsb.packets;
5810
5811 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5812 stats->tx_errors = gsq->drops;
5813 }
5814
5815 return 0;
5816
5817 error:
5818 if (options) {
5819 *options = NULL;
5820 }
5821 if (stats) {
5822 memset(stats, 0, sizeof *stats);
5823 }
5824 return EPROTO;
5825 }
5826
5827 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5828 * on 'netdev'. */
5829 static int
5830 tc_query_class(const struct netdev *netdev,
5831 unsigned int handle, unsigned int parent,
5832 struct ofpbuf **replyp)
5833 {
5834 struct ofpbuf request;
5835 struct tcmsg *tcmsg;
5836 int error;
5837
5838 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5839 &request);
5840 if (!tcmsg) {
5841 return ENODEV;
5842 }
5843 tcmsg->tcm_handle = handle;
5844 tcmsg->tcm_parent = parent;
5845
5846 error = tc_transact(&request, replyp);
5847 if (error) {
5848 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5849 netdev_get_name(netdev),
5850 tc_get_major(handle), tc_get_minor(handle),
5851 tc_get_major(parent), tc_get_minor(parent),
5852 ovs_strerror(error));
5853 }
5854 return error;
5855 }
5856
5857 /* Equivalent to "tc class del dev <name> handle <handle>". */
5858 static int
5859 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5860 {
5861 struct ofpbuf request;
5862 struct tcmsg *tcmsg;
5863 int error;
5864
5865 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5866 if (!tcmsg) {
5867 return ENODEV;
5868 }
5869 tcmsg->tcm_handle = handle;
5870 tcmsg->tcm_parent = 0;
5871
5872 error = tc_transact(&request, NULL);
5873 if (error) {
5874 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5875 netdev_get_name(netdev),
5876 tc_get_major(handle), tc_get_minor(handle),
5877 ovs_strerror(error));
5878 }
5879 return error;
5880 }
5881
5882 /* Equivalent to "tc qdisc del dev <name> root". */
5883 static int
5884 tc_del_qdisc(struct netdev *netdev_)
5885 {
5886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5887 struct ofpbuf request;
5888 struct tcmsg *tcmsg;
5889 int error;
5890
5891 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5892 if (!tcmsg) {
5893 return ENODEV;
5894 }
5895 tcmsg->tcm_handle = tc_make_handle(1, 0);
5896 tcmsg->tcm_parent = TC_H_ROOT;
5897
5898 error = tc_transact(&request, NULL);
5899 if (error == EINVAL) {
5900 /* EINVAL probably means that the default qdisc was in use, in which
5901 * case we've accomplished our purpose. */
5902 error = 0;
5903 }
5904 if (!error && netdev->tc) {
5905 if (netdev->tc->ops->tc_destroy) {
5906 netdev->tc->ops->tc_destroy(netdev->tc);
5907 }
5908 netdev->tc = NULL;
5909 }
5910 return error;
5911 }
5912
5913 static bool
5914 getqdisc_is_safe(void)
5915 {
5916 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5917 static bool safe = false;
5918
5919 if (ovsthread_once_start(&once)) {
5920 struct utsname utsname;
5921 int major, minor;
5922
5923 if (uname(&utsname) == -1) {
5924 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5925 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5926 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5927 } else if (major < 2 || (major == 2 && minor < 35)) {
5928 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5929 utsname.release);
5930 } else {
5931 safe = true;
5932 }
5933 ovsthread_once_done(&once);
5934 }
5935 return safe;
5936 }
5937
5938 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5939 * kernel to determine what they are. Returns 0 if successful, otherwise a
5940 * positive errno value. */
5941 static int
5942 tc_query_qdisc(const struct netdev *netdev_)
5943 {
5944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5945 struct ofpbuf request, *qdisc;
5946 const struct tc_ops *ops;
5947 struct tcmsg *tcmsg;
5948 int load_error;
5949 int error;
5950
5951 if (netdev->tc) {
5952 return 0;
5953 }
5954
5955 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5956 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5957 * 2.6.35 without that fix backported to it.
5958 *
5959 * To avoid the OOPS, we must not make a request that would attempt to dump
5960 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5961 * few others. There are a few ways that I can see to do this, but most of
5962 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5963 * technique chosen here is to assume that any non-default qdisc that we
5964 * create will have a class with handle 1:0. The built-in qdiscs only have
5965 * a class with handle 0:0.
5966 *
5967 * On Linux 2.6.35+ we use the straightforward method because it allows us
5968 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5969 * in such a case we get no response at all from the kernel (!) if a
5970 * builtin qdisc is in use (which is later caught by "!error &&
5971 * !qdisc->size"). */
5972 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5973 &request);
5974 if (!tcmsg) {
5975 return ENODEV;
5976 }
5977 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5978 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5979
5980 /* Figure out what tc class to instantiate. */
5981 error = tc_transact(&request, &qdisc);
5982 if (!error && qdisc->size) {
5983 const char *kind;
5984
5985 error = tc_parse_qdisc(qdisc, &kind, NULL);
5986 if (error) {
5987 ops = &tc_ops_other;
5988 } else {
5989 ops = tc_lookup_linux_name(kind);
5990 if (!ops) {
5991 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5992 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5993
5994 ops = &tc_ops_other;
5995 }
5996 }
5997 } else if ((!error && !qdisc->size) || error == ENOENT) {
5998 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5999 * set up by some other entity that doesn't have a handle 1:0. We will
6000 * assume that it's the system default qdisc. */
6001 ops = &tc_ops_default;
6002 error = 0;
6003 } else {
6004 /* Who knows? Maybe the device got deleted. */
6005 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
6006 netdev_get_name(netdev_), ovs_strerror(error));
6007 ops = &tc_ops_other;
6008 }
6009
6010 /* Instantiate it. */
6011 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6012 ovs_assert((load_error == 0) == (netdev->tc != NULL));
6013 ofpbuf_delete(qdisc);
6014
6015 return error ? error : load_error;
6016 }
6017
6018 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6019 approximate the time to transmit packets of various lengths. For an MTU of
6020 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6021 represents two possible packet lengths; for a MTU of 513 through 1024, four
6022 possible lengths; and so on.
6023
6024 Returns, for the specified 'mtu', the number of bits that packet lengths
6025 need to be shifted right to fit within such a 256-entry table. */
6026 static int
6027 tc_calc_cell_log(unsigned int mtu)
6028 {
6029 int cell_log;
6030
6031 if (!mtu) {
6032 mtu = ETH_PAYLOAD_MAX;
6033 }
6034 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6035
6036 for (cell_log = 0; mtu >= 256; cell_log++) {
6037 mtu >>= 1;
6038 }
6039
6040 return cell_log;
6041 }
6042
6043 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6044 * of 'mtu'. */
6045 static void
6046 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6047 {
6048 memset(rate, 0, sizeof *rate);
6049 rate->cell_log = tc_calc_cell_log(mtu);
6050 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6051 /* rate->cell_align = 0; */ /* distro headers. */
6052 rate->mpu = ETH_TOTAL_MIN;
6053 rate->rate = Bps;
6054 }
6055
6056 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6057 * attribute of the specified "type".
6058 *
6059 * See tc_calc_cell_log() above for a description of "rtab"s. */
6060 void
6061 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
6062 {
6063 uint32_t *rtab;
6064 unsigned int i;
6065
6066 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6067 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6068 unsigned packet_size = (i + 1) << rate->cell_log;
6069 if (packet_size < rate->mpu) {
6070 packet_size = rate->mpu;
6071 }
6072 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
6073 }
6074 }
6075
6076 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6077 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6078 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6079 * 0 is fine.) */
6080 static int
6081 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
6082 {
6083 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6084 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6085 }
6086 \f
6087 /* Linux-only functions declared in netdev-linux.h */
6088
6089 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6090 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6091 int
6092 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6093 const char *flag_name, bool enable)
6094 {
6095 const char *netdev_name = netdev_get_name(netdev);
6096 struct ethtool_value evalue;
6097 uint32_t new_flags;
6098 int error;
6099
6100 COVERAGE_INC(netdev_get_ethtool);
6101 memset(&evalue, 0, sizeof evalue);
6102 error = netdev_linux_do_ethtool(netdev_name,
6103 (struct ethtool_cmd *)&evalue,
6104 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6105 if (error) {
6106 return error;
6107 }
6108
6109 COVERAGE_INC(netdev_set_ethtool);
6110 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6111 if (new_flags == evalue.data) {
6112 return 0;
6113 }
6114 evalue.data = new_flags;
6115 error = netdev_linux_do_ethtool(netdev_name,
6116 (struct ethtool_cmd *)&evalue,
6117 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6118 if (error) {
6119 return error;
6120 }
6121
6122 COVERAGE_INC(netdev_get_ethtool);
6123 memset(&evalue, 0, sizeof evalue);
6124 error = netdev_linux_do_ethtool(netdev_name,
6125 (struct ethtool_cmd *)&evalue,
6126 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6127 if (error) {
6128 return error;
6129 }
6130
6131 if (new_flags != evalue.data) {
6132 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6133 "device %s failed", enable ? "enable" : "disable",
6134 flag_name, netdev_name);
6135 return EOPNOTSUPP;
6136 }
6137
6138 return 0;
6139 }
6140 \f
6141 /* Utility functions. */
6142
6143 /* Copies 'src' into 'dst', performing format conversion in the process. */
6144 static void
6145 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6146 const struct rtnl_link_stats *src)
6147 {
6148 dst->rx_packets = src->rx_packets;
6149 dst->tx_packets = src->tx_packets;
6150 dst->rx_bytes = src->rx_bytes;
6151 dst->tx_bytes = src->tx_bytes;
6152 dst->rx_errors = src->rx_errors;
6153 dst->tx_errors = src->tx_errors;
6154 dst->rx_dropped = src->rx_dropped;
6155 dst->tx_dropped = src->tx_dropped;
6156 dst->multicast = src->multicast;
6157 dst->collisions = src->collisions;
6158 dst->rx_length_errors = src->rx_length_errors;
6159 dst->rx_over_errors = src->rx_over_errors;
6160 dst->rx_crc_errors = src->rx_crc_errors;
6161 dst->rx_frame_errors = src->rx_frame_errors;
6162 dst->rx_fifo_errors = src->rx_fifo_errors;
6163 dst->rx_missed_errors = src->rx_missed_errors;
6164 dst->tx_aborted_errors = src->tx_aborted_errors;
6165 dst->tx_carrier_errors = src->tx_carrier_errors;
6166 dst->tx_fifo_errors = src->tx_fifo_errors;
6167 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6168 dst->tx_window_errors = src->tx_window_errors;
6169 }
6170
6171 /* Copies 'src' into 'dst', performing format conversion in the process. */
6172 static void
6173 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6174 const struct rtnl_link_stats64 *src)
6175 {
6176 dst->rx_packets = src->rx_packets;
6177 dst->tx_packets = src->tx_packets;
6178 dst->rx_bytes = src->rx_bytes;
6179 dst->tx_bytes = src->tx_bytes;
6180 dst->rx_errors = src->rx_errors;
6181 dst->tx_errors = src->tx_errors;
6182 dst->rx_dropped = src->rx_dropped;
6183 dst->tx_dropped = src->tx_dropped;
6184 dst->multicast = src->multicast;
6185 dst->collisions = src->collisions;
6186 dst->rx_length_errors = src->rx_length_errors;
6187 dst->rx_over_errors = src->rx_over_errors;
6188 dst->rx_crc_errors = src->rx_crc_errors;
6189 dst->rx_frame_errors = src->rx_frame_errors;
6190 dst->rx_fifo_errors = src->rx_fifo_errors;
6191 dst->rx_missed_errors = src->rx_missed_errors;
6192 dst->tx_aborted_errors = src->tx_aborted_errors;
6193 dst->tx_carrier_errors = src->tx_carrier_errors;
6194 dst->tx_fifo_errors = src->tx_fifo_errors;
6195 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6196 dst->tx_window_errors = src->tx_window_errors;
6197 }
6198
6199 int
6200 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6201 {
6202 struct ofpbuf request;
6203 struct ofpbuf *reply;
6204 int error;
6205
6206 /* Filtering all counters by default */
6207 memset(stats, 0xFF, sizeof(struct netdev_stats));
6208
6209 ofpbuf_init(&request, 0);
6210 nl_msg_put_nlmsghdr(&request,
6211 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6212 RTM_GETLINK, NLM_F_REQUEST);
6213 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6214 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6215 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6216 ofpbuf_uninit(&request);
6217 if (error) {
6218 return error;
6219 }
6220
6221 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6222 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6223 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6224 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
6225 error = 0;
6226 } else {
6227 a = nl_attr_find(reply, 0, IFLA_STATS);
6228 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6229 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6230 error = 0;
6231 } else {
6232 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6233 error = EPROTO;
6234 }
6235 }
6236 } else {
6237 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6238 error = EPROTO;
6239 }
6240
6241
6242 ofpbuf_delete(reply);
6243 return error;
6244 }
6245
6246 static int
6247 get_flags(const struct netdev *dev, unsigned int *flags)
6248 {
6249 struct ifreq ifr;
6250 int error;
6251
6252 *flags = 0;
6253 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6254 if (!error) {
6255 *flags = ifr.ifr_flags;
6256 }
6257 return error;
6258 }
6259
6260 static int
6261 set_flags(const char *name, unsigned int flags)
6262 {
6263 struct ifreq ifr;
6264
6265 ifr.ifr_flags = flags;
6266 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6267 }
6268
6269 int
6270 linux_get_ifindex(const char *netdev_name)
6271 {
6272 struct ifreq ifr;
6273 int error;
6274
6275 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6276 COVERAGE_INC(netdev_get_ifindex);
6277
6278 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6279 if (error) {
6280 /* ENODEV probably means that a vif disappeared asynchronously and
6281 * hasn't been removed from the database yet, so reduce the log level
6282 * to INFO for that case. */
6283 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6284 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6285 netdev_name, ovs_strerror(error));
6286 return -error;
6287 }
6288 return ifr.ifr_ifindex;
6289 }
6290
6291 static int
6292 get_ifindex(const struct netdev *netdev_, int *ifindexp)
6293 {
6294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6295
6296 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6297 netdev_linux_update_via_netlink(netdev);
6298 }
6299
6300 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6301 /* Fall back to ioctl if netlink fails */
6302 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6303
6304 if (ifindex < 0) {
6305 netdev->get_ifindex_error = -ifindex;
6306 netdev->ifindex = 0;
6307 } else {
6308 netdev->get_ifindex_error = 0;
6309 netdev->ifindex = ifindex;
6310 }
6311 netdev->cache_valid |= VALID_IFINDEX;
6312 }
6313
6314 *ifindexp = netdev->ifindex;
6315 return netdev->get_ifindex_error;
6316 }
6317
6318 static int
6319 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6320 {
6321 struct ofpbuf request;
6322 struct ofpbuf *reply;
6323 struct rtnetlink_change chg;
6324 struct rtnetlink_change *change = &chg;
6325 int error;
6326
6327 ofpbuf_init(&request, 0);
6328 nl_msg_put_nlmsghdr(&request,
6329 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6330 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6331 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6332
6333 /* The correct identifiers for a Linux device are netnsid and ifindex,
6334 * but ifindex changes as the port is moved to another network namespace
6335 * and the interface name statically stored in ovsdb. */
6336 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6337 if (netdev_linux_netnsid_is_remote(netdev)) {
6338 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6339 }
6340 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6341 ofpbuf_uninit(&request);
6342 if (error) {
6343 ofpbuf_delete(reply);
6344 return error;
6345 }
6346
6347 if (rtnetlink_parse(reply, change)
6348 && change->nlmsg_type == RTM_NEWLINK) {
6349 bool changed = false;
6350 error = 0;
6351
6352 /* Update netdev from rtnl msg and increment its seq if needed. */
6353 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6354 netdev->carrier_resets++;
6355 changed = true;
6356 }
6357 if (change->ifi_flags != netdev->ifi_flags) {
6358 netdev->ifi_flags = change->ifi_flags;
6359 changed = true;
6360 }
6361 if (change->mtu && change->mtu != netdev->mtu) {
6362 netdev->mtu = change->mtu;
6363 netdev->cache_valid |= VALID_MTU;
6364 netdev->netdev_mtu_error = 0;
6365 changed = true;
6366 }
6367 if (!eth_addr_is_zero(change->mac)
6368 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6369 netdev->etheraddr = change->mac;
6370 netdev->cache_valid |= VALID_ETHERADDR;
6371 netdev->ether_addr_error = 0;
6372 changed = true;
6373 }
6374 if (change->if_index != netdev->ifindex) {
6375 netdev->ifindex = change->if_index;
6376 netdev->cache_valid |= VALID_IFINDEX;
6377 netdev->get_ifindex_error = 0;
6378 changed = true;
6379 }
6380 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6381 netdev->is_lag_master = true;
6382 }
6383 if (changed) {
6384 netdev_change_seq_changed(&netdev->up);
6385 }
6386 } else {
6387 error = EINVAL;
6388 }
6389
6390 ofpbuf_delete(reply);
6391 return error;
6392 }
6393
6394 static int
6395 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6396 {
6397 struct ifreq ifr;
6398 int hwaddr_family;
6399 int error;
6400
6401 memset(&ifr, 0, sizeof ifr);
6402 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6403 COVERAGE_INC(netdev_get_hwaddr);
6404 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6405 if (error) {
6406 /* ENODEV probably means that a vif disappeared asynchronously and
6407 * hasn't been removed from the database yet, so reduce the log level
6408 * to INFO for that case. */
6409 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6410 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6411 netdev_name, ovs_strerror(error));
6412 return error;
6413 }
6414 hwaddr_family = ifr.ifr_hwaddr.sa_family;
6415 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6416 hwaddr_family != ARPHRD_NONE) {
6417 VLOG_INFO("%s device has unknown hardware address family %d",
6418 netdev_name, hwaddr_family);
6419 return EINVAL;
6420 }
6421 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6422 return 0;
6423 }
6424
6425 static int
6426 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6427 {
6428 struct ifreq ifr;
6429 int error;
6430
6431 memset(&ifr, 0, sizeof ifr);
6432 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6433 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6434 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6435 COVERAGE_INC(netdev_set_hwaddr);
6436 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6437 if (error) {
6438 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6439 netdev_name, ovs_strerror(error));
6440 }
6441 return error;
6442 }
6443
6444 static int
6445 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6446 int cmd, const char *cmd_name)
6447 {
6448 struct ifreq ifr;
6449 int error;
6450
6451 memset(&ifr, 0, sizeof ifr);
6452 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6453 ifr.ifr_data = (caddr_t) ecmd;
6454
6455 ecmd->cmd = cmd;
6456 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6457 if (error) {
6458 if (error != EOPNOTSUPP) {
6459 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6460 "failed: %s", cmd_name, name, ovs_strerror(error));
6461 } else {
6462 /* The device doesn't support this operation. That's pretty
6463 * common, so there's no point in logging anything. */
6464 }
6465 }
6466 return error;
6467 }
6468
6469 /* Returns an AF_PACKET raw socket or a negative errno value. */
6470 static int
6471 af_packet_sock(void)
6472 {
6473 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6474 static int sock;
6475
6476 if (ovsthread_once_start(&once)) {
6477 sock = socket(AF_PACKET, SOCK_RAW, 0);
6478 if (sock >= 0) {
6479 int error = set_nonblocking(sock);
6480 if (error) {
6481 close(sock);
6482 sock = -error;
6483 } else if (userspace_tso_enabled()) {
6484 int val = 1;
6485 error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
6486 sizeof val);
6487 if (error) {
6488 error = errno;
6489 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6490 ovs_strerror(errno));
6491 close(sock);
6492 sock = -error;
6493 }
6494 }
6495 } else {
6496 sock = -errno;
6497 VLOG_ERR("failed to create packet socket: %s",
6498 ovs_strerror(errno));
6499 }
6500 ovsthread_once_done(&once);
6501 }
6502
6503 return sock;
6504 }
6505
6506 static int
6507 netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
6508 {
6509 struct eth_header *eth_hdr;
6510 ovs_be16 eth_type;
6511 int l2_len;
6512
6513 eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
6514 if (!eth_hdr) {
6515 return -EINVAL;
6516 }
6517
6518 l2_len = ETH_HEADER_LEN;
6519 eth_type = eth_hdr->eth_type;
6520 if (eth_type_vlan(eth_type)) {
6521 struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
6522
6523 if (!vlan) {
6524 return -EINVAL;
6525 }
6526
6527 eth_type = vlan->vlan_next_type;
6528 l2_len += VLAN_HEADER_LEN;
6529 }
6530
6531 if (eth_type == htons(ETH_TYPE_IP)) {
6532 struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
6533
6534 if (!ip_hdr) {
6535 return -EINVAL;
6536 }
6537
6538 *l4proto = ip_hdr->ip_proto;
6539 dp_packet_hwol_set_tx_ipv4(b);
6540 } else if (eth_type == htons(ETH_TYPE_IPV6)) {
6541 struct ovs_16aligned_ip6_hdr *nh6;
6542
6543 nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
6544 if (!nh6) {
6545 return -EINVAL;
6546 }
6547
6548 *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
6549 dp_packet_hwol_set_tx_ipv6(b);
6550 }
6551
6552 return 0;
6553 }
6554
6555 static int
6556 netdev_linux_parse_vnet_hdr(struct dp_packet *b)
6557 {
6558 struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
6559 uint16_t l4proto = 0;
6560
6561 if (OVS_UNLIKELY(!vnet)) {
6562 return -EINVAL;
6563 }
6564
6565 if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
6566 return 0;
6567 }
6568
6569 if (netdev_linux_parse_l2(b, &l4proto)) {
6570 return -EINVAL;
6571 }
6572
6573 if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
6574 if (l4proto == IPPROTO_TCP) {
6575 dp_packet_hwol_set_csum_tcp(b);
6576 } else if (l4proto == IPPROTO_UDP) {
6577 dp_packet_hwol_set_csum_udp(b);
6578 } else if (l4proto == IPPROTO_SCTP) {
6579 dp_packet_hwol_set_csum_sctp(b);
6580 }
6581 }
6582
6583 if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
6584 uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4
6585 | VIRTIO_NET_HDR_GSO_TCPV6
6586 | VIRTIO_NET_HDR_GSO_UDP;
6587 uint8_t type = vnet->gso_type & allowed_mask;
6588
6589 if (type == VIRTIO_NET_HDR_GSO_TCPV4
6590 || type == VIRTIO_NET_HDR_GSO_TCPV6) {
6591 dp_packet_hwol_set_tcp_seg(b);
6592 }
6593 }
6594
6595 return 0;
6596 }
6597
6598 static void
6599 netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
6600 {
6601 struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
6602
6603 if (dp_packet_hwol_is_tso(b)) {
6604 uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
6605 + TCP_HEADER_LEN;
6606
6607 vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
6608 vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
6609 if (dp_packet_hwol_is_ipv4(b)) {
6610 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
6611 } else {
6612 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
6613 }
6614
6615 } else {
6616 vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
6617 }
6618
6619 if (dp_packet_hwol_l4_mask(b)) {
6620 vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
6621 vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b)
6622 - (char *)dp_packet_eth(b));
6623
6624 if (dp_packet_hwol_l4_is_tcp(b)) {
6625 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6626 struct tcp_header, tcp_csum);
6627 } else if (dp_packet_hwol_l4_is_udp(b)) {
6628 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6629 struct udp_header, udp_csum);
6630 } else if (dp_packet_hwol_l4_is_sctp(b)) {
6631 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6632 struct sctp_header, sctp_csum);
6633 } else {
6634 VLOG_WARN_RL(&rl, "Unsupported L4 protocol");
6635 }
6636 }
6637 }