]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev-offload-tc: Use single 'once' variable for probing tc features
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
27 #include <inttypes.h>
28 #include <math.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/utsname.h>
44 #include <net/if.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
47 #include <poll.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51
52 #include "coverage.h"
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
58 #include "hash.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
65 #include "netlink.h"
66 #include "netnsid.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
70 #include "ovs-numa.h"
71 #include "packets.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
76 #include "sset.h"
77 #include "tc.h"
78 #include "timer.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
82 #include "util.h"
83
84 VLOG_DEFINE_THIS_MODULE(netdev_linux);
85
86 COVERAGE_DEFINE(netdev_set_policing);
87 COVERAGE_DEFINE(netdev_arp_lookup);
88 COVERAGE_DEFINE(netdev_get_ifindex);
89 COVERAGE_DEFINE(netdev_get_hwaddr);
90 COVERAGE_DEFINE(netdev_set_hwaddr);
91 COVERAGE_DEFINE(netdev_get_ethtool);
92 COVERAGE_DEFINE(netdev_set_ethtool);
93
94 \f
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
97 #endif
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
99 * old headers. */
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
102 #endif
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
105 #endif
106
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
111 #endif
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
114 #endif
115
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
117 * headers. */
118 #ifndef TC_RTAB_SIZE
119 #define TC_RTAB_SIZE 1024
120 #endif
121
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
127 *
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
130 */
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
133 #endif
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
136 #endif
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
139 #endif
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata {
143 uint32_t tp_status;
144 uint32_t tp_len;
145 uint32_t tp_snaplen;
146 uint16_t tp_mac;
147 uint16_t tp_net;
148 uint16_t tp_vlan_tci;
149 uint16_t tp_vlan_tpid;
150 };
151
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
153 *
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
160 {
161 return ep->speed | (ep->speed_hi << 16);
162 }
163
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
175 #endif
176
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
188 #endif
189
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
191 *
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
197 #ifndef IFLA_STATS64
198 #define IFLA_STATS64 23
199 #endif
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64 {
202 uint64_t rx_packets;
203 uint64_t tx_packets;
204 uint64_t rx_bytes;
205 uint64_t tx_bytes;
206 uint64_t rx_errors;
207 uint64_t tx_errors;
208 uint64_t rx_dropped;
209 uint64_t tx_dropped;
210 uint64_t multicast;
211 uint64_t collisions;
212
213 uint64_t rx_length_errors;
214 uint64_t rx_over_errors;
215 uint64_t rx_crc_errors;
216 uint64_t rx_frame_errors;
217 uint64_t rx_fifo_errors;
218 uint64_t rx_missed_errors;
219
220 uint64_t tx_aborted_errors;
221 uint64_t tx_carrier_errors;
222 uint64_t tx_fifo_errors;
223 uint64_t tx_heartbeat_errors;
224 uint64_t tx_window_errors;
225
226 uint64_t rx_compressed;
227 uint64_t tx_compressed;
228 };
229
230 /* Linux 3.19 introduced virtio_types.h. It might be missing
231 * if we are using old kernel. */
232 #ifndef HAVE_VIRTIO_TYPES
233 typedef __u16 __bitwise__ __virtio16;
234 typedef __u32 __bitwise__ __virtio32;
235 typedef __u64 __bitwise__ __virtio64;
236 #endif
237
238 enum {
239 VALID_IFINDEX = 1 << 0,
240 VALID_ETHERADDR = 1 << 1,
241 VALID_IN = 1 << 2,
242 VALID_MTU = 1 << 3,
243 VALID_POLICING = 1 << 4,
244 VALID_VPORT_STAT_ERROR = 1 << 5,
245 VALID_DRVINFO = 1 << 6,
246 VALID_FEATURES = 1 << 7,
247 VALID_NUMA_ID = 1 << 8,
248 };
249
250 /* Use one for the packet buffer and another for the aux buffer to receive
251 * TSO packets. */
252 #define IOV_STD_SIZE 1
253 #define IOV_TSO_SIZE 2
254
255 enum {
256 IOV_PACKET = 0,
257 IOV_AUXBUF = 1,
258 };
259 \f
260 struct linux_lag_member {
261 uint32_t block_id;
262 struct shash_node *node;
263 };
264
265 /* Protects 'lag_shash' and the mutable members of struct linux_lag_member. */
266 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
267
268 /* All members whose LAG primary interfaces are OVS network devices. */
269 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
270 = SHASH_INITIALIZER(&lag_shash);
271
272 /* Traffic control. */
273
274 /* An instance of a traffic control class. Always associated with a particular
275 * network device.
276 *
277 * Each TC implementation subclasses this with whatever additional data it
278 * needs. */
279 struct tc {
280 const struct tc_ops *ops;
281 struct hmap queues; /* Contains "struct tc_queue"s.
282 * Read by generic TC layer.
283 * Written only by TC implementation. */
284 };
285
286 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
287
288 /* One traffic control queue.
289 *
290 * Each TC implementation subclasses this with whatever additional data it
291 * needs. */
292 struct tc_queue {
293 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
294 unsigned int queue_id; /* OpenFlow queue ID. */
295 long long int created; /* Time queue was created, in msecs. */
296 };
297
298 /* A particular kind of traffic control. Each implementation generally maps to
299 * one particular Linux qdisc class.
300 *
301 * The functions below return 0 if successful or a positive errno value on
302 * failure, except where otherwise noted. All of them must be provided, except
303 * where otherwise noted. */
304 struct tc_ops {
305 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
306 * This is null for tc_ops_default and tc_ops_other, for which there are no
307 * appropriate values. */
308 const char *linux_name;
309
310 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
311 const char *ovs_name;
312
313 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
314 * queues. The queues are numbered 0 through n_queues - 1. */
315 unsigned int n_queues;
316
317 /* Called to install this TC class on 'netdev'. The implementation should
318 * make the Netlink calls required to set up 'netdev' with the right qdisc
319 * and configure it according to 'details'. The implementation may assume
320 * that the current qdisc is the default; that is, there is no need for it
321 * to delete the current qdisc before installing itself.
322 *
323 * The contents of 'details' should be documented as valid for 'ovs_name'
324 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
325 * (which is built as ovs-vswitchd.conf.db(8)).
326 *
327 * This function must return 0 if and only if it sets 'netdev->tc' to an
328 * initialized 'struct tc'.
329 *
330 * (This function is null for tc_ops_other, which cannot be installed. For
331 * other TC classes it should always be nonnull.) */
332 int (*tc_install)(struct netdev *netdev, const struct smap *details);
333
334 /* Called when the netdev code determines (through a Netlink query) that
335 * this TC class's qdisc is installed on 'netdev', but we didn't install
336 * it ourselves and so don't know any of the details.
337 *
338 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
339 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
340 * implementation should parse the other attributes of 'nlmsg' as
341 * necessary to determine its configuration. If necessary it should also
342 * use Netlink queries to determine the configuration of queues on
343 * 'netdev'.
344 *
345 * This function must return 0 if and only if it sets 'netdev->tc' to an
346 * initialized 'struct tc'. */
347 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
348
349 /* Destroys the data structures allocated by the implementation as part of
350 * 'tc'. (This includes destroying 'tc->queues' by calling
351 * tc_destroy(tc).
352 *
353 * The implementation should not need to perform any Netlink calls. If
354 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
355 * (But it may not be desirable.)
356 *
357 * This function may be null if 'tc' is trivial. */
358 void (*tc_destroy)(struct tc *tc);
359
360 /* Retrieves details of 'netdev->tc' configuration into 'details'.
361 *
362 * The implementation should not need to perform any Netlink calls, because
363 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
364 * cached the configuration.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
368 * (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' is not configurable.
371 */
372 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
373
374 /* Reconfigures 'netdev->tc' according to 'details', performing any
375 * required Netlink calls to complete the reconfiguration.
376 *
377 * The contents of 'details' should be documented as valid for 'ovs_name'
378 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
379 * (which is built as ovs-vswitchd.conf.db(8)).
380 *
381 * This function may be null if 'tc' is not configurable.
382 */
383 int (*qdisc_set)(struct netdev *, const struct smap *details);
384
385 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
386 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
387 *
388 * The contents of 'details' should be documented as valid for 'ovs_name'
389 * in the "other_config" column in the "Queue" table in
390 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
391 *
392 * The implementation should not need to perform any Netlink calls, because
393 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
394 * cached the queue configuration.
395 *
396 * This function may be null if 'tc' does not have queues ('n_queues' is
397 * 0). */
398 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
399 struct smap *details);
400
401 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
402 * 'details', perfoming any required Netlink calls to complete the
403 * reconfiguration. The caller ensures that 'queue_id' is less than
404 * 'n_queues'.
405 *
406 * The contents of 'details' should be documented as valid for 'ovs_name'
407 * in the "other_config" column in the "Queue" table in
408 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
409 *
410 * This function may be null if 'tc' does not have queues or its queues are
411 * not configurable. */
412 int (*class_set)(struct netdev *, unsigned int queue_id,
413 const struct smap *details);
414
415 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
416 * tc_queue's within 'netdev->tc->queues'.
417 *
418 * This function may be null if 'tc' does not have queues or its queues
419 * cannot be deleted. */
420 int (*class_delete)(struct netdev *, struct tc_queue *queue);
421
422 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
423 * 'struct tc_queue's within 'netdev->tc->queues'.
424 *
425 * On success, initializes '*stats'.
426 *
427 * This function may be null if 'tc' does not have queues or if it cannot
428 * report queue statistics. */
429 int (*class_get_stats)(const struct netdev *netdev,
430 const struct tc_queue *queue,
431 struct netdev_queue_stats *stats);
432
433 /* Extracts queue stats from 'nlmsg', which is a response to a
434 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
435 *
436 * This function may be null if 'tc' does not have queues or if it cannot
437 * report queue statistics. */
438 int (*class_dump_stats)(const struct netdev *netdev,
439 const struct ofpbuf *nlmsg,
440 netdev_dump_queue_stats_cb *cb, void *aux);
441 };
442
443 static void
444 tc_init(struct tc *tc, const struct tc_ops *ops)
445 {
446 tc->ops = ops;
447 hmap_init(&tc->queues);
448 }
449
450 static void
451 tc_destroy(struct tc *tc)
452 {
453 hmap_destroy(&tc->queues);
454 }
455
456 static const struct tc_ops tc_ops_htb;
457 static const struct tc_ops tc_ops_hfsc;
458 static const struct tc_ops tc_ops_codel;
459 static const struct tc_ops tc_ops_fqcodel;
460 static const struct tc_ops tc_ops_sfq;
461 static const struct tc_ops tc_ops_netem;
462 static const struct tc_ops tc_ops_default;
463 static const struct tc_ops tc_ops_noop;
464 static const struct tc_ops tc_ops_other;
465
466 static const struct tc_ops *const tcs[] = {
467 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
468 &tc_ops_hfsc, /* Hierarchical fair service curve. */
469 &tc_ops_codel, /* Controlled delay */
470 &tc_ops_fqcodel, /* Fair queue controlled delay */
471 &tc_ops_sfq, /* Stochastic fair queueing */
472 &tc_ops_netem, /* Network Emulator */
473 &tc_ops_noop, /* Non operating qos type. */
474 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
475 &tc_ops_other, /* Some other qdisc. */
476 NULL
477 };
478
479 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
480 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
481 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
482 static uint32_t tc_time_to_ticks(uint32_t time);
483
484 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
485 int type,
486 unsigned int flags,
487 struct ofpbuf *);
488 static int tc_add_policer(struct netdev *,
489 uint32_t kbits_rate, uint32_t kbits_burst);
490
491 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
492 struct nlattr **options);
493 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
494 struct nlattr **options,
495 struct netdev_queue_stats *);
496 static int tc_query_class(const struct netdev *,
497 unsigned int handle, unsigned int parent,
498 struct ofpbuf **replyp);
499 static int tc_delete_class(const struct netdev *, unsigned int handle);
500
501 static int tc_del_qdisc(struct netdev *netdev);
502 static int tc_query_qdisc(const struct netdev *netdev);
503
504 void
505 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
506 static int tc_calc_cell_log(unsigned int mtu);
507 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
508 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
509 \f
510
511 /* This is set pretty low because we probably won't learn anything from the
512 * additional log messages. */
513 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
514
515 /* Polling miimon status for all ports causes performance degradation when
516 * handling a large number of ports. If there are no devices using miimon, then
517 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
518 *
519 * Readers do not depend on this variable synchronizing with the related
520 * changes in the device miimon status, so we can use atomic_count. */
521 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
522
523 static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
524 static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
525 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
526 int cmd, const char *cmd_name);
527 static int get_flags(const struct netdev *, unsigned int *flags);
528 static int set_flags(const char *, unsigned int flags);
529 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
530 enum netdev_flags on, enum netdev_flags *old_flagsp)
531 OVS_REQUIRES(netdev->mutex);
532 static int get_ifindex(const struct netdev *, int *ifindexp);
533 static int do_set_addr(struct netdev *netdev,
534 int ioctl_nr, const char *ioctl_name,
535 struct in_addr addr);
536 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
537 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
538 static int af_packet_sock(void);
539 static bool netdev_linux_miimon_enabled(void);
540 static void netdev_linux_miimon_run(void);
541 static void netdev_linux_miimon_wait(void);
542 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
543
544 static bool
545 is_tap_netdev(const struct netdev *netdev)
546 {
547 return netdev_get_class(netdev) == &netdev_tap_class;
548 }
549 \f
550 static int
551 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
552 {
553 struct dpif_netlink_vport reply;
554 struct ofpbuf *buf;
555 int error;
556
557 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
558 if (error) {
559 if (error == ENOENT) {
560 /* Assume it is local if there is no API (e.g. if the openvswitch
561 * kernel module is not loaded). */
562 netnsid_set_local(&netdev->netnsid);
563 } else {
564 netnsid_unset(&netdev->netnsid);
565 }
566 return error;
567 }
568
569 netnsid_set(&netdev->netnsid, reply.netnsid);
570 ofpbuf_delete(buf);
571 return 0;
572 }
573
574 static int
575 netdev_linux_netnsid_update(struct netdev_linux *netdev)
576 {
577 if (netnsid_is_unset(netdev->netnsid)) {
578 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
579 netnsid_set_local(&netdev->netnsid);
580 } else {
581 return netdev_linux_netnsid_update__(netdev);
582 }
583 }
584
585 return 0;
586 }
587
588 static bool
589 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
590 {
591 netdev_linux_netnsid_update(netdev);
592 return netnsid_eq(netdev->netnsid, nsid);
593 }
594
595 static bool
596 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
597 {
598 netdev_linux_netnsid_update(netdev);
599 return netnsid_is_remote(netdev->netnsid);
600 }
601
602 static int netdev_linux_update_via_netlink(struct netdev_linux *);
603 static void netdev_linux_update(struct netdev_linux *netdev, int,
604 const struct rtnetlink_change *)
605 OVS_REQUIRES(netdev->mutex);
606 static void netdev_linux_changed(struct netdev_linux *netdev,
607 unsigned int ifi_flags, unsigned int mask)
608 OVS_REQUIRES(netdev->mutex);
609
610 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
611 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
612 * if no such socket could be created. */
613 static struct nl_sock *
614 netdev_linux_notify_sock(void)
615 {
616 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
617 static struct nl_sock *sock;
618 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
619 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
620
621 if (ovsthread_once_start(&once)) {
622 int error;
623
624 error = nl_sock_create(NETLINK_ROUTE, &sock);
625 if (!error) {
626 size_t i;
627
628 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
629 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
630 if (error) {
631 nl_sock_destroy(sock);
632 sock = NULL;
633 break;
634 }
635 }
636 }
637 nl_sock_listen_all_nsid(sock, true);
638 ovsthread_once_done(&once);
639 }
640
641 return sock;
642 }
643
644 static bool
645 netdev_linux_miimon_enabled(void)
646 {
647 return atomic_count_get(&miimon_cnt) > 0;
648 }
649
650 static bool
651 netdev_linux_kind_is_lag(const char *kind)
652 {
653 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
654 return true;
655 }
656
657 return false;
658 }
659
660 static void
661 netdev_linux_update_lag(struct rtnetlink_change *change)
662 OVS_REQUIRES(lag_mutex)
663 {
664 struct linux_lag_member *lag;
665
666 if (change->sub && netdev_linux_kind_is_lag(change->sub)) {
667 lag = shash_find_data(&lag_shash, change->ifname);
668
669 if (!lag) {
670 struct netdev *master_netdev;
671 char master_name[IFNAMSIZ];
672 uint32_t block_id;
673 int error = 0;
674
675 if_indextoname(change->master_ifindex, master_name);
676 master_netdev = netdev_from_name(master_name);
677 if (!master_netdev) {
678 return;
679 }
680
681 if (is_netdev_linux_class(master_netdev->netdev_class)) {
682 block_id = netdev_get_block_id(master_netdev);
683 if (!block_id) {
684 netdev_close(master_netdev);
685 return;
686 }
687
688 lag = xmalloc(sizeof *lag);
689 lag->block_id = block_id;
690 lag->node = shash_add(&lag_shash, change->ifname, lag);
691
692 /* delete ingress block in case it exists */
693 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
694 /* LAG master is linux netdev so add member to same block. */
695 error = tc_add_del_qdisc(change->if_index, true, block_id,
696 TC_INGRESS);
697 if (error) {
698 VLOG_WARN("failed to bind LAG member %s to "
699 "primary's block", change->ifname);
700 shash_delete(&lag_shash, lag->node);
701 free(lag);
702 }
703 }
704
705 netdev_close(master_netdev);
706 }
707 } else if (change->master_ifindex == 0) {
708 /* Check if this was a lag member that has been removed. */
709 lag = shash_find_data(&lag_shash, change->ifname);
710
711 if (lag) {
712 tc_add_del_qdisc(change->if_index, false, lag->block_id,
713 TC_INGRESS);
714 shash_delete(&lag_shash, lag->node);
715 free(lag);
716 }
717 }
718 }
719
720 void
721 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
722 {
723 struct nl_sock *sock;
724 int error;
725
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
728 }
729
730 sock = netdev_linux_notify_sock();
731 if (!sock) {
732 return;
733 }
734
735 do {
736 uint64_t buf_stub[4096 / 8];
737 int nsid;
738 struct ofpbuf buf;
739
740 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
741 error = nl_sock_recv(sock, &buf, &nsid, false);
742 if (!error) {
743 struct rtnetlink_change change;
744
745 if (rtnetlink_parse(&buf, &change)) {
746 struct netdev *netdev_ = NULL;
747 char dev_name[IFNAMSIZ];
748
749 if (!change.ifname) {
750 change.ifname = if_indextoname(change.if_index, dev_name);
751 }
752
753 if (change.ifname) {
754 netdev_ = netdev_from_name(change.ifname);
755 }
756 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758
759 ovs_mutex_lock(&netdev->mutex);
760 netdev_linux_update(netdev, nsid, &change);
761 ovs_mutex_unlock(&netdev->mutex);
762 }
763
764 if (change.ifname &&
765 rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
766
767 /* Need to try updating the LAG information. */
768 ovs_mutex_lock(&lag_mutex);
769 netdev_linux_update_lag(&change);
770 ovs_mutex_unlock(&lag_mutex);
771 }
772 netdev_close(netdev_);
773 }
774 } else if (error == ENOBUFS) {
775 struct shash device_shash;
776 struct shash_node *node;
777
778 nl_sock_drain(sock);
779
780 shash_init(&device_shash);
781 netdev_get_devices(&netdev_linux_class, &device_shash);
782 SHASH_FOR_EACH (node, &device_shash) {
783 struct netdev *netdev_ = node->data;
784 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
785 unsigned int flags;
786
787 ovs_mutex_lock(&netdev->mutex);
788 get_flags(netdev_, &flags);
789 netdev_linux_changed(netdev, flags, 0);
790 ovs_mutex_unlock(&netdev->mutex);
791
792 netdev_close(netdev_);
793 }
794 shash_destroy(&device_shash);
795 } else if (error != EAGAIN) {
796 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
797 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
798 ovs_strerror(error));
799 }
800 ofpbuf_uninit(&buf);
801 } while (!error);
802 }
803
804 static void
805 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
806 {
807 struct nl_sock *sock;
808
809 if (netdev_linux_miimon_enabled()) {
810 netdev_linux_miimon_wait();
811 }
812 sock = netdev_linux_notify_sock();
813 if (sock) {
814 nl_sock_wait(sock, POLLIN);
815 }
816 }
817
818 static void
819 netdev_linux_changed(struct netdev_linux *dev,
820 unsigned int ifi_flags, unsigned int mask)
821 OVS_REQUIRES(dev->mutex)
822 {
823 netdev_change_seq_changed(&dev->up);
824
825 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
826 dev->carrier_resets++;
827 }
828 dev->ifi_flags = ifi_flags;
829
830 dev->cache_valid &= mask;
831 if (!(mask & VALID_IN)) {
832 netdev_get_addrs_list_flush();
833 }
834 }
835
836 static void
837 netdev_linux_update__(struct netdev_linux *dev,
838 const struct rtnetlink_change *change)
839 OVS_REQUIRES(dev->mutex)
840 {
841 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
842 if (change->nlmsg_type == RTM_NEWLINK) {
843 /* Keep drv-info, ip addresses, and NUMA id. */
844 netdev_linux_changed(dev, change->ifi_flags,
845 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
846
847 /* Update netdev from rtnl-change msg. */
848 if (change->mtu) {
849 dev->mtu = change->mtu;
850 dev->cache_valid |= VALID_MTU;
851 dev->netdev_mtu_error = 0;
852 }
853
854 if (!eth_addr_is_zero(change->mac)) {
855 dev->etheraddr = change->mac;
856 dev->cache_valid |= VALID_ETHERADDR;
857 dev->ether_addr_error = 0;
858
859 /* The mac addr has been changed, report it now. */
860 rtnetlink_report_link();
861 }
862
863 if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
864 dev->is_lag_master = true;
865 }
866
867 dev->ifindex = change->if_index;
868 dev->cache_valid |= VALID_IFINDEX;
869 dev->get_ifindex_error = 0;
870 dev->present = true;
871 } else {
872 /* FIXME */
873 netdev_linux_changed(dev, change->ifi_flags, 0);
874 dev->present = false;
875 netnsid_unset(&dev->netnsid);
876 }
877 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
878 /* Invalidates in4, in6. */
879 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
880 } else {
881 OVS_NOT_REACHED();
882 }
883 }
884
885 static void
886 netdev_linux_update(struct netdev_linux *dev, int nsid,
887 const struct rtnetlink_change *change)
888 OVS_REQUIRES(dev->mutex)
889 {
890 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
891 netdev_linux_update__(dev, change);
892 }
893 }
894
895 static struct netdev *
896 netdev_linux_alloc(void)
897 {
898 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
899 return &netdev->up;
900 }
901
902 static int
903 netdev_linux_common_construct(struct netdev *netdev_)
904 {
905 /* Prevent any attempt to create (or open) a network device named "default"
906 * or "all". These device names are effectively reserved on Linux because
907 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
908 * itself this wouldn't call for any special treatment, but in practice if
909 * a program tries to create devices with these names, it causes the kernel
910 * to fire a "new device" notification event even though creation failed,
911 * and in turn that causes OVS to wake up and try to create them again,
912 * which ends up as a 100% CPU loop. */
913 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
914 const char *name = netdev_->name;
915 if (!strcmp(name, "default") || !strcmp(name, "all")) {
916 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
917 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
918 name);
919 return EINVAL;
920 }
921
922 /* The device could be in the same network namespace or in another one. */
923 netnsid_unset(&netdev->netnsid);
924 ovs_mutex_init(&netdev->mutex);
925
926 if (userspace_tso_enabled()) {
927 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
928 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
929 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
930 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
931 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
932 }
933
934 return 0;
935 }
936
937 /* Creates system and internal devices. */
938 int
939 netdev_linux_construct(struct netdev *netdev_)
940 {
941 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
942 int error = netdev_linux_common_construct(netdev_);
943 if (error) {
944 return error;
945 }
946
947 error = get_flags(&netdev->up, &netdev->ifi_flags);
948 if (error == ENODEV) {
949 if (netdev->up.netdev_class != &netdev_internal_class) {
950 /* The device does not exist, so don't allow it to be opened. */
951 return ENODEV;
952 } else {
953 /* "Internal" netdevs have to be created as netdev objects before
954 * they exist in the kernel, because creating them in the kernel
955 * happens by passing a netdev object to dpif_port_add().
956 * Therefore, ignore the error. */
957 }
958 }
959
960 return 0;
961 }
962
963 /* For most types of netdevs we open the device for each call of
964 * netdev_open(). However, this is not the case with tap devices,
965 * since it is only possible to open the device once. In this
966 * situation we share a single file descriptor, and consequently
967 * buffers, across all readers. Therefore once data is read it will
968 * be unavailable to other reads for tap devices. */
969 static int
970 netdev_linux_construct_tap(struct netdev *netdev_)
971 {
972 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
973 static const char tap_dev[] = "/dev/net/tun";
974 const char *name = netdev_->name;
975 struct ifreq ifr;
976
977 int error = netdev_linux_common_construct(netdev_);
978 if (error) {
979 return error;
980 }
981
982 /* Open tap device. */
983 netdev->tap_fd = open(tap_dev, O_RDWR);
984 if (netdev->tap_fd < 0) {
985 error = errno;
986 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
987 return error;
988 }
989
990 /* Create tap device. */
991 get_flags(&netdev->up, &netdev->ifi_flags);
992 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
993 if (userspace_tso_enabled()) {
994 ifr.ifr_flags |= IFF_VNET_HDR;
995 }
996
997 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
998 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
999 VLOG_WARN("%s: creating tap device failed: %s", name,
1000 ovs_strerror(errno));
1001 error = errno;
1002 goto error_close;
1003 }
1004
1005 /* Make non-blocking. */
1006 error = set_nonblocking(netdev->tap_fd);
1007 if (error) {
1008 goto error_close;
1009 }
1010
1011 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1012 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1013 ovs_strerror(errno));
1014 error = errno;
1015 goto error_close;
1016 }
1017
1018 if (userspace_tso_enabled()) {
1019 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1020 * available, it will return EINVAL when a flag is unknown.
1021 * Therefore, try enabling offload with no flags to check
1022 * if TUNSETOFFLOAD support is available or not. */
1023 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) {
1024 unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
1025
1026 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
1027 VLOG_WARN("%s: enabling tap offloading failed: %s", name,
1028 ovs_strerror(errno));
1029 error = errno;
1030 goto error_close;
1031 }
1032 }
1033 }
1034
1035 netdev->present = true;
1036 return 0;
1037
1038 error_close:
1039 close(netdev->tap_fd);
1040 return error;
1041 }
1042
1043 static void
1044 netdev_linux_destruct(struct netdev *netdev_)
1045 {
1046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1047
1048 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1049 netdev->tc->ops->tc_destroy(netdev->tc);
1050 }
1051
1052 if (netdev_get_class(netdev_) == &netdev_tap_class
1053 && netdev->tap_fd >= 0)
1054 {
1055 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1056 close(netdev->tap_fd);
1057 }
1058
1059 if (netdev->miimon_interval > 0) {
1060 atomic_count_dec(&miimon_cnt);
1061 }
1062
1063 ovs_mutex_destroy(&netdev->mutex);
1064 }
1065
1066 static void
1067 netdev_linux_dealloc(struct netdev *netdev_)
1068 {
1069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1070 free(netdev);
1071 }
1072
1073 static struct netdev_rxq *
1074 netdev_linux_rxq_alloc(void)
1075 {
1076 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1077 return &rx->up;
1078 }
1079
1080 static int
1081 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1082 {
1083 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1084 struct netdev *netdev_ = rx->up.netdev;
1085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1086 int error;
1087
1088 ovs_mutex_lock(&netdev->mutex);
1089 rx->is_tap = is_tap_netdev(netdev_);
1090 if (rx->is_tap) {
1091 rx->fd = netdev->tap_fd;
1092 } else {
1093 struct sockaddr_ll sll;
1094 int ifindex, val;
1095 /* Result of tcpdump -dd inbound */
1096 static const struct sock_filter filt[] = {
1097 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1098 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1099 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1100 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1101 };
1102 static const struct sock_fprog fprog = {
1103 ARRAY_SIZE(filt), (struct sock_filter *) filt
1104 };
1105
1106 /* Create file descriptor. */
1107 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1108 if (rx->fd < 0) {
1109 error = errno;
1110 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1111 goto error;
1112 }
1113
1114 val = 1;
1115 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1116 error = errno;
1117 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1118 netdev_get_name(netdev_), ovs_strerror(error));
1119 goto error;
1120 }
1121
1122 if (userspace_tso_enabled()
1123 && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1124 sizeof val)) {
1125 error = errno;
1126 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1127 netdev_get_name(netdev_), ovs_strerror(errno));
1128 goto error;
1129 }
1130
1131 /* Set non-blocking mode. */
1132 error = set_nonblocking(rx->fd);
1133 if (error) {
1134 goto error;
1135 }
1136
1137 /* Get ethernet device index. */
1138 error = get_ifindex(&netdev->up, &ifindex);
1139 if (error) {
1140 goto error;
1141 }
1142
1143 /* Bind to specific ethernet device. */
1144 memset(&sll, 0, sizeof sll);
1145 sll.sll_family = AF_PACKET;
1146 sll.sll_ifindex = ifindex;
1147 sll.sll_protocol = htons(ETH_P_ALL);
1148 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1149 error = errno;
1150 VLOG_ERR("%s: failed to bind raw socket (%s)",
1151 netdev_get_name(netdev_), ovs_strerror(error));
1152 goto error;
1153 }
1154
1155 /* Filter for only inbound packets. */
1156 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1157 sizeof fprog);
1158 if (error) {
1159 error = errno;
1160 VLOG_ERR("%s: failed to attach filter (%s)",
1161 netdev_get_name(netdev_), ovs_strerror(error));
1162 goto error;
1163 }
1164 }
1165 ovs_mutex_unlock(&netdev->mutex);
1166
1167 return 0;
1168
1169 error:
1170 if (rx->fd >= 0) {
1171 close(rx->fd);
1172 }
1173 ovs_mutex_unlock(&netdev->mutex);
1174 return error;
1175 }
1176
1177 static void
1178 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1179 {
1180 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1181 int i;
1182
1183 if (!rx->is_tap) {
1184 close(rx->fd);
1185 }
1186
1187 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1188 dp_packet_delete(rx->aux_bufs[i]);
1189 }
1190 }
1191
1192 static void
1193 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1194 {
1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1196
1197 free(rx);
1198 }
1199
1200 static ovs_be16
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1202 {
1203 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1204 return htons(aux->tp_vlan_tpid);
1205 } else if (double_tagged) {
1206 return htons(ETH_TYPE_VLAN_8021AD);
1207 } else {
1208 return htons(ETH_TYPE_VLAN_8021Q);
1209 }
1210 }
1211
1212 static bool
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1214 {
1215 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1216 }
1217
1218 /*
1219 * Receive packets from raw socket in batch process for better performance,
1220 * it can receive NETDEV_MAX_BURST packets at most once, the received
1221 * packets are added into *batch. The return value is 0 or errno.
1222 *
1223 * It also used recvmmsg to reduce multiple syscalls overhead;
1224 */
1225 static int
1226 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1227 struct dp_packet_batch *batch)
1228 {
1229 int iovlen;
1230 size_t std_len;
1231 ssize_t retval;
1232 int virtio_net_hdr_size;
1233 struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1234 struct cmsghdr *cmsg;
1235 union {
1236 struct cmsghdr cmsg;
1237 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1238 } cmsg_buffers[NETDEV_MAX_BURST];
1239 struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1240 struct dp_packet *buffers[NETDEV_MAX_BURST];
1241 int i;
1242
1243 if (userspace_tso_enabled()) {
1244 /* Use the buffer from the allocated packet below to receive MTU
1245 * sized packets and an aux_buf for extra TSO data. */
1246 iovlen = IOV_TSO_SIZE;
1247 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1248 } else {
1249 /* Use only the buffer from the allocated packet. */
1250 iovlen = IOV_STD_SIZE;
1251 virtio_net_hdr_size = 0;
1252 }
1253
1254 /* The length here needs to be accounted in the same way when the
1255 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1256 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1257 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1258 buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1259 iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1260 iovs[i][IOV_PACKET].iov_len = std_len;
1261 if (iovlen == IOV_TSO_SIZE) {
1262 iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1263 iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1264 }
1265
1266 mmsgs[i].msg_hdr.msg_name = NULL;
1267 mmsgs[i].msg_hdr.msg_namelen = 0;
1268 mmsgs[i].msg_hdr.msg_iov = iovs[i];
1269 mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1270 mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1271 mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1272 mmsgs[i].msg_hdr.msg_flags = 0;
1273 }
1274
1275 do {
1276 retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1277 } while (retval < 0 && errno == EINTR);
1278
1279 if (retval < 0) {
1280 retval = errno;
1281 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1282 dp_packet_delete(buffers[i]);
1283 }
1284
1285 return retval;
1286 }
1287
1288 for (i = 0; i < retval; i++) {
1289 struct dp_packet *pkt;
1290
1291 if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
1292 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1294
1295 dp_packet_delete(buffers[i]);
1296 netdev->rx_dropped += 1;
1297 VLOG_WARN_RL(&rl, "%s: Dropped packet: less than ether hdr size",
1298 netdev_get_name(netdev_));
1299 continue;
1300 }
1301
1302 if (mmsgs[i].msg_len > std_len) {
1303 /* Build a single linear TSO packet by prepending the data from
1304 * std_len buffer to the aux_buf. */
1305 pkt = rx->aux_bufs[i];
1306 dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1307 dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1308 /* The headroom should be the same in buffers[i], pkt and
1309 * DP_NETDEV_HEADROOM. */
1310 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1311 dp_packet_delete(buffers[i]);
1312 rx->aux_bufs[i] = NULL;
1313 } else {
1314 dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1315 pkt = buffers[i];
1316 }
1317
1318 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1319 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1321
1322 /* Unexpected error situation: the virtio header is not present
1323 * or corrupted. Drop the packet but continue in case next ones
1324 * are correct. */
1325 dp_packet_delete(pkt);
1326 netdev->rx_dropped += 1;
1327 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1328 netdev_get_name(netdev_));
1329 continue;
1330 }
1331
1332 for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1333 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1334 const struct tpacket_auxdata *aux;
1335
1336 if (cmsg->cmsg_level != SOL_PACKET
1337 || cmsg->cmsg_type != PACKET_AUXDATA
1338 || cmsg->cmsg_len <
1339 CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1340 continue;
1341 }
1342
1343 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1344 if (auxdata_has_vlan_tci(aux)) {
1345 struct eth_header *eth;
1346 bool double_tagged;
1347
1348 eth = dp_packet_data(pkt);
1349 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1350
1351 eth_push_vlan(pkt,
1352 auxdata_to_vlan_tpid(aux, double_tagged),
1353 htons(aux->tp_vlan_tci));
1354 break;
1355 }
1356 }
1357 dp_packet_batch_add(batch, pkt);
1358 }
1359
1360 /* Delete unused buffers. */
1361 for (; i < NETDEV_MAX_BURST; i++) {
1362 dp_packet_delete(buffers[i]);
1363 }
1364
1365 return 0;
1366 }
1367
1368 /*
1369 * Receive packets from tap by batch process for better performance,
1370 * it can receive NETDEV_MAX_BURST packets at most once, the received
1371 * packets are added into *batch. The return value is 0 or errno.
1372 */
1373 static int
1374 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1375 struct dp_packet_batch *batch)
1376 {
1377 int virtio_net_hdr_size;
1378 ssize_t retval;
1379 size_t std_len;
1380 int iovlen;
1381 int i;
1382
1383 if (userspace_tso_enabled()) {
1384 /* Use the buffer from the allocated packet below to receive MTU
1385 * sized packets and an aux_buf for extra TSO data. */
1386 iovlen = IOV_TSO_SIZE;
1387 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1388 } else {
1389 /* Use only the buffer from the allocated packet. */
1390 iovlen = IOV_STD_SIZE;
1391 virtio_net_hdr_size = 0;
1392 }
1393
1394 /* The length here needs to be accounted in the same way when the
1395 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1396 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1397 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1398 struct dp_packet *buffer;
1399 struct dp_packet *pkt;
1400 struct iovec iov[IOV_TSO_SIZE];
1401
1402 /* Assume Ethernet port. No need to set packet_type. */
1403 buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1404 iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1405 iov[IOV_PACKET].iov_len = std_len;
1406 if (iovlen == IOV_TSO_SIZE) {
1407 iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1408 iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1409 }
1410
1411 do {
1412 retval = readv(rx->fd, iov, iovlen);
1413 } while (retval < 0 && errno == EINTR);
1414
1415 if (retval < 0) {
1416 dp_packet_delete(buffer);
1417 break;
1418 }
1419
1420 if (retval > std_len) {
1421 /* Build a single linear TSO packet by prepending the data from
1422 * std_len buffer to the aux_buf. */
1423 pkt = rx->aux_bufs[i];
1424 dp_packet_set_size(pkt, retval - std_len);
1425 dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1426 /* The headroom should be the same in buffers[i], pkt and
1427 * DP_NETDEV_HEADROOM. */
1428 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1429 dp_packet_delete(buffer);
1430 rx->aux_bufs[i] = NULL;
1431 } else {
1432 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1433 pkt = buffer;
1434 }
1435
1436 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
1437 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1439
1440 /* Unexpected error situation: the virtio header is not present
1441 * or corrupted. Drop the packet but continue in case next ones
1442 * are correct. */
1443 dp_packet_delete(pkt);
1444 netdev->rx_dropped += 1;
1445 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1446 netdev_get_name(netdev_));
1447 continue;
1448 }
1449
1450 dp_packet_batch_add(batch, pkt);
1451 }
1452
1453 if ((i == 0) && (retval < 0)) {
1454 return errno;
1455 }
1456
1457 return 0;
1458 }
1459
1460 static int
1461 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1462 int *qfill)
1463 {
1464 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1465 struct netdev *netdev = rx->up.netdev;
1466 ssize_t retval;
1467 int mtu;
1468
1469 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1470 mtu = ETH_PAYLOAD_MAX;
1471 }
1472
1473 if (userspace_tso_enabled()) {
1474 /* Allocate TSO packets. The packet has enough headroom to store
1475 * a full non-TSO packet. When a TSO packet is received, the data
1476 * from non-TSO buffer (std_len) is prepended to the TSO packet
1477 * (aux_buf). */
1478 size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1479 + DP_NETDEV_HEADROOM + mtu;
1480 size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1481 for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1482 if (rx->aux_bufs[i]) {
1483 continue;
1484 }
1485
1486 rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1487 }
1488 }
1489
1490 dp_packet_batch_init(batch);
1491 retval = (rx->is_tap
1492 ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1493 : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1494
1495 if (retval) {
1496 if (retval != EAGAIN && retval != EMSGSIZE) {
1497 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1498 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1499 }
1500 }
1501
1502 if (qfill) {
1503 *qfill = -ENOTSUP;
1504 }
1505
1506 return retval;
1507 }
1508
1509 static void
1510 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1511 {
1512 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1513 poll_fd_wait(rx->fd, POLLIN);
1514 }
1515
1516 static int
1517 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1518 {
1519 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1520 if (rx->is_tap) {
1521 struct ifreq ifr;
1522 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1523 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1524 if (error) {
1525 return error;
1526 }
1527 drain_fd(rx->fd, ifr.ifr_qlen);
1528 return 0;
1529 } else {
1530 return drain_rcvbuf(rx->fd);
1531 }
1532 }
1533
1534 static int
1535 netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
1536 struct dp_packet_batch *batch)
1537 {
1538 const size_t size = dp_packet_batch_size(batch);
1539 /* We don't bother setting most fields in sockaddr_ll because the
1540 * kernel ignores them for SOCK_RAW. */
1541 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1542 .sll_ifindex = ifindex };
1543
1544 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1545 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1546
1547 struct dp_packet *packet;
1548 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1549 if (tso) {
1550 netdev_linux_prepend_vnet_hdr(packet, mtu);
1551 }
1552
1553 iov[i].iov_base = dp_packet_data(packet);
1554 iov[i].iov_len = dp_packet_size(packet);
1555 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1556 .msg_namelen = sizeof sll,
1557 .msg_iov = &iov[i],
1558 .msg_iovlen = 1 };
1559 }
1560
1561 int error = 0;
1562 for (uint32_t ofs = 0; ofs < size; ) {
1563 ssize_t retval;
1564 do {
1565 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1566 error = retval < 0 ? errno : 0;
1567 } while (error == EINTR);
1568 if (error) {
1569 break;
1570 }
1571 ofs += retval;
1572 }
1573
1574 free(mmsg);
1575 free(iov);
1576 return error;
1577 }
1578
1579 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1580 * essential, because packets sent to a tap device with an AF_PACKET socket
1581 * will loop back to be *received* again on the tap device. This doesn't occur
1582 * on other interface types because we attach a socket filter to the rx
1583 * socket. */
1584 static int
1585 netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
1586 struct dp_packet_batch *batch)
1587 {
1588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1589 struct dp_packet *packet;
1590
1591 /* The Linux tap driver returns EIO if the device is not up,
1592 * so if the device is not up, don't waste time sending it.
1593 * However, if the device is in another network namespace
1594 * then OVS can't retrieve the state. In that case, send the
1595 * packets anyway. */
1596 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1597 netdev->tx_dropped += dp_packet_batch_size(batch);
1598 return 0;
1599 }
1600
1601 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1602 size_t size;
1603 ssize_t retval;
1604 int error;
1605
1606 if (tso) {
1607 netdev_linux_prepend_vnet_hdr(packet, mtu);
1608 }
1609
1610 size = dp_packet_size(packet);
1611 do {
1612 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1613 error = retval < 0 ? errno : 0;
1614 } while (error == EINTR);
1615
1616 if (error) {
1617 /* The Linux tap driver returns EIO if the device is not up. From
1618 * the OVS side this is not an error, so we ignore it; otherwise,
1619 * return the erro. */
1620 if (error != EIO) {
1621 return error;
1622 }
1623 } else if (retval != size) {
1624 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1625 "bytes of %"PRIuSIZE") on %s",
1626 retval, size, netdev_get_name(netdev_));
1627 return EMSGSIZE;
1628 }
1629 }
1630 return 0;
1631 }
1632
1633 static int
1634 netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1635 OVS_REQUIRES(netdev->mutex)
1636 {
1637 char *numa_node_path;
1638 const char *name;
1639 int node_id;
1640 FILE *stream;
1641
1642 if (netdev->cache_valid & VALID_NUMA_ID) {
1643 return netdev->numa_id;
1644 }
1645
1646 netdev->numa_id = 0;
1647 netdev->cache_valid |= VALID_NUMA_ID;
1648
1649 if (ovs_numa_get_n_numas() < 2) {
1650 /* No need to check on system with a single NUMA node. */
1651 return 0;
1652 }
1653
1654 name = netdev_get_name(&netdev->up);
1655 if (strpbrk(name, "/\\")) {
1656 VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1657 "A valid name must not include '/' or '\\'."
1658 "Using numa_id 0", name);
1659 return 0;
1660 }
1661
1662 numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1663
1664 stream = fopen(numa_node_path, "r");
1665 if (!stream) {
1666 /* Virtual device does not have this info. */
1667 VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1668 name, numa_node_path, ovs_strerror(errno));
1669 free(numa_node_path);
1670 return 0;
1671 }
1672
1673 if (fscanf(stream, "%d", &node_id) != 1
1674 || !ovs_numa_numa_id_is_valid(node_id)) {
1675 VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1676 node_id = 0;
1677 }
1678
1679 netdev->numa_id = node_id;
1680 fclose(stream);
1681 free(numa_node_path);
1682 return node_id;
1683 }
1684
1685 static int OVS_UNUSED
1686 netdev_linux_get_numa_id(const struct netdev *netdev_)
1687 {
1688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1689 int numa_id;
1690
1691 ovs_mutex_lock(&netdev->mutex);
1692 numa_id = netdev_linux_get_numa_id__(netdev);
1693 ovs_mutex_unlock(&netdev->mutex);
1694
1695 return numa_id;
1696 }
1697
1698 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1699 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1700 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1701 * the packet is too big or too small to transmit on the device.
1702 *
1703 * The kernel maintains a packet transmission queue, so the caller is not
1704 * expected to do additional queuing of packets. */
1705 static int
1706 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1707 struct dp_packet_batch *batch,
1708 bool concurrent_txq OVS_UNUSED)
1709 {
1710 bool tso = userspace_tso_enabled();
1711 int mtu = ETH_PAYLOAD_MAX;
1712 int error = 0;
1713 int sock = 0;
1714
1715 if (tso) {
1716 netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1717 }
1718
1719 if (!is_tap_netdev(netdev_)) {
1720 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1721 error = EOPNOTSUPP;
1722 goto free_batch;
1723 }
1724
1725 sock = af_packet_sock();
1726 if (sock < 0) {
1727 error = -sock;
1728 goto free_batch;
1729 }
1730
1731 int ifindex = netdev_get_ifindex(netdev_);
1732 if (ifindex < 0) {
1733 error = -ifindex;
1734 goto free_batch;
1735 }
1736
1737 error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
1738 } else {
1739 error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
1740 }
1741 if (error) {
1742 if (error == ENOBUFS) {
1743 /* The Linux AF_PACKET implementation never blocks waiting
1744 * for room for packets, instead returning ENOBUFS.
1745 * Translate this into EAGAIN for the caller. */
1746 error = EAGAIN;
1747 } else {
1748 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1749 netdev_get_name(netdev_), ovs_strerror(error));
1750 }
1751 }
1752
1753 free_batch:
1754 dp_packet_delete_batch(batch, true);
1755 return error;
1756 }
1757
1758 /* Registers with the poll loop to wake up from the next call to poll_block()
1759 * when the packet transmission queue has sufficient room to transmit a packet
1760 * with netdev_send().
1761 *
1762 * The kernel maintains a packet transmission queue, so the client is not
1763 * expected to do additional queuing of packets. Thus, this function is
1764 * unlikely to ever be used. It is included for completeness. */
1765 static void
1766 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1767 {
1768 if (is_tap_netdev(netdev)) {
1769 /* TAP device always accepts packets.*/
1770 poll_immediate_wake();
1771 }
1772 }
1773
1774 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1775 * otherwise a positive errno value. */
1776 static int
1777 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1778 {
1779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1780 enum netdev_flags old_flags = 0;
1781 int error;
1782
1783 ovs_mutex_lock(&netdev->mutex);
1784 if (netdev_linux_netnsid_is_remote(netdev)) {
1785 error = EOPNOTSUPP;
1786 goto exit;
1787 }
1788
1789 if (netdev->cache_valid & VALID_ETHERADDR) {
1790 error = netdev->ether_addr_error;
1791 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1792 goto exit;
1793 }
1794 netdev->cache_valid &= ~VALID_ETHERADDR;
1795 }
1796
1797 /* Tap devices must be brought down before setting the address. */
1798 if (is_tap_netdev(netdev_)) {
1799 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1800 }
1801 error = set_etheraddr(netdev_get_name(netdev_), mac);
1802 if (!error || error == ENODEV) {
1803 netdev->ether_addr_error = error;
1804 netdev->cache_valid |= VALID_ETHERADDR;
1805 if (!error) {
1806 netdev->etheraddr = mac;
1807 }
1808 }
1809
1810 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1811 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1812 }
1813
1814 exit:
1815 ovs_mutex_unlock(&netdev->mutex);
1816 return error;
1817 }
1818
1819 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1820 static int
1821 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1822 {
1823 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1824 int error;
1825
1826 ovs_mutex_lock(&netdev->mutex);
1827 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1828 netdev_linux_update_via_netlink(netdev);
1829 }
1830
1831 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1832 /* Fall back to ioctl if netlink fails */
1833 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1834 &netdev->etheraddr);
1835 netdev->cache_valid |= VALID_ETHERADDR;
1836 }
1837
1838 error = netdev->ether_addr_error;
1839 if (!error) {
1840 *mac = netdev->etheraddr;
1841 }
1842 ovs_mutex_unlock(&netdev->mutex);
1843
1844 return error;
1845 }
1846
1847 static int
1848 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1849 {
1850 int error;
1851
1852 if (!(netdev->cache_valid & VALID_MTU)) {
1853 netdev_linux_update_via_netlink(netdev);
1854 }
1855
1856 if (!(netdev->cache_valid & VALID_MTU)) {
1857 /* Fall back to ioctl if netlink fails */
1858 struct ifreq ifr;
1859
1860 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1861 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1862 netdev->mtu = ifr.ifr_mtu;
1863 netdev->cache_valid |= VALID_MTU;
1864 }
1865
1866 error = netdev->netdev_mtu_error;
1867 if (!error) {
1868 *mtup = netdev->mtu;
1869 }
1870
1871 return error;
1872 }
1873
1874 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1875 * in bytes, not including the hardware header; thus, this is typically 1500
1876 * bytes for Ethernet devices. */
1877 static int
1878 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1879 {
1880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1881 int error;
1882
1883 ovs_mutex_lock(&netdev->mutex);
1884 error = netdev_linux_get_mtu__(netdev, mtup);
1885 ovs_mutex_unlock(&netdev->mutex);
1886
1887 return error;
1888 }
1889
1890 /* Sets the maximum size of transmitted (MTU) for given device using linux
1891 * networking ioctl interface.
1892 */
1893 static int
1894 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1895 {
1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1897 struct ifreq ifr;
1898 int error;
1899
1900 ovs_mutex_lock(&netdev->mutex);
1901 if (netdev_linux_netnsid_is_remote(netdev)) {
1902 error = EOPNOTSUPP;
1903 goto exit;
1904 }
1905
1906 #ifdef HAVE_AF_XDP
1907 if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1908 error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1909 if (error) {
1910 goto exit;
1911 }
1912 }
1913 #endif
1914
1915 if (netdev->cache_valid & VALID_MTU) {
1916 error = netdev->netdev_mtu_error;
1917 if (error || netdev->mtu == mtu) {
1918 goto exit;
1919 }
1920 netdev->cache_valid &= ~VALID_MTU;
1921 }
1922 ifr.ifr_mtu = mtu;
1923 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1924 SIOCSIFMTU, "SIOCSIFMTU");
1925 if (!error || error == ENODEV) {
1926 netdev->netdev_mtu_error = error;
1927 netdev->mtu = ifr.ifr_mtu;
1928 netdev->cache_valid |= VALID_MTU;
1929 }
1930 exit:
1931 ovs_mutex_unlock(&netdev->mutex);
1932 return error;
1933 }
1934
1935 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1936 * On failure, returns a negative errno value. */
1937 static int
1938 netdev_linux_get_ifindex(const struct netdev *netdev_)
1939 {
1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1941 int ifindex, error;
1942
1943 ovs_mutex_lock(&netdev->mutex);
1944 if (netdev_linux_netnsid_is_remote(netdev)) {
1945 error = EOPNOTSUPP;
1946 goto exit;
1947 }
1948 error = get_ifindex(netdev_, &ifindex);
1949
1950 exit:
1951 ovs_mutex_unlock(&netdev->mutex);
1952 return error ? -error : ifindex;
1953 }
1954
1955 static int
1956 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1957 {
1958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1959
1960 ovs_mutex_lock(&netdev->mutex);
1961 if (netdev->miimon_interval > 0) {
1962 *carrier = netdev->miimon;
1963 } else {
1964 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1965 }
1966 ovs_mutex_unlock(&netdev->mutex);
1967
1968 return 0;
1969 }
1970
1971 static long long int
1972 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1973 {
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1975 long long int carrier_resets;
1976
1977 ovs_mutex_lock(&netdev->mutex);
1978 carrier_resets = netdev->carrier_resets;
1979 ovs_mutex_unlock(&netdev->mutex);
1980
1981 return carrier_resets;
1982 }
1983
1984 static int
1985 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1986 struct mii_ioctl_data *data)
1987 {
1988 struct ifreq ifr;
1989 int error;
1990
1991 memset(&ifr, 0, sizeof ifr);
1992 memcpy(&ifr.ifr_data, data, sizeof *data);
1993 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1994 memcpy(data, &ifr.ifr_data, sizeof *data);
1995
1996 return error;
1997 }
1998
1999 static int
2000 netdev_linux_get_miimon(const char *name, bool *miimon)
2001 {
2002 struct mii_ioctl_data data;
2003 int error;
2004
2005 *miimon = false;
2006
2007 memset(&data, 0, sizeof data);
2008 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2009 if (!error) {
2010 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2011 data.reg_num = MII_BMSR;
2012 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2013 &data);
2014
2015 if (!error) {
2016 *miimon = !!(data.val_out & BMSR_LSTATUS);
2017 }
2018 }
2019 if (error) {
2020 struct ethtool_cmd ecmd;
2021
2022 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2023 name);
2024
2025 COVERAGE_INC(netdev_get_ethtool);
2026 memset(&ecmd, 0, sizeof ecmd);
2027 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2028 "ETHTOOL_GLINK");
2029 if (!error) {
2030 struct ethtool_value eval;
2031
2032 memcpy(&eval, &ecmd, sizeof eval);
2033 *miimon = !!eval.data;
2034 } else {
2035 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2036 }
2037 }
2038
2039 return error;
2040 }
2041
2042 static int
2043 netdev_linux_set_miimon_interval(struct netdev *netdev_,
2044 long long int interval)
2045 {
2046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2047
2048 ovs_mutex_lock(&netdev->mutex);
2049 interval = interval > 0 ? MAX(interval, 100) : 0;
2050 if (netdev->miimon_interval != interval) {
2051 if (interval && !netdev->miimon_interval) {
2052 atomic_count_inc(&miimon_cnt);
2053 } else if (!interval && netdev->miimon_interval) {
2054 atomic_count_dec(&miimon_cnt);
2055 }
2056
2057 netdev->miimon_interval = interval;
2058 timer_set_expired(&netdev->miimon_timer);
2059 }
2060 ovs_mutex_unlock(&netdev->mutex);
2061
2062 return 0;
2063 }
2064
2065 static void
2066 netdev_linux_miimon_run(void)
2067 {
2068 struct shash device_shash;
2069 struct shash_node *node;
2070
2071 shash_init(&device_shash);
2072 netdev_get_devices(&netdev_linux_class, &device_shash);
2073 SHASH_FOR_EACH (node, &device_shash) {
2074 struct netdev *netdev = node->data;
2075 struct netdev_linux *dev = netdev_linux_cast(netdev);
2076 bool miimon;
2077
2078 ovs_mutex_lock(&dev->mutex);
2079 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2080 netdev_linux_get_miimon(dev->up.name, &miimon);
2081 if (miimon != dev->miimon) {
2082 dev->miimon = miimon;
2083 netdev_linux_changed(dev, dev->ifi_flags, 0);
2084 }
2085
2086 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2087 }
2088 ovs_mutex_unlock(&dev->mutex);
2089 netdev_close(netdev);
2090 }
2091
2092 shash_destroy(&device_shash);
2093 }
2094
2095 static void
2096 netdev_linux_miimon_wait(void)
2097 {
2098 struct shash device_shash;
2099 struct shash_node *node;
2100
2101 shash_init(&device_shash);
2102 netdev_get_devices(&netdev_linux_class, &device_shash);
2103 SHASH_FOR_EACH (node, &device_shash) {
2104 struct netdev *netdev = node->data;
2105 struct netdev_linux *dev = netdev_linux_cast(netdev);
2106
2107 ovs_mutex_lock(&dev->mutex);
2108 if (dev->miimon_interval > 0) {
2109 timer_wait(&dev->miimon_timer);
2110 }
2111 ovs_mutex_unlock(&dev->mutex);
2112 netdev_close(netdev);
2113 }
2114 shash_destroy(&device_shash);
2115 }
2116
2117 static void
2118 swap_uint64(uint64_t *a, uint64_t *b)
2119 {
2120 uint64_t tmp = *a;
2121 *a = *b;
2122 *b = tmp;
2123 }
2124
2125 /* Copies 'src' into 'dst', performing format conversion in the process.
2126 *
2127 * 'src' is allowed to be misaligned. */
2128 static void
2129 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2130 const struct ovs_vport_stats *src)
2131 {
2132 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
2133 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
2134 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
2135 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
2136 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
2137 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
2138 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
2139 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
2140 dst->multicast = 0;
2141 dst->collisions = 0;
2142 dst->rx_length_errors = 0;
2143 dst->rx_over_errors = 0;
2144 dst->rx_crc_errors = 0;
2145 dst->rx_frame_errors = 0;
2146 dst->rx_fifo_errors = 0;
2147 dst->rx_missed_errors = 0;
2148 dst->tx_aborted_errors = 0;
2149 dst->tx_carrier_errors = 0;
2150 dst->tx_fifo_errors = 0;
2151 dst->tx_heartbeat_errors = 0;
2152 dst->tx_window_errors = 0;
2153 }
2154
2155 static int
2156 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2157 {
2158 struct dpif_netlink_vport reply;
2159 struct ofpbuf *buf;
2160 int error;
2161
2162 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2163 if (error) {
2164 return error;
2165 } else if (!reply.stats) {
2166 ofpbuf_delete(buf);
2167 return EOPNOTSUPP;
2168 }
2169
2170 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
2171
2172 ofpbuf_delete(buf);
2173
2174 return 0;
2175 }
2176
2177 static void
2178 get_stats_via_vport(const struct netdev *netdev_,
2179 struct netdev_stats *stats)
2180 {
2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2182
2183 if (!netdev->vport_stats_error ||
2184 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2185 int error;
2186
2187 error = get_stats_via_vport__(netdev_, stats);
2188 if (error && error != ENOENT && error != ENODEV) {
2189 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2190 "(%s)",
2191 netdev_get_name(netdev_), ovs_strerror(error));
2192 }
2193 netdev->vport_stats_error = error;
2194 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2195 }
2196 }
2197
2198 /* Retrieves current device stats for 'netdev-linux'. */
2199 static int
2200 netdev_linux_get_stats(const struct netdev *netdev_,
2201 struct netdev_stats *stats)
2202 {
2203 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2204 struct netdev_stats dev_stats;
2205 int error;
2206
2207 ovs_mutex_lock(&netdev->mutex);
2208 get_stats_via_vport(netdev_, stats);
2209 error = get_stats_via_netlink(netdev_, &dev_stats);
2210 if (error) {
2211 if (!netdev->vport_stats_error) {
2212 error = 0;
2213 }
2214 } else if (netdev->vport_stats_error) {
2215 /* stats not available from OVS then use netdev stats. */
2216 *stats = dev_stats;
2217 } else {
2218 stats->multicast += dev_stats.multicast;
2219 stats->collisions += dev_stats.collisions;
2220 stats->rx_length_errors += dev_stats.rx_length_errors;
2221 stats->rx_over_errors += dev_stats.rx_over_errors;
2222 stats->rx_crc_errors += dev_stats.rx_crc_errors;
2223 stats->rx_frame_errors += dev_stats.rx_frame_errors;
2224 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
2225 stats->rx_missed_errors += dev_stats.rx_missed_errors;
2226 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
2227 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
2228 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
2229 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2230 stats->tx_window_errors += dev_stats.tx_window_errors;
2231 }
2232 ovs_mutex_unlock(&netdev->mutex);
2233
2234 return error;
2235 }
2236
2237 /* Retrieves current device stats for 'netdev-tap' netdev or
2238 * netdev-internal. */
2239 static int
2240 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2241 {
2242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2243 struct netdev_stats dev_stats;
2244 int error;
2245
2246 ovs_mutex_lock(&netdev->mutex);
2247 get_stats_via_vport(netdev_, stats);
2248 error = get_stats_via_netlink(netdev_, &dev_stats);
2249 if (error) {
2250 if (!netdev->vport_stats_error) {
2251 error = 0;
2252 }
2253 } else if (netdev->vport_stats_error) {
2254 /* Transmit and receive stats will appear to be swapped relative to the
2255 * other ports since we are the one sending the data, not a remote
2256 * computer. For consistency, we swap them back here. This does not
2257 * apply if we are getting stats from the vport layer because it always
2258 * tracks stats from the perspective of the switch. */
2259
2260 *stats = dev_stats;
2261 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2262 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2263 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2264 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2265 stats->rx_length_errors = 0;
2266 stats->rx_over_errors = 0;
2267 stats->rx_crc_errors = 0;
2268 stats->rx_frame_errors = 0;
2269 stats->rx_fifo_errors = 0;
2270 stats->rx_missed_errors = 0;
2271 stats->tx_aborted_errors = 0;
2272 stats->tx_carrier_errors = 0;
2273 stats->tx_fifo_errors = 0;
2274 stats->tx_heartbeat_errors = 0;
2275 stats->tx_window_errors = 0;
2276 } else {
2277 /* Use kernel netdev's packet and byte counts since vport counters
2278 * do not reflect packet counts on the wire when GSO, TSO or GRO
2279 * are enabled. */
2280 stats->rx_packets = dev_stats.tx_packets;
2281 stats->rx_bytes = dev_stats.tx_bytes;
2282 stats->tx_packets = dev_stats.rx_packets;
2283 stats->tx_bytes = dev_stats.rx_bytes;
2284
2285 stats->rx_dropped += dev_stats.tx_dropped;
2286 stats->tx_dropped += dev_stats.rx_dropped;
2287
2288 stats->rx_errors += dev_stats.tx_errors;
2289 stats->tx_errors += dev_stats.rx_errors;
2290
2291 stats->multicast += dev_stats.multicast;
2292 stats->collisions += dev_stats.collisions;
2293 }
2294 stats->tx_dropped += netdev->tx_dropped;
2295 stats->rx_dropped += netdev->rx_dropped;
2296 ovs_mutex_unlock(&netdev->mutex);
2297
2298 return error;
2299 }
2300
2301 static int
2302 netdev_internal_get_stats(const struct netdev *netdev_,
2303 struct netdev_stats *stats)
2304 {
2305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2306 int error;
2307
2308 ovs_mutex_lock(&netdev->mutex);
2309 get_stats_via_vport(netdev_, stats);
2310 error = netdev->vport_stats_error;
2311 ovs_mutex_unlock(&netdev->mutex);
2312
2313 return error;
2314 }
2315
2316 static void
2317 netdev_linux_read_features(struct netdev_linux *netdev)
2318 {
2319 struct ethtool_cmd ecmd;
2320 uint32_t speed;
2321 int error;
2322
2323 if (netdev->cache_valid & VALID_FEATURES) {
2324 return;
2325 }
2326
2327 COVERAGE_INC(netdev_get_ethtool);
2328 memset(&ecmd, 0, sizeof ecmd);
2329 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2330 ETHTOOL_GSET, "ETHTOOL_GSET");
2331 if (error) {
2332 goto out;
2333 }
2334
2335 /* Supported features. */
2336 netdev->supported = 0;
2337 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2338 netdev->supported |= NETDEV_F_10MB_HD;
2339 }
2340 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2341 netdev->supported |= NETDEV_F_10MB_FD;
2342 }
2343 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2344 netdev->supported |= NETDEV_F_100MB_HD;
2345 }
2346 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2347 netdev->supported |= NETDEV_F_100MB_FD;
2348 }
2349 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2350 netdev->supported |= NETDEV_F_1GB_HD;
2351 }
2352 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2353 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2354 netdev->supported |= NETDEV_F_1GB_FD;
2355 }
2356 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2357 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2358 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2359 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2360 netdev->supported |= NETDEV_F_10GB_FD;
2361 }
2362 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2363 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2364 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2365 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2366 netdev->supported |= NETDEV_F_40GB_FD;
2367 }
2368 if (ecmd.supported & SUPPORTED_TP) {
2369 netdev->supported |= NETDEV_F_COPPER;
2370 }
2371 if (ecmd.supported & SUPPORTED_FIBRE) {
2372 netdev->supported |= NETDEV_F_FIBER;
2373 }
2374 if (ecmd.supported & SUPPORTED_Autoneg) {
2375 netdev->supported |= NETDEV_F_AUTONEG;
2376 }
2377 if (ecmd.supported & SUPPORTED_Pause) {
2378 netdev->supported |= NETDEV_F_PAUSE;
2379 }
2380 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2381 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2382 }
2383
2384 /* Advertised features. */
2385 netdev->advertised = 0;
2386 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2387 netdev->advertised |= NETDEV_F_10MB_HD;
2388 }
2389 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2390 netdev->advertised |= NETDEV_F_10MB_FD;
2391 }
2392 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2393 netdev->advertised |= NETDEV_F_100MB_HD;
2394 }
2395 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2396 netdev->advertised |= NETDEV_F_100MB_FD;
2397 }
2398 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2399 netdev->advertised |= NETDEV_F_1GB_HD;
2400 }
2401 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2402 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2403 netdev->advertised |= NETDEV_F_1GB_FD;
2404 }
2405 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2406 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2407 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2408 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2409 netdev->advertised |= NETDEV_F_10GB_FD;
2410 }
2411 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2412 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2413 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2414 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2415 netdev->advertised |= NETDEV_F_40GB_FD;
2416 }
2417 if (ecmd.advertising & ADVERTISED_TP) {
2418 netdev->advertised |= NETDEV_F_COPPER;
2419 }
2420 if (ecmd.advertising & ADVERTISED_FIBRE) {
2421 netdev->advertised |= NETDEV_F_FIBER;
2422 }
2423 if (ecmd.advertising & ADVERTISED_Autoneg) {
2424 netdev->advertised |= NETDEV_F_AUTONEG;
2425 }
2426 if (ecmd.advertising & ADVERTISED_Pause) {
2427 netdev->advertised |= NETDEV_F_PAUSE;
2428 }
2429 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2430 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2431 }
2432
2433 /* Current settings. */
2434 speed = ethtool_cmd_speed(&ecmd);
2435 if (speed == SPEED_10) {
2436 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2437 } else if (speed == SPEED_100) {
2438 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2439 } else if (speed == SPEED_1000) {
2440 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2441 } else if (speed == SPEED_10000) {
2442 netdev->current = NETDEV_F_10GB_FD;
2443 } else if (speed == 40000) {
2444 netdev->current = NETDEV_F_40GB_FD;
2445 } else if (speed == 100000) {
2446 netdev->current = NETDEV_F_100GB_FD;
2447 } else if (speed == 1000000) {
2448 netdev->current = NETDEV_F_1TB_FD;
2449 } else {
2450 netdev->current = 0;
2451 }
2452
2453 if (ecmd.port == PORT_TP) {
2454 netdev->current |= NETDEV_F_COPPER;
2455 } else if (ecmd.port == PORT_FIBRE) {
2456 netdev->current |= NETDEV_F_FIBER;
2457 }
2458
2459 if (ecmd.autoneg) {
2460 netdev->current |= NETDEV_F_AUTONEG;
2461 }
2462
2463 out:
2464 netdev->cache_valid |= VALID_FEATURES;
2465 netdev->get_features_error = error;
2466 }
2467
2468 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2469 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2470 * Returns 0 if successful, otherwise a positive errno value. */
2471 static int
2472 netdev_linux_get_features(const struct netdev *netdev_,
2473 enum netdev_features *current,
2474 enum netdev_features *advertised,
2475 enum netdev_features *supported,
2476 enum netdev_features *peer)
2477 {
2478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2479 int error;
2480
2481 ovs_mutex_lock(&netdev->mutex);
2482 if (netdev_linux_netnsid_is_remote(netdev)) {
2483 error = EOPNOTSUPP;
2484 goto exit;
2485 }
2486
2487 netdev_linux_read_features(netdev);
2488 if (!netdev->get_features_error) {
2489 *current = netdev->current;
2490 *advertised = netdev->advertised;
2491 *supported = netdev->supported;
2492 *peer = 0; /* XXX */
2493 }
2494 error = netdev->get_features_error;
2495
2496 exit:
2497 ovs_mutex_unlock(&netdev->mutex);
2498 return error;
2499 }
2500
2501 /* Set the features advertised by 'netdev' to 'advertise'. */
2502 static int
2503 netdev_linux_set_advertisements(struct netdev *netdev_,
2504 enum netdev_features advertise)
2505 {
2506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2507 struct ethtool_cmd ecmd;
2508 int error;
2509
2510 ovs_mutex_lock(&netdev->mutex);
2511
2512 COVERAGE_INC(netdev_get_ethtool);
2513
2514 if (netdev_linux_netnsid_is_remote(netdev)) {
2515 error = EOPNOTSUPP;
2516 goto exit;
2517 }
2518
2519 memset(&ecmd, 0, sizeof ecmd);
2520 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2521 ETHTOOL_GSET, "ETHTOOL_GSET");
2522 if (error) {
2523 goto exit;
2524 }
2525
2526 ecmd.advertising = 0;
2527 if (advertise & NETDEV_F_10MB_HD) {
2528 ecmd.advertising |= ADVERTISED_10baseT_Half;
2529 }
2530 if (advertise & NETDEV_F_10MB_FD) {
2531 ecmd.advertising |= ADVERTISED_10baseT_Full;
2532 }
2533 if (advertise & NETDEV_F_100MB_HD) {
2534 ecmd.advertising |= ADVERTISED_100baseT_Half;
2535 }
2536 if (advertise & NETDEV_F_100MB_FD) {
2537 ecmd.advertising |= ADVERTISED_100baseT_Full;
2538 }
2539 if (advertise & NETDEV_F_1GB_HD) {
2540 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2541 }
2542 if (advertise & NETDEV_F_1GB_FD) {
2543 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2544 }
2545 if (advertise & NETDEV_F_10GB_FD) {
2546 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2547 }
2548 if (advertise & NETDEV_F_COPPER) {
2549 ecmd.advertising |= ADVERTISED_TP;
2550 }
2551 if (advertise & NETDEV_F_FIBER) {
2552 ecmd.advertising |= ADVERTISED_FIBRE;
2553 }
2554 if (advertise & NETDEV_F_AUTONEG) {
2555 ecmd.advertising |= ADVERTISED_Autoneg;
2556 }
2557 if (advertise & NETDEV_F_PAUSE) {
2558 ecmd.advertising |= ADVERTISED_Pause;
2559 }
2560 if (advertise & NETDEV_F_PAUSE_ASYM) {
2561 ecmd.advertising |= ADVERTISED_Asym_Pause;
2562 }
2563 COVERAGE_INC(netdev_set_ethtool);
2564 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2565 ETHTOOL_SSET, "ETHTOOL_SSET");
2566
2567 exit:
2568 ovs_mutex_unlock(&netdev->mutex);
2569 return error;
2570 }
2571
2572 static struct tc_police
2573 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2574 {
2575 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2576 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2577 struct tc_police police;
2578 struct tc_ratespec rate;
2579 int mtu = 65535;
2580
2581 memset(&rate, 0, sizeof rate);
2582 rate.rate = bps;
2583 rate.cell_log = tc_calc_cell_log(mtu);
2584 rate.mpu = ETH_TOTAL_MIN;
2585
2586 memset(&police, 0, sizeof police);
2587 police.burst = tc_bytes_to_ticks(bps, bsize);
2588 police.action = TC_POLICE_SHOT;
2589 police.rate = rate;
2590 police.mtu = mtu;
2591
2592 return police;
2593 }
2594
2595 static void
2596 nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2597 {
2598 size_t offset;
2599
2600 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2601 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2602 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2603 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2604 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2605 nl_msg_end_nested(request, offset);
2606 }
2607
2608 static int
2609 tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2610 uint32_t kbits_burst)
2611 {
2612 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2613 size_t basic_offset, action_offset, inner_offset;
2614 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2615 int ifindex, err = 0;
2616 struct tc_police pol_act;
2617 struct ofpbuf request;
2618 struct ofpbuf *reply;
2619 struct tcmsg *tcmsg;
2620 uint32_t handle = 1;
2621
2622 err = get_ifindex(netdev, &ifindex);
2623 if (err) {
2624 return err;
2625 }
2626
2627 tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2628 &request);
2629 tcmsg->tcm_parent = TC_INGRESS_PARENT;
2630 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2631 tcmsg->tcm_handle = handle;
2632
2633 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2634 nl_msg_put_string(&request, TCA_KIND, "matchall");
2635 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2636 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2637 inner_offset = nl_msg_start_nested(&request, 1);
2638 nl_msg_put_act_police(&request, pol_act);
2639 nl_msg_end_nested(&request, inner_offset);
2640 nl_msg_end_nested(&request, action_offset);
2641 nl_msg_end_nested(&request, basic_offset);
2642
2643 err = tc_transact(&request, &reply);
2644 if (!err) {
2645 struct tcmsg *tc =
2646 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2647 ofpbuf_delete(reply);
2648 }
2649
2650 return err;
2651 }
2652
2653 static int
2654 tc_del_matchall_policer(struct netdev *netdev)
2655 {
2656 int prio = TC_RESERVED_PRIORITY_POLICE;
2657 uint32_t block_id = 0;
2658 struct tcf_id id;
2659 int ifindex;
2660 int err;
2661
2662 err = get_ifindex(netdev, &ifindex);
2663 if (err) {
2664 return err;
2665 }
2666
2667 id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2668 err = tc_del_filter(&id);
2669 if (err) {
2670 return err;
2671 }
2672
2673 return 0;
2674 }
2675
2676 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2677 * successful, otherwise a positive errno value. */
2678 static int
2679 netdev_linux_set_policing(struct netdev *netdev_,
2680 uint32_t kbits_rate, uint32_t kbits_burst)
2681 {
2682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2683 const char *netdev_name = netdev_get_name(netdev_);
2684 int ifindex;
2685 int error;
2686
2687 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2688 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2689 : kbits_burst); /* Stick with user-specified value. */
2690
2691 ovs_mutex_lock(&netdev->mutex);
2692 if (netdev_linux_netnsid_is_remote(netdev)) {
2693 error = EOPNOTSUPP;
2694 goto out;
2695 }
2696
2697 if (netdev->cache_valid & VALID_POLICING) {
2698 error = netdev->netdev_policing_error;
2699 if (error || (netdev->kbits_rate == kbits_rate &&
2700 netdev->kbits_burst == kbits_burst)) {
2701 /* Assume that settings haven't changed since we last set them. */
2702 goto out;
2703 }
2704 netdev->cache_valid &= ~VALID_POLICING;
2705 }
2706
2707 COVERAGE_INC(netdev_set_policing);
2708
2709 /* Use matchall for policing when offloadling ovs with tc-flower. */
2710 if (netdev_is_flow_api_enabled()) {
2711 error = tc_del_matchall_policer(netdev_);
2712 if (kbits_rate) {
2713 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2714 }
2715 ovs_mutex_unlock(&netdev->mutex);
2716 return error;
2717 }
2718
2719 error = get_ifindex(netdev_, &ifindex);
2720 if (error) {
2721 goto out;
2722 }
2723
2724 /* Remove any existing ingress qdisc. */
2725 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
2726 if (error) {
2727 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2728 netdev_name, ovs_strerror(error));
2729 goto out;
2730 }
2731
2732 if (kbits_rate) {
2733 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
2734 if (error) {
2735 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2736 netdev_name, ovs_strerror(error));
2737 goto out;
2738 }
2739
2740 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2741 if (error){
2742 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2743 netdev_name, ovs_strerror(error));
2744 goto out;
2745 }
2746 }
2747
2748 netdev->kbits_rate = kbits_rate;
2749 netdev->kbits_burst = kbits_burst;
2750
2751 out:
2752 if (!error || error == ENODEV) {
2753 netdev->netdev_policing_error = error;
2754 netdev->cache_valid |= VALID_POLICING;
2755 }
2756 ovs_mutex_unlock(&netdev->mutex);
2757 return error;
2758 }
2759
2760 static int
2761 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2762 struct sset *types)
2763 {
2764 const struct tc_ops *const *opsp;
2765 for (opsp = tcs; *opsp != NULL; opsp++) {
2766 const struct tc_ops *ops = *opsp;
2767 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2768 sset_add(types, ops->ovs_name);
2769 }
2770 }
2771 return 0;
2772 }
2773
2774 static const struct tc_ops *
2775 tc_lookup_ovs_name(const char *name)
2776 {
2777 const struct tc_ops *const *opsp;
2778
2779 for (opsp = tcs; *opsp != NULL; opsp++) {
2780 const struct tc_ops *ops = *opsp;
2781 if (!strcmp(name, ops->ovs_name)) {
2782 return ops;
2783 }
2784 }
2785 return NULL;
2786 }
2787
2788 static const struct tc_ops *
2789 tc_lookup_linux_name(const char *name)
2790 {
2791 const struct tc_ops *const *opsp;
2792
2793 for (opsp = tcs; *opsp != NULL; opsp++) {
2794 const struct tc_ops *ops = *opsp;
2795 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2796 return ops;
2797 }
2798 }
2799 return NULL;
2800 }
2801
2802 static struct tc_queue *
2803 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2804 size_t hash)
2805 {
2806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2807 struct tc_queue *queue;
2808
2809 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2810 if (queue->queue_id == queue_id) {
2811 return queue;
2812 }
2813 }
2814 return NULL;
2815 }
2816
2817 static struct tc_queue *
2818 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2819 {
2820 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2821 }
2822
2823 static int
2824 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2825 const char *type,
2826 struct netdev_qos_capabilities *caps)
2827 {
2828 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2829 if (!ops) {
2830 return EOPNOTSUPP;
2831 }
2832 caps->n_queues = ops->n_queues;
2833 return 0;
2834 }
2835
2836 static int
2837 netdev_linux_get_qos(const struct netdev *netdev_,
2838 const char **typep, struct smap *details)
2839 {
2840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2841 int error;
2842
2843 ovs_mutex_lock(&netdev->mutex);
2844 if (netdev_linux_netnsid_is_remote(netdev)) {
2845 error = EOPNOTSUPP;
2846 goto exit;
2847 }
2848
2849 error = tc_query_qdisc(netdev_);
2850 if (!error) {
2851 *typep = netdev->tc->ops->ovs_name;
2852 error = (netdev->tc->ops->qdisc_get
2853 ? netdev->tc->ops->qdisc_get(netdev_, details)
2854 : 0);
2855 }
2856
2857 exit:
2858 ovs_mutex_unlock(&netdev->mutex);
2859 return error;
2860 }
2861
2862 static int
2863 netdev_linux_set_qos(struct netdev *netdev_,
2864 const char *type, const struct smap *details)
2865 {
2866 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2867 const struct tc_ops *new_ops;
2868 int error;
2869
2870 new_ops = tc_lookup_ovs_name(type);
2871 if (!new_ops || !new_ops->tc_install) {
2872 return EOPNOTSUPP;
2873 }
2874
2875 if (new_ops == &tc_ops_noop) {
2876 return new_ops->tc_install(netdev_, details);
2877 }
2878
2879 ovs_mutex_lock(&netdev->mutex);
2880 if (netdev_linux_netnsid_is_remote(netdev)) {
2881 error = EOPNOTSUPP;
2882 goto exit;
2883 }
2884
2885 error = tc_query_qdisc(netdev_);
2886 if (error) {
2887 goto exit;
2888 }
2889
2890 if (new_ops == netdev->tc->ops) {
2891 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2892 } else {
2893 /* Delete existing qdisc. */
2894 error = tc_del_qdisc(netdev_);
2895 if (error) {
2896 goto exit;
2897 }
2898 ovs_assert(netdev->tc == NULL);
2899
2900 /* Install new qdisc. */
2901 error = new_ops->tc_install(netdev_, details);
2902 ovs_assert((error == 0) == (netdev->tc != NULL));
2903 }
2904
2905 exit:
2906 ovs_mutex_unlock(&netdev->mutex);
2907 return error;
2908 }
2909
2910 static int
2911 netdev_linux_get_queue(const struct netdev *netdev_,
2912 unsigned int queue_id, struct smap *details)
2913 {
2914 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2915 int error;
2916
2917 ovs_mutex_lock(&netdev->mutex);
2918 if (netdev_linux_netnsid_is_remote(netdev)) {
2919 error = EOPNOTSUPP;
2920 goto exit;
2921 }
2922
2923 error = tc_query_qdisc(netdev_);
2924 if (!error) {
2925 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2926 error = (queue
2927 ? netdev->tc->ops->class_get(netdev_, queue, details)
2928 : ENOENT);
2929 }
2930
2931 exit:
2932 ovs_mutex_unlock(&netdev->mutex);
2933 return error;
2934 }
2935
2936 static int
2937 netdev_linux_set_queue(struct netdev *netdev_,
2938 unsigned int queue_id, const struct smap *details)
2939 {
2940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2941 int error;
2942
2943 ovs_mutex_lock(&netdev->mutex);
2944 if (netdev_linux_netnsid_is_remote(netdev)) {
2945 error = EOPNOTSUPP;
2946 goto exit;
2947 }
2948
2949 error = tc_query_qdisc(netdev_);
2950 if (!error) {
2951 error = (queue_id < netdev->tc->ops->n_queues
2952 && netdev->tc->ops->class_set
2953 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2954 : EINVAL);
2955 }
2956
2957 exit:
2958 ovs_mutex_unlock(&netdev->mutex);
2959 return error;
2960 }
2961
2962 static int
2963 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2964 {
2965 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2966 int error;
2967
2968 ovs_mutex_lock(&netdev->mutex);
2969 if (netdev_linux_netnsid_is_remote(netdev)) {
2970 error = EOPNOTSUPP;
2971 goto exit;
2972 }
2973
2974 error = tc_query_qdisc(netdev_);
2975 if (!error) {
2976 if (netdev->tc->ops->class_delete) {
2977 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2978 error = (queue
2979 ? netdev->tc->ops->class_delete(netdev_, queue)
2980 : ENOENT);
2981 } else {
2982 error = EINVAL;
2983 }
2984 }
2985
2986 exit:
2987 ovs_mutex_unlock(&netdev->mutex);
2988 return error;
2989 }
2990
2991 static int
2992 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2993 unsigned int queue_id,
2994 struct netdev_queue_stats *stats)
2995 {
2996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2997 int error;
2998
2999 ovs_mutex_lock(&netdev->mutex);
3000 if (netdev_linux_netnsid_is_remote(netdev)) {
3001 error = EOPNOTSUPP;
3002 goto exit;
3003 }
3004
3005 error = tc_query_qdisc(netdev_);
3006 if (!error) {
3007 if (netdev->tc->ops->class_get_stats) {
3008 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3009 if (queue) {
3010 stats->created = queue->created;
3011 error = netdev->tc->ops->class_get_stats(netdev_, queue,
3012 stats);
3013 } else {
3014 error = ENOENT;
3015 }
3016 } else {
3017 error = EOPNOTSUPP;
3018 }
3019 }
3020
3021 exit:
3022 ovs_mutex_unlock(&netdev->mutex);
3023 return error;
3024 }
3025
3026 struct queue_dump_state {
3027 struct nl_dump dump;
3028 struct ofpbuf buf;
3029 };
3030
3031 static bool
3032 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3033 {
3034 struct ofpbuf request;
3035 struct tcmsg *tcmsg;
3036
3037 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3038 if (!tcmsg) {
3039 return false;
3040 }
3041 tcmsg->tcm_parent = 0;
3042 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3043 ofpbuf_uninit(&request);
3044
3045 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3046 return true;
3047 }
3048
3049 static int
3050 finish_queue_dump(struct queue_dump_state *state)
3051 {
3052 ofpbuf_uninit(&state->buf);
3053 return nl_dump_done(&state->dump);
3054 }
3055
3056 struct netdev_linux_queue_state {
3057 unsigned int *queues;
3058 size_t cur_queue;
3059 size_t n_queues;
3060 };
3061
3062 static int
3063 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3064 {
3065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3066 int error;
3067
3068 ovs_mutex_lock(&netdev->mutex);
3069 if (netdev_linux_netnsid_is_remote(netdev)) {
3070 error = EOPNOTSUPP;
3071 goto exit;
3072 }
3073
3074 error = tc_query_qdisc(netdev_);
3075 if (!error) {
3076 if (netdev->tc->ops->class_get) {
3077 struct netdev_linux_queue_state *state;
3078 struct tc_queue *queue;
3079 size_t i;
3080
3081 *statep = state = xmalloc(sizeof *state);
3082 state->n_queues = hmap_count(&netdev->tc->queues);
3083 state->cur_queue = 0;
3084 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3085
3086 i = 0;
3087 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3088 state->queues[i++] = queue->queue_id;
3089 }
3090 } else {
3091 error = EOPNOTSUPP;
3092 }
3093 }
3094
3095 exit:
3096 ovs_mutex_unlock(&netdev->mutex);
3097 return error;
3098 }
3099
3100 static int
3101 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3102 unsigned int *queue_idp, struct smap *details)
3103 {
3104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3105 struct netdev_linux_queue_state *state = state_;
3106 int error = EOF;
3107
3108 ovs_mutex_lock(&netdev->mutex);
3109 if (netdev_linux_netnsid_is_remote(netdev)) {
3110 error = EOPNOTSUPP;
3111 goto exit;
3112 }
3113
3114 while (state->cur_queue < state->n_queues) {
3115 unsigned int queue_id = state->queues[state->cur_queue++];
3116 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3117
3118 if (queue) {
3119 *queue_idp = queue_id;
3120 error = netdev->tc->ops->class_get(netdev_, queue, details);
3121 break;
3122 }
3123 }
3124
3125 exit:
3126 ovs_mutex_unlock(&netdev->mutex);
3127 return error;
3128 }
3129
3130 static int
3131 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3132 void *state_)
3133 {
3134 struct netdev_linux_queue_state *state = state_;
3135
3136 free(state->queues);
3137 free(state);
3138 return 0;
3139 }
3140
3141 static int
3142 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3143 netdev_dump_queue_stats_cb *cb, void *aux)
3144 {
3145 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3146 int error;
3147
3148 ovs_mutex_lock(&netdev->mutex);
3149 if (netdev_linux_netnsid_is_remote(netdev)) {
3150 error = EOPNOTSUPP;
3151 goto exit;
3152 }
3153
3154 error = tc_query_qdisc(netdev_);
3155 if (!error) {
3156 struct queue_dump_state state;
3157
3158 if (!netdev->tc->ops->class_dump_stats) {
3159 error = EOPNOTSUPP;
3160 } else if (!start_queue_dump(netdev_, &state)) {
3161 error = ENODEV;
3162 } else {
3163 struct ofpbuf msg;
3164 int retval;
3165
3166 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3167 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3168 cb, aux);
3169 if (retval) {
3170 error = retval;
3171 }
3172 }
3173
3174 retval = finish_queue_dump(&state);
3175 if (retval) {
3176 error = retval;
3177 }
3178 }
3179 }
3180
3181 exit:
3182 ovs_mutex_unlock(&netdev->mutex);
3183 return error;
3184 }
3185
3186 static int
3187 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3188 struct in_addr netmask)
3189 {
3190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3191 int error;
3192
3193 ovs_mutex_lock(&netdev->mutex);
3194 if (netdev_linux_netnsid_is_remote(netdev)) {
3195 error = EOPNOTSUPP;
3196 goto exit;
3197 }
3198
3199 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3200 if (!error) {
3201 if (address.s_addr != INADDR_ANY) {
3202 error = do_set_addr(netdev_, SIOCSIFNETMASK,
3203 "SIOCSIFNETMASK", netmask);
3204 }
3205 }
3206
3207 exit:
3208 ovs_mutex_unlock(&netdev->mutex);
3209 return error;
3210 }
3211
3212 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3213 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3214 * error. */
3215 static int
3216 netdev_linux_get_addr_list(const struct netdev *netdev_,
3217 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3218 {
3219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3220 int error;
3221
3222 ovs_mutex_lock(&netdev->mutex);
3223 if (netdev_linux_netnsid_is_remote(netdev)) {
3224 error = EOPNOTSUPP;
3225 goto exit;
3226 }
3227
3228 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3229
3230 exit:
3231 ovs_mutex_unlock(&netdev->mutex);
3232 return error;
3233 }
3234
3235 static void
3236 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3237 {
3238 struct sockaddr_in sin;
3239 memset(&sin, 0, sizeof sin);
3240 sin.sin_family = AF_INET;
3241 sin.sin_addr = addr;
3242 sin.sin_port = 0;
3243
3244 memset(sa, 0, sizeof *sa);
3245 memcpy(sa, &sin, sizeof sin);
3246 }
3247
3248 static int
3249 do_set_addr(struct netdev *netdev,
3250 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3251 {
3252 struct ifreq ifr;
3253
3254 make_in4_sockaddr(&ifr.ifr_addr, addr);
3255 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3256 ioctl_name);
3257 }
3258
3259 /* Adds 'router' as a default IP gateway. */
3260 static int
3261 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3262 {
3263 struct in_addr any = { INADDR_ANY };
3264 struct rtentry rt;
3265 int error;
3266
3267 memset(&rt, 0, sizeof rt);
3268 make_in4_sockaddr(&rt.rt_dst, any);
3269 make_in4_sockaddr(&rt.rt_gateway, router);
3270 make_in4_sockaddr(&rt.rt_genmask, any);
3271 rt.rt_flags = RTF_UP | RTF_GATEWAY;
3272 error = af_inet_ioctl(SIOCADDRT, &rt);
3273 if (error) {
3274 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3275 }
3276 return error;
3277 }
3278
3279 static int
3280 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3281 char **netdev_name)
3282 {
3283 static const char fn[] = "/proc/net/route";
3284 FILE *stream;
3285 char line[256];
3286 int ln;
3287
3288 *netdev_name = NULL;
3289 stream = fopen(fn, "r");
3290 if (stream == NULL) {
3291 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3292 return errno;
3293 }
3294
3295 ln = 0;
3296 while (fgets(line, sizeof line, stream)) {
3297 if (++ln >= 2) {
3298 char iface[17];
3299 ovs_be32 dest, gateway, mask;
3300 int refcnt, metric, mtu;
3301 unsigned int flags, use, window, irtt;
3302
3303 if (!ovs_scan(line,
3304 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3305 " %d %u %u\n",
3306 iface, &dest, &gateway, &flags, &refcnt,
3307 &use, &metric, &mask, &mtu, &window, &irtt)) {
3308 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3309 fn, ln, line);
3310 continue;
3311 }
3312 if (!(flags & RTF_UP)) {
3313 /* Skip routes that aren't up. */
3314 continue;
3315 }
3316
3317 /* The output of 'dest', 'mask', and 'gateway' were given in
3318 * network byte order, so we don't need need any endian
3319 * conversions here. */
3320 if ((dest & mask) == (host->s_addr & mask)) {
3321 if (!gateway) {
3322 /* The host is directly reachable. */
3323 next_hop->s_addr = 0;
3324 } else {
3325 /* To reach the host, we must go through a gateway. */
3326 next_hop->s_addr = gateway;
3327 }
3328 *netdev_name = xstrdup(iface);
3329 fclose(stream);
3330 return 0;
3331 }
3332 }
3333 }
3334
3335 fclose(stream);
3336 return ENXIO;
3337 }
3338
3339 static int
3340 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3341 {
3342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3343 int error = 0;
3344
3345 ovs_mutex_lock(&netdev->mutex);
3346 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3347 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3348
3349 COVERAGE_INC(netdev_get_ethtool);
3350 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3351 error = netdev_linux_do_ethtool(netdev->up.name,
3352 cmd,
3353 ETHTOOL_GDRVINFO,
3354 "ETHTOOL_GDRVINFO");
3355 if (!error) {
3356 netdev->cache_valid |= VALID_DRVINFO;
3357 }
3358 }
3359
3360 if (!error) {
3361 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3362 smap_add(smap, "driver_version", netdev->drvinfo.version);
3363 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3364 }
3365 ovs_mutex_unlock(&netdev->mutex);
3366
3367 return error;
3368 }
3369
3370 static int
3371 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3372 struct smap *smap)
3373 {
3374 smap_add(smap, "driver_name", "openvswitch");
3375 return 0;
3376 }
3377
3378 static uint32_t
3379 netdev_linux_get_block_id(struct netdev *netdev_)
3380 {
3381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3382 uint32_t block_id = 0;
3383
3384 ovs_mutex_lock(&netdev->mutex);
3385 /* Ensure the linux netdev has had its fields populated. */
3386 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3387 netdev_linux_update_via_netlink(netdev);
3388 }
3389
3390 /* Only assigning block ids to linux netdevs that are LAG masters. */
3391 if (netdev->is_lag_master) {
3392 block_id = netdev->ifindex;
3393 }
3394 ovs_mutex_unlock(&netdev->mutex);
3395
3396 return block_id;
3397 }
3398
3399 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3400 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3401 * returns 0. Otherwise, it returns a positive errno value; in particular,
3402 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3403 static int
3404 netdev_linux_arp_lookup(const struct netdev *netdev,
3405 ovs_be32 ip, struct eth_addr *mac)
3406 {
3407 struct arpreq r;
3408 struct sockaddr_in sin;
3409 int retval;
3410
3411 memset(&r, 0, sizeof r);
3412 memset(&sin, 0, sizeof sin);
3413 sin.sin_family = AF_INET;
3414 sin.sin_addr.s_addr = ip;
3415 sin.sin_port = 0;
3416 memcpy(&r.arp_pa, &sin, sizeof sin);
3417 r.arp_ha.sa_family = ARPHRD_ETHER;
3418 r.arp_flags = 0;
3419 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3420 COVERAGE_INC(netdev_arp_lookup);
3421 retval = af_inet_ioctl(SIOCGARP, &r);
3422 if (!retval) {
3423 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3424 } else if (retval != ENXIO) {
3425 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3426 netdev_get_name(netdev), IP_ARGS(ip),
3427 ovs_strerror(retval));
3428 }
3429 return retval;
3430 }
3431
3432 static unsigned int
3433 nd_to_iff_flags(enum netdev_flags nd)
3434 {
3435 unsigned int iff = 0;
3436 if (nd & NETDEV_UP) {
3437 iff |= IFF_UP;
3438 }
3439 if (nd & NETDEV_PROMISC) {
3440 iff |= IFF_PROMISC;
3441 }
3442 if (nd & NETDEV_LOOPBACK) {
3443 iff |= IFF_LOOPBACK;
3444 }
3445 return iff;
3446 }
3447
3448 static int
3449 iff_to_nd_flags(unsigned int iff)
3450 {
3451 enum netdev_flags nd = 0;
3452 if (iff & IFF_UP) {
3453 nd |= NETDEV_UP;
3454 }
3455 if (iff & IFF_PROMISC) {
3456 nd |= NETDEV_PROMISC;
3457 }
3458 if (iff & IFF_LOOPBACK) {
3459 nd |= NETDEV_LOOPBACK;
3460 }
3461 return nd;
3462 }
3463
3464 static int
3465 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3466 enum netdev_flags on, enum netdev_flags *old_flagsp)
3467 OVS_REQUIRES(netdev->mutex)
3468 {
3469 unsigned int old_flags, new_flags;
3470 int error = 0;
3471
3472 old_flags = netdev->ifi_flags;
3473 *old_flagsp = iff_to_nd_flags(old_flags);
3474 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3475 if (new_flags != old_flags) {
3476 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3477 get_flags(&netdev->up, &netdev->ifi_flags);
3478 }
3479
3480 return error;
3481 }
3482
3483 static int
3484 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3485 enum netdev_flags on, enum netdev_flags *old_flagsp)
3486 {
3487 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3488 int error = 0;
3489
3490 ovs_mutex_lock(&netdev->mutex);
3491 if (on || off) {
3492 /* Changing flags over netlink isn't support yet. */
3493 if (netdev_linux_netnsid_is_remote(netdev)) {
3494 error = EOPNOTSUPP;
3495 goto exit;
3496 }
3497 error = update_flags(netdev, off, on, old_flagsp);
3498 } else {
3499 /* Try reading flags over netlink, or fall back to ioctl. */
3500 if (!netdev_linux_update_via_netlink(netdev)) {
3501 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3502 } else {
3503 error = update_flags(netdev, off, on, old_flagsp);
3504 }
3505 }
3506
3507 exit:
3508 ovs_mutex_unlock(&netdev->mutex);
3509 return error;
3510 }
3511
3512 #define NETDEV_LINUX_CLASS_COMMON \
3513 .run = netdev_linux_run, \
3514 .wait = netdev_linux_wait, \
3515 .alloc = netdev_linux_alloc, \
3516 .dealloc = netdev_linux_dealloc, \
3517 .send_wait = netdev_linux_send_wait, \
3518 .set_etheraddr = netdev_linux_set_etheraddr, \
3519 .get_etheraddr = netdev_linux_get_etheraddr, \
3520 .get_mtu = netdev_linux_get_mtu, \
3521 .set_mtu = netdev_linux_set_mtu, \
3522 .get_ifindex = netdev_linux_get_ifindex, \
3523 .get_carrier = netdev_linux_get_carrier, \
3524 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3525 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3526 .set_advertisements = netdev_linux_set_advertisements, \
3527 .set_policing = netdev_linux_set_policing, \
3528 .get_qos_types = netdev_linux_get_qos_types, \
3529 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3530 .get_qos = netdev_linux_get_qos, \
3531 .set_qos = netdev_linux_set_qos, \
3532 .get_queue = netdev_linux_get_queue, \
3533 .set_queue = netdev_linux_set_queue, \
3534 .delete_queue = netdev_linux_delete_queue, \
3535 .get_queue_stats = netdev_linux_get_queue_stats, \
3536 .queue_dump_start = netdev_linux_queue_dump_start, \
3537 .queue_dump_next = netdev_linux_queue_dump_next, \
3538 .queue_dump_done = netdev_linux_queue_dump_done, \
3539 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3540 .set_in4 = netdev_linux_set_in4, \
3541 .get_addr_list = netdev_linux_get_addr_list, \
3542 .add_router = netdev_linux_add_router, \
3543 .get_next_hop = netdev_linux_get_next_hop, \
3544 .arp_lookup = netdev_linux_arp_lookup, \
3545 .update_flags = netdev_linux_update_flags, \
3546 .rxq_alloc = netdev_linux_rxq_alloc, \
3547 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3548 .rxq_wait = netdev_linux_rxq_wait, \
3549 .rxq_drain = netdev_linux_rxq_drain
3550
3551 const struct netdev_class netdev_linux_class = {
3552 NETDEV_LINUX_CLASS_COMMON,
3553 .type = "system",
3554 .is_pmd = false,
3555 .construct = netdev_linux_construct,
3556 .destruct = netdev_linux_destruct,
3557 .get_stats = netdev_linux_get_stats,
3558 .get_features = netdev_linux_get_features,
3559 .get_status = netdev_linux_get_status,
3560 .get_block_id = netdev_linux_get_block_id,
3561 .send = netdev_linux_send,
3562 .rxq_construct = netdev_linux_rxq_construct,
3563 .rxq_destruct = netdev_linux_rxq_destruct,
3564 .rxq_recv = netdev_linux_rxq_recv,
3565 };
3566
3567 const struct netdev_class netdev_tap_class = {
3568 NETDEV_LINUX_CLASS_COMMON,
3569 .type = "tap",
3570 .is_pmd = false,
3571 .construct = netdev_linux_construct_tap,
3572 .destruct = netdev_linux_destruct,
3573 .get_stats = netdev_tap_get_stats,
3574 .get_features = netdev_linux_get_features,
3575 .get_status = netdev_linux_get_status,
3576 .send = netdev_linux_send,
3577 .rxq_construct = netdev_linux_rxq_construct,
3578 .rxq_destruct = netdev_linux_rxq_destruct,
3579 .rxq_recv = netdev_linux_rxq_recv,
3580 };
3581
3582 const struct netdev_class netdev_internal_class = {
3583 NETDEV_LINUX_CLASS_COMMON,
3584 .type = "internal",
3585 .is_pmd = false,
3586 .construct = netdev_linux_construct,
3587 .destruct = netdev_linux_destruct,
3588 .get_stats = netdev_internal_get_stats,
3589 .get_status = netdev_internal_get_status,
3590 .send = netdev_linux_send,
3591 .rxq_construct = netdev_linux_rxq_construct,
3592 .rxq_destruct = netdev_linux_rxq_destruct,
3593 .rxq_recv = netdev_linux_rxq_recv,
3594 };
3595
3596 #ifdef HAVE_AF_XDP
3597 #define NETDEV_AFXDP_CLASS_COMMON \
3598 .init = netdev_afxdp_init, \
3599 .construct = netdev_afxdp_construct, \
3600 .destruct = netdev_afxdp_destruct, \
3601 .get_stats = netdev_afxdp_get_stats, \
3602 .get_custom_stats = netdev_afxdp_get_custom_stats, \
3603 .get_status = netdev_linux_get_status, \
3604 .set_config = netdev_afxdp_set_config, \
3605 .get_config = netdev_afxdp_get_config, \
3606 .reconfigure = netdev_afxdp_reconfigure, \
3607 .get_numa_id = netdev_linux_get_numa_id, \
3608 .send = netdev_afxdp_batch_send, \
3609 .rxq_construct = netdev_afxdp_rxq_construct, \
3610 .rxq_destruct = netdev_afxdp_rxq_destruct, \
3611 .rxq_recv = netdev_afxdp_rxq_recv
3612
3613 const struct netdev_class netdev_afxdp_class = {
3614 NETDEV_LINUX_CLASS_COMMON,
3615 NETDEV_AFXDP_CLASS_COMMON,
3616 .type = "afxdp",
3617 .is_pmd = true,
3618 };
3619
3620 const struct netdev_class netdev_afxdp_nonpmd_class = {
3621 NETDEV_LINUX_CLASS_COMMON,
3622 NETDEV_AFXDP_CLASS_COMMON,
3623 .type = "afxdp-nonpmd",
3624 .is_pmd = false,
3625 };
3626 #endif
3627 \f
3628
3629 #define CODEL_N_QUEUES 0x0000
3630
3631 /* In sufficiently new kernel headers these are defined as enums in
3632 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3633 * kernels. (This overrides any enum definition in the header file but that's
3634 * harmless.) */
3635 #define TCA_CODEL_TARGET 1
3636 #define TCA_CODEL_LIMIT 2
3637 #define TCA_CODEL_INTERVAL 3
3638
3639 struct codel {
3640 struct tc tc;
3641 uint32_t target;
3642 uint32_t limit;
3643 uint32_t interval;
3644 };
3645
3646 static struct codel *
3647 codel_get__(const struct netdev *netdev_)
3648 {
3649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3650 return CONTAINER_OF(netdev->tc, struct codel, tc);
3651 }
3652
3653 static void
3654 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3655 uint32_t interval)
3656 {
3657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3658 struct codel *codel;
3659
3660 codel = xmalloc(sizeof *codel);
3661 tc_init(&codel->tc, &tc_ops_codel);
3662 codel->target = target;
3663 codel->limit = limit;
3664 codel->interval = interval;
3665
3666 netdev->tc = &codel->tc;
3667 }
3668
3669 static int
3670 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3671 uint32_t interval)
3672 {
3673 size_t opt_offset;
3674 struct ofpbuf request;
3675 struct tcmsg *tcmsg;
3676 uint32_t otarget, olimit, ointerval;
3677 int error;
3678
3679 tc_del_qdisc(netdev);
3680
3681 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3682 NLM_F_EXCL | NLM_F_CREATE, &request);
3683 if (!tcmsg) {
3684 return ENODEV;
3685 }
3686 tcmsg->tcm_handle = tc_make_handle(1, 0);
3687 tcmsg->tcm_parent = TC_H_ROOT;
3688
3689 otarget = target ? target : 5000;
3690 olimit = limit ? limit : 10240;
3691 ointerval = interval ? interval : 100000;
3692
3693 nl_msg_put_string(&request, TCA_KIND, "codel");
3694 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3695 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3696 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3697 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3698 nl_msg_end_nested(&request, opt_offset);
3699
3700 error = tc_transact(&request, NULL);
3701 if (error) {
3702 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3703 "target %u, limit %u, interval %u error %d(%s)",
3704 netdev_get_name(netdev),
3705 otarget, olimit, ointerval,
3706 error, ovs_strerror(error));
3707 }
3708 return error;
3709 }
3710
3711 static void
3712 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3713 const struct smap *details, struct codel *codel)
3714 {
3715 codel->target = smap_get_ullong(details, "target", 0);
3716 codel->limit = smap_get_ullong(details, "limit", 0);
3717 codel->interval = smap_get_ullong(details, "interval", 0);
3718
3719 if (!codel->target) {
3720 codel->target = 5000;
3721 }
3722 if (!codel->limit) {
3723 codel->limit = 10240;
3724 }
3725 if (!codel->interval) {
3726 codel->interval = 100000;
3727 }
3728 }
3729
3730 static int
3731 codel_tc_install(struct netdev *netdev, const struct smap *details)
3732 {
3733 int error;
3734 struct codel codel;
3735
3736 codel_parse_qdisc_details__(netdev, details, &codel);
3737 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3738 codel.interval);
3739 if (!error) {
3740 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3741 }
3742 return error;
3743 }
3744
3745 static int
3746 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3747 {
3748 static const struct nl_policy tca_codel_policy[] = {
3749 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3750 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3751 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3752 };
3753
3754 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3755
3756 if (!nl_parse_nested(nl_options, tca_codel_policy,
3757 attrs, ARRAY_SIZE(tca_codel_policy))) {
3758 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3759 return EPROTO;
3760 }
3761
3762 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3763 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3764 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3765 return 0;
3766 }
3767
3768 static int
3769 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3770 {
3771 struct nlattr *nlattr;
3772 const char * kind;
3773 int error;
3774 struct codel codel;
3775
3776 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3777 if (error != 0) {
3778 return error;
3779 }
3780
3781 error = codel_parse_tca_options__(nlattr, &codel);
3782 if (error != 0) {
3783 return error;
3784 }
3785
3786 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3787 return 0;
3788 }
3789
3790
3791 static void
3792 codel_tc_destroy(struct tc *tc)
3793 {
3794 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3795 tc_destroy(tc);
3796 free(codel);
3797 }
3798
3799 static int
3800 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3801 {
3802 const struct codel *codel = codel_get__(netdev);
3803 smap_add_format(details, "target", "%u", codel->target);
3804 smap_add_format(details, "limit", "%u", codel->limit);
3805 smap_add_format(details, "interval", "%u", codel->interval);
3806 return 0;
3807 }
3808
3809 static int
3810 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3811 {
3812 struct codel codel;
3813
3814 codel_parse_qdisc_details__(netdev, details, &codel);
3815 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3816 codel_get__(netdev)->target = codel.target;
3817 codel_get__(netdev)->limit = codel.limit;
3818 codel_get__(netdev)->interval = codel.interval;
3819 return 0;
3820 }
3821
3822 static const struct tc_ops tc_ops_codel = {
3823 .linux_name = "codel",
3824 .ovs_name = "linux-codel",
3825 .n_queues = CODEL_N_QUEUES,
3826 .tc_install = codel_tc_install,
3827 .tc_load = codel_tc_load,
3828 .tc_destroy = codel_tc_destroy,
3829 .qdisc_get = codel_qdisc_get,
3830 .qdisc_set = codel_qdisc_set,
3831 };
3832 \f
3833 /* FQ-CoDel traffic control class. */
3834
3835 #define FQCODEL_N_QUEUES 0x0000
3836
3837 /* In sufficiently new kernel headers these are defined as enums in
3838 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3839 * kernels. (This overrides any enum definition in the header file but that's
3840 * harmless.) */
3841 #define TCA_FQ_CODEL_TARGET 1
3842 #define TCA_FQ_CODEL_LIMIT 2
3843 #define TCA_FQ_CODEL_INTERVAL 3
3844 #define TCA_FQ_CODEL_ECN 4
3845 #define TCA_FQ_CODEL_FLOWS 5
3846 #define TCA_FQ_CODEL_QUANTUM 6
3847
3848 struct fqcodel {
3849 struct tc tc;
3850 uint32_t target;
3851 uint32_t limit;
3852 uint32_t interval;
3853 uint32_t flows;
3854 uint32_t quantum;
3855 };
3856
3857 static struct fqcodel *
3858 fqcodel_get__(const struct netdev *netdev_)
3859 {
3860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3861 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3862 }
3863
3864 static void
3865 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3866 uint32_t interval, uint32_t flows, uint32_t quantum)
3867 {
3868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3869 struct fqcodel *fqcodel;
3870
3871 fqcodel = xmalloc(sizeof *fqcodel);
3872 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3873 fqcodel->target = target;
3874 fqcodel->limit = limit;
3875 fqcodel->interval = interval;
3876 fqcodel->flows = flows;
3877 fqcodel->quantum = quantum;
3878
3879 netdev->tc = &fqcodel->tc;
3880 }
3881
3882 static int
3883 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3884 uint32_t interval, uint32_t flows, uint32_t quantum)
3885 {
3886 size_t opt_offset;
3887 struct ofpbuf request;
3888 struct tcmsg *tcmsg;
3889 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3890 int error;
3891
3892 tc_del_qdisc(netdev);
3893
3894 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3895 NLM_F_EXCL | NLM_F_CREATE, &request);
3896 if (!tcmsg) {
3897 return ENODEV;
3898 }
3899 tcmsg->tcm_handle = tc_make_handle(1, 0);
3900 tcmsg->tcm_parent = TC_H_ROOT;
3901
3902 otarget = target ? target : 5000;
3903 olimit = limit ? limit : 10240;
3904 ointerval = interval ? interval : 100000;
3905 oflows = flows ? flows : 1024;
3906 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3907 not mtu */
3908
3909 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3910 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3911 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3912 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3913 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3914 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3915 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3916 nl_msg_end_nested(&request, opt_offset);
3917
3918 error = tc_transact(&request, NULL);
3919 if (error) {
3920 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3921 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3922 netdev_get_name(netdev),
3923 otarget, olimit, ointerval, oflows, oquantum,
3924 error, ovs_strerror(error));
3925 }
3926 return error;
3927 }
3928
3929 static void
3930 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3931 const struct smap *details, struct fqcodel *fqcodel)
3932 {
3933 fqcodel->target = smap_get_ullong(details, "target", 0);
3934 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3935 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3936 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3937 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3938
3939 if (!fqcodel->target) {
3940 fqcodel->target = 5000;
3941 }
3942 if (!fqcodel->limit) {
3943 fqcodel->limit = 10240;
3944 }
3945 if (!fqcodel->interval) {
3946 fqcodel->interval = 1000000;
3947 }
3948 if (!fqcodel->flows) {
3949 fqcodel->flows = 1024;
3950 }
3951 if (!fqcodel->quantum) {
3952 fqcodel->quantum = 1514;
3953 }
3954 }
3955
3956 static int
3957 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3958 {
3959 int error;
3960 struct fqcodel fqcodel;
3961
3962 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3963 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3964 fqcodel.interval, fqcodel.flows,
3965 fqcodel.quantum);
3966 if (!error) {
3967 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3968 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3969 }
3970 return error;
3971 }
3972
3973 static int
3974 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3975 {
3976 static const struct nl_policy tca_fqcodel_policy[] = {
3977 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3978 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3979 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3980 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3981 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3982 };
3983
3984 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3985
3986 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3987 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3988 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3989 return EPROTO;
3990 }
3991
3992 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3993 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3994 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3995 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3996 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3997 return 0;
3998 }
3999
4000 static int
4001 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4002 {
4003 struct nlattr *nlattr;
4004 const char * kind;
4005 int error;
4006 struct fqcodel fqcodel;
4007
4008 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4009 if (error != 0) {
4010 return error;
4011 }
4012
4013 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4014 if (error != 0) {
4015 return error;
4016 }
4017
4018 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4019 fqcodel.flows, fqcodel.quantum);
4020 return 0;
4021 }
4022
4023 static void
4024 fqcodel_tc_destroy(struct tc *tc)
4025 {
4026 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4027 tc_destroy(tc);
4028 free(fqcodel);
4029 }
4030
4031 static int
4032 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4033 {
4034 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4035 smap_add_format(details, "target", "%u", fqcodel->target);
4036 smap_add_format(details, "limit", "%u", fqcodel->limit);
4037 smap_add_format(details, "interval", "%u", fqcodel->interval);
4038 smap_add_format(details, "flows", "%u", fqcodel->flows);
4039 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4040 return 0;
4041 }
4042
4043 static int
4044 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4045 {
4046 struct fqcodel fqcodel;
4047
4048 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4049 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4050 fqcodel.flows, fqcodel.quantum);
4051 fqcodel_get__(netdev)->target = fqcodel.target;
4052 fqcodel_get__(netdev)->limit = fqcodel.limit;
4053 fqcodel_get__(netdev)->interval = fqcodel.interval;
4054 fqcodel_get__(netdev)->flows = fqcodel.flows;
4055 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4056 return 0;
4057 }
4058
4059 static const struct tc_ops tc_ops_fqcodel = {
4060 .linux_name = "fq_codel",
4061 .ovs_name = "linux-fq_codel",
4062 .n_queues = FQCODEL_N_QUEUES,
4063 .tc_install = fqcodel_tc_install,
4064 .tc_load = fqcodel_tc_load,
4065 .tc_destroy = fqcodel_tc_destroy,
4066 .qdisc_get = fqcodel_qdisc_get,
4067 .qdisc_set = fqcodel_qdisc_set,
4068 };
4069 \f
4070 /* SFQ traffic control class. */
4071
4072 #define SFQ_N_QUEUES 0x0000
4073
4074 struct sfq {
4075 struct tc tc;
4076 uint32_t quantum;
4077 uint32_t perturb;
4078 };
4079
4080 static struct sfq *
4081 sfq_get__(const struct netdev *netdev_)
4082 {
4083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4084 return CONTAINER_OF(netdev->tc, struct sfq, tc);
4085 }
4086
4087 static void
4088 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4089 {
4090 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4091 struct sfq *sfq;
4092
4093 sfq = xmalloc(sizeof *sfq);
4094 tc_init(&sfq->tc, &tc_ops_sfq);
4095 sfq->perturb = perturb;
4096 sfq->quantum = quantum;
4097
4098 netdev->tc = &sfq->tc;
4099 }
4100
4101 static int
4102 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4103 {
4104 struct tc_sfq_qopt opt;
4105 struct ofpbuf request;
4106 struct tcmsg *tcmsg;
4107 int mtu;
4108 int mtu_error, error;
4109 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4110
4111 tc_del_qdisc(netdev);
4112
4113 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4114 NLM_F_EXCL | NLM_F_CREATE, &request);
4115 if (!tcmsg) {
4116 return ENODEV;
4117 }
4118 tcmsg->tcm_handle = tc_make_handle(1, 0);
4119 tcmsg->tcm_parent = TC_H_ROOT;
4120
4121 memset(&opt, 0, sizeof opt);
4122 if (!quantum) {
4123 if (!mtu_error) {
4124 opt.quantum = mtu; /* if we cannot find mtu, use default */
4125 }
4126 } else {
4127 opt.quantum = quantum;
4128 }
4129
4130 if (!perturb) {
4131 opt.perturb_period = 10;
4132 } else {
4133 opt.perturb_period = perturb;
4134 }
4135
4136 nl_msg_put_string(&request, TCA_KIND, "sfq");
4137 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4138
4139 error = tc_transact(&request, NULL);
4140 if (error) {
4141 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4142 "quantum %u, perturb %u error %d(%s)",
4143 netdev_get_name(netdev),
4144 opt.quantum, opt.perturb_period,
4145 error, ovs_strerror(error));
4146 }
4147 return error;
4148 }
4149
4150 static void
4151 sfq_parse_qdisc_details__(struct netdev *netdev,
4152 const struct smap *details, struct sfq *sfq)
4153 {
4154 sfq->perturb = smap_get_ullong(details, "perturb", 0);
4155 sfq->quantum = smap_get_ullong(details, "quantum", 0);
4156
4157 if (!sfq->perturb) {
4158 sfq->perturb = 10;
4159 }
4160
4161 if (!sfq->quantum) {
4162 int mtu;
4163 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4164 sfq->quantum = mtu;
4165 } else {
4166 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4167 "device without mtu");
4168 }
4169 }
4170 }
4171
4172 static int
4173 sfq_tc_install(struct netdev *netdev, const struct smap *details)
4174 {
4175 int error;
4176 struct sfq sfq;
4177
4178 sfq_parse_qdisc_details__(netdev, details, &sfq);
4179 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4180 if (!error) {
4181 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4182 }
4183 return error;
4184 }
4185
4186 static int
4187 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4188 {
4189 const struct tc_sfq_qopt *sfq;
4190 struct nlattr *nlattr;
4191 const char * kind;
4192 int error;
4193
4194 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4195 if (error == 0) {
4196 sfq = nl_attr_get(nlattr);
4197 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4198 return 0;
4199 }
4200
4201 return error;
4202 }
4203
4204 static void
4205 sfq_tc_destroy(struct tc *tc)
4206 {
4207 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4208 tc_destroy(tc);
4209 free(sfq);
4210 }
4211
4212 static int
4213 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4214 {
4215 const struct sfq *sfq = sfq_get__(netdev);
4216 smap_add_format(details, "quantum", "%u", sfq->quantum);
4217 smap_add_format(details, "perturb", "%u", sfq->perturb);
4218 return 0;
4219 }
4220
4221 static int
4222 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4223 {
4224 struct sfq sfq;
4225
4226 sfq_parse_qdisc_details__(netdev, details, &sfq);
4227 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4228 sfq_get__(netdev)->quantum = sfq.quantum;
4229 sfq_get__(netdev)->perturb = sfq.perturb;
4230 return 0;
4231 }
4232
4233 static const struct tc_ops tc_ops_sfq = {
4234 .linux_name = "sfq",
4235 .ovs_name = "linux-sfq",
4236 .n_queues = SFQ_N_QUEUES,
4237 .tc_install = sfq_tc_install,
4238 .tc_load = sfq_tc_load,
4239 .tc_destroy = sfq_tc_destroy,
4240 .qdisc_get = sfq_qdisc_get,
4241 .qdisc_set = sfq_qdisc_set,
4242 };
4243 \f
4244 /* netem traffic control class. */
4245
4246 struct netem {
4247 struct tc tc;
4248 uint32_t latency;
4249 uint32_t limit;
4250 uint32_t loss;
4251 };
4252
4253 static struct netem *
4254 netem_get__(const struct netdev *netdev_)
4255 {
4256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4257 return CONTAINER_OF(netdev->tc, struct netem, tc);
4258 }
4259
4260 static void
4261 netem_install__(struct netdev *netdev_, uint32_t latency,
4262 uint32_t limit, uint32_t loss)
4263 {
4264 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4265 struct netem *netem;
4266
4267 netem = xmalloc(sizeof *netem);
4268 tc_init(&netem->tc, &tc_ops_netem);
4269 netem->latency = latency;
4270 netem->limit = limit;
4271 netem->loss = loss;
4272
4273 netdev->tc = &netem->tc;
4274 }
4275
4276 static int
4277 netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4278 uint32_t limit, uint32_t loss)
4279 {
4280 struct tc_netem_qopt opt;
4281 struct ofpbuf request;
4282 struct tcmsg *tcmsg;
4283 int error;
4284
4285 tc_del_qdisc(netdev);
4286
4287 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4288 NLM_F_EXCL | NLM_F_CREATE, &request);
4289 if (!tcmsg) {
4290 return ENODEV;
4291 }
4292 tcmsg->tcm_handle = tc_make_handle(1, 0);
4293 tcmsg->tcm_parent = TC_H_ROOT;
4294
4295 memset(&opt, 0, sizeof opt);
4296
4297 if (!limit) {
4298 opt.limit = 1000;
4299 } else {
4300 opt.limit = limit;
4301 }
4302
4303 if (loss) {
4304 if (loss > 100) {
4305 VLOG_WARN_RL(&rl,
4306 "loss should be a percentage value between 0 to 100, "
4307 "loss was %u", loss);
4308 return EINVAL;
4309 }
4310 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4311 }
4312
4313 opt.latency = tc_time_to_ticks(latency);
4314
4315 nl_msg_put_string(&request, TCA_KIND, "netem");
4316 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4317
4318 error = tc_transact(&request, NULL);
4319 if (error) {
4320 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4321 "latency %u, limit %u, loss %u error %d(%s)",
4322 netdev_get_name(netdev),
4323 opt.latency, opt.limit, opt.loss,
4324 error, ovs_strerror(error));
4325 }
4326 return error;
4327 }
4328
4329 static void
4330 netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4331 const struct smap *details, struct netem *netem)
4332 {
4333 netem->latency = smap_get_ullong(details, "latency", 0);
4334 netem->limit = smap_get_ullong(details, "limit", 0);
4335 netem->loss = smap_get_ullong(details, "loss", 0);
4336
4337 if (!netem->limit) {
4338 netem->limit = 1000;
4339 }
4340 }
4341
4342 static int
4343 netem_tc_install(struct netdev *netdev, const struct smap *details)
4344 {
4345 int error;
4346 struct netem netem;
4347
4348 netem_parse_qdisc_details__(netdev, details, &netem);
4349 error = netem_setup_qdisc__(netdev, netem.latency,
4350 netem.limit, netem.loss);
4351 if (!error) {
4352 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4353 }
4354 return error;
4355 }
4356
4357 static int
4358 netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4359 {
4360 const struct tc_netem_qopt *netem;
4361 struct nlattr *nlattr;
4362 const char *kind;
4363 int error;
4364
4365 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4366 if (error == 0) {
4367 netem = nl_attr_get(nlattr);
4368 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4369 return 0;
4370 }
4371
4372 return error;
4373 }
4374
4375 static void
4376 netem_tc_destroy(struct tc *tc)
4377 {
4378 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4379 tc_destroy(tc);
4380 free(netem);
4381 }
4382
4383 static int
4384 netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4385 {
4386 const struct netem *netem = netem_get__(netdev);
4387 smap_add_format(details, "latency", "%u", netem->latency);
4388 smap_add_format(details, "limit", "%u", netem->limit);
4389 smap_add_format(details, "loss", "%u", netem->loss);
4390 return 0;
4391 }
4392
4393 static int
4394 netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4395 {
4396 struct netem netem;
4397
4398 netem_parse_qdisc_details__(netdev, details, &netem);
4399 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4400 netem_get__(netdev)->latency = netem.latency;
4401 netem_get__(netdev)->limit = netem.limit;
4402 netem_get__(netdev)->loss = netem.loss;
4403 return 0;
4404 }
4405
4406 static const struct tc_ops tc_ops_netem = {
4407 .linux_name = "netem",
4408 .ovs_name = "linux-netem",
4409 .n_queues = 0,
4410 .tc_install = netem_tc_install,
4411 .tc_load = netem_tc_load,
4412 .tc_destroy = netem_tc_destroy,
4413 .qdisc_get = netem_qdisc_get,
4414 .qdisc_set = netem_qdisc_set,
4415 };
4416 \f
4417 /* HTB traffic control class. */
4418
4419 #define HTB_N_QUEUES 0xf000
4420 #define HTB_RATE2QUANTUM 10
4421
4422 struct htb {
4423 struct tc tc;
4424 unsigned int max_rate; /* In bytes/s. */
4425 };
4426
4427 struct htb_class {
4428 struct tc_queue tc_queue;
4429 unsigned int min_rate; /* In bytes/s. */
4430 unsigned int max_rate; /* In bytes/s. */
4431 unsigned int burst; /* In bytes. */
4432 unsigned int priority; /* Lower values are higher priorities. */
4433 };
4434
4435 static struct htb *
4436 htb_get__(const struct netdev *netdev_)
4437 {
4438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4439 return CONTAINER_OF(netdev->tc, struct htb, tc);
4440 }
4441
4442 static void
4443 htb_install__(struct netdev *netdev_, uint64_t max_rate)
4444 {
4445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4446 struct htb *htb;
4447
4448 htb = xmalloc(sizeof *htb);
4449 tc_init(&htb->tc, &tc_ops_htb);
4450 htb->max_rate = max_rate;
4451
4452 netdev->tc = &htb->tc;
4453 }
4454
4455 /* Create an HTB qdisc.
4456 *
4457 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4458 static int
4459 htb_setup_qdisc__(struct netdev *netdev)
4460 {
4461 size_t opt_offset;
4462 struct tc_htb_glob opt;
4463 struct ofpbuf request;
4464 struct tcmsg *tcmsg;
4465
4466 tc_del_qdisc(netdev);
4467
4468 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4469 NLM_F_EXCL | NLM_F_CREATE, &request);
4470 if (!tcmsg) {
4471 return ENODEV;
4472 }
4473 tcmsg->tcm_handle = tc_make_handle(1, 0);
4474 tcmsg->tcm_parent = TC_H_ROOT;
4475
4476 nl_msg_put_string(&request, TCA_KIND, "htb");
4477
4478 memset(&opt, 0, sizeof opt);
4479 opt.rate2quantum = HTB_RATE2QUANTUM;
4480 opt.version = 3;
4481 opt.defcls = 1;
4482
4483 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4484 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4485 nl_msg_end_nested(&request, opt_offset);
4486
4487 return tc_transact(&request, NULL);
4488 }
4489
4490 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4491 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4492 static int
4493 htb_setup_class__(struct netdev *netdev, unsigned int handle,
4494 unsigned int parent, struct htb_class *class)
4495 {
4496 size_t opt_offset;
4497 struct tc_htb_opt opt;
4498 struct ofpbuf request;
4499 struct tcmsg *tcmsg;
4500 int error;
4501 int mtu;
4502
4503 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4504 if (error) {
4505 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4506 netdev_get_name(netdev));
4507 return error;
4508 }
4509
4510 memset(&opt, 0, sizeof opt);
4511 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4512 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4513 /* Makes sure the quantum is at least MTU. Setting quantum will
4514 * make htb ignore the r2q for this class. */
4515 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4516 opt.quantum = mtu;
4517 }
4518 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4519 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4520 opt.prio = class->priority;
4521
4522 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4523 &request);
4524 if (!tcmsg) {
4525 return ENODEV;
4526 }
4527 tcmsg->tcm_handle = handle;
4528 tcmsg->tcm_parent = parent;
4529
4530 nl_msg_put_string(&request, TCA_KIND, "htb");
4531 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4532 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4533 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4534 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4535 nl_msg_end_nested(&request, opt_offset);
4536
4537 error = tc_transact(&request, NULL);
4538 if (error) {
4539 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4540 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4541 netdev_get_name(netdev),
4542 tc_get_major(handle), tc_get_minor(handle),
4543 tc_get_major(parent), tc_get_minor(parent),
4544 class->min_rate, class->max_rate,
4545 class->burst, class->priority, ovs_strerror(error));
4546 }
4547 return error;
4548 }
4549
4550 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4551 * description of them into 'details'. The description complies with the
4552 * specification given in the vswitch database documentation for linux-htb
4553 * queue details. */
4554 static int
4555 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4556 {
4557 static const struct nl_policy tca_htb_policy[] = {
4558 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4559 .min_len = sizeof(struct tc_htb_opt) },
4560 };
4561
4562 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4563 const struct tc_htb_opt *htb;
4564
4565 if (!nl_parse_nested(nl_options, tca_htb_policy,
4566 attrs, ARRAY_SIZE(tca_htb_policy))) {
4567 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4568 return EPROTO;
4569 }
4570
4571 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4572 class->min_rate = htb->rate.rate;
4573 class->max_rate = htb->ceil.rate;
4574 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4575 class->priority = htb->prio;
4576 return 0;
4577 }
4578
4579 static int
4580 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4581 struct htb_class *options,
4582 struct netdev_queue_stats *stats)
4583 {
4584 struct nlattr *nl_options;
4585 unsigned int handle;
4586 int error;
4587
4588 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4589 if (!error && queue_id) {
4590 unsigned int major = tc_get_major(handle);
4591 unsigned int minor = tc_get_minor(handle);
4592 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4593 *queue_id = minor - 1;
4594 } else {
4595 error = EPROTO;
4596 }
4597 }
4598 if (!error && options) {
4599 error = htb_parse_tca_options__(nl_options, options);
4600 }
4601 return error;
4602 }
4603
4604 static void
4605 htb_parse_qdisc_details__(struct netdev *netdev_,
4606 const struct smap *details, struct htb_class *hc)
4607 {
4608 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4609
4610 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4611 if (!hc->max_rate) {
4612 enum netdev_features current;
4613
4614 netdev_linux_read_features(netdev);
4615 current = !netdev->get_features_error ? netdev->current : 0;
4616 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4617 }
4618 hc->min_rate = hc->max_rate;
4619 hc->burst = 0;
4620 hc->priority = 0;
4621 }
4622
4623 static int
4624 htb_parse_class_details__(struct netdev *netdev,
4625 const struct smap *details, struct htb_class *hc)
4626 {
4627 const struct htb *htb = htb_get__(netdev);
4628 int mtu, error;
4629 unsigned long long int max_rate_bit;
4630
4631 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4632 if (error) {
4633 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4634 netdev_get_name(netdev));
4635 return error;
4636 }
4637
4638 /* HTB requires at least an mtu sized min-rate to send any traffic even
4639 * on uncongested links. */
4640 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4641 hc->min_rate = MAX(hc->min_rate, mtu);
4642 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4643
4644 /* max-rate */
4645 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4646 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4647 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4648 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4649
4650 /* burst
4651 *
4652 * According to hints in the documentation that I've read, it is important
4653 * that 'burst' be at least as big as the largest frame that might be
4654 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4655 * but having it a bit too small is a problem. Since netdev_get_mtu()
4656 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4657 * the MTU. We actually add 64, instead of 14, as a guard against
4658 * additional headers get tacked on somewhere that we're not aware of. */
4659 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4660 hc->burst = MAX(hc->burst, mtu + 64);
4661
4662 /* priority */
4663 hc->priority = smap_get_ullong(details, "priority", 0);
4664
4665 return 0;
4666 }
4667
4668 static int
4669 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4670 unsigned int parent, struct htb_class *options,
4671 struct netdev_queue_stats *stats)
4672 {
4673 struct ofpbuf *reply;
4674 int error;
4675
4676 error = tc_query_class(netdev, handle, parent, &reply);
4677 if (!error) {
4678 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4679 ofpbuf_delete(reply);
4680 }
4681 return error;
4682 }
4683
4684 static int
4685 htb_tc_install(struct netdev *netdev, const struct smap *details)
4686 {
4687 int error;
4688
4689 error = htb_setup_qdisc__(netdev);
4690 if (!error) {
4691 struct htb_class hc;
4692
4693 htb_parse_qdisc_details__(netdev, details, &hc);
4694 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4695 tc_make_handle(1, 0), &hc);
4696 if (!error) {
4697 htb_install__(netdev, hc.max_rate);
4698 }
4699 }
4700 return error;
4701 }
4702
4703 static struct htb_class *
4704 htb_class_cast__(const struct tc_queue *queue)
4705 {
4706 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4707 }
4708
4709 static void
4710 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4711 const struct htb_class *hc)
4712 {
4713 struct htb *htb = htb_get__(netdev);
4714 size_t hash = hash_int(queue_id, 0);
4715 struct tc_queue *queue;
4716 struct htb_class *hcp;
4717
4718 queue = tc_find_queue__(netdev, queue_id, hash);
4719 if (queue) {
4720 hcp = htb_class_cast__(queue);
4721 } else {
4722 hcp = xmalloc(sizeof *hcp);
4723 queue = &hcp->tc_queue;
4724 queue->queue_id = queue_id;
4725 queue->created = time_msec();
4726 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4727 }
4728
4729 hcp->min_rate = hc->min_rate;
4730 hcp->max_rate = hc->max_rate;
4731 hcp->burst = hc->burst;
4732 hcp->priority = hc->priority;
4733 }
4734
4735 static int
4736 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4737 {
4738 struct ofpbuf msg;
4739 struct queue_dump_state state;
4740 struct htb_class hc;
4741
4742 /* Get qdisc options. */
4743 hc.max_rate = 0;
4744 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4745 htb_install__(netdev, hc.max_rate);
4746
4747 /* Get queues. */
4748 if (!start_queue_dump(netdev, &state)) {
4749 return ENODEV;
4750 }
4751 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4752 unsigned int queue_id;
4753
4754 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4755 htb_update_queue__(netdev, queue_id, &hc);
4756 }
4757 }
4758 finish_queue_dump(&state);
4759
4760 return 0;
4761 }
4762
4763 static void
4764 htb_tc_destroy(struct tc *tc)
4765 {
4766 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4767 struct htb_class *hc;
4768
4769 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4770 free(hc);
4771 }
4772 tc_destroy(tc);
4773 free(htb);
4774 }
4775
4776 static int
4777 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4778 {
4779 const struct htb *htb = htb_get__(netdev);
4780 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4781 return 0;
4782 }
4783
4784 static int
4785 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4786 {
4787 struct htb_class hc;
4788 int error;
4789
4790 htb_parse_qdisc_details__(netdev, details, &hc);
4791 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4792 tc_make_handle(1, 0), &hc);
4793 if (!error) {
4794 htb_get__(netdev)->max_rate = hc.max_rate;
4795 }
4796 return error;
4797 }
4798
4799 static int
4800 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4801 const struct tc_queue *queue, struct smap *details)
4802 {
4803 const struct htb_class *hc = htb_class_cast__(queue);
4804
4805 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4806 if (hc->min_rate != hc->max_rate) {
4807 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4808 }
4809 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4810 if (hc->priority) {
4811 smap_add_format(details, "priority", "%u", hc->priority);
4812 }
4813 return 0;
4814 }
4815
4816 static int
4817 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4818 const struct smap *details)
4819 {
4820 struct htb_class hc;
4821 int error;
4822
4823 error = htb_parse_class_details__(netdev, details, &hc);
4824 if (error) {
4825 return error;
4826 }
4827
4828 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4829 tc_make_handle(1, 0xfffe), &hc);
4830 if (error) {
4831 return error;
4832 }
4833
4834 htb_update_queue__(netdev, queue_id, &hc);
4835 return 0;
4836 }
4837
4838 static int
4839 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4840 {
4841 struct htb_class *hc = htb_class_cast__(queue);
4842 struct htb *htb = htb_get__(netdev);
4843 int error;
4844
4845 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4846 if (!error) {
4847 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4848 free(hc);
4849 }
4850 return error;
4851 }
4852
4853 static int
4854 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4855 struct netdev_queue_stats *stats)
4856 {
4857 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4858 tc_make_handle(1, 0xfffe), NULL, stats);
4859 }
4860
4861 static int
4862 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4863 const struct ofpbuf *nlmsg,
4864 netdev_dump_queue_stats_cb *cb, void *aux)
4865 {
4866 struct netdev_queue_stats stats;
4867 unsigned int handle, major, minor;
4868 int error;
4869
4870 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4871 if (error) {
4872 return error;
4873 }
4874
4875 major = tc_get_major(handle);
4876 minor = tc_get_minor(handle);
4877 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4878 (*cb)(minor - 1, &stats, aux);
4879 }
4880 return 0;
4881 }
4882
4883 static const struct tc_ops tc_ops_htb = {
4884 .linux_name = "htb",
4885 .ovs_name = "linux-htb",
4886 .n_queues = HTB_N_QUEUES,
4887 .tc_install = htb_tc_install,
4888 .tc_load = htb_tc_load,
4889 .tc_destroy = htb_tc_destroy,
4890 .qdisc_get = htb_qdisc_get,
4891 .qdisc_set = htb_qdisc_set,
4892 .class_get = htb_class_get,
4893 .class_set = htb_class_set,
4894 .class_delete = htb_class_delete,
4895 .class_get_stats = htb_class_get_stats,
4896 .class_dump_stats = htb_class_dump_stats
4897 };
4898 \f
4899 /* "linux-hfsc" traffic control class. */
4900
4901 #define HFSC_N_QUEUES 0xf000
4902
4903 struct hfsc {
4904 struct tc tc;
4905 uint32_t max_rate;
4906 };
4907
4908 struct hfsc_class {
4909 struct tc_queue tc_queue;
4910 uint32_t min_rate;
4911 uint32_t max_rate;
4912 };
4913
4914 static struct hfsc *
4915 hfsc_get__(const struct netdev *netdev_)
4916 {
4917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4918 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4919 }
4920
4921 static struct hfsc_class *
4922 hfsc_class_cast__(const struct tc_queue *queue)
4923 {
4924 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4925 }
4926
4927 static void
4928 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4929 {
4930 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4931 struct hfsc *hfsc;
4932
4933 hfsc = xmalloc(sizeof *hfsc);
4934 tc_init(&hfsc->tc, &tc_ops_hfsc);
4935 hfsc->max_rate = max_rate;
4936 netdev->tc = &hfsc->tc;
4937 }
4938
4939 static void
4940 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4941 const struct hfsc_class *hc)
4942 {
4943 size_t hash;
4944 struct hfsc *hfsc;
4945 struct hfsc_class *hcp;
4946 struct tc_queue *queue;
4947
4948 hfsc = hfsc_get__(netdev);
4949 hash = hash_int(queue_id, 0);
4950
4951 queue = tc_find_queue__(netdev, queue_id, hash);
4952 if (queue) {
4953 hcp = hfsc_class_cast__(queue);
4954 } else {
4955 hcp = xmalloc(sizeof *hcp);
4956 queue = &hcp->tc_queue;
4957 queue->queue_id = queue_id;
4958 queue->created = time_msec();
4959 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4960 }
4961
4962 hcp->min_rate = hc->min_rate;
4963 hcp->max_rate = hc->max_rate;
4964 }
4965
4966 static int
4967 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4968 {
4969 const struct tc_service_curve *rsc, *fsc, *usc;
4970 static const struct nl_policy tca_hfsc_policy[] = {
4971 [TCA_HFSC_RSC] = {
4972 .type = NL_A_UNSPEC,
4973 .optional = false,
4974 .min_len = sizeof(struct tc_service_curve),
4975 },
4976 [TCA_HFSC_FSC] = {
4977 .type = NL_A_UNSPEC,
4978 .optional = false,
4979 .min_len = sizeof(struct tc_service_curve),
4980 },
4981 [TCA_HFSC_USC] = {
4982 .type = NL_A_UNSPEC,
4983 .optional = false,
4984 .min_len = sizeof(struct tc_service_curve),
4985 },
4986 };
4987 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4988
4989 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4990 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4991 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4992 return EPROTO;
4993 }
4994
4995 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4996 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4997 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4998
4999 if (rsc->m1 != 0 || rsc->d != 0 ||
5000 fsc->m1 != 0 || fsc->d != 0 ||
5001 usc->m1 != 0 || usc->d != 0) {
5002 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5003 "Non-linear service curves are not supported.");
5004 return EPROTO;
5005 }
5006
5007 if (rsc->m2 != fsc->m2) {
5008 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5009 "Real-time service curves are not supported ");
5010 return EPROTO;
5011 }
5012
5013 if (rsc->m2 > usc->m2) {
5014 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5015 "Min-rate service curve is greater than "
5016 "the max-rate service curve.");
5017 return EPROTO;
5018 }
5019
5020 class->min_rate = fsc->m2;
5021 class->max_rate = usc->m2;
5022 return 0;
5023 }
5024
5025 static int
5026 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5027 struct hfsc_class *options,
5028 struct netdev_queue_stats *stats)
5029 {
5030 int error;
5031 unsigned int handle;
5032 struct nlattr *nl_options;
5033
5034 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5035 if (error) {
5036 return error;
5037 }
5038
5039 if (queue_id) {
5040 unsigned int major, minor;
5041
5042 major = tc_get_major(handle);
5043 minor = tc_get_minor(handle);
5044 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5045 *queue_id = minor - 1;
5046 } else {
5047 return EPROTO;
5048 }
5049 }
5050
5051 if (options) {
5052 error = hfsc_parse_tca_options__(nl_options, options);
5053 }
5054
5055 return error;
5056 }
5057
5058 static int
5059 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5060 unsigned int parent, struct hfsc_class *options,
5061 struct netdev_queue_stats *stats)
5062 {
5063 int error;
5064 struct ofpbuf *reply;
5065
5066 error = tc_query_class(netdev, handle, parent, &reply);
5067 if (error) {
5068 return error;
5069 }
5070
5071 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5072 ofpbuf_delete(reply);
5073 return error;
5074 }
5075
5076 static void
5077 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
5078 struct hfsc_class *class)
5079 {
5080 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5081
5082 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5083 if (!max_rate) {
5084 enum netdev_features current;
5085
5086 netdev_linux_read_features(netdev);
5087 current = !netdev->get_features_error ? netdev->current : 0;
5088 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
5089 }
5090
5091 class->min_rate = max_rate;
5092 class->max_rate = max_rate;
5093 }
5094
5095 static int
5096 hfsc_parse_class_details__(struct netdev *netdev,
5097 const struct smap *details,
5098 struct hfsc_class * class)
5099 {
5100 const struct hfsc *hfsc;
5101 uint32_t min_rate, max_rate;
5102
5103 hfsc = hfsc_get__(netdev);
5104
5105 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5106 min_rate = MAX(min_rate, 1);
5107 min_rate = MIN(min_rate, hfsc->max_rate);
5108
5109 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5110 max_rate = MAX(max_rate, min_rate);
5111 max_rate = MIN(max_rate, hfsc->max_rate);
5112
5113 class->min_rate = min_rate;
5114 class->max_rate = max_rate;
5115
5116 return 0;
5117 }
5118
5119 /* Create an HFSC qdisc.
5120 *
5121 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5122 static int
5123 hfsc_setup_qdisc__(struct netdev * netdev)
5124 {
5125 struct tcmsg *tcmsg;
5126 struct ofpbuf request;
5127 struct tc_hfsc_qopt opt;
5128
5129 tc_del_qdisc(netdev);
5130
5131 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5132 NLM_F_EXCL | NLM_F_CREATE, &request);
5133
5134 if (!tcmsg) {
5135 return ENODEV;
5136 }
5137
5138 tcmsg->tcm_handle = tc_make_handle(1, 0);
5139 tcmsg->tcm_parent = TC_H_ROOT;
5140
5141 memset(&opt, 0, sizeof opt);
5142 opt.defcls = 1;
5143
5144 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5145 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5146
5147 return tc_transact(&request, NULL);
5148 }
5149
5150 /* Create an HFSC class.
5151 *
5152 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5153 * sc rate <min_rate> ul rate <max_rate>" */
5154 static int
5155 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5156 unsigned int parent, struct hfsc_class *class)
5157 {
5158 int error;
5159 size_t opt_offset;
5160 struct tcmsg *tcmsg;
5161 struct ofpbuf request;
5162 struct tc_service_curve min, max;
5163
5164 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5165 &request);
5166
5167 if (!tcmsg) {
5168 return ENODEV;
5169 }
5170
5171 tcmsg->tcm_handle = handle;
5172 tcmsg->tcm_parent = parent;
5173
5174 min.m1 = 0;
5175 min.d = 0;
5176 min.m2 = class->min_rate;
5177
5178 max.m1 = 0;
5179 max.d = 0;
5180 max.m2 = class->max_rate;
5181
5182 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5183 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5184 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5185 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5186 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5187 nl_msg_end_nested(&request, opt_offset);
5188
5189 error = tc_transact(&request, NULL);
5190 if (error) {
5191 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5192 "min-rate %ubps, max-rate %ubps (%s)",
5193 netdev_get_name(netdev),
5194 tc_get_major(handle), tc_get_minor(handle),
5195 tc_get_major(parent), tc_get_minor(parent),
5196 class->min_rate, class->max_rate, ovs_strerror(error));
5197 }
5198
5199 return error;
5200 }
5201
5202 static int
5203 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5204 {
5205 int error;
5206 struct hfsc_class class;
5207
5208 error = hfsc_setup_qdisc__(netdev);
5209
5210 if (error) {
5211 return error;
5212 }
5213
5214 hfsc_parse_qdisc_details__(netdev, details, &class);
5215 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5216 tc_make_handle(1, 0), &class);
5217
5218 if (error) {
5219 return error;
5220 }
5221
5222 hfsc_install__(netdev, class.max_rate);
5223 return 0;
5224 }
5225
5226 static int
5227 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5228 {
5229 struct ofpbuf msg;
5230 struct queue_dump_state state;
5231 struct hfsc_class hc;
5232
5233 hc.max_rate = 0;
5234 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5235 hfsc_install__(netdev, hc.max_rate);
5236
5237 if (!start_queue_dump(netdev, &state)) {
5238 return ENODEV;
5239 }
5240
5241 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5242 unsigned int queue_id;
5243
5244 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5245 hfsc_update_queue__(netdev, queue_id, &hc);
5246 }
5247 }
5248
5249 finish_queue_dump(&state);
5250 return 0;
5251 }
5252
5253 static void
5254 hfsc_tc_destroy(struct tc *tc)
5255 {
5256 struct hfsc *hfsc;
5257 struct hfsc_class *hc, *next;
5258
5259 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5260
5261 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
5262 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5263 free(hc);
5264 }
5265
5266 tc_destroy(tc);
5267 free(hfsc);
5268 }
5269
5270 static int
5271 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5272 {
5273 const struct hfsc *hfsc;
5274 hfsc = hfsc_get__(netdev);
5275 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5276 return 0;
5277 }
5278
5279 static int
5280 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5281 {
5282 int error;
5283 struct hfsc_class class;
5284
5285 hfsc_parse_qdisc_details__(netdev, details, &class);
5286 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5287 tc_make_handle(1, 0), &class);
5288
5289 if (!error) {
5290 hfsc_get__(netdev)->max_rate = class.max_rate;
5291 }
5292
5293 return error;
5294 }
5295
5296 static int
5297 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5298 const struct tc_queue *queue, struct smap *details)
5299 {
5300 const struct hfsc_class *hc;
5301
5302 hc = hfsc_class_cast__(queue);
5303 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5304 if (hc->min_rate != hc->max_rate) {
5305 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5306 }
5307 return 0;
5308 }
5309
5310 static int
5311 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5312 const struct smap *details)
5313 {
5314 int error;
5315 struct hfsc_class class;
5316
5317 error = hfsc_parse_class_details__(netdev, details, &class);
5318 if (error) {
5319 return error;
5320 }
5321
5322 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5323 tc_make_handle(1, 0xfffe), &class);
5324 if (error) {
5325 return error;
5326 }
5327
5328 hfsc_update_queue__(netdev, queue_id, &class);
5329 return 0;
5330 }
5331
5332 static int
5333 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5334 {
5335 int error;
5336 struct hfsc *hfsc;
5337 struct hfsc_class *hc;
5338
5339 hc = hfsc_class_cast__(queue);
5340 hfsc = hfsc_get__(netdev);
5341
5342 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5343 if (!error) {
5344 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5345 free(hc);
5346 }
5347 return error;
5348 }
5349
5350 static int
5351 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5352 struct netdev_queue_stats *stats)
5353 {
5354 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5355 tc_make_handle(1, 0xfffe), NULL, stats);
5356 }
5357
5358 static int
5359 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5360 const struct ofpbuf *nlmsg,
5361 netdev_dump_queue_stats_cb *cb, void *aux)
5362 {
5363 struct netdev_queue_stats stats;
5364 unsigned int handle, major, minor;
5365 int error;
5366
5367 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5368 if (error) {
5369 return error;
5370 }
5371
5372 major = tc_get_major(handle);
5373 minor = tc_get_minor(handle);
5374 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5375 (*cb)(minor - 1, &stats, aux);
5376 }
5377 return 0;
5378 }
5379
5380 static const struct tc_ops tc_ops_hfsc = {
5381 .linux_name = "hfsc",
5382 .ovs_name = "linux-hfsc",
5383 .n_queues = HFSC_N_QUEUES, /* n_queues */
5384 .tc_install = hfsc_tc_install,
5385 .tc_load = hfsc_tc_load,
5386 .tc_destroy = hfsc_tc_destroy,
5387 .qdisc_get = hfsc_qdisc_get,
5388 .qdisc_set = hfsc_qdisc_set,
5389 .class_get = hfsc_class_get,
5390 .class_set = hfsc_class_set,
5391 .class_delete = hfsc_class_delete,
5392 .class_get_stats = hfsc_class_get_stats,
5393 .class_dump_stats = hfsc_class_dump_stats,
5394 };
5395 \f
5396 /* "linux-noop" traffic control class. */
5397
5398 static void
5399 noop_install__(struct netdev *netdev_)
5400 {
5401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5402 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5403
5404 netdev->tc = CONST_CAST(struct tc *, &tc);
5405 }
5406
5407 static int
5408 noop_tc_install(struct netdev *netdev,
5409 const struct smap *details OVS_UNUSED)
5410 {
5411 noop_install__(netdev);
5412 return 0;
5413 }
5414
5415 static int
5416 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5417 {
5418 noop_install__(netdev);
5419 return 0;
5420 }
5421
5422 static const struct tc_ops tc_ops_noop = {
5423 .ovs_name = "linux-noop", /* ovs_name */
5424 .tc_install = noop_tc_install,
5425 .tc_load = noop_tc_load,
5426 };
5427 \f
5428 /* "linux-default" traffic control class.
5429 *
5430 * This class represents the default, unnamed Linux qdisc. It corresponds to
5431 * the "" (empty string) QoS type in the OVS database. */
5432
5433 static void
5434 default_install__(struct netdev *netdev_)
5435 {
5436 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5437 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5438
5439 /* Nothing but a tc class implementation is allowed to write to a tc. This
5440 * class never does that, so we can legitimately use a const tc object. */
5441 netdev->tc = CONST_CAST(struct tc *, &tc);
5442 }
5443
5444 static int
5445 default_tc_install(struct netdev *netdev,
5446 const struct smap *details OVS_UNUSED)
5447 {
5448 default_install__(netdev);
5449 return 0;
5450 }
5451
5452 static int
5453 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5454 {
5455 default_install__(netdev);
5456 return 0;
5457 }
5458
5459 static const struct tc_ops tc_ops_default = {
5460 .ovs_name = "", /* ovs_name */
5461 .tc_install = default_tc_install,
5462 .tc_load = default_tc_load,
5463 };
5464 \f
5465 /* "linux-other" traffic control class.
5466 *
5467 * */
5468
5469 static int
5470 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5471 {
5472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5473 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5474
5475 /* Nothing but a tc class implementation is allowed to write to a tc. This
5476 * class never does that, so we can legitimately use a const tc object. */
5477 netdev->tc = CONST_CAST(struct tc *, &tc);
5478 return 0;
5479 }
5480
5481 static const struct tc_ops tc_ops_other = {
5482 .ovs_name = "linux-other",
5483 .tc_load = other_tc_load,
5484 };
5485 \f
5486 /* Traffic control. */
5487
5488 /* Number of kernel "tc" ticks per second. */
5489 static double ticks_per_s;
5490
5491 /* Number of kernel "jiffies" per second. This is used for the purpose of
5492 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5493 * one jiffy's worth of data.
5494 *
5495 * There are two possibilities here:
5496 *
5497 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5498 * approximate range of 100 to 1024. That means that we really need to
5499 * make sure that the qdisc can buffer that much data.
5500 *
5501 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5502 * has finely granular timers and there's no need to fudge additional room
5503 * for buffers. (There's no extra effort needed to implement that: the
5504 * large 'buffer_hz' is used as a divisor, so practically any number will
5505 * come out as 0 in the division. Small integer results in the case of
5506 * really high dividends won't have any real effect anyhow.)
5507 */
5508 static unsigned int buffer_hz;
5509
5510 static struct tcmsg *
5511 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5512 unsigned int flags, struct ofpbuf *request)
5513 {
5514 int ifindex;
5515 int error;
5516
5517 error = get_ifindex(netdev, &ifindex);
5518 if (error) {
5519 return NULL;
5520 }
5521
5522 return tc_make_request(ifindex, type, flags, request);
5523 }
5524
5525 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5526 * of 'kbits_burst'.
5527 *
5528 * This function is equivalent to running:
5529 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5530 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5531 * mtu 65535 drop
5532 *
5533 * The configuration and stats may be seen with the following command:
5534 * /sbin/tc -s filter show dev <devname> parent ffff:
5535 *
5536 * Returns 0 if successful, otherwise a positive errno value.
5537 */
5538 static int
5539 tc_add_policer(struct netdev *netdev,
5540 uint32_t kbits_rate, uint32_t kbits_burst)
5541 {
5542 struct tc_police tc_police;
5543 struct ofpbuf request;
5544 struct tcmsg *tcmsg;
5545 size_t basic_offset;
5546 size_t police_offset;
5547 int error;
5548 int mtu = 65535;
5549
5550 memset(&tc_police, 0, sizeof tc_police);
5551 tc_police.action = TC_POLICE_SHOT;
5552 tc_police.mtu = mtu;
5553 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5554
5555 /* The following appears wrong in one way: In networking a kilobit is
5556 * usually 1000 bits but this uses 1024 bits.
5557 *
5558 * However if you "fix" those problems then "tc filter show ..." shows
5559 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5560 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5561 * tc's point of view. Whatever. */
5562 tc_police.burst = tc_bytes_to_ticks(
5563 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5564
5565 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5566 NLM_F_EXCL | NLM_F_CREATE, &request);
5567 if (!tcmsg) {
5568 return ENODEV;
5569 }
5570 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5571 tcmsg->tcm_info = tc_make_handle(49,
5572 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5573
5574 nl_msg_put_string(&request, TCA_KIND, "basic");
5575 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5576 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5577 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5578 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5579 nl_msg_end_nested(&request, police_offset);
5580 nl_msg_end_nested(&request, basic_offset);
5581
5582 error = tc_transact(&request, NULL);
5583 if (error) {
5584 return error;
5585 }
5586
5587 return 0;
5588 }
5589
5590 static void
5591 read_psched(void)
5592 {
5593 /* The values in psched are not individually very meaningful, but they are
5594 * important. The tables below show some values seen in the wild.
5595 *
5596 * Some notes:
5597 *
5598 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5599 * (Before that, there are hints that it was 1000000000.)
5600 *
5601 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5602 * above.
5603 *
5604 * /proc/net/psched
5605 * -----------------------------------
5606 * [1] 000c8000 000f4240 000f4240 00000064
5607 * [2] 000003e8 00000400 000f4240 3b9aca00
5608 * [3] 000003e8 00000400 000f4240 3b9aca00
5609 * [4] 000003e8 00000400 000f4240 00000064
5610 * [5] 000003e8 00000040 000f4240 3b9aca00
5611 * [6] 000003e8 00000040 000f4240 000000f9
5612 *
5613 * a b c d ticks_per_s buffer_hz
5614 * ------- --------- ---------- ------------- ----------- -------------
5615 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5616 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5617 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [4] 1,000 1,024 1,000,000 100 976,562 100
5619 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5620 * [6] 1,000 64 1,000,000 249 15,625,000 249
5621 *
5622 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5623 * [2] 2.6.26-1-686-bigmem from Debian lenny
5624 * [3] 2.6.26-2-sparc64 from Debian lenny
5625 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5626 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5627 * [6] 2.6.34 from kernel.org on KVM
5628 */
5629 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5630 static const char fn[] = "/proc/net/psched";
5631 unsigned int a, b, c, d;
5632 FILE *stream;
5633
5634 if (!ovsthread_once_start(&once)) {
5635 return;
5636 }
5637
5638 ticks_per_s = 1.0;
5639 buffer_hz = 100;
5640
5641 stream = fopen(fn, "r");
5642 if (!stream) {
5643 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5644 goto exit;
5645 }
5646
5647 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5648 VLOG_WARN("%s: read failed", fn);
5649 fclose(stream);
5650 goto exit;
5651 }
5652 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5653 fclose(stream);
5654
5655 if (!a || !b || !c) {
5656 VLOG_WARN("%s: invalid scheduler parameters", fn);
5657 goto exit;
5658 }
5659
5660 ticks_per_s = (double) a * c / b;
5661 if (c == 1000000) {
5662 buffer_hz = d;
5663 } else {
5664 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5665 fn, a, b, c, d);
5666 }
5667 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5668
5669 exit:
5670 ovsthread_once_done(&once);
5671 }
5672
5673 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5674 * rate of 'rate' bytes per second. */
5675 static unsigned int
5676 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5677 {
5678 read_psched();
5679 return (rate * ticks) / ticks_per_s;
5680 }
5681
5682 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5683 * rate of 'rate' bytes per second. */
5684 static unsigned int
5685 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5686 {
5687 read_psched();
5688 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5689 }
5690
5691 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5692 * a transmission rate of 'rate' bytes per second. */
5693 static unsigned int
5694 tc_buffer_per_jiffy(unsigned int rate)
5695 {
5696 read_psched();
5697 return rate / buffer_hz;
5698 }
5699
5700 static uint32_t
5701 tc_time_to_ticks(uint32_t time) {
5702 read_psched();
5703 return time * (ticks_per_s / 1000000);
5704 }
5705
5706 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5707 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5708 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5709 * stores NULL into it if it is absent.
5710 *
5711 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5712 * 'msg'.
5713 *
5714 * Returns 0 if successful, otherwise a positive errno value. */
5715 static int
5716 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5717 struct nlattr **options)
5718 {
5719 static const struct nl_policy tca_policy[] = {
5720 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5721 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5722 };
5723 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5724
5725 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5726 tca_policy, ta, ARRAY_SIZE(ta))) {
5727 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5728 goto error;
5729 }
5730
5731 if (kind) {
5732 *kind = nl_attr_get_string(ta[TCA_KIND]);
5733 }
5734
5735 if (options) {
5736 *options = ta[TCA_OPTIONS];
5737 }
5738
5739 return 0;
5740
5741 error:
5742 if (kind) {
5743 *kind = NULL;
5744 }
5745 if (options) {
5746 *options = NULL;
5747 }
5748 return EPROTO;
5749 }
5750
5751 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5752 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5753 * into '*options', and its queue statistics into '*stats'. Any of the output
5754 * arguments may be null.
5755 *
5756 * Returns 0 if successful, otherwise a positive errno value. */
5757 static int
5758 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5759 struct nlattr **options, struct netdev_queue_stats *stats)
5760 {
5761 static const struct nl_policy tca_policy[] = {
5762 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5763 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5764 };
5765 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5766
5767 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5768 tca_policy, ta, ARRAY_SIZE(ta))) {
5769 VLOG_WARN_RL(&rl, "failed to parse class message");
5770 goto error;
5771 }
5772
5773 if (handlep) {
5774 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5775 *handlep = tc->tcm_handle;
5776 }
5777
5778 if (options) {
5779 *options = ta[TCA_OPTIONS];
5780 }
5781
5782 if (stats) {
5783 const struct gnet_stats_queue *gsq;
5784 struct gnet_stats_basic gsb;
5785
5786 static const struct nl_policy stats_policy[] = {
5787 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5788 .min_len = sizeof gsb },
5789 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5790 .min_len = sizeof *gsq },
5791 };
5792 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5793
5794 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5795 sa, ARRAY_SIZE(sa))) {
5796 VLOG_WARN_RL(&rl, "failed to parse class stats");
5797 goto error;
5798 }
5799
5800 /* Alignment issues screw up the length of struct gnet_stats_basic on
5801 * some arch/bitsize combinations. Newer versions of Linux have a
5802 * struct gnet_stats_basic_packed, but we can't depend on that. The
5803 * easiest thing to do is just to make a copy. */
5804 memset(&gsb, 0, sizeof gsb);
5805 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5806 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5807 stats->tx_bytes = gsb.bytes;
5808 stats->tx_packets = gsb.packets;
5809
5810 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5811 stats->tx_errors = gsq->drops;
5812 }
5813
5814 return 0;
5815
5816 error:
5817 if (options) {
5818 *options = NULL;
5819 }
5820 if (stats) {
5821 memset(stats, 0, sizeof *stats);
5822 }
5823 return EPROTO;
5824 }
5825
5826 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5827 * on 'netdev'. */
5828 static int
5829 tc_query_class(const struct netdev *netdev,
5830 unsigned int handle, unsigned int parent,
5831 struct ofpbuf **replyp)
5832 {
5833 struct ofpbuf request;
5834 struct tcmsg *tcmsg;
5835 int error;
5836
5837 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5838 &request);
5839 if (!tcmsg) {
5840 return ENODEV;
5841 }
5842 tcmsg->tcm_handle = handle;
5843 tcmsg->tcm_parent = parent;
5844
5845 error = tc_transact(&request, replyp);
5846 if (error) {
5847 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5848 netdev_get_name(netdev),
5849 tc_get_major(handle), tc_get_minor(handle),
5850 tc_get_major(parent), tc_get_minor(parent),
5851 ovs_strerror(error));
5852 }
5853 return error;
5854 }
5855
5856 /* Equivalent to "tc class del dev <name> handle <handle>". */
5857 static int
5858 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5859 {
5860 struct ofpbuf request;
5861 struct tcmsg *tcmsg;
5862 int error;
5863
5864 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5865 if (!tcmsg) {
5866 return ENODEV;
5867 }
5868 tcmsg->tcm_handle = handle;
5869 tcmsg->tcm_parent = 0;
5870
5871 error = tc_transact(&request, NULL);
5872 if (error) {
5873 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5874 netdev_get_name(netdev),
5875 tc_get_major(handle), tc_get_minor(handle),
5876 ovs_strerror(error));
5877 }
5878 return error;
5879 }
5880
5881 /* Equivalent to "tc qdisc del dev <name> root". */
5882 static int
5883 tc_del_qdisc(struct netdev *netdev_)
5884 {
5885 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5886 struct ofpbuf request;
5887 struct tcmsg *tcmsg;
5888 int error;
5889
5890 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5891 if (!tcmsg) {
5892 return ENODEV;
5893 }
5894 tcmsg->tcm_handle = tc_make_handle(1, 0);
5895 tcmsg->tcm_parent = TC_H_ROOT;
5896
5897 error = tc_transact(&request, NULL);
5898 if (error == EINVAL) {
5899 /* EINVAL probably means that the default qdisc was in use, in which
5900 * case we've accomplished our purpose. */
5901 error = 0;
5902 }
5903 if (!error && netdev->tc) {
5904 if (netdev->tc->ops->tc_destroy) {
5905 netdev->tc->ops->tc_destroy(netdev->tc);
5906 }
5907 netdev->tc = NULL;
5908 }
5909 return error;
5910 }
5911
5912 static bool
5913 getqdisc_is_safe(void)
5914 {
5915 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5916 static bool safe = false;
5917
5918 if (ovsthread_once_start(&once)) {
5919 struct utsname utsname;
5920 int major, minor;
5921
5922 if (uname(&utsname) == -1) {
5923 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5924 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5925 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5926 } else if (major < 2 || (major == 2 && minor < 35)) {
5927 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5928 utsname.release);
5929 } else {
5930 safe = true;
5931 }
5932 ovsthread_once_done(&once);
5933 }
5934 return safe;
5935 }
5936
5937 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5938 * kernel to determine what they are. Returns 0 if successful, otherwise a
5939 * positive errno value. */
5940 static int
5941 tc_query_qdisc(const struct netdev *netdev_)
5942 {
5943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5944 struct ofpbuf request, *qdisc;
5945 const struct tc_ops *ops;
5946 struct tcmsg *tcmsg;
5947 int load_error;
5948 int error;
5949
5950 if (netdev->tc) {
5951 return 0;
5952 }
5953
5954 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5955 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5956 * 2.6.35 without that fix backported to it.
5957 *
5958 * To avoid the OOPS, we must not make a request that would attempt to dump
5959 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5960 * few others. There are a few ways that I can see to do this, but most of
5961 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5962 * technique chosen here is to assume that any non-default qdisc that we
5963 * create will have a class with handle 1:0. The built-in qdiscs only have
5964 * a class with handle 0:0.
5965 *
5966 * On Linux 2.6.35+ we use the straightforward method because it allows us
5967 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5968 * in such a case we get no response at all from the kernel (!) if a
5969 * builtin qdisc is in use (which is later caught by "!error &&
5970 * !qdisc->size"). */
5971 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5972 &request);
5973 if (!tcmsg) {
5974 return ENODEV;
5975 }
5976 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5977 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5978
5979 /* Figure out what tc class to instantiate. */
5980 error = tc_transact(&request, &qdisc);
5981 if (!error && qdisc->size) {
5982 const char *kind;
5983
5984 error = tc_parse_qdisc(qdisc, &kind, NULL);
5985 if (error) {
5986 ops = &tc_ops_other;
5987 } else {
5988 ops = tc_lookup_linux_name(kind);
5989 if (!ops) {
5990 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5991 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5992
5993 ops = &tc_ops_other;
5994 }
5995 }
5996 } else if ((!error && !qdisc->size) || error == ENOENT) {
5997 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5998 * set up by some other entity that doesn't have a handle 1:0. We will
5999 * assume that it's the system default qdisc. */
6000 ops = &tc_ops_default;
6001 error = 0;
6002 } else {
6003 /* Who knows? Maybe the device got deleted. */
6004 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
6005 netdev_get_name(netdev_), ovs_strerror(error));
6006 ops = &tc_ops_other;
6007 }
6008
6009 /* Instantiate it. */
6010 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6011 ovs_assert((load_error == 0) == (netdev->tc != NULL));
6012 ofpbuf_delete(qdisc);
6013
6014 return error ? error : load_error;
6015 }
6016
6017 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6018 approximate the time to transmit packets of various lengths. For an MTU of
6019 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6020 represents two possible packet lengths; for a MTU of 513 through 1024, four
6021 possible lengths; and so on.
6022
6023 Returns, for the specified 'mtu', the number of bits that packet lengths
6024 need to be shifted right to fit within such a 256-entry table. */
6025 static int
6026 tc_calc_cell_log(unsigned int mtu)
6027 {
6028 int cell_log;
6029
6030 if (!mtu) {
6031 mtu = ETH_PAYLOAD_MAX;
6032 }
6033 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6034
6035 for (cell_log = 0; mtu >= 256; cell_log++) {
6036 mtu >>= 1;
6037 }
6038
6039 return cell_log;
6040 }
6041
6042 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6043 * of 'mtu'. */
6044 static void
6045 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6046 {
6047 memset(rate, 0, sizeof *rate);
6048 rate->cell_log = tc_calc_cell_log(mtu);
6049 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6050 /* rate->cell_align = 0; */ /* distro headers. */
6051 rate->mpu = ETH_TOTAL_MIN;
6052 rate->rate = Bps;
6053 }
6054
6055 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6056 * attribute of the specified "type".
6057 *
6058 * See tc_calc_cell_log() above for a description of "rtab"s. */
6059 void
6060 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
6061 {
6062 uint32_t *rtab;
6063 unsigned int i;
6064
6065 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6066 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6067 unsigned packet_size = (i + 1) << rate->cell_log;
6068 if (packet_size < rate->mpu) {
6069 packet_size = rate->mpu;
6070 }
6071 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
6072 }
6073 }
6074
6075 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6076 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6077 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6078 * 0 is fine.) */
6079 static int
6080 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
6081 {
6082 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6083 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6084 }
6085 \f
6086 /* Linux-only functions declared in netdev-linux.h */
6087
6088 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6089 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6090 int
6091 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6092 const char *flag_name, bool enable)
6093 {
6094 const char *netdev_name = netdev_get_name(netdev);
6095 struct ethtool_value evalue;
6096 uint32_t new_flags;
6097 int error;
6098
6099 COVERAGE_INC(netdev_get_ethtool);
6100 memset(&evalue, 0, sizeof evalue);
6101 error = netdev_linux_do_ethtool(netdev_name,
6102 (struct ethtool_cmd *)&evalue,
6103 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6104 if (error) {
6105 return error;
6106 }
6107
6108 COVERAGE_INC(netdev_set_ethtool);
6109 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6110 if (new_flags == evalue.data) {
6111 return 0;
6112 }
6113 evalue.data = new_flags;
6114 error = netdev_linux_do_ethtool(netdev_name,
6115 (struct ethtool_cmd *)&evalue,
6116 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6117 if (error) {
6118 return error;
6119 }
6120
6121 COVERAGE_INC(netdev_get_ethtool);
6122 memset(&evalue, 0, sizeof evalue);
6123 error = netdev_linux_do_ethtool(netdev_name,
6124 (struct ethtool_cmd *)&evalue,
6125 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6126 if (error) {
6127 return error;
6128 }
6129
6130 if (new_flags != evalue.data) {
6131 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6132 "device %s failed", enable ? "enable" : "disable",
6133 flag_name, netdev_name);
6134 return EOPNOTSUPP;
6135 }
6136
6137 return 0;
6138 }
6139 \f
6140 /* Utility functions. */
6141
6142 /* Copies 'src' into 'dst', performing format conversion in the process. */
6143 static void
6144 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6145 const struct rtnl_link_stats *src)
6146 {
6147 dst->rx_packets = src->rx_packets;
6148 dst->tx_packets = src->tx_packets;
6149 dst->rx_bytes = src->rx_bytes;
6150 dst->tx_bytes = src->tx_bytes;
6151 dst->rx_errors = src->rx_errors;
6152 dst->tx_errors = src->tx_errors;
6153 dst->rx_dropped = src->rx_dropped;
6154 dst->tx_dropped = src->tx_dropped;
6155 dst->multicast = src->multicast;
6156 dst->collisions = src->collisions;
6157 dst->rx_length_errors = src->rx_length_errors;
6158 dst->rx_over_errors = src->rx_over_errors;
6159 dst->rx_crc_errors = src->rx_crc_errors;
6160 dst->rx_frame_errors = src->rx_frame_errors;
6161 dst->rx_fifo_errors = src->rx_fifo_errors;
6162 dst->rx_missed_errors = src->rx_missed_errors;
6163 dst->tx_aborted_errors = src->tx_aborted_errors;
6164 dst->tx_carrier_errors = src->tx_carrier_errors;
6165 dst->tx_fifo_errors = src->tx_fifo_errors;
6166 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6167 dst->tx_window_errors = src->tx_window_errors;
6168 }
6169
6170 /* Copies 'src' into 'dst', performing format conversion in the process. */
6171 static void
6172 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6173 const struct rtnl_link_stats64 *src)
6174 {
6175 dst->rx_packets = src->rx_packets;
6176 dst->tx_packets = src->tx_packets;
6177 dst->rx_bytes = src->rx_bytes;
6178 dst->tx_bytes = src->tx_bytes;
6179 dst->rx_errors = src->rx_errors;
6180 dst->tx_errors = src->tx_errors;
6181 dst->rx_dropped = src->rx_dropped;
6182 dst->tx_dropped = src->tx_dropped;
6183 dst->multicast = src->multicast;
6184 dst->collisions = src->collisions;
6185 dst->rx_length_errors = src->rx_length_errors;
6186 dst->rx_over_errors = src->rx_over_errors;
6187 dst->rx_crc_errors = src->rx_crc_errors;
6188 dst->rx_frame_errors = src->rx_frame_errors;
6189 dst->rx_fifo_errors = src->rx_fifo_errors;
6190 dst->rx_missed_errors = src->rx_missed_errors;
6191 dst->tx_aborted_errors = src->tx_aborted_errors;
6192 dst->tx_carrier_errors = src->tx_carrier_errors;
6193 dst->tx_fifo_errors = src->tx_fifo_errors;
6194 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6195 dst->tx_window_errors = src->tx_window_errors;
6196 }
6197
6198 int
6199 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6200 {
6201 struct ofpbuf request;
6202 struct ofpbuf *reply;
6203 int error;
6204
6205 /* Filtering all counters by default */
6206 memset(stats, 0xFF, sizeof(struct netdev_stats));
6207
6208 ofpbuf_init(&request, 0);
6209 nl_msg_put_nlmsghdr(&request,
6210 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6211 RTM_GETLINK, NLM_F_REQUEST);
6212 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6213 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6214 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6215 ofpbuf_uninit(&request);
6216 if (error) {
6217 return error;
6218 }
6219
6220 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6221 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6222 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6223 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
6224 error = 0;
6225 } else {
6226 a = nl_attr_find(reply, 0, IFLA_STATS);
6227 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6228 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6229 error = 0;
6230 } else {
6231 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6232 error = EPROTO;
6233 }
6234 }
6235 } else {
6236 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6237 error = EPROTO;
6238 }
6239
6240
6241 ofpbuf_delete(reply);
6242 return error;
6243 }
6244
6245 static int
6246 get_flags(const struct netdev *dev, unsigned int *flags)
6247 {
6248 struct ifreq ifr;
6249 int error;
6250
6251 *flags = 0;
6252 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6253 if (!error) {
6254 *flags = ifr.ifr_flags;
6255 }
6256 return error;
6257 }
6258
6259 static int
6260 set_flags(const char *name, unsigned int flags)
6261 {
6262 struct ifreq ifr;
6263
6264 ifr.ifr_flags = flags;
6265 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6266 }
6267
6268 int
6269 linux_get_ifindex(const char *netdev_name)
6270 {
6271 struct ifreq ifr;
6272 int error;
6273
6274 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6275 COVERAGE_INC(netdev_get_ifindex);
6276
6277 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6278 if (error) {
6279 /* ENODEV probably means that a vif disappeared asynchronously and
6280 * hasn't been removed from the database yet, so reduce the log level
6281 * to INFO for that case. */
6282 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6283 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6284 netdev_name, ovs_strerror(error));
6285 return -error;
6286 }
6287 return ifr.ifr_ifindex;
6288 }
6289
6290 static int
6291 get_ifindex(const struct netdev *netdev_, int *ifindexp)
6292 {
6293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6294
6295 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6296 netdev_linux_update_via_netlink(netdev);
6297 }
6298
6299 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6300 /* Fall back to ioctl if netlink fails */
6301 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6302
6303 if (ifindex < 0) {
6304 netdev->get_ifindex_error = -ifindex;
6305 netdev->ifindex = 0;
6306 } else {
6307 netdev->get_ifindex_error = 0;
6308 netdev->ifindex = ifindex;
6309 }
6310 netdev->cache_valid |= VALID_IFINDEX;
6311 }
6312
6313 *ifindexp = netdev->ifindex;
6314 return netdev->get_ifindex_error;
6315 }
6316
6317 static int
6318 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6319 {
6320 struct ofpbuf request;
6321 struct ofpbuf *reply;
6322 struct rtnetlink_change chg;
6323 struct rtnetlink_change *change = &chg;
6324 int error;
6325
6326 ofpbuf_init(&request, 0);
6327 nl_msg_put_nlmsghdr(&request,
6328 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6329 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6330 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6331
6332 /* The correct identifiers for a Linux device are netnsid and ifindex,
6333 * but ifindex changes as the port is moved to another network namespace
6334 * and the interface name statically stored in ovsdb. */
6335 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6336 if (netdev_linux_netnsid_is_remote(netdev)) {
6337 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6338 }
6339 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6340 ofpbuf_uninit(&request);
6341 if (error) {
6342 ofpbuf_delete(reply);
6343 return error;
6344 }
6345
6346 if (rtnetlink_parse(reply, change)
6347 && change->nlmsg_type == RTM_NEWLINK) {
6348 bool changed = false;
6349 error = 0;
6350
6351 /* Update netdev from rtnl msg and increment its seq if needed. */
6352 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6353 netdev->carrier_resets++;
6354 changed = true;
6355 }
6356 if (change->ifi_flags != netdev->ifi_flags) {
6357 netdev->ifi_flags = change->ifi_flags;
6358 changed = true;
6359 }
6360 if (change->mtu && change->mtu != netdev->mtu) {
6361 netdev->mtu = change->mtu;
6362 netdev->cache_valid |= VALID_MTU;
6363 netdev->netdev_mtu_error = 0;
6364 changed = true;
6365 }
6366 if (!eth_addr_is_zero(change->mac)
6367 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6368 netdev->etheraddr = change->mac;
6369 netdev->cache_valid |= VALID_ETHERADDR;
6370 netdev->ether_addr_error = 0;
6371 changed = true;
6372 }
6373 if (change->if_index != netdev->ifindex) {
6374 netdev->ifindex = change->if_index;
6375 netdev->cache_valid |= VALID_IFINDEX;
6376 netdev->get_ifindex_error = 0;
6377 changed = true;
6378 }
6379 if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
6380 netdev->is_lag_master = true;
6381 }
6382 if (changed) {
6383 netdev_change_seq_changed(&netdev->up);
6384 }
6385 } else {
6386 error = EINVAL;
6387 }
6388
6389 ofpbuf_delete(reply);
6390 return error;
6391 }
6392
6393 static int
6394 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6395 {
6396 struct ifreq ifr;
6397 int hwaddr_family;
6398 int error;
6399
6400 memset(&ifr, 0, sizeof ifr);
6401 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6402 COVERAGE_INC(netdev_get_hwaddr);
6403 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6404 if (error) {
6405 /* ENODEV probably means that a vif disappeared asynchronously and
6406 * hasn't been removed from the database yet, so reduce the log level
6407 * to INFO for that case. */
6408 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6409 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6410 netdev_name, ovs_strerror(error));
6411 return error;
6412 }
6413 hwaddr_family = ifr.ifr_hwaddr.sa_family;
6414 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6415 hwaddr_family != ARPHRD_NONE) {
6416 VLOG_INFO("%s device has unknown hardware address family %d",
6417 netdev_name, hwaddr_family);
6418 return EINVAL;
6419 }
6420 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6421 return 0;
6422 }
6423
6424 static int
6425 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6426 {
6427 struct ifreq ifr;
6428 int error;
6429
6430 memset(&ifr, 0, sizeof ifr);
6431 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6432 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6433 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6434 COVERAGE_INC(netdev_set_hwaddr);
6435 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6436 if (error) {
6437 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6438 netdev_name, ovs_strerror(error));
6439 }
6440 return error;
6441 }
6442
6443 static int
6444 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6445 int cmd, const char *cmd_name)
6446 {
6447 struct ifreq ifr;
6448 int error;
6449
6450 memset(&ifr, 0, sizeof ifr);
6451 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6452 ifr.ifr_data = (caddr_t) ecmd;
6453
6454 ecmd->cmd = cmd;
6455 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6456 if (error) {
6457 if (error != EOPNOTSUPP) {
6458 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6459 "failed: %s", cmd_name, name, ovs_strerror(error));
6460 } else {
6461 /* The device doesn't support this operation. That's pretty
6462 * common, so there's no point in logging anything. */
6463 }
6464 }
6465 return error;
6466 }
6467
6468 /* Returns an AF_PACKET raw socket or a negative errno value. */
6469 static int
6470 af_packet_sock(void)
6471 {
6472 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6473 static int sock;
6474
6475 if (ovsthread_once_start(&once)) {
6476 sock = socket(AF_PACKET, SOCK_RAW, 0);
6477 if (sock >= 0) {
6478 int error = set_nonblocking(sock);
6479 if (error) {
6480 close(sock);
6481 sock = -error;
6482 } else if (userspace_tso_enabled()) {
6483 int val = 1;
6484 error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
6485 sizeof val);
6486 if (error) {
6487 error = errno;
6488 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6489 ovs_strerror(errno));
6490 close(sock);
6491 sock = -error;
6492 }
6493 }
6494 } else {
6495 sock = -errno;
6496 VLOG_ERR("failed to create packet socket: %s",
6497 ovs_strerror(errno));
6498 }
6499 ovsthread_once_done(&once);
6500 }
6501
6502 return sock;
6503 }
6504
6505 static int
6506 netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
6507 {
6508 struct eth_header *eth_hdr;
6509 ovs_be16 eth_type;
6510 int l2_len;
6511
6512 eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
6513 if (!eth_hdr) {
6514 return -EINVAL;
6515 }
6516
6517 l2_len = ETH_HEADER_LEN;
6518 eth_type = eth_hdr->eth_type;
6519 if (eth_type_vlan(eth_type)) {
6520 struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
6521
6522 if (!vlan) {
6523 return -EINVAL;
6524 }
6525
6526 eth_type = vlan->vlan_next_type;
6527 l2_len += VLAN_HEADER_LEN;
6528 }
6529
6530 if (eth_type == htons(ETH_TYPE_IP)) {
6531 struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
6532
6533 if (!ip_hdr) {
6534 return -EINVAL;
6535 }
6536
6537 *l4proto = ip_hdr->ip_proto;
6538 dp_packet_hwol_set_tx_ipv4(b);
6539 } else if (eth_type == htons(ETH_TYPE_IPV6)) {
6540 struct ovs_16aligned_ip6_hdr *nh6;
6541
6542 nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
6543 if (!nh6) {
6544 return -EINVAL;
6545 }
6546
6547 *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
6548 dp_packet_hwol_set_tx_ipv6(b);
6549 }
6550
6551 return 0;
6552 }
6553
6554 static int
6555 netdev_linux_parse_vnet_hdr(struct dp_packet *b)
6556 {
6557 struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
6558 uint16_t l4proto = 0;
6559
6560 if (OVS_UNLIKELY(!vnet)) {
6561 return -EINVAL;
6562 }
6563
6564 if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
6565 return 0;
6566 }
6567
6568 if (netdev_linux_parse_l2(b, &l4proto)) {
6569 return -EINVAL;
6570 }
6571
6572 if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
6573 if (l4proto == IPPROTO_TCP) {
6574 dp_packet_hwol_set_csum_tcp(b);
6575 } else if (l4proto == IPPROTO_UDP) {
6576 dp_packet_hwol_set_csum_udp(b);
6577 } else if (l4proto == IPPROTO_SCTP) {
6578 dp_packet_hwol_set_csum_sctp(b);
6579 }
6580 }
6581
6582 if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
6583 uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4
6584 | VIRTIO_NET_HDR_GSO_TCPV6
6585 | VIRTIO_NET_HDR_GSO_UDP;
6586 uint8_t type = vnet->gso_type & allowed_mask;
6587
6588 if (type == VIRTIO_NET_HDR_GSO_TCPV4
6589 || type == VIRTIO_NET_HDR_GSO_TCPV6) {
6590 dp_packet_hwol_set_tcp_seg(b);
6591 }
6592 }
6593
6594 return 0;
6595 }
6596
6597 static void
6598 netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
6599 {
6600 struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
6601
6602 if (dp_packet_hwol_is_tso(b)) {
6603 uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
6604 + TCP_HEADER_LEN;
6605
6606 vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
6607 vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
6608 if (dp_packet_hwol_is_ipv4(b)) {
6609 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
6610 } else {
6611 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
6612 }
6613
6614 } else {
6615 vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
6616 }
6617
6618 if (dp_packet_hwol_l4_mask(b)) {
6619 vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
6620 vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b)
6621 - (char *)dp_packet_eth(b));
6622
6623 if (dp_packet_hwol_l4_is_tcp(b)) {
6624 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6625 struct tcp_header, tcp_csum);
6626 } else if (dp_packet_hwol_l4_is_udp(b)) {
6627 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6628 struct udp_header, udp_csum);
6629 } else if (dp_packet_hwol_l4_is_sctp(b)) {
6630 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6631 struct sctp_header, sctp_csum);
6632 } else {
6633 VLOG_WARN_RL(&rl, "Unsupported L4 protocol");
6634 }
6635 }
6636 }