]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
25d037cb643fbba06457070b1a761e28768d3ee3
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
26 #include <inttypes.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/route.h>
43 #include <poll.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <unistd.h>
47
48 #include "coverage.h"
49 #include "dp-packet.h"
50 #include "dpif-netlink.h"
51 #include "dpif-netdev.h"
52 #include "openvswitch/dynamic-string.h"
53 #include "fatal-signal.h"
54 #include "hash.h"
55 #include "openvswitch/hmap.h"
56 #include "netdev-provider.h"
57 #include "netdev-tc-offloads.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
61 #include "netlink.h"
62 #include "netnsid.h"
63 #include "openvswitch/ofpbuf.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
66 #include "packets.h"
67 #include "openvswitch/poll-loop.h"
68 #include "rtnetlink.h"
69 #include "openvswitch/shash.h"
70 #include "socket-util.h"
71 #include "sset.h"
72 #include "tc.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76 #include "util.h"
77
78 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79
80 COVERAGE_DEFINE(netdev_set_policing);
81 COVERAGE_DEFINE(netdev_arp_lookup);
82 COVERAGE_DEFINE(netdev_get_ifindex);
83 COVERAGE_DEFINE(netdev_get_hwaddr);
84 COVERAGE_DEFINE(netdev_set_hwaddr);
85 COVERAGE_DEFINE(netdev_get_ethtool);
86 COVERAGE_DEFINE(netdev_set_ethtool);
87
88 \f
89 #ifndef IFLA_IF_NETNSID
90 #define IFLA_IF_NETNSID 0x45
91 #endif
92 /* These were introduced in Linux 2.6.14, so they might be missing if we have
93 * old headers. */
94 #ifndef ADVERTISED_Pause
95 #define ADVERTISED_Pause (1 << 13)
96 #endif
97 #ifndef ADVERTISED_Asym_Pause
98 #define ADVERTISED_Asym_Pause (1 << 14)
99 #endif
100
101 /* These were introduced in Linux 2.6.24, so they might be missing if we
102 * have old headers. */
103 #ifndef ETHTOOL_GFLAGS
104 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
105 #endif
106 #ifndef ETHTOOL_SFLAGS
107 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
108 #endif
109
110 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
111 * headers. */
112 #ifndef TC_RTAB_SIZE
113 #define TC_RTAB_SIZE 1024
114 #endif
115
116 /* Linux 2.6.21 introduced struct tpacket_auxdata.
117 * Linux 2.6.27 added the tp_vlan_tci member.
118 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
119 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
120 * TP_STATUS_VLAN_TPID_VALID.
121 *
122 * With all this churn it's easiest to unconditionally define a replacement
123 * structure that has everything we want.
124 */
125 #ifndef PACKET_AUXDATA
126 #define PACKET_AUXDATA 8
127 #endif
128 #ifndef TP_STATUS_VLAN_VALID
129 #define TP_STATUS_VLAN_VALID (1 << 4)
130 #endif
131 #ifndef TP_STATUS_VLAN_TPID_VALID
132 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
133 #endif
134 #undef tpacket_auxdata
135 #define tpacket_auxdata rpl_tpacket_auxdata
136 struct tpacket_auxdata {
137 uint32_t tp_status;
138 uint32_t tp_len;
139 uint32_t tp_snaplen;
140 uint16_t tp_mac;
141 uint16_t tp_net;
142 uint16_t tp_vlan_tci;
143 uint16_t tp_vlan_tpid;
144 };
145
146 /* Linux 2.6.27 introduced ethtool_cmd_speed
147 *
148 * To avoid revisiting problems reported with using configure to detect
149 * compatibility (see report at
150 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
151 * unconditionally replace ethtool_cmd_speed. */
152 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
153 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
154 {
155 return ep->speed | (ep->speed_hi << 16);
156 }
157
158 /* Linux 2.6.30 introduced supported and advertised flags for
159 * 1G base KX, and 10G base KX4, KR and R. */
160 #ifndef SUPPORTED_1000baseKX_Full
161 #define SUPPORTED_1000baseKX_Full (1 << 17)
162 #define SUPPORTED_10000baseKX4_Full (1 << 18)
163 #define SUPPORTED_10000baseKR_Full (1 << 19)
164 #define SUPPORTED_10000baseR_FEC (1 << 20)
165 #define ADVERTISED_1000baseKX_Full (1 << 17)
166 #define ADVERTISED_10000baseKX4_Full (1 << 18)
167 #define ADVERTISED_10000baseKR_Full (1 << 19)
168 #define ADVERTISED_10000baseR_FEC (1 << 20)
169 #endif
170
171 /* Linux 3.5 introduced supported and advertised flags for
172 * 40G base KR4, CR4, SR4 and LR4. */
173 #ifndef SUPPORTED_40000baseKR4_Full
174 #define SUPPORTED_40000baseKR4_Full (1 << 23)
175 #define SUPPORTED_40000baseCR4_Full (1 << 24)
176 #define SUPPORTED_40000baseSR4_Full (1 << 25)
177 #define SUPPORTED_40000baseLR4_Full (1 << 26)
178 #define ADVERTISED_40000baseKR4_Full (1 << 23)
179 #define ADVERTISED_40000baseCR4_Full (1 << 24)
180 #define ADVERTISED_40000baseSR4_Full (1 << 25)
181 #define ADVERTISED_40000baseLR4_Full (1 << 26)
182 #endif
183
184 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
185 *
186 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
187 * 2.6.32-431.29.2.el6.x86_64 (see report at
188 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
189 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
190 * unconditionally define a replacement. */
191 #ifndef IFLA_STATS64
192 #define IFLA_STATS64 23
193 #endif
194 #define rtnl_link_stats64 rpl_rtnl_link_stats64
195 struct rtnl_link_stats64 {
196 uint64_t rx_packets;
197 uint64_t tx_packets;
198 uint64_t rx_bytes;
199 uint64_t tx_bytes;
200 uint64_t rx_errors;
201 uint64_t tx_errors;
202 uint64_t rx_dropped;
203 uint64_t tx_dropped;
204 uint64_t multicast;
205 uint64_t collisions;
206
207 uint64_t rx_length_errors;
208 uint64_t rx_over_errors;
209 uint64_t rx_crc_errors;
210 uint64_t rx_frame_errors;
211 uint64_t rx_fifo_errors;
212 uint64_t rx_missed_errors;
213
214 uint64_t tx_aborted_errors;
215 uint64_t tx_carrier_errors;
216 uint64_t tx_fifo_errors;
217 uint64_t tx_heartbeat_errors;
218 uint64_t tx_window_errors;
219
220 uint64_t rx_compressed;
221 uint64_t tx_compressed;
222 };
223
224 enum {
225 VALID_IFINDEX = 1 << 0,
226 VALID_ETHERADDR = 1 << 1,
227 VALID_IN = 1 << 2,
228 VALID_MTU = 1 << 3,
229 VALID_POLICING = 1 << 4,
230 VALID_VPORT_STAT_ERROR = 1 << 5,
231 VALID_DRVINFO = 1 << 6,
232 VALID_FEATURES = 1 << 7,
233 };
234 \f
235 struct linux_lag_slave {
236 uint32_t block_id;
237 struct shash_node *node;
238 };
239
240 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
241 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
242
243 /* All slaves whose LAG masters are network devices in OvS. */
244 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
245 = SHASH_INITIALIZER(&lag_shash);
246
247 /* Traffic control. */
248
249 /* An instance of a traffic control class. Always associated with a particular
250 * network device.
251 *
252 * Each TC implementation subclasses this with whatever additional data it
253 * needs. */
254 struct tc {
255 const struct tc_ops *ops;
256 struct hmap queues; /* Contains "struct tc_queue"s.
257 * Read by generic TC layer.
258 * Written only by TC implementation. */
259 };
260
261 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
262
263 /* One traffic control queue.
264 *
265 * Each TC implementation subclasses this with whatever additional data it
266 * needs. */
267 struct tc_queue {
268 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
269 unsigned int queue_id; /* OpenFlow queue ID. */
270 long long int created; /* Time queue was created, in msecs. */
271 };
272
273 /* A particular kind of traffic control. Each implementation generally maps to
274 * one particular Linux qdisc class.
275 *
276 * The functions below return 0 if successful or a positive errno value on
277 * failure, except where otherwise noted. All of them must be provided, except
278 * where otherwise noted. */
279 struct tc_ops {
280 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
281 * This is null for tc_ops_default and tc_ops_other, for which there are no
282 * appropriate values. */
283 const char *linux_name;
284
285 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
286 const char *ovs_name;
287
288 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
289 * queues. The queues are numbered 0 through n_queues - 1. */
290 unsigned int n_queues;
291
292 /* Called to install this TC class on 'netdev'. The implementation should
293 * make the Netlink calls required to set up 'netdev' with the right qdisc
294 * and configure it according to 'details'. The implementation may assume
295 * that the current qdisc is the default; that is, there is no need for it
296 * to delete the current qdisc before installing itself.
297 *
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
301 *
302 * This function must return 0 if and only if it sets 'netdev->tc' to an
303 * initialized 'struct tc'.
304 *
305 * (This function is null for tc_ops_other, which cannot be installed. For
306 * other TC classes it should always be nonnull.) */
307 int (*tc_install)(struct netdev *netdev, const struct smap *details);
308
309 /* Called when the netdev code determines (through a Netlink query) that
310 * this TC class's qdisc is installed on 'netdev', but we didn't install
311 * it ourselves and so don't know any of the details.
312 *
313 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
314 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
315 * implementation should parse the other attributes of 'nlmsg' as
316 * necessary to determine its configuration. If necessary it should also
317 * use Netlink queries to determine the configuration of queues on
318 * 'netdev'.
319 *
320 * This function must return 0 if and only if it sets 'netdev->tc' to an
321 * initialized 'struct tc'. */
322 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
323
324 /* Destroys the data structures allocated by the implementation as part of
325 * 'tc'. (This includes destroying 'tc->queues' by calling
326 * tc_destroy(tc).
327 *
328 * The implementation should not need to perform any Netlink calls. If
329 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
330 * (But it may not be desirable.)
331 *
332 * This function may be null if 'tc' is trivial. */
333 void (*tc_destroy)(struct tc *tc);
334
335 /* Retrieves details of 'netdev->tc' configuration into 'details'.
336 *
337 * The implementation should not need to perform any Netlink calls, because
338 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
339 * cached the configuration.
340 *
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
344 *
345 * This function may be null if 'tc' is not configurable.
346 */
347 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
348
349 /* Reconfigures 'netdev->tc' according to 'details', performing any
350 * required Netlink calls to complete the reconfiguration.
351 *
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
354 * (which is built as ovs-vswitchd.conf.db(8)).
355 *
356 * This function may be null if 'tc' is not configurable.
357 */
358 int (*qdisc_set)(struct netdev *, const struct smap *details);
359
360 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
361 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
362 *
363 * The contents of 'details' should be documented as valid for 'ovs_name'
364 * in the "other_config" column in the "Queue" table in
365 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
366 *
367 * The implementation should not need to perform any Netlink calls, because
368 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
369 * cached the queue configuration.
370 *
371 * This function may be null if 'tc' does not have queues ('n_queues' is
372 * 0). */
373 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
374 struct smap *details);
375
376 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
377 * 'details', perfoming any required Netlink calls to complete the
378 * reconfiguration. The caller ensures that 'queue_id' is less than
379 * 'n_queues'.
380 *
381 * The contents of 'details' should be documented as valid for 'ovs_name'
382 * in the "other_config" column in the "Queue" table in
383 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
384 *
385 * This function may be null if 'tc' does not have queues or its queues are
386 * not configurable. */
387 int (*class_set)(struct netdev *, unsigned int queue_id,
388 const struct smap *details);
389
390 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
391 * tc_queue's within 'netdev->tc->queues'.
392 *
393 * This function may be null if 'tc' does not have queues or its queues
394 * cannot be deleted. */
395 int (*class_delete)(struct netdev *, struct tc_queue *queue);
396
397 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
398 * 'struct tc_queue's within 'netdev->tc->queues'.
399 *
400 * On success, initializes '*stats'.
401 *
402 * This function may be null if 'tc' does not have queues or if it cannot
403 * report queue statistics. */
404 int (*class_get_stats)(const struct netdev *netdev,
405 const struct tc_queue *queue,
406 struct netdev_queue_stats *stats);
407
408 /* Extracts queue stats from 'nlmsg', which is a response to a
409 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
410 *
411 * This function may be null if 'tc' does not have queues or if it cannot
412 * report queue statistics. */
413 int (*class_dump_stats)(const struct netdev *netdev,
414 const struct ofpbuf *nlmsg,
415 netdev_dump_queue_stats_cb *cb, void *aux);
416 };
417
418 static void
419 tc_init(struct tc *tc, const struct tc_ops *ops)
420 {
421 tc->ops = ops;
422 hmap_init(&tc->queues);
423 }
424
425 static void
426 tc_destroy(struct tc *tc)
427 {
428 hmap_destroy(&tc->queues);
429 }
430
431 static const struct tc_ops tc_ops_htb;
432 static const struct tc_ops tc_ops_hfsc;
433 static const struct tc_ops tc_ops_codel;
434 static const struct tc_ops tc_ops_fqcodel;
435 static const struct tc_ops tc_ops_sfq;
436 static const struct tc_ops tc_ops_default;
437 static const struct tc_ops tc_ops_noop;
438 static const struct tc_ops tc_ops_other;
439
440 static const struct tc_ops *const tcs[] = {
441 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
442 &tc_ops_hfsc, /* Hierarchical fair service curve. */
443 &tc_ops_codel, /* Controlled delay */
444 &tc_ops_fqcodel, /* Fair queue controlled delay */
445 &tc_ops_sfq, /* Stochastic fair queueing */
446 &tc_ops_noop, /* Non operating qos type. */
447 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
448 &tc_ops_other, /* Some other qdisc. */
449 NULL
450 };
451
452 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
453 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
454 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
455
456 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
457 int type,
458 unsigned int flags,
459 struct ofpbuf *);
460 static int tc_add_policer(struct netdev *,
461 uint32_t kbits_rate, uint32_t kbits_burst);
462
463 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
464 struct nlattr **options);
465 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
466 struct nlattr **options,
467 struct netdev_queue_stats *);
468 static int tc_query_class(const struct netdev *,
469 unsigned int handle, unsigned int parent,
470 struct ofpbuf **replyp);
471 static int tc_delete_class(const struct netdev *, unsigned int handle);
472
473 static int tc_del_qdisc(struct netdev *netdev);
474 static int tc_query_qdisc(const struct netdev *netdev);
475
476 static int tc_calc_cell_log(unsigned int mtu);
477 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
478 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
479 const struct tc_ratespec *rate);
480 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
481 \f
482 struct netdev_linux {
483 struct netdev up;
484
485 /* Protects all members below. */
486 struct ovs_mutex mutex;
487
488 unsigned int cache_valid;
489
490 bool miimon; /* Link status of last poll. */
491 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
492 struct timer miimon_timer;
493
494 int netnsid; /* Network namespace ID. */
495 /* The following are figured out "on demand" only. They are only valid
496 * when the corresponding VALID_* bit in 'cache_valid' is set. */
497 int ifindex;
498 struct eth_addr etheraddr;
499 int mtu;
500 unsigned int ifi_flags;
501 long long int carrier_resets;
502 uint32_t kbits_rate; /* Policing data. */
503 uint32_t kbits_burst;
504 int vport_stats_error; /* Cached error code from vport_get_stats().
505 0 or an errno value. */
506 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
507 int ether_addr_error; /* Cached error code from set/get etheraddr. */
508 int netdev_policing_error; /* Cached error code from set policing. */
509 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
510 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
511
512 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
513 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
515
516 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
517 struct tc *tc;
518
519 /* For devices of class netdev_tap_class only. */
520 int tap_fd;
521 bool present; /* If the device is present in the namespace */
522 uint64_t tx_dropped; /* tap device can drop if the iface is down */
523
524 /* LAG information. */
525 bool is_lag_master; /* True if the netdev is a LAG master. */
526 };
527
528 struct netdev_rxq_linux {
529 struct netdev_rxq up;
530 bool is_tap;
531 int fd;
532 };
533
534 /* This is set pretty low because we probably won't learn anything from the
535 * additional log messages. */
536 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
537
538 /* Polling miimon status for all ports causes performance degradation when
539 * handling a large number of ports. If there are no devices using miimon, then
540 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
541 *
542 * Readers do not depend on this variable synchronizing with the related
543 * changes in the device miimon status, so we can use atomic_count. */
544 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
545
546 static void netdev_linux_run(const struct netdev_class *);
547
548 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
549 int cmd, const char *cmd_name);
550 static int get_flags(const struct netdev *, unsigned int *flags);
551 static int set_flags(const char *, unsigned int flags);
552 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
553 enum netdev_flags on, enum netdev_flags *old_flagsp)
554 OVS_REQUIRES(netdev->mutex);
555 static int get_ifindex(const struct netdev *, int *ifindexp);
556 static int do_set_addr(struct netdev *netdev,
557 int ioctl_nr, const char *ioctl_name,
558 struct in_addr addr);
559 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
560 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
561 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
562 static int af_packet_sock(void);
563 static bool netdev_linux_miimon_enabled(void);
564 static void netdev_linux_miimon_run(void);
565 static void netdev_linux_miimon_wait(void);
566 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
567
568 static bool
569 is_netdev_linux_class(const struct netdev_class *netdev_class)
570 {
571 return netdev_class->run == netdev_linux_run;
572 }
573
574 static bool
575 is_tap_netdev(const struct netdev *netdev)
576 {
577 return netdev_get_class(netdev) == &netdev_tap_class;
578 }
579
580 static struct netdev_linux *
581 netdev_linux_cast(const struct netdev *netdev)
582 {
583 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
584
585 return CONTAINER_OF(netdev, struct netdev_linux, up);
586 }
587
588 static struct netdev_rxq_linux *
589 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
590 {
591 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
592 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
593 }
594 \f
595 static int
596 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
597 {
598 struct dpif_netlink_vport reply;
599 struct ofpbuf *buf;
600 int error;
601
602 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
603 if (error) {
604 if (error == ENOENT) {
605 /* Assume it is local if there is no API (e.g. if the openvswitch
606 * kernel module is not loaded). */
607 netnsid_set_local(&netdev->netnsid);
608 } else {
609 netnsid_unset(&netdev->netnsid);
610 }
611 return error;
612 }
613
614 netnsid_set(&netdev->netnsid, reply.netnsid);
615 ofpbuf_delete(buf);
616 return 0;
617 }
618
619 static int
620 netdev_linux_netnsid_update(struct netdev_linux *netdev)
621 {
622 if (netnsid_is_unset(netdev->netnsid)) {
623 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
624 netnsid_set_local(&netdev->netnsid);
625 } else {
626 return netdev_linux_netnsid_update__(netdev);
627 }
628 }
629
630 return 0;
631 }
632
633 static bool
634 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
635 {
636 netdev_linux_netnsid_update(netdev);
637 return netnsid_eq(netdev->netnsid, nsid);
638 }
639
640 static bool
641 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
642 {
643 netdev_linux_netnsid_update(netdev);
644 return netnsid_is_remote(netdev->netnsid);
645 }
646
647 static int netdev_linux_update_via_netlink(struct netdev_linux *);
648 static void netdev_linux_update(struct netdev_linux *netdev, int,
649 const struct rtnetlink_change *)
650 OVS_REQUIRES(netdev->mutex);
651 static void netdev_linux_changed(struct netdev_linux *netdev,
652 unsigned int ifi_flags, unsigned int mask)
653 OVS_REQUIRES(netdev->mutex);
654
655 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
656 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
657 * if no such socket could be created. */
658 static struct nl_sock *
659 netdev_linux_notify_sock(void)
660 {
661 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
662 static struct nl_sock *sock;
663 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
664 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
665
666 if (ovsthread_once_start(&once)) {
667 int error;
668
669 error = nl_sock_create(NETLINK_ROUTE, &sock);
670 if (!error) {
671 size_t i;
672
673 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
674 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
675 if (error) {
676 nl_sock_destroy(sock);
677 sock = NULL;
678 break;
679 }
680 }
681 }
682 nl_sock_listen_all_nsid(sock, true);
683 ovsthread_once_done(&once);
684 }
685
686 return sock;
687 }
688
689 static bool
690 netdev_linux_miimon_enabled(void)
691 {
692 return atomic_count_get(&miimon_cnt) > 0;
693 }
694
695 static bool
696 netdev_linux_kind_is_lag(const char *kind)
697 {
698 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
699 return true;
700 }
701
702 return false;
703 }
704
705 static void
706 netdev_linux_update_lag(struct rtnetlink_change *change)
707 OVS_REQUIRES(lag_mutex)
708 {
709 struct linux_lag_slave *lag;
710
711 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
712 return;
713 }
714
715 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
716 lag = shash_find_data(&lag_shash, change->ifname);
717
718 if (!lag) {
719 struct netdev *master_netdev;
720 char master_name[IFNAMSIZ];
721 uint32_t block_id;
722 int error = 0;
723
724 if_indextoname(change->master_ifindex, master_name);
725 master_netdev = netdev_from_name(master_name);
726 if (!master_netdev) {
727 return;
728 }
729
730 if (is_netdev_linux_class(master_netdev->netdev_class)) {
731 block_id = netdev_get_block_id(master_netdev);
732 if (!block_id) {
733 netdev_close(master_netdev);
734 return;
735 }
736
737 lag = xmalloc(sizeof *lag);
738 lag->block_id = block_id;
739 lag->node = shash_add(&lag_shash, change->ifname, lag);
740
741 /* LAG master is linux netdev so add slave to same block. */
742 error = tc_add_del_ingress_qdisc(change->if_index, true,
743 block_id);
744 if (error) {
745 VLOG_WARN("failed to bind LAG slave to master's block");
746 shash_delete(&lag_shash, lag->node);
747 free(lag);
748 }
749 }
750
751 netdev_close(master_netdev);
752 }
753 } else if (change->master_ifindex == 0) {
754 /* Check if this was a lag slave that has been freed. */
755 lag = shash_find_data(&lag_shash, change->ifname);
756
757 if (lag) {
758 tc_add_del_ingress_qdisc(change->if_index, false,
759 lag->block_id);
760 shash_delete(&lag_shash, lag->node);
761 free(lag);
762 }
763 }
764 }
765
766 static void
767 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
768 {
769 struct nl_sock *sock;
770 int error;
771
772 if (netdev_linux_miimon_enabled()) {
773 netdev_linux_miimon_run();
774 }
775
776 sock = netdev_linux_notify_sock();
777 if (!sock) {
778 return;
779 }
780
781 do {
782 uint64_t buf_stub[4096 / 8];
783 int nsid;
784 struct ofpbuf buf;
785
786 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
787 error = nl_sock_recv(sock, &buf, &nsid, false);
788 if (!error) {
789 struct rtnetlink_change change;
790
791 if (rtnetlink_parse(&buf, &change)) {
792 struct netdev *netdev_ = NULL;
793 char dev_name[IFNAMSIZ];
794
795 if (!change.ifname) {
796 change.ifname = if_indextoname(change.if_index, dev_name);
797 }
798
799 if (change.ifname) {
800 netdev_ = netdev_from_name(change.ifname);
801 }
802 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
804
805 ovs_mutex_lock(&netdev->mutex);
806 netdev_linux_update(netdev, nsid, &change);
807 ovs_mutex_unlock(&netdev->mutex);
808 }
809 else if (!netdev_ && change.ifname) {
810 /* Netdev is not present in OvS but its master could be. */
811 ovs_mutex_lock(&lag_mutex);
812 netdev_linux_update_lag(&change);
813 ovs_mutex_unlock(&lag_mutex);
814 }
815 netdev_close(netdev_);
816 }
817 } else if (error == ENOBUFS) {
818 struct shash device_shash;
819 struct shash_node *node;
820
821 nl_sock_drain(sock);
822
823 shash_init(&device_shash);
824 netdev_get_devices(&netdev_linux_class, &device_shash);
825 SHASH_FOR_EACH (node, &device_shash) {
826 struct netdev *netdev_ = node->data;
827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
828 unsigned int flags;
829
830 ovs_mutex_lock(&netdev->mutex);
831 get_flags(netdev_, &flags);
832 netdev_linux_changed(netdev, flags, 0);
833 ovs_mutex_unlock(&netdev->mutex);
834
835 netdev_close(netdev_);
836 }
837 shash_destroy(&device_shash);
838 } else if (error != EAGAIN) {
839 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
840 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
841 ovs_strerror(error));
842 }
843 ofpbuf_uninit(&buf);
844 } while (!error);
845 }
846
847 static void
848 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
849 {
850 struct nl_sock *sock;
851
852 if (netdev_linux_miimon_enabled()) {
853 netdev_linux_miimon_wait();
854 }
855 sock = netdev_linux_notify_sock();
856 if (sock) {
857 nl_sock_wait(sock, POLLIN);
858 }
859 }
860
861 static void
862 netdev_linux_changed(struct netdev_linux *dev,
863 unsigned int ifi_flags, unsigned int mask)
864 OVS_REQUIRES(dev->mutex)
865 {
866 netdev_change_seq_changed(&dev->up);
867
868 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
869 dev->carrier_resets++;
870 }
871 dev->ifi_flags = ifi_flags;
872
873 dev->cache_valid &= mask;
874 if (!(mask & VALID_IN)) {
875 netdev_get_addrs_list_flush();
876 }
877 }
878
879 static void
880 netdev_linux_update__(struct netdev_linux *dev,
881 const struct rtnetlink_change *change)
882 OVS_REQUIRES(dev->mutex)
883 {
884 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
885 if (change->nlmsg_type == RTM_NEWLINK) {
886 /* Keep drv-info, and ip addresses. */
887 netdev_linux_changed(dev, change->ifi_flags,
888 VALID_DRVINFO | VALID_IN);
889
890 /* Update netdev from rtnl-change msg. */
891 if (change->mtu) {
892 dev->mtu = change->mtu;
893 dev->cache_valid |= VALID_MTU;
894 dev->netdev_mtu_error = 0;
895 }
896
897 if (!eth_addr_is_zero(change->mac)) {
898 dev->etheraddr = change->mac;
899 dev->cache_valid |= VALID_ETHERADDR;
900 dev->ether_addr_error = 0;
901
902 /* The mac addr has been changed, report it now. */
903 rtnetlink_report_link();
904 }
905
906 if (change->master && netdev_linux_kind_is_lag(change->master)) {
907 dev->is_lag_master = true;
908 }
909
910 dev->ifindex = change->if_index;
911 dev->cache_valid |= VALID_IFINDEX;
912 dev->get_ifindex_error = 0;
913 dev->present = true;
914 } else {
915 /* FIXME */
916 netdev_linux_changed(dev, change->ifi_flags, 0);
917 dev->present = false;
918 netnsid_unset(&dev->netnsid);
919 }
920 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
921 /* Invalidates in4, in6. */
922 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
923 } else {
924 OVS_NOT_REACHED();
925 }
926 }
927
928 static void
929 netdev_linux_update(struct netdev_linux *dev, int nsid,
930 const struct rtnetlink_change *change)
931 OVS_REQUIRES(dev->mutex)
932 {
933 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
934 netdev_linux_update__(dev, change);
935 }
936 }
937
938 static struct netdev *
939 netdev_linux_alloc(void)
940 {
941 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
942 return &netdev->up;
943 }
944
945 static int
946 netdev_linux_common_construct(struct netdev *netdev_)
947 {
948 /* Prevent any attempt to create (or open) a network device named "default"
949 * or "all". These device names are effectively reserved on Linux because
950 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
951 * itself this wouldn't call for any special treatment, but in practice if
952 * a program tries to create devices with these names, it causes the kernel
953 * to fire a "new device" notification event even though creation failed,
954 * and in turn that causes OVS to wake up and try to create them again,
955 * which ends up as a 100% CPU loop. */
956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
957 const char *name = netdev_->name;
958 if (!strcmp(name, "default") || !strcmp(name, "all")) {
959 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
960 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
961 name);
962 return EINVAL;
963 }
964
965 /* The device could be in the same network namespace or in another one. */
966 netnsid_unset(&netdev->netnsid);
967 ovs_mutex_init(&netdev->mutex);
968 return 0;
969 }
970
971 /* Creates system and internal devices. */
972 static int
973 netdev_linux_construct(struct netdev *netdev_)
974 {
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 int error = netdev_linux_common_construct(netdev_);
977 if (error) {
978 return error;
979 }
980
981 error = get_flags(&netdev->up, &netdev->ifi_flags);
982 if (error == ENODEV) {
983 if (netdev->up.netdev_class != &netdev_internal_class) {
984 /* The device does not exist, so don't allow it to be opened. */
985 return ENODEV;
986 } else {
987 /* "Internal" netdevs have to be created as netdev objects before
988 * they exist in the kernel, because creating them in the kernel
989 * happens by passing a netdev object to dpif_port_add().
990 * Therefore, ignore the error. */
991 }
992 }
993
994 return 0;
995 }
996
997 /* For most types of netdevs we open the device for each call of
998 * netdev_open(). However, this is not the case with tap devices,
999 * since it is only possible to open the device once. In this
1000 * situation we share a single file descriptor, and consequently
1001 * buffers, across all readers. Therefore once data is read it will
1002 * be unavailable to other reads for tap devices. */
1003 static int
1004 netdev_linux_construct_tap(struct netdev *netdev_)
1005 {
1006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1007 static const char tap_dev[] = "/dev/net/tun";
1008 const char *name = netdev_->name;
1009 struct ifreq ifr;
1010
1011 int error = netdev_linux_common_construct(netdev_);
1012 if (error) {
1013 return error;
1014 }
1015
1016 /* Open tap device. */
1017 netdev->tap_fd = open(tap_dev, O_RDWR);
1018 if (netdev->tap_fd < 0) {
1019 error = errno;
1020 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1021 return error;
1022 }
1023
1024 /* Create tap device. */
1025 get_flags(&netdev->up, &netdev->ifi_flags);
1026 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1027 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1028 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1029 VLOG_WARN("%s: creating tap device failed: %s", name,
1030 ovs_strerror(errno));
1031 error = errno;
1032 goto error_close;
1033 }
1034
1035 /* Make non-blocking. */
1036 error = set_nonblocking(netdev->tap_fd);
1037 if (error) {
1038 goto error_close;
1039 }
1040
1041 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1042 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1043 ovs_strerror(errno));
1044 error = errno;
1045 goto error_close;
1046 }
1047
1048 netdev->present = true;
1049 return 0;
1050
1051 error_close:
1052 close(netdev->tap_fd);
1053 return error;
1054 }
1055
1056 static void
1057 netdev_linux_destruct(struct netdev *netdev_)
1058 {
1059 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1060
1061 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1062 netdev->tc->ops->tc_destroy(netdev->tc);
1063 }
1064
1065 if (netdev_get_class(netdev_) == &netdev_tap_class
1066 && netdev->tap_fd >= 0)
1067 {
1068 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1069 close(netdev->tap_fd);
1070 }
1071
1072 if (netdev->miimon_interval > 0) {
1073 atomic_count_dec(&miimon_cnt);
1074 }
1075
1076 ovs_mutex_destroy(&netdev->mutex);
1077 }
1078
1079 static void
1080 netdev_linux_dealloc(struct netdev *netdev_)
1081 {
1082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1083 free(netdev);
1084 }
1085
1086 static struct netdev_rxq *
1087 netdev_linux_rxq_alloc(void)
1088 {
1089 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1090 return &rx->up;
1091 }
1092
1093 static int
1094 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1095 {
1096 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1097 struct netdev *netdev_ = rx->up.netdev;
1098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1099 int error;
1100
1101 ovs_mutex_lock(&netdev->mutex);
1102 rx->is_tap = is_tap_netdev(netdev_);
1103 if (rx->is_tap) {
1104 rx->fd = netdev->tap_fd;
1105 } else {
1106 struct sockaddr_ll sll;
1107 int ifindex, val;
1108 /* Result of tcpdump -dd inbound */
1109 static const struct sock_filter filt[] = {
1110 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1111 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1112 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1113 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1114 };
1115 static const struct sock_fprog fprog = {
1116 ARRAY_SIZE(filt), (struct sock_filter *) filt
1117 };
1118
1119 /* Create file descriptor. */
1120 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1121 if (rx->fd < 0) {
1122 error = errno;
1123 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1124 goto error;
1125 }
1126
1127 val = 1;
1128 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1129 error = errno;
1130 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1131 netdev_get_name(netdev_), ovs_strerror(error));
1132 goto error;
1133 }
1134
1135 /* Set non-blocking mode. */
1136 error = set_nonblocking(rx->fd);
1137 if (error) {
1138 goto error;
1139 }
1140
1141 /* Get ethernet device index. */
1142 error = get_ifindex(&netdev->up, &ifindex);
1143 if (error) {
1144 goto error;
1145 }
1146
1147 /* Bind to specific ethernet device. */
1148 memset(&sll, 0, sizeof sll);
1149 sll.sll_family = AF_PACKET;
1150 sll.sll_ifindex = ifindex;
1151 sll.sll_protocol = htons(ETH_P_ALL);
1152 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1153 error = errno;
1154 VLOG_ERR("%s: failed to bind raw socket (%s)",
1155 netdev_get_name(netdev_), ovs_strerror(error));
1156 goto error;
1157 }
1158
1159 /* Filter for only inbound packets. */
1160 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1161 sizeof fprog);
1162 if (error) {
1163 error = errno;
1164 VLOG_ERR("%s: failed to attach filter (%s)",
1165 netdev_get_name(netdev_), ovs_strerror(error));
1166 goto error;
1167 }
1168 }
1169 ovs_mutex_unlock(&netdev->mutex);
1170
1171 return 0;
1172
1173 error:
1174 if (rx->fd >= 0) {
1175 close(rx->fd);
1176 }
1177 ovs_mutex_unlock(&netdev->mutex);
1178 return error;
1179 }
1180
1181 static void
1182 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1183 {
1184 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1185
1186 if (!rx->is_tap) {
1187 close(rx->fd);
1188 }
1189 }
1190
1191 static void
1192 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1193 {
1194 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1195
1196 free(rx);
1197 }
1198
1199 static ovs_be16
1200 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1201 {
1202 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1203 return htons(aux->tp_vlan_tpid);
1204 } else if (double_tagged) {
1205 return htons(ETH_TYPE_VLAN_8021AD);
1206 } else {
1207 return htons(ETH_TYPE_VLAN_8021Q);
1208 }
1209 }
1210
1211 static bool
1212 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1213 {
1214 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1215 }
1216
1217 static int
1218 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1219 {
1220 size_t size;
1221 ssize_t retval;
1222 struct iovec iov;
1223 struct cmsghdr *cmsg;
1224 union {
1225 struct cmsghdr cmsg;
1226 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1227 } cmsg_buffer;
1228 struct msghdr msgh;
1229
1230 /* Reserve headroom for a single VLAN tag */
1231 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1232 size = dp_packet_tailroom(buffer);
1233
1234 iov.iov_base = dp_packet_data(buffer);
1235 iov.iov_len = size;
1236 msgh.msg_name = NULL;
1237 msgh.msg_namelen = 0;
1238 msgh.msg_iov = &iov;
1239 msgh.msg_iovlen = 1;
1240 msgh.msg_control = &cmsg_buffer;
1241 msgh.msg_controllen = sizeof cmsg_buffer;
1242 msgh.msg_flags = 0;
1243
1244 do {
1245 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1246 } while (retval < 0 && errno == EINTR);
1247
1248 if (retval < 0) {
1249 return errno;
1250 } else if (retval > size) {
1251 return EMSGSIZE;
1252 }
1253
1254 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1255
1256 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1257 const struct tpacket_auxdata *aux;
1258
1259 if (cmsg->cmsg_level != SOL_PACKET
1260 || cmsg->cmsg_type != PACKET_AUXDATA
1261 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1262 continue;
1263 }
1264
1265 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1266 if (auxdata_has_vlan_tci(aux)) {
1267 struct eth_header *eth;
1268 bool double_tagged;
1269
1270 if (retval < ETH_HEADER_LEN) {
1271 return EINVAL;
1272 }
1273
1274 eth = dp_packet_data(buffer);
1275 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1276
1277 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1278 htons(aux->tp_vlan_tci));
1279 break;
1280 }
1281 }
1282
1283 return 0;
1284 }
1285
1286 static int
1287 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1288 {
1289 ssize_t retval;
1290 size_t size = dp_packet_tailroom(buffer);
1291
1292 do {
1293 retval = read(fd, dp_packet_data(buffer), size);
1294 } while (retval < 0 && errno == EINTR);
1295
1296 if (retval < 0) {
1297 return errno;
1298 }
1299
1300 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1301 return 0;
1302 }
1303
1304 static int
1305 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1306 int *qfill)
1307 {
1308 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1309 struct netdev *netdev = rx->up.netdev;
1310 struct dp_packet *buffer;
1311 ssize_t retval;
1312 int mtu;
1313
1314 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1315 mtu = ETH_PAYLOAD_MAX;
1316 }
1317
1318 /* Assume Ethernet port. No need to set packet_type. */
1319 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1320 DP_NETDEV_HEADROOM);
1321 retval = (rx->is_tap
1322 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1323 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1324
1325 if (retval) {
1326 if (retval != EAGAIN && retval != EMSGSIZE) {
1327 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1328 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1329 }
1330 dp_packet_delete(buffer);
1331 } else {
1332 dp_packet_batch_init_packet(batch, buffer);
1333 }
1334
1335 if (qfill) {
1336 *qfill = -ENOTSUP;
1337 }
1338
1339 return retval;
1340 }
1341
1342 static void
1343 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1344 {
1345 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1346 poll_fd_wait(rx->fd, POLLIN);
1347 }
1348
1349 static int
1350 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1351 {
1352 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1353 if (rx->is_tap) {
1354 struct ifreq ifr;
1355 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1356 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1357 if (error) {
1358 return error;
1359 }
1360 drain_fd(rx->fd, ifr.ifr_qlen);
1361 return 0;
1362 } else {
1363 return drain_rcvbuf(rx->fd);
1364 }
1365 }
1366
1367 static int
1368 netdev_linux_sock_batch_send(int sock, int ifindex,
1369 struct dp_packet_batch *batch)
1370 {
1371 const size_t size = dp_packet_batch_size(batch);
1372 /* We don't bother setting most fields in sockaddr_ll because the
1373 * kernel ignores them for SOCK_RAW. */
1374 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1375 .sll_ifindex = ifindex };
1376
1377 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1378 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1379
1380 struct dp_packet *packet;
1381 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1382 iov[i].iov_base = dp_packet_data(packet);
1383 iov[i].iov_len = dp_packet_size(packet);
1384 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1385 .msg_namelen = sizeof sll,
1386 .msg_iov = &iov[i],
1387 .msg_iovlen = 1 };
1388 }
1389
1390 int error = 0;
1391 for (uint32_t ofs = 0; ofs < size; ) {
1392 ssize_t retval;
1393 do {
1394 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1395 error = retval < 0 ? errno : 0;
1396 } while (error == EINTR);
1397 if (error) {
1398 break;
1399 }
1400 ofs += retval;
1401 }
1402
1403 free(mmsg);
1404 free(iov);
1405 return error;
1406 }
1407
1408 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1409 * essential, because packets sent to a tap device with an AF_PACKET socket
1410 * will loop back to be *received* again on the tap device. This doesn't occur
1411 * on other interface types because we attach a socket filter to the rx
1412 * socket. */
1413 static int
1414 netdev_linux_tap_batch_send(struct netdev *netdev_,
1415 struct dp_packet_batch *batch)
1416 {
1417 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1418 struct dp_packet *packet;
1419
1420 /* The Linux tap driver returns EIO if the device is not up,
1421 * so if the device is not up, don't waste time sending it.
1422 * However, if the device is in another network namespace
1423 * then OVS can't retrieve the state. In that case, send the
1424 * packets anyway. */
1425 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1426 netdev->tx_dropped += dp_packet_batch_size(batch);
1427 return 0;
1428 }
1429
1430 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1431 size_t size = dp_packet_size(packet);
1432 ssize_t retval;
1433 int error;
1434
1435 do {
1436 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1437 error = retval < 0 ? errno : 0;
1438 } while (error == EINTR);
1439
1440 if (error) {
1441 /* The Linux tap driver returns EIO if the device is not up. From
1442 * the OVS side this is not an error, so we ignore it; otherwise,
1443 * return the erro. */
1444 if (error != EIO) {
1445 return error;
1446 }
1447 } else if (retval != size) {
1448 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1449 "bytes of %"PRIuSIZE") on %s",
1450 retval, size, netdev_get_name(netdev_));
1451 return EMSGSIZE;
1452 }
1453 }
1454 return 0;
1455 }
1456
1457 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1458 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1459 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1460 * the packet is too big or too small to transmit on the device.
1461 *
1462 * The kernel maintains a packet transmission queue, so the caller is not
1463 * expected to do additional queuing of packets. */
1464 static int
1465 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1466 struct dp_packet_batch *batch,
1467 bool concurrent_txq OVS_UNUSED)
1468 {
1469 int error = 0;
1470 int sock = 0;
1471
1472 if (!is_tap_netdev(netdev_)) {
1473 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1474 error = EOPNOTSUPP;
1475 goto free_batch;
1476 }
1477
1478 sock = af_packet_sock();
1479 if (sock < 0) {
1480 error = -sock;
1481 goto free_batch;
1482 }
1483
1484 int ifindex = netdev_get_ifindex(netdev_);
1485 if (ifindex < 0) {
1486 error = -ifindex;
1487 goto free_batch;
1488 }
1489
1490 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1491 } else {
1492 error = netdev_linux_tap_batch_send(netdev_, batch);
1493 }
1494 if (error) {
1495 if (error == ENOBUFS) {
1496 /* The Linux AF_PACKET implementation never blocks waiting
1497 * for room for packets, instead returning ENOBUFS.
1498 * Translate this into EAGAIN for the caller. */
1499 error = EAGAIN;
1500 } else {
1501 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1502 netdev_get_name(netdev_), ovs_strerror(error));
1503 }
1504 }
1505
1506 free_batch:
1507 dp_packet_delete_batch(batch, true);
1508 return error;
1509 }
1510
1511 /* Registers with the poll loop to wake up from the next call to poll_block()
1512 * when the packet transmission queue has sufficient room to transmit a packet
1513 * with netdev_send().
1514 *
1515 * The kernel maintains a packet transmission queue, so the client is not
1516 * expected to do additional queuing of packets. Thus, this function is
1517 * unlikely to ever be used. It is included for completeness. */
1518 static void
1519 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1520 {
1521 if (is_tap_netdev(netdev)) {
1522 /* TAP device always accepts packets.*/
1523 poll_immediate_wake();
1524 }
1525 }
1526
1527 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1528 * otherwise a positive errno value. */
1529 static int
1530 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1531 {
1532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1533 enum netdev_flags old_flags = 0;
1534 int error;
1535
1536 ovs_mutex_lock(&netdev->mutex);
1537 if (netdev_linux_netnsid_is_remote(netdev)) {
1538 error = EOPNOTSUPP;
1539 goto exit;
1540 }
1541
1542 if (netdev->cache_valid & VALID_ETHERADDR) {
1543 error = netdev->ether_addr_error;
1544 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1545 goto exit;
1546 }
1547 netdev->cache_valid &= ~VALID_ETHERADDR;
1548 }
1549
1550 /* Tap devices must be brought down before setting the address. */
1551 if (is_tap_netdev(netdev_)) {
1552 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1553 }
1554 error = set_etheraddr(netdev_get_name(netdev_), mac);
1555 if (!error || error == ENODEV) {
1556 netdev->ether_addr_error = error;
1557 netdev->cache_valid |= VALID_ETHERADDR;
1558 if (!error) {
1559 netdev->etheraddr = mac;
1560 }
1561 }
1562
1563 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1564 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1565 }
1566
1567 exit:
1568 ovs_mutex_unlock(&netdev->mutex);
1569 return error;
1570 }
1571
1572 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1573 static int
1574 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1575 {
1576 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1577 int error;
1578
1579 ovs_mutex_lock(&netdev->mutex);
1580 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1581 netdev_linux_update_via_netlink(netdev);
1582 }
1583
1584 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1585 /* Fall back to ioctl if netlink fails */
1586 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1587 &netdev->etheraddr);
1588 netdev->cache_valid |= VALID_ETHERADDR;
1589 }
1590
1591 error = netdev->ether_addr_error;
1592 if (!error) {
1593 *mac = netdev->etheraddr;
1594 }
1595 ovs_mutex_unlock(&netdev->mutex);
1596
1597 return error;
1598 }
1599
1600 static int
1601 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1602 {
1603 int error;
1604
1605 if (!(netdev->cache_valid & VALID_MTU)) {
1606 netdev_linux_update_via_netlink(netdev);
1607 }
1608
1609 if (!(netdev->cache_valid & VALID_MTU)) {
1610 /* Fall back to ioctl if netlink fails */
1611 struct ifreq ifr;
1612
1613 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1614 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1615 netdev->mtu = ifr.ifr_mtu;
1616 netdev->cache_valid |= VALID_MTU;
1617 }
1618
1619 error = netdev->netdev_mtu_error;
1620 if (!error) {
1621 *mtup = netdev->mtu;
1622 }
1623
1624 return error;
1625 }
1626
1627 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1628 * in bytes, not including the hardware header; thus, this is typically 1500
1629 * bytes for Ethernet devices. */
1630 static int
1631 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1632 {
1633 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1634 int error;
1635
1636 ovs_mutex_lock(&netdev->mutex);
1637 error = netdev_linux_get_mtu__(netdev, mtup);
1638 ovs_mutex_unlock(&netdev->mutex);
1639
1640 return error;
1641 }
1642
1643 /* Sets the maximum size of transmitted (MTU) for given device using linux
1644 * networking ioctl interface.
1645 */
1646 static int
1647 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1648 {
1649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1650 struct ifreq ifr;
1651 int error;
1652
1653 ovs_mutex_lock(&netdev->mutex);
1654 if (netdev_linux_netnsid_is_remote(netdev)) {
1655 error = EOPNOTSUPP;
1656 goto exit;
1657 }
1658
1659 if (netdev->cache_valid & VALID_MTU) {
1660 error = netdev->netdev_mtu_error;
1661 if (error || netdev->mtu == mtu) {
1662 goto exit;
1663 }
1664 netdev->cache_valid &= ~VALID_MTU;
1665 }
1666 ifr.ifr_mtu = mtu;
1667 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1668 SIOCSIFMTU, "SIOCSIFMTU");
1669 if (!error || error == ENODEV) {
1670 netdev->netdev_mtu_error = error;
1671 netdev->mtu = ifr.ifr_mtu;
1672 netdev->cache_valid |= VALID_MTU;
1673 }
1674 exit:
1675 ovs_mutex_unlock(&netdev->mutex);
1676 return error;
1677 }
1678
1679 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1680 * On failure, returns a negative errno value. */
1681 static int
1682 netdev_linux_get_ifindex(const struct netdev *netdev_)
1683 {
1684 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1685 int ifindex, error;
1686
1687 ovs_mutex_lock(&netdev->mutex);
1688 if (netdev_linux_netnsid_is_remote(netdev)) {
1689 error = EOPNOTSUPP;
1690 goto exit;
1691 }
1692 error = get_ifindex(netdev_, &ifindex);
1693
1694 exit:
1695 ovs_mutex_unlock(&netdev->mutex);
1696 return error ? -error : ifindex;
1697 }
1698
1699 static int
1700 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1701 {
1702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1703
1704 ovs_mutex_lock(&netdev->mutex);
1705 if (netdev->miimon_interval > 0) {
1706 *carrier = netdev->miimon;
1707 } else {
1708 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1709 }
1710 ovs_mutex_unlock(&netdev->mutex);
1711
1712 return 0;
1713 }
1714
1715 static long long int
1716 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1717 {
1718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1719 long long int carrier_resets;
1720
1721 ovs_mutex_lock(&netdev->mutex);
1722 carrier_resets = netdev->carrier_resets;
1723 ovs_mutex_unlock(&netdev->mutex);
1724
1725 return carrier_resets;
1726 }
1727
1728 static int
1729 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1730 struct mii_ioctl_data *data)
1731 {
1732 struct ifreq ifr;
1733 int error;
1734
1735 memset(&ifr, 0, sizeof ifr);
1736 memcpy(&ifr.ifr_data, data, sizeof *data);
1737 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1738 memcpy(data, &ifr.ifr_data, sizeof *data);
1739
1740 return error;
1741 }
1742
1743 static int
1744 netdev_linux_get_miimon(const char *name, bool *miimon)
1745 {
1746 struct mii_ioctl_data data;
1747 int error;
1748
1749 *miimon = false;
1750
1751 memset(&data, 0, sizeof data);
1752 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1753 if (!error) {
1754 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1755 data.reg_num = MII_BMSR;
1756 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1757 &data);
1758
1759 if (!error) {
1760 *miimon = !!(data.val_out & BMSR_LSTATUS);
1761 }
1762 }
1763 if (error) {
1764 struct ethtool_cmd ecmd;
1765
1766 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1767 name);
1768
1769 COVERAGE_INC(netdev_get_ethtool);
1770 memset(&ecmd, 0, sizeof ecmd);
1771 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1772 "ETHTOOL_GLINK");
1773 if (!error) {
1774 struct ethtool_value eval;
1775
1776 memcpy(&eval, &ecmd, sizeof eval);
1777 *miimon = !!eval.data;
1778 } else {
1779 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1780 }
1781 }
1782
1783 return error;
1784 }
1785
1786 static int
1787 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1788 long long int interval)
1789 {
1790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1791
1792 ovs_mutex_lock(&netdev->mutex);
1793 interval = interval > 0 ? MAX(interval, 100) : 0;
1794 if (netdev->miimon_interval != interval) {
1795 if (interval && !netdev->miimon_interval) {
1796 atomic_count_inc(&miimon_cnt);
1797 } else if (!interval && netdev->miimon_interval) {
1798 atomic_count_dec(&miimon_cnt);
1799 }
1800
1801 netdev->miimon_interval = interval;
1802 timer_set_expired(&netdev->miimon_timer);
1803 }
1804 ovs_mutex_unlock(&netdev->mutex);
1805
1806 return 0;
1807 }
1808
1809 static void
1810 netdev_linux_miimon_run(void)
1811 {
1812 struct shash device_shash;
1813 struct shash_node *node;
1814
1815 shash_init(&device_shash);
1816 netdev_get_devices(&netdev_linux_class, &device_shash);
1817 SHASH_FOR_EACH (node, &device_shash) {
1818 struct netdev *netdev = node->data;
1819 struct netdev_linux *dev = netdev_linux_cast(netdev);
1820 bool miimon;
1821
1822 ovs_mutex_lock(&dev->mutex);
1823 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1824 netdev_linux_get_miimon(dev->up.name, &miimon);
1825 if (miimon != dev->miimon) {
1826 dev->miimon = miimon;
1827 netdev_linux_changed(dev, dev->ifi_flags, 0);
1828 }
1829
1830 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1831 }
1832 ovs_mutex_unlock(&dev->mutex);
1833 netdev_close(netdev);
1834 }
1835
1836 shash_destroy(&device_shash);
1837 }
1838
1839 static void
1840 netdev_linux_miimon_wait(void)
1841 {
1842 struct shash device_shash;
1843 struct shash_node *node;
1844
1845 shash_init(&device_shash);
1846 netdev_get_devices(&netdev_linux_class, &device_shash);
1847 SHASH_FOR_EACH (node, &device_shash) {
1848 struct netdev *netdev = node->data;
1849 struct netdev_linux *dev = netdev_linux_cast(netdev);
1850
1851 ovs_mutex_lock(&dev->mutex);
1852 if (dev->miimon_interval > 0) {
1853 timer_wait(&dev->miimon_timer);
1854 }
1855 ovs_mutex_unlock(&dev->mutex);
1856 netdev_close(netdev);
1857 }
1858 shash_destroy(&device_shash);
1859 }
1860
1861 static void
1862 swap_uint64(uint64_t *a, uint64_t *b)
1863 {
1864 uint64_t tmp = *a;
1865 *a = *b;
1866 *b = tmp;
1867 }
1868
1869 /* Copies 'src' into 'dst', performing format conversion in the process.
1870 *
1871 * 'src' is allowed to be misaligned. */
1872 static void
1873 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1874 const struct ovs_vport_stats *src)
1875 {
1876 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1877 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1878 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1879 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1880 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1881 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1882 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1883 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1884 dst->multicast = 0;
1885 dst->collisions = 0;
1886 dst->rx_length_errors = 0;
1887 dst->rx_over_errors = 0;
1888 dst->rx_crc_errors = 0;
1889 dst->rx_frame_errors = 0;
1890 dst->rx_fifo_errors = 0;
1891 dst->rx_missed_errors = 0;
1892 dst->tx_aborted_errors = 0;
1893 dst->tx_carrier_errors = 0;
1894 dst->tx_fifo_errors = 0;
1895 dst->tx_heartbeat_errors = 0;
1896 dst->tx_window_errors = 0;
1897 }
1898
1899 static int
1900 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1901 {
1902 struct dpif_netlink_vport reply;
1903 struct ofpbuf *buf;
1904 int error;
1905
1906 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1907 if (error) {
1908 return error;
1909 } else if (!reply.stats) {
1910 ofpbuf_delete(buf);
1911 return EOPNOTSUPP;
1912 }
1913
1914 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1915
1916 ofpbuf_delete(buf);
1917
1918 return 0;
1919 }
1920
1921 static void
1922 get_stats_via_vport(const struct netdev *netdev_,
1923 struct netdev_stats *stats)
1924 {
1925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1926
1927 if (!netdev->vport_stats_error ||
1928 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1929 int error;
1930
1931 error = get_stats_via_vport__(netdev_, stats);
1932 if (error && error != ENOENT && error != ENODEV) {
1933 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1934 "(%s)",
1935 netdev_get_name(netdev_), ovs_strerror(error));
1936 }
1937 netdev->vport_stats_error = error;
1938 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1939 }
1940 }
1941
1942 /* Retrieves current device stats for 'netdev-linux'. */
1943 static int
1944 netdev_linux_get_stats(const struct netdev *netdev_,
1945 struct netdev_stats *stats)
1946 {
1947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1948 struct netdev_stats dev_stats;
1949 int error;
1950
1951 ovs_mutex_lock(&netdev->mutex);
1952 get_stats_via_vport(netdev_, stats);
1953 error = get_stats_via_netlink(netdev_, &dev_stats);
1954 if (error) {
1955 if (!netdev->vport_stats_error) {
1956 error = 0;
1957 }
1958 } else if (netdev->vport_stats_error) {
1959 /* stats not available from OVS then use netdev stats. */
1960 *stats = dev_stats;
1961 } else {
1962 /* Use kernel netdev's packet and byte counts since vport's counters
1963 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1964 * enabled. */
1965 stats->rx_packets = dev_stats.rx_packets;
1966 stats->rx_bytes = dev_stats.rx_bytes;
1967 stats->tx_packets = dev_stats.tx_packets;
1968 stats->tx_bytes = dev_stats.tx_bytes;
1969
1970 stats->rx_errors += dev_stats.rx_errors;
1971 stats->tx_errors += dev_stats.tx_errors;
1972 stats->rx_dropped += dev_stats.rx_dropped;
1973 stats->tx_dropped += dev_stats.tx_dropped;
1974 stats->multicast += dev_stats.multicast;
1975 stats->collisions += dev_stats.collisions;
1976 stats->rx_length_errors += dev_stats.rx_length_errors;
1977 stats->rx_over_errors += dev_stats.rx_over_errors;
1978 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1979 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1980 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1981 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1982 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1983 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1984 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1985 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1986 stats->tx_window_errors += dev_stats.tx_window_errors;
1987 }
1988 ovs_mutex_unlock(&netdev->mutex);
1989
1990 return error;
1991 }
1992
1993 /* Retrieves current device stats for 'netdev-tap' netdev or
1994 * netdev-internal. */
1995 static int
1996 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1997 {
1998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1999 struct netdev_stats dev_stats;
2000 int error;
2001
2002 ovs_mutex_lock(&netdev->mutex);
2003 get_stats_via_vport(netdev_, stats);
2004 error = get_stats_via_netlink(netdev_, &dev_stats);
2005 if (error) {
2006 if (!netdev->vport_stats_error) {
2007 error = 0;
2008 }
2009 } else if (netdev->vport_stats_error) {
2010 /* Transmit and receive stats will appear to be swapped relative to the
2011 * other ports since we are the one sending the data, not a remote
2012 * computer. For consistency, we swap them back here. This does not
2013 * apply if we are getting stats from the vport layer because it always
2014 * tracks stats from the perspective of the switch. */
2015
2016 *stats = dev_stats;
2017 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2018 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2019 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2020 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2021 stats->rx_length_errors = 0;
2022 stats->rx_over_errors = 0;
2023 stats->rx_crc_errors = 0;
2024 stats->rx_frame_errors = 0;
2025 stats->rx_fifo_errors = 0;
2026 stats->rx_missed_errors = 0;
2027 stats->tx_aborted_errors = 0;
2028 stats->tx_carrier_errors = 0;
2029 stats->tx_fifo_errors = 0;
2030 stats->tx_heartbeat_errors = 0;
2031 stats->tx_window_errors = 0;
2032 } else {
2033 /* Use kernel netdev's packet and byte counts since vport counters
2034 * do not reflect packet counts on the wire when GSO, TSO or GRO
2035 * are enabled. */
2036 stats->rx_packets = dev_stats.tx_packets;
2037 stats->rx_bytes = dev_stats.tx_bytes;
2038 stats->tx_packets = dev_stats.rx_packets;
2039 stats->tx_bytes = dev_stats.rx_bytes;
2040
2041 stats->rx_dropped += dev_stats.tx_dropped;
2042 stats->tx_dropped += dev_stats.rx_dropped;
2043
2044 stats->rx_errors += dev_stats.tx_errors;
2045 stats->tx_errors += dev_stats.rx_errors;
2046
2047 stats->multicast += dev_stats.multicast;
2048 stats->collisions += dev_stats.collisions;
2049 }
2050 stats->tx_dropped += netdev->tx_dropped;
2051 ovs_mutex_unlock(&netdev->mutex);
2052
2053 return error;
2054 }
2055
2056 static int
2057 netdev_internal_get_stats(const struct netdev *netdev_,
2058 struct netdev_stats *stats)
2059 {
2060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2061 int error;
2062
2063 ovs_mutex_lock(&netdev->mutex);
2064 get_stats_via_vport(netdev_, stats);
2065 error = netdev->vport_stats_error;
2066 ovs_mutex_unlock(&netdev->mutex);
2067
2068 return error;
2069 }
2070
2071 static void
2072 netdev_linux_read_features(struct netdev_linux *netdev)
2073 {
2074 struct ethtool_cmd ecmd;
2075 uint32_t speed;
2076 int error;
2077
2078 if (netdev->cache_valid & VALID_FEATURES) {
2079 return;
2080 }
2081
2082 COVERAGE_INC(netdev_get_ethtool);
2083 memset(&ecmd, 0, sizeof ecmd);
2084 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2085 ETHTOOL_GSET, "ETHTOOL_GSET");
2086 if (error) {
2087 goto out;
2088 }
2089
2090 /* Supported features. */
2091 netdev->supported = 0;
2092 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2093 netdev->supported |= NETDEV_F_10MB_HD;
2094 }
2095 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2096 netdev->supported |= NETDEV_F_10MB_FD;
2097 }
2098 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2099 netdev->supported |= NETDEV_F_100MB_HD;
2100 }
2101 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2102 netdev->supported |= NETDEV_F_100MB_FD;
2103 }
2104 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2105 netdev->supported |= NETDEV_F_1GB_HD;
2106 }
2107 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2108 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2109 netdev->supported |= NETDEV_F_1GB_FD;
2110 }
2111 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2112 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2113 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2114 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2115 netdev->supported |= NETDEV_F_10GB_FD;
2116 }
2117 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2118 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2119 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2120 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2121 netdev->supported |= NETDEV_F_40GB_FD;
2122 }
2123 if (ecmd.supported & SUPPORTED_TP) {
2124 netdev->supported |= NETDEV_F_COPPER;
2125 }
2126 if (ecmd.supported & SUPPORTED_FIBRE) {
2127 netdev->supported |= NETDEV_F_FIBER;
2128 }
2129 if (ecmd.supported & SUPPORTED_Autoneg) {
2130 netdev->supported |= NETDEV_F_AUTONEG;
2131 }
2132 if (ecmd.supported & SUPPORTED_Pause) {
2133 netdev->supported |= NETDEV_F_PAUSE;
2134 }
2135 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2136 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2137 }
2138
2139 /* Advertised features. */
2140 netdev->advertised = 0;
2141 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2142 netdev->advertised |= NETDEV_F_10MB_HD;
2143 }
2144 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2145 netdev->advertised |= NETDEV_F_10MB_FD;
2146 }
2147 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2148 netdev->advertised |= NETDEV_F_100MB_HD;
2149 }
2150 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2151 netdev->advertised |= NETDEV_F_100MB_FD;
2152 }
2153 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2154 netdev->advertised |= NETDEV_F_1GB_HD;
2155 }
2156 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2157 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2158 netdev->advertised |= NETDEV_F_1GB_FD;
2159 }
2160 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2161 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2162 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2163 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2164 netdev->advertised |= NETDEV_F_10GB_FD;
2165 }
2166 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2167 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2168 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2169 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2170 netdev->advertised |= NETDEV_F_40GB_FD;
2171 }
2172 if (ecmd.advertising & ADVERTISED_TP) {
2173 netdev->advertised |= NETDEV_F_COPPER;
2174 }
2175 if (ecmd.advertising & ADVERTISED_FIBRE) {
2176 netdev->advertised |= NETDEV_F_FIBER;
2177 }
2178 if (ecmd.advertising & ADVERTISED_Autoneg) {
2179 netdev->advertised |= NETDEV_F_AUTONEG;
2180 }
2181 if (ecmd.advertising & ADVERTISED_Pause) {
2182 netdev->advertised |= NETDEV_F_PAUSE;
2183 }
2184 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2185 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2186 }
2187
2188 /* Current settings. */
2189 speed = ethtool_cmd_speed(&ecmd);
2190 if (speed == SPEED_10) {
2191 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2192 } else if (speed == SPEED_100) {
2193 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2194 } else if (speed == SPEED_1000) {
2195 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2196 } else if (speed == SPEED_10000) {
2197 netdev->current = NETDEV_F_10GB_FD;
2198 } else if (speed == 40000) {
2199 netdev->current = NETDEV_F_40GB_FD;
2200 } else if (speed == 100000) {
2201 netdev->current = NETDEV_F_100GB_FD;
2202 } else if (speed == 1000000) {
2203 netdev->current = NETDEV_F_1TB_FD;
2204 } else {
2205 netdev->current = 0;
2206 }
2207
2208 if (ecmd.port == PORT_TP) {
2209 netdev->current |= NETDEV_F_COPPER;
2210 } else if (ecmd.port == PORT_FIBRE) {
2211 netdev->current |= NETDEV_F_FIBER;
2212 }
2213
2214 if (ecmd.autoneg) {
2215 netdev->current |= NETDEV_F_AUTONEG;
2216 }
2217
2218 out:
2219 netdev->cache_valid |= VALID_FEATURES;
2220 netdev->get_features_error = error;
2221 }
2222
2223 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2224 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2225 * Returns 0 if successful, otherwise a positive errno value. */
2226 static int
2227 netdev_linux_get_features(const struct netdev *netdev_,
2228 enum netdev_features *current,
2229 enum netdev_features *advertised,
2230 enum netdev_features *supported,
2231 enum netdev_features *peer)
2232 {
2233 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2234 int error;
2235
2236 ovs_mutex_lock(&netdev->mutex);
2237 if (netdev_linux_netnsid_is_remote(netdev)) {
2238 error = EOPNOTSUPP;
2239 goto exit;
2240 }
2241
2242 netdev_linux_read_features(netdev);
2243 if (!netdev->get_features_error) {
2244 *current = netdev->current;
2245 *advertised = netdev->advertised;
2246 *supported = netdev->supported;
2247 *peer = 0; /* XXX */
2248 }
2249 error = netdev->get_features_error;
2250
2251 exit:
2252 ovs_mutex_unlock(&netdev->mutex);
2253 return error;
2254 }
2255
2256 /* Set the features advertised by 'netdev' to 'advertise'. */
2257 static int
2258 netdev_linux_set_advertisements(struct netdev *netdev_,
2259 enum netdev_features advertise)
2260 {
2261 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2262 struct ethtool_cmd ecmd;
2263 int error;
2264
2265 ovs_mutex_lock(&netdev->mutex);
2266
2267 COVERAGE_INC(netdev_get_ethtool);
2268
2269 if (netdev_linux_netnsid_is_remote(netdev)) {
2270 error = EOPNOTSUPP;
2271 goto exit;
2272 }
2273
2274 memset(&ecmd, 0, sizeof ecmd);
2275 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2276 ETHTOOL_GSET, "ETHTOOL_GSET");
2277 if (error) {
2278 goto exit;
2279 }
2280
2281 ecmd.advertising = 0;
2282 if (advertise & NETDEV_F_10MB_HD) {
2283 ecmd.advertising |= ADVERTISED_10baseT_Half;
2284 }
2285 if (advertise & NETDEV_F_10MB_FD) {
2286 ecmd.advertising |= ADVERTISED_10baseT_Full;
2287 }
2288 if (advertise & NETDEV_F_100MB_HD) {
2289 ecmd.advertising |= ADVERTISED_100baseT_Half;
2290 }
2291 if (advertise & NETDEV_F_100MB_FD) {
2292 ecmd.advertising |= ADVERTISED_100baseT_Full;
2293 }
2294 if (advertise & NETDEV_F_1GB_HD) {
2295 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2296 }
2297 if (advertise & NETDEV_F_1GB_FD) {
2298 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2299 }
2300 if (advertise & NETDEV_F_10GB_FD) {
2301 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2302 }
2303 if (advertise & NETDEV_F_COPPER) {
2304 ecmd.advertising |= ADVERTISED_TP;
2305 }
2306 if (advertise & NETDEV_F_FIBER) {
2307 ecmd.advertising |= ADVERTISED_FIBRE;
2308 }
2309 if (advertise & NETDEV_F_AUTONEG) {
2310 ecmd.advertising |= ADVERTISED_Autoneg;
2311 }
2312 if (advertise & NETDEV_F_PAUSE) {
2313 ecmd.advertising |= ADVERTISED_Pause;
2314 }
2315 if (advertise & NETDEV_F_PAUSE_ASYM) {
2316 ecmd.advertising |= ADVERTISED_Asym_Pause;
2317 }
2318 COVERAGE_INC(netdev_set_ethtool);
2319 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2320 ETHTOOL_SSET, "ETHTOOL_SSET");
2321
2322 exit:
2323 ovs_mutex_unlock(&netdev->mutex);
2324 return error;
2325 }
2326
2327 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2328 * successful, otherwise a positive errno value. */
2329 static int
2330 netdev_linux_set_policing(struct netdev *netdev_,
2331 uint32_t kbits_rate, uint32_t kbits_burst)
2332 {
2333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2334 const char *netdev_name = netdev_get_name(netdev_);
2335 int ifindex;
2336 int error;
2337
2338 if (netdev_is_flow_api_enabled()) {
2339 if (kbits_rate) {
2340 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2341 netdev_name);
2342 }
2343 return EOPNOTSUPP;
2344 }
2345
2346 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2347 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2348 : kbits_burst); /* Stick with user-specified value. */
2349
2350 ovs_mutex_lock(&netdev->mutex);
2351 if (netdev_linux_netnsid_is_remote(netdev)) {
2352 error = EOPNOTSUPP;
2353 goto out;
2354 }
2355
2356 if (netdev->cache_valid & VALID_POLICING) {
2357 error = netdev->netdev_policing_error;
2358 if (error || (netdev->kbits_rate == kbits_rate &&
2359 netdev->kbits_burst == kbits_burst)) {
2360 /* Assume that settings haven't changed since we last set them. */
2361 goto out;
2362 }
2363 netdev->cache_valid &= ~VALID_POLICING;
2364 }
2365
2366 error = get_ifindex(netdev_, &ifindex);
2367 if (error) {
2368 goto out;
2369 }
2370
2371 COVERAGE_INC(netdev_set_policing);
2372 /* Remove any existing ingress qdisc. */
2373 error = tc_add_del_ingress_qdisc(ifindex, false, 0);
2374 if (error) {
2375 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2376 netdev_name, ovs_strerror(error));
2377 goto out;
2378 }
2379
2380 if (kbits_rate) {
2381 error = tc_add_del_ingress_qdisc(ifindex, true, 0);
2382 if (error) {
2383 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2384 netdev_name, ovs_strerror(error));
2385 goto out;
2386 }
2387
2388 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2389 if (error){
2390 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2391 netdev_name, ovs_strerror(error));
2392 goto out;
2393 }
2394 }
2395
2396 netdev->kbits_rate = kbits_rate;
2397 netdev->kbits_burst = kbits_burst;
2398
2399 out:
2400 if (!error || error == ENODEV) {
2401 netdev->netdev_policing_error = error;
2402 netdev->cache_valid |= VALID_POLICING;
2403 }
2404 ovs_mutex_unlock(&netdev->mutex);
2405 return error;
2406 }
2407
2408 static int
2409 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2410 struct sset *types)
2411 {
2412 const struct tc_ops *const *opsp;
2413 for (opsp = tcs; *opsp != NULL; opsp++) {
2414 const struct tc_ops *ops = *opsp;
2415 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2416 sset_add(types, ops->ovs_name);
2417 }
2418 }
2419 return 0;
2420 }
2421
2422 static const struct tc_ops *
2423 tc_lookup_ovs_name(const char *name)
2424 {
2425 const struct tc_ops *const *opsp;
2426
2427 for (opsp = tcs; *opsp != NULL; opsp++) {
2428 const struct tc_ops *ops = *opsp;
2429 if (!strcmp(name, ops->ovs_name)) {
2430 return ops;
2431 }
2432 }
2433 return NULL;
2434 }
2435
2436 static const struct tc_ops *
2437 tc_lookup_linux_name(const char *name)
2438 {
2439 const struct tc_ops *const *opsp;
2440
2441 for (opsp = tcs; *opsp != NULL; opsp++) {
2442 const struct tc_ops *ops = *opsp;
2443 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2444 return ops;
2445 }
2446 }
2447 return NULL;
2448 }
2449
2450 static struct tc_queue *
2451 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2452 size_t hash)
2453 {
2454 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2455 struct tc_queue *queue;
2456
2457 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2458 if (queue->queue_id == queue_id) {
2459 return queue;
2460 }
2461 }
2462 return NULL;
2463 }
2464
2465 static struct tc_queue *
2466 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2467 {
2468 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2469 }
2470
2471 static int
2472 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2473 const char *type,
2474 struct netdev_qos_capabilities *caps)
2475 {
2476 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2477 if (!ops) {
2478 return EOPNOTSUPP;
2479 }
2480 caps->n_queues = ops->n_queues;
2481 return 0;
2482 }
2483
2484 static int
2485 netdev_linux_get_qos(const struct netdev *netdev_,
2486 const char **typep, struct smap *details)
2487 {
2488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2489 int error;
2490
2491 ovs_mutex_lock(&netdev->mutex);
2492 if (netdev_linux_netnsid_is_remote(netdev)) {
2493 error = EOPNOTSUPP;
2494 goto exit;
2495 }
2496
2497 error = tc_query_qdisc(netdev_);
2498 if (!error) {
2499 *typep = netdev->tc->ops->ovs_name;
2500 error = (netdev->tc->ops->qdisc_get
2501 ? netdev->tc->ops->qdisc_get(netdev_, details)
2502 : 0);
2503 }
2504
2505 exit:
2506 ovs_mutex_unlock(&netdev->mutex);
2507 return error;
2508 }
2509
2510 static int
2511 netdev_linux_set_qos(struct netdev *netdev_,
2512 const char *type, const struct smap *details)
2513 {
2514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2515 const struct tc_ops *new_ops;
2516 int error;
2517
2518 new_ops = tc_lookup_ovs_name(type);
2519 if (!new_ops || !new_ops->tc_install) {
2520 return EOPNOTSUPP;
2521 }
2522
2523 if (new_ops == &tc_ops_noop) {
2524 return new_ops->tc_install(netdev_, details);
2525 }
2526
2527 ovs_mutex_lock(&netdev->mutex);
2528 if (netdev_linux_netnsid_is_remote(netdev)) {
2529 error = EOPNOTSUPP;
2530 goto exit;
2531 }
2532
2533 error = tc_query_qdisc(netdev_);
2534 if (error) {
2535 goto exit;
2536 }
2537
2538 if (new_ops == netdev->tc->ops) {
2539 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2540 } else {
2541 /* Delete existing qdisc. */
2542 error = tc_del_qdisc(netdev_);
2543 if (error) {
2544 goto exit;
2545 }
2546 ovs_assert(netdev->tc == NULL);
2547
2548 /* Install new qdisc. */
2549 error = new_ops->tc_install(netdev_, details);
2550 ovs_assert((error == 0) == (netdev->tc != NULL));
2551 }
2552
2553 exit:
2554 ovs_mutex_unlock(&netdev->mutex);
2555 return error;
2556 }
2557
2558 static int
2559 netdev_linux_get_queue(const struct netdev *netdev_,
2560 unsigned int queue_id, struct smap *details)
2561 {
2562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2563 int error;
2564
2565 ovs_mutex_lock(&netdev->mutex);
2566 if (netdev_linux_netnsid_is_remote(netdev)) {
2567 error = EOPNOTSUPP;
2568 goto exit;
2569 }
2570
2571 error = tc_query_qdisc(netdev_);
2572 if (!error) {
2573 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2574 error = (queue
2575 ? netdev->tc->ops->class_get(netdev_, queue, details)
2576 : ENOENT);
2577 }
2578
2579 exit:
2580 ovs_mutex_unlock(&netdev->mutex);
2581 return error;
2582 }
2583
2584 static int
2585 netdev_linux_set_queue(struct netdev *netdev_,
2586 unsigned int queue_id, const struct smap *details)
2587 {
2588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2589 int error;
2590
2591 ovs_mutex_lock(&netdev->mutex);
2592 if (netdev_linux_netnsid_is_remote(netdev)) {
2593 error = EOPNOTSUPP;
2594 goto exit;
2595 }
2596
2597 error = tc_query_qdisc(netdev_);
2598 if (!error) {
2599 error = (queue_id < netdev->tc->ops->n_queues
2600 && netdev->tc->ops->class_set
2601 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2602 : EINVAL);
2603 }
2604
2605 exit:
2606 ovs_mutex_unlock(&netdev->mutex);
2607 return error;
2608 }
2609
2610 static int
2611 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2612 {
2613 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2614 int error;
2615
2616 ovs_mutex_lock(&netdev->mutex);
2617 if (netdev_linux_netnsid_is_remote(netdev)) {
2618 error = EOPNOTSUPP;
2619 goto exit;
2620 }
2621
2622 error = tc_query_qdisc(netdev_);
2623 if (!error) {
2624 if (netdev->tc->ops->class_delete) {
2625 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2626 error = (queue
2627 ? netdev->tc->ops->class_delete(netdev_, queue)
2628 : ENOENT);
2629 } else {
2630 error = EINVAL;
2631 }
2632 }
2633
2634 exit:
2635 ovs_mutex_unlock(&netdev->mutex);
2636 return error;
2637 }
2638
2639 static int
2640 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2641 unsigned int queue_id,
2642 struct netdev_queue_stats *stats)
2643 {
2644 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2645 int error;
2646
2647 ovs_mutex_lock(&netdev->mutex);
2648 if (netdev_linux_netnsid_is_remote(netdev)) {
2649 error = EOPNOTSUPP;
2650 goto exit;
2651 }
2652
2653 error = tc_query_qdisc(netdev_);
2654 if (!error) {
2655 if (netdev->tc->ops->class_get_stats) {
2656 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2657 if (queue) {
2658 stats->created = queue->created;
2659 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2660 stats);
2661 } else {
2662 error = ENOENT;
2663 }
2664 } else {
2665 error = EOPNOTSUPP;
2666 }
2667 }
2668
2669 exit:
2670 ovs_mutex_unlock(&netdev->mutex);
2671 return error;
2672 }
2673
2674 struct queue_dump_state {
2675 struct nl_dump dump;
2676 struct ofpbuf buf;
2677 };
2678
2679 static bool
2680 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2681 {
2682 struct ofpbuf request;
2683 struct tcmsg *tcmsg;
2684
2685 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2686 if (!tcmsg) {
2687 return false;
2688 }
2689 tcmsg->tcm_parent = 0;
2690 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2691 ofpbuf_uninit(&request);
2692
2693 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2694 return true;
2695 }
2696
2697 static int
2698 finish_queue_dump(struct queue_dump_state *state)
2699 {
2700 ofpbuf_uninit(&state->buf);
2701 return nl_dump_done(&state->dump);
2702 }
2703
2704 struct netdev_linux_queue_state {
2705 unsigned int *queues;
2706 size_t cur_queue;
2707 size_t n_queues;
2708 };
2709
2710 static int
2711 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2712 {
2713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2714 int error;
2715
2716 ovs_mutex_lock(&netdev->mutex);
2717 if (netdev_linux_netnsid_is_remote(netdev)) {
2718 error = EOPNOTSUPP;
2719 goto exit;
2720 }
2721
2722 error = tc_query_qdisc(netdev_);
2723 if (!error) {
2724 if (netdev->tc->ops->class_get) {
2725 struct netdev_linux_queue_state *state;
2726 struct tc_queue *queue;
2727 size_t i;
2728
2729 *statep = state = xmalloc(sizeof *state);
2730 state->n_queues = hmap_count(&netdev->tc->queues);
2731 state->cur_queue = 0;
2732 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2733
2734 i = 0;
2735 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2736 state->queues[i++] = queue->queue_id;
2737 }
2738 } else {
2739 error = EOPNOTSUPP;
2740 }
2741 }
2742
2743 exit:
2744 ovs_mutex_unlock(&netdev->mutex);
2745 return error;
2746 }
2747
2748 static int
2749 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2750 unsigned int *queue_idp, struct smap *details)
2751 {
2752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2753 struct netdev_linux_queue_state *state = state_;
2754 int error = EOF;
2755
2756 ovs_mutex_lock(&netdev->mutex);
2757 if (netdev_linux_netnsid_is_remote(netdev)) {
2758 error = EOPNOTSUPP;
2759 goto exit;
2760 }
2761
2762 while (state->cur_queue < state->n_queues) {
2763 unsigned int queue_id = state->queues[state->cur_queue++];
2764 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2765
2766 if (queue) {
2767 *queue_idp = queue_id;
2768 error = netdev->tc->ops->class_get(netdev_, queue, details);
2769 break;
2770 }
2771 }
2772
2773 exit:
2774 ovs_mutex_unlock(&netdev->mutex);
2775 return error;
2776 }
2777
2778 static int
2779 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2780 void *state_)
2781 {
2782 struct netdev_linux_queue_state *state = state_;
2783
2784 free(state->queues);
2785 free(state);
2786 return 0;
2787 }
2788
2789 static int
2790 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2791 netdev_dump_queue_stats_cb *cb, void *aux)
2792 {
2793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2794 int error;
2795
2796 ovs_mutex_lock(&netdev->mutex);
2797 if (netdev_linux_netnsid_is_remote(netdev)) {
2798 error = EOPNOTSUPP;
2799 goto exit;
2800 }
2801
2802 error = tc_query_qdisc(netdev_);
2803 if (!error) {
2804 struct queue_dump_state state;
2805
2806 if (!netdev->tc->ops->class_dump_stats) {
2807 error = EOPNOTSUPP;
2808 } else if (!start_queue_dump(netdev_, &state)) {
2809 error = ENODEV;
2810 } else {
2811 struct ofpbuf msg;
2812 int retval;
2813
2814 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2815 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2816 cb, aux);
2817 if (retval) {
2818 error = retval;
2819 }
2820 }
2821
2822 retval = finish_queue_dump(&state);
2823 if (retval) {
2824 error = retval;
2825 }
2826 }
2827 }
2828
2829 exit:
2830 ovs_mutex_unlock(&netdev->mutex);
2831 return error;
2832 }
2833
2834 static int
2835 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2836 struct in_addr netmask)
2837 {
2838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2839 int error;
2840
2841 ovs_mutex_lock(&netdev->mutex);
2842 if (netdev_linux_netnsid_is_remote(netdev)) {
2843 error = EOPNOTSUPP;
2844 goto exit;
2845 }
2846
2847 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2848 if (!error) {
2849 if (address.s_addr != INADDR_ANY) {
2850 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2851 "SIOCSIFNETMASK", netmask);
2852 }
2853 }
2854
2855 exit:
2856 ovs_mutex_unlock(&netdev->mutex);
2857 return error;
2858 }
2859
2860 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2861 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2862 * error. */
2863 static int
2864 netdev_linux_get_addr_list(const struct netdev *netdev_,
2865 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2866 {
2867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2868 int error;
2869
2870 ovs_mutex_lock(&netdev->mutex);
2871 if (netdev_linux_netnsid_is_remote(netdev)) {
2872 error = EOPNOTSUPP;
2873 goto exit;
2874 }
2875
2876 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2877
2878 exit:
2879 ovs_mutex_unlock(&netdev->mutex);
2880 return error;
2881 }
2882
2883 static void
2884 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2885 {
2886 struct sockaddr_in sin;
2887 memset(&sin, 0, sizeof sin);
2888 sin.sin_family = AF_INET;
2889 sin.sin_addr = addr;
2890 sin.sin_port = 0;
2891
2892 memset(sa, 0, sizeof *sa);
2893 memcpy(sa, &sin, sizeof sin);
2894 }
2895
2896 static int
2897 do_set_addr(struct netdev *netdev,
2898 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2899 {
2900 struct ifreq ifr;
2901
2902 make_in4_sockaddr(&ifr.ifr_addr, addr);
2903 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2904 ioctl_name);
2905 }
2906
2907 /* Adds 'router' as a default IP gateway. */
2908 static int
2909 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2910 {
2911 struct in_addr any = { INADDR_ANY };
2912 struct rtentry rt;
2913 int error;
2914
2915 memset(&rt, 0, sizeof rt);
2916 make_in4_sockaddr(&rt.rt_dst, any);
2917 make_in4_sockaddr(&rt.rt_gateway, router);
2918 make_in4_sockaddr(&rt.rt_genmask, any);
2919 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2920 error = af_inet_ioctl(SIOCADDRT, &rt);
2921 if (error) {
2922 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2923 }
2924 return error;
2925 }
2926
2927 static int
2928 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2929 char **netdev_name)
2930 {
2931 static const char fn[] = "/proc/net/route";
2932 FILE *stream;
2933 char line[256];
2934 int ln;
2935
2936 *netdev_name = NULL;
2937 stream = fopen(fn, "r");
2938 if (stream == NULL) {
2939 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2940 return errno;
2941 }
2942
2943 ln = 0;
2944 while (fgets(line, sizeof line, stream)) {
2945 if (++ln >= 2) {
2946 char iface[17];
2947 ovs_be32 dest, gateway, mask;
2948 int refcnt, metric, mtu;
2949 unsigned int flags, use, window, irtt;
2950
2951 if (!ovs_scan(line,
2952 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2953 " %d %u %u\n",
2954 iface, &dest, &gateway, &flags, &refcnt,
2955 &use, &metric, &mask, &mtu, &window, &irtt)) {
2956 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2957 fn, ln, line);
2958 continue;
2959 }
2960 if (!(flags & RTF_UP)) {
2961 /* Skip routes that aren't up. */
2962 continue;
2963 }
2964
2965 /* The output of 'dest', 'mask', and 'gateway' were given in
2966 * network byte order, so we don't need need any endian
2967 * conversions here. */
2968 if ((dest & mask) == (host->s_addr & mask)) {
2969 if (!gateway) {
2970 /* The host is directly reachable. */
2971 next_hop->s_addr = 0;
2972 } else {
2973 /* To reach the host, we must go through a gateway. */
2974 next_hop->s_addr = gateway;
2975 }
2976 *netdev_name = xstrdup(iface);
2977 fclose(stream);
2978 return 0;
2979 }
2980 }
2981 }
2982
2983 fclose(stream);
2984 return ENXIO;
2985 }
2986
2987 static int
2988 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2989 {
2990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2991 int error = 0;
2992
2993 ovs_mutex_lock(&netdev->mutex);
2994 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2995 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2996
2997 COVERAGE_INC(netdev_get_ethtool);
2998 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2999 error = netdev_linux_do_ethtool(netdev->up.name,
3000 cmd,
3001 ETHTOOL_GDRVINFO,
3002 "ETHTOOL_GDRVINFO");
3003 if (!error) {
3004 netdev->cache_valid |= VALID_DRVINFO;
3005 }
3006 }
3007
3008 if (!error) {
3009 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3010 smap_add(smap, "driver_version", netdev->drvinfo.version);
3011 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3012 }
3013 ovs_mutex_unlock(&netdev->mutex);
3014
3015 return error;
3016 }
3017
3018 static int
3019 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3020 struct smap *smap)
3021 {
3022 smap_add(smap, "driver_name", "openvswitch");
3023 return 0;
3024 }
3025
3026 static uint32_t
3027 netdev_linux_get_block_id(struct netdev *netdev_)
3028 {
3029 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3030 uint32_t block_id = 0;
3031
3032 ovs_mutex_lock(&netdev->mutex);
3033 /* Ensure the linux netdev has had its fields populated. */
3034 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3035 netdev_linux_update_via_netlink(netdev);
3036 }
3037
3038 /* Only assigning block ids to linux netdevs that are LAG masters. */
3039 if (netdev->is_lag_master) {
3040 block_id = netdev->ifindex;
3041 }
3042 ovs_mutex_unlock(&netdev->mutex);
3043
3044 return block_id;
3045 }
3046
3047 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3048 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3049 * returns 0. Otherwise, it returns a positive errno value; in particular,
3050 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3051 static int
3052 netdev_linux_arp_lookup(const struct netdev *netdev,
3053 ovs_be32 ip, struct eth_addr *mac)
3054 {
3055 struct arpreq r;
3056 struct sockaddr_in sin;
3057 int retval;
3058
3059 memset(&r, 0, sizeof r);
3060 memset(&sin, 0, sizeof sin);
3061 sin.sin_family = AF_INET;
3062 sin.sin_addr.s_addr = ip;
3063 sin.sin_port = 0;
3064 memcpy(&r.arp_pa, &sin, sizeof sin);
3065 r.arp_ha.sa_family = ARPHRD_ETHER;
3066 r.arp_flags = 0;
3067 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3068 COVERAGE_INC(netdev_arp_lookup);
3069 retval = af_inet_ioctl(SIOCGARP, &r);
3070 if (!retval) {
3071 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3072 } else if (retval != ENXIO) {
3073 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3074 netdev_get_name(netdev), IP_ARGS(ip),
3075 ovs_strerror(retval));
3076 }
3077 return retval;
3078 }
3079
3080 static unsigned int
3081 nd_to_iff_flags(enum netdev_flags nd)
3082 {
3083 unsigned int iff = 0;
3084 if (nd & NETDEV_UP) {
3085 iff |= IFF_UP;
3086 }
3087 if (nd & NETDEV_PROMISC) {
3088 iff |= IFF_PROMISC;
3089 }
3090 if (nd & NETDEV_LOOPBACK) {
3091 iff |= IFF_LOOPBACK;
3092 }
3093 return iff;
3094 }
3095
3096 static int
3097 iff_to_nd_flags(unsigned int iff)
3098 {
3099 enum netdev_flags nd = 0;
3100 if (iff & IFF_UP) {
3101 nd |= NETDEV_UP;
3102 }
3103 if (iff & IFF_PROMISC) {
3104 nd |= NETDEV_PROMISC;
3105 }
3106 if (iff & IFF_LOOPBACK) {
3107 nd |= NETDEV_LOOPBACK;
3108 }
3109 return nd;
3110 }
3111
3112 static int
3113 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3114 enum netdev_flags on, enum netdev_flags *old_flagsp)
3115 OVS_REQUIRES(netdev->mutex)
3116 {
3117 unsigned int old_flags, new_flags;
3118 int error = 0;
3119
3120 old_flags = netdev->ifi_flags;
3121 *old_flagsp = iff_to_nd_flags(old_flags);
3122 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3123 if (new_flags != old_flags) {
3124 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3125 get_flags(&netdev->up, &netdev->ifi_flags);
3126 }
3127
3128 return error;
3129 }
3130
3131 static int
3132 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3133 enum netdev_flags on, enum netdev_flags *old_flagsp)
3134 {
3135 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3136 int error = 0;
3137
3138 ovs_mutex_lock(&netdev->mutex);
3139 if (on || off) {
3140 /* Changing flags over netlink isn't support yet. */
3141 if (netdev_linux_netnsid_is_remote(netdev)) {
3142 error = EOPNOTSUPP;
3143 goto exit;
3144 }
3145 error = update_flags(netdev, off, on, old_flagsp);
3146 } else {
3147 /* Try reading flags over netlink, or fall back to ioctl. */
3148 if (!netdev_linux_update_via_netlink(netdev)) {
3149 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3150 } else {
3151 error = update_flags(netdev, off, on, old_flagsp);
3152 }
3153 }
3154
3155 exit:
3156 ovs_mutex_unlock(&netdev->mutex);
3157 return error;
3158 }
3159
3160 #define NETDEV_LINUX_CLASS_COMMON \
3161 .run = netdev_linux_run, \
3162 .wait = netdev_linux_wait, \
3163 .alloc = netdev_linux_alloc, \
3164 .destruct = netdev_linux_destruct, \
3165 .dealloc = netdev_linux_dealloc, \
3166 .send = netdev_linux_send, \
3167 .send_wait = netdev_linux_send_wait, \
3168 .set_etheraddr = netdev_linux_set_etheraddr, \
3169 .get_etheraddr = netdev_linux_get_etheraddr, \
3170 .get_mtu = netdev_linux_get_mtu, \
3171 .set_mtu = netdev_linux_set_mtu, \
3172 .get_ifindex = netdev_linux_get_ifindex, \
3173 .get_carrier = netdev_linux_get_carrier, \
3174 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3175 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3176 .set_advertisements = netdev_linux_set_advertisements, \
3177 .set_policing = netdev_linux_set_policing, \
3178 .get_qos_types = netdev_linux_get_qos_types, \
3179 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3180 .get_qos = netdev_linux_get_qos, \
3181 .set_qos = netdev_linux_set_qos, \
3182 .get_queue = netdev_linux_get_queue, \
3183 .set_queue = netdev_linux_set_queue, \
3184 .delete_queue = netdev_linux_delete_queue, \
3185 .get_queue_stats = netdev_linux_get_queue_stats, \
3186 .queue_dump_start = netdev_linux_queue_dump_start, \
3187 .queue_dump_next = netdev_linux_queue_dump_next, \
3188 .queue_dump_done = netdev_linux_queue_dump_done, \
3189 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3190 .set_in4 = netdev_linux_set_in4, \
3191 .get_addr_list = netdev_linux_get_addr_list, \
3192 .add_router = netdev_linux_add_router, \
3193 .get_next_hop = netdev_linux_get_next_hop, \
3194 .arp_lookup = netdev_linux_arp_lookup, \
3195 .update_flags = netdev_linux_update_flags, \
3196 .rxq_alloc = netdev_linux_rxq_alloc, \
3197 .rxq_construct = netdev_linux_rxq_construct, \
3198 .rxq_destruct = netdev_linux_rxq_destruct, \
3199 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3200 .rxq_recv = netdev_linux_rxq_recv, \
3201 .rxq_wait = netdev_linux_rxq_wait, \
3202 .rxq_drain = netdev_linux_rxq_drain
3203
3204 const struct netdev_class netdev_linux_class = {
3205 NETDEV_LINUX_CLASS_COMMON,
3206 LINUX_FLOW_OFFLOAD_API,
3207 .type = "system",
3208 .construct = netdev_linux_construct,
3209 .get_stats = netdev_linux_get_stats,
3210 .get_features = netdev_linux_get_features,
3211 .get_status = netdev_linux_get_status,
3212 .get_block_id = netdev_linux_get_block_id
3213 };
3214
3215 const struct netdev_class netdev_tap_class = {
3216 NETDEV_LINUX_CLASS_COMMON,
3217 .type = "tap",
3218 .construct = netdev_linux_construct_tap,
3219 .get_stats = netdev_tap_get_stats,
3220 .get_features = netdev_linux_get_features,
3221 .get_status = netdev_linux_get_status,
3222 };
3223
3224 const struct netdev_class netdev_internal_class = {
3225 NETDEV_LINUX_CLASS_COMMON,
3226 .type = "internal",
3227 .construct = netdev_linux_construct,
3228 .get_stats = netdev_internal_get_stats,
3229 .get_status = netdev_internal_get_status,
3230 };
3231 \f
3232
3233 #define CODEL_N_QUEUES 0x0000
3234
3235 /* In sufficiently new kernel headers these are defined as enums in
3236 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3237 * kernels. (This overrides any enum definition in the header file but that's
3238 * harmless.) */
3239 #define TCA_CODEL_TARGET 1
3240 #define TCA_CODEL_LIMIT 2
3241 #define TCA_CODEL_INTERVAL 3
3242
3243 struct codel {
3244 struct tc tc;
3245 uint32_t target;
3246 uint32_t limit;
3247 uint32_t interval;
3248 };
3249
3250 static struct codel *
3251 codel_get__(const struct netdev *netdev_)
3252 {
3253 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3254 return CONTAINER_OF(netdev->tc, struct codel, tc);
3255 }
3256
3257 static void
3258 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3259 uint32_t interval)
3260 {
3261 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3262 struct codel *codel;
3263
3264 codel = xmalloc(sizeof *codel);
3265 tc_init(&codel->tc, &tc_ops_codel);
3266 codel->target = target;
3267 codel->limit = limit;
3268 codel->interval = interval;
3269
3270 netdev->tc = &codel->tc;
3271 }
3272
3273 static int
3274 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3275 uint32_t interval)
3276 {
3277 size_t opt_offset;
3278 struct ofpbuf request;
3279 struct tcmsg *tcmsg;
3280 uint32_t otarget, olimit, ointerval;
3281 int error;
3282
3283 tc_del_qdisc(netdev);
3284
3285 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3286 NLM_F_EXCL | NLM_F_CREATE, &request);
3287 if (!tcmsg) {
3288 return ENODEV;
3289 }
3290 tcmsg->tcm_handle = tc_make_handle(1, 0);
3291 tcmsg->tcm_parent = TC_H_ROOT;
3292
3293 otarget = target ? target : 5000;
3294 olimit = limit ? limit : 10240;
3295 ointerval = interval ? interval : 100000;
3296
3297 nl_msg_put_string(&request, TCA_KIND, "codel");
3298 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3299 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3300 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3301 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3302 nl_msg_end_nested(&request, opt_offset);
3303
3304 error = tc_transact(&request, NULL);
3305 if (error) {
3306 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3307 "target %u, limit %u, interval %u error %d(%s)",
3308 netdev_get_name(netdev),
3309 otarget, olimit, ointerval,
3310 error, ovs_strerror(error));
3311 }
3312 return error;
3313 }
3314
3315 static void
3316 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3317 const struct smap *details, struct codel *codel)
3318 {
3319 codel->target = smap_get_ullong(details, "target", 0);
3320 codel->limit = smap_get_ullong(details, "limit", 0);
3321 codel->interval = smap_get_ullong(details, "interval", 0);
3322
3323 if (!codel->target) {
3324 codel->target = 5000;
3325 }
3326 if (!codel->limit) {
3327 codel->limit = 10240;
3328 }
3329 if (!codel->interval) {
3330 codel->interval = 100000;
3331 }
3332 }
3333
3334 static int
3335 codel_tc_install(struct netdev *netdev, const struct smap *details)
3336 {
3337 int error;
3338 struct codel codel;
3339
3340 codel_parse_qdisc_details__(netdev, details, &codel);
3341 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3342 codel.interval);
3343 if (!error) {
3344 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3345 }
3346 return error;
3347 }
3348
3349 static int
3350 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3351 {
3352 static const struct nl_policy tca_codel_policy[] = {
3353 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3354 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3355 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3356 };
3357
3358 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3359
3360 if (!nl_parse_nested(nl_options, tca_codel_policy,
3361 attrs, ARRAY_SIZE(tca_codel_policy))) {
3362 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3363 return EPROTO;
3364 }
3365
3366 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3367 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3368 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3369 return 0;
3370 }
3371
3372 static int
3373 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3374 {
3375 struct nlattr *nlattr;
3376 const char * kind;
3377 int error;
3378 struct codel codel;
3379
3380 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3381 if (error != 0) {
3382 return error;
3383 }
3384
3385 error = codel_parse_tca_options__(nlattr, &codel);
3386 if (error != 0) {
3387 return error;
3388 }
3389
3390 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3391 return 0;
3392 }
3393
3394
3395 static void
3396 codel_tc_destroy(struct tc *tc)
3397 {
3398 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3399 tc_destroy(tc);
3400 free(codel);
3401 }
3402
3403 static int
3404 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3405 {
3406 const struct codel *codel = codel_get__(netdev);
3407 smap_add_format(details, "target", "%u", codel->target);
3408 smap_add_format(details, "limit", "%u", codel->limit);
3409 smap_add_format(details, "interval", "%u", codel->interval);
3410 return 0;
3411 }
3412
3413 static int
3414 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3415 {
3416 struct codel codel;
3417
3418 codel_parse_qdisc_details__(netdev, details, &codel);
3419 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3420 codel_get__(netdev)->target = codel.target;
3421 codel_get__(netdev)->limit = codel.limit;
3422 codel_get__(netdev)->interval = codel.interval;
3423 return 0;
3424 }
3425
3426 static const struct tc_ops tc_ops_codel = {
3427 .linux_name = "codel",
3428 .ovs_name = "linux-codel",
3429 .n_queues = CODEL_N_QUEUES,
3430 .tc_install = codel_tc_install,
3431 .tc_load = codel_tc_load,
3432 .tc_destroy = codel_tc_destroy,
3433 .qdisc_get = codel_qdisc_get,
3434 .qdisc_set = codel_qdisc_set,
3435 };
3436 \f
3437 /* FQ-CoDel traffic control class. */
3438
3439 #define FQCODEL_N_QUEUES 0x0000
3440
3441 /* In sufficiently new kernel headers these are defined as enums in
3442 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3443 * kernels. (This overrides any enum definition in the header file but that's
3444 * harmless.) */
3445 #define TCA_FQ_CODEL_TARGET 1
3446 #define TCA_FQ_CODEL_LIMIT 2
3447 #define TCA_FQ_CODEL_INTERVAL 3
3448 #define TCA_FQ_CODEL_ECN 4
3449 #define TCA_FQ_CODEL_FLOWS 5
3450 #define TCA_FQ_CODEL_QUANTUM 6
3451
3452 struct fqcodel {
3453 struct tc tc;
3454 uint32_t target;
3455 uint32_t limit;
3456 uint32_t interval;
3457 uint32_t flows;
3458 uint32_t quantum;
3459 };
3460
3461 static struct fqcodel *
3462 fqcodel_get__(const struct netdev *netdev_)
3463 {
3464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3465 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3466 }
3467
3468 static void
3469 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3470 uint32_t interval, uint32_t flows, uint32_t quantum)
3471 {
3472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3473 struct fqcodel *fqcodel;
3474
3475 fqcodel = xmalloc(sizeof *fqcodel);
3476 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3477 fqcodel->target = target;
3478 fqcodel->limit = limit;
3479 fqcodel->interval = interval;
3480 fqcodel->flows = flows;
3481 fqcodel->quantum = quantum;
3482
3483 netdev->tc = &fqcodel->tc;
3484 }
3485
3486 static int
3487 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3488 uint32_t interval, uint32_t flows, uint32_t quantum)
3489 {
3490 size_t opt_offset;
3491 struct ofpbuf request;
3492 struct tcmsg *tcmsg;
3493 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3494 int error;
3495
3496 tc_del_qdisc(netdev);
3497
3498 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3499 NLM_F_EXCL | NLM_F_CREATE, &request);
3500 if (!tcmsg) {
3501 return ENODEV;
3502 }
3503 tcmsg->tcm_handle = tc_make_handle(1, 0);
3504 tcmsg->tcm_parent = TC_H_ROOT;
3505
3506 otarget = target ? target : 5000;
3507 olimit = limit ? limit : 10240;
3508 ointerval = interval ? interval : 100000;
3509 oflows = flows ? flows : 1024;
3510 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3511 not mtu */
3512
3513 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3514 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3515 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3516 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3517 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3518 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3519 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3520 nl_msg_end_nested(&request, opt_offset);
3521
3522 error = tc_transact(&request, NULL);
3523 if (error) {
3524 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3525 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3526 netdev_get_name(netdev),
3527 otarget, olimit, ointerval, oflows, oquantum,
3528 error, ovs_strerror(error));
3529 }
3530 return error;
3531 }
3532
3533 static void
3534 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3535 const struct smap *details, struct fqcodel *fqcodel)
3536 {
3537 fqcodel->target = smap_get_ullong(details, "target", 0);
3538 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3539 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3540 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3541 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3542
3543 if (!fqcodel->target) {
3544 fqcodel->target = 5000;
3545 }
3546 if (!fqcodel->limit) {
3547 fqcodel->limit = 10240;
3548 }
3549 if (!fqcodel->interval) {
3550 fqcodel->interval = 1000000;
3551 }
3552 if (!fqcodel->flows) {
3553 fqcodel->flows = 1024;
3554 }
3555 if (!fqcodel->quantum) {
3556 fqcodel->quantum = 1514;
3557 }
3558 }
3559
3560 static int
3561 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3562 {
3563 int error;
3564 struct fqcodel fqcodel;
3565
3566 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3567 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3568 fqcodel.interval, fqcodel.flows,
3569 fqcodel.quantum);
3570 if (!error) {
3571 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3572 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3573 }
3574 return error;
3575 }
3576
3577 static int
3578 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3579 {
3580 static const struct nl_policy tca_fqcodel_policy[] = {
3581 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3582 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3583 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3584 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3585 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3586 };
3587
3588 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3589
3590 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3591 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3592 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3593 return EPROTO;
3594 }
3595
3596 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3597 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3598 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3599 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3600 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3601 return 0;
3602 }
3603
3604 static int
3605 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3606 {
3607 struct nlattr *nlattr;
3608 const char * kind;
3609 int error;
3610 struct fqcodel fqcodel;
3611
3612 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3613 if (error != 0) {
3614 return error;
3615 }
3616
3617 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3618 if (error != 0) {
3619 return error;
3620 }
3621
3622 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3623 fqcodel.flows, fqcodel.quantum);
3624 return 0;
3625 }
3626
3627 static void
3628 fqcodel_tc_destroy(struct tc *tc)
3629 {
3630 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3631 tc_destroy(tc);
3632 free(fqcodel);
3633 }
3634
3635 static int
3636 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3637 {
3638 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3639 smap_add_format(details, "target", "%u", fqcodel->target);
3640 smap_add_format(details, "limit", "%u", fqcodel->limit);
3641 smap_add_format(details, "interval", "%u", fqcodel->interval);
3642 smap_add_format(details, "flows", "%u", fqcodel->flows);
3643 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3644 return 0;
3645 }
3646
3647 static int
3648 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3649 {
3650 struct fqcodel fqcodel;
3651
3652 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3653 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3654 fqcodel.flows, fqcodel.quantum);
3655 fqcodel_get__(netdev)->target = fqcodel.target;
3656 fqcodel_get__(netdev)->limit = fqcodel.limit;
3657 fqcodel_get__(netdev)->interval = fqcodel.interval;
3658 fqcodel_get__(netdev)->flows = fqcodel.flows;
3659 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3660 return 0;
3661 }
3662
3663 static const struct tc_ops tc_ops_fqcodel = {
3664 .linux_name = "fq_codel",
3665 .ovs_name = "linux-fq_codel",
3666 .n_queues = FQCODEL_N_QUEUES,
3667 .tc_install = fqcodel_tc_install,
3668 .tc_load = fqcodel_tc_load,
3669 .tc_destroy = fqcodel_tc_destroy,
3670 .qdisc_get = fqcodel_qdisc_get,
3671 .qdisc_set = fqcodel_qdisc_set,
3672 };
3673 \f
3674 /* SFQ traffic control class. */
3675
3676 #define SFQ_N_QUEUES 0x0000
3677
3678 struct sfq {
3679 struct tc tc;
3680 uint32_t quantum;
3681 uint32_t perturb;
3682 };
3683
3684 static struct sfq *
3685 sfq_get__(const struct netdev *netdev_)
3686 {
3687 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3688 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3689 }
3690
3691 static void
3692 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3693 {
3694 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3695 struct sfq *sfq;
3696
3697 sfq = xmalloc(sizeof *sfq);
3698 tc_init(&sfq->tc, &tc_ops_sfq);
3699 sfq->perturb = perturb;
3700 sfq->quantum = quantum;
3701
3702 netdev->tc = &sfq->tc;
3703 }
3704
3705 static int
3706 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3707 {
3708 struct tc_sfq_qopt opt;
3709 struct ofpbuf request;
3710 struct tcmsg *tcmsg;
3711 int mtu;
3712 int mtu_error, error;
3713 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3714
3715 tc_del_qdisc(netdev);
3716
3717 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3718 NLM_F_EXCL | NLM_F_CREATE, &request);
3719 if (!tcmsg) {
3720 return ENODEV;
3721 }
3722 tcmsg->tcm_handle = tc_make_handle(1, 0);
3723 tcmsg->tcm_parent = TC_H_ROOT;
3724
3725 memset(&opt, 0, sizeof opt);
3726 if (!quantum) {
3727 if (!mtu_error) {
3728 opt.quantum = mtu; /* if we cannot find mtu, use default */
3729 }
3730 } else {
3731 opt.quantum = quantum;
3732 }
3733
3734 if (!perturb) {
3735 opt.perturb_period = 10;
3736 } else {
3737 opt.perturb_period = perturb;
3738 }
3739
3740 nl_msg_put_string(&request, TCA_KIND, "sfq");
3741 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3742
3743 error = tc_transact(&request, NULL);
3744 if (error) {
3745 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3746 "quantum %u, perturb %u error %d(%s)",
3747 netdev_get_name(netdev),
3748 opt.quantum, opt.perturb_period,
3749 error, ovs_strerror(error));
3750 }
3751 return error;
3752 }
3753
3754 static void
3755 sfq_parse_qdisc_details__(struct netdev *netdev,
3756 const struct smap *details, struct sfq *sfq)
3757 {
3758 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3759 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3760
3761 if (!sfq->perturb) {
3762 sfq->perturb = 10;
3763 }
3764
3765 if (!sfq->quantum) {
3766 int mtu;
3767 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3768 sfq->quantum = mtu;
3769 } else {
3770 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3771 "device without mtu");
3772 }
3773 }
3774 }
3775
3776 static int
3777 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3778 {
3779 int error;
3780 struct sfq sfq;
3781
3782 sfq_parse_qdisc_details__(netdev, details, &sfq);
3783 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3784 if (!error) {
3785 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3786 }
3787 return error;
3788 }
3789
3790 static int
3791 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3792 {
3793 const struct tc_sfq_qopt *sfq;
3794 struct nlattr *nlattr;
3795 const char * kind;
3796 int error;
3797
3798 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3799 if (error == 0) {
3800 sfq = nl_attr_get(nlattr);
3801 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
3802 return 0;
3803 }
3804
3805 return error;
3806 }
3807
3808 static void
3809 sfq_tc_destroy(struct tc *tc)
3810 {
3811 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3812 tc_destroy(tc);
3813 free(sfq);
3814 }
3815
3816 static int
3817 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3818 {
3819 const struct sfq *sfq = sfq_get__(netdev);
3820 smap_add_format(details, "quantum", "%u", sfq->quantum);
3821 smap_add_format(details, "perturb", "%u", sfq->perturb);
3822 return 0;
3823 }
3824
3825 static int
3826 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3827 {
3828 struct sfq sfq;
3829
3830 sfq_parse_qdisc_details__(netdev, details, &sfq);
3831 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3832 sfq_get__(netdev)->quantum = sfq.quantum;
3833 sfq_get__(netdev)->perturb = sfq.perturb;
3834 return 0;
3835 }
3836
3837 static const struct tc_ops tc_ops_sfq = {
3838 .linux_name = "sfq",
3839 .ovs_name = "linux-sfq",
3840 .n_queues = SFQ_N_QUEUES,
3841 .tc_install = sfq_tc_install,
3842 .tc_load = sfq_tc_load,
3843 .tc_destroy = sfq_tc_destroy,
3844 .qdisc_get = sfq_qdisc_get,
3845 .qdisc_set = sfq_qdisc_set,
3846 };
3847 \f
3848 /* HTB traffic control class. */
3849
3850 #define HTB_N_QUEUES 0xf000
3851 #define HTB_RATE2QUANTUM 10
3852
3853 struct htb {
3854 struct tc tc;
3855 unsigned int max_rate; /* In bytes/s. */
3856 };
3857
3858 struct htb_class {
3859 struct tc_queue tc_queue;
3860 unsigned int min_rate; /* In bytes/s. */
3861 unsigned int max_rate; /* In bytes/s. */
3862 unsigned int burst; /* In bytes. */
3863 unsigned int priority; /* Lower values are higher priorities. */
3864 };
3865
3866 static struct htb *
3867 htb_get__(const struct netdev *netdev_)
3868 {
3869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3870 return CONTAINER_OF(netdev->tc, struct htb, tc);
3871 }
3872
3873 static void
3874 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3875 {
3876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3877 struct htb *htb;
3878
3879 htb = xmalloc(sizeof *htb);
3880 tc_init(&htb->tc, &tc_ops_htb);
3881 htb->max_rate = max_rate;
3882
3883 netdev->tc = &htb->tc;
3884 }
3885
3886 /* Create an HTB qdisc.
3887 *
3888 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3889 static int
3890 htb_setup_qdisc__(struct netdev *netdev)
3891 {
3892 size_t opt_offset;
3893 struct tc_htb_glob opt;
3894 struct ofpbuf request;
3895 struct tcmsg *tcmsg;
3896
3897 tc_del_qdisc(netdev);
3898
3899 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3900 NLM_F_EXCL | NLM_F_CREATE, &request);
3901 if (!tcmsg) {
3902 return ENODEV;
3903 }
3904 tcmsg->tcm_handle = tc_make_handle(1, 0);
3905 tcmsg->tcm_parent = TC_H_ROOT;
3906
3907 nl_msg_put_string(&request, TCA_KIND, "htb");
3908
3909 memset(&opt, 0, sizeof opt);
3910 opt.rate2quantum = HTB_RATE2QUANTUM;
3911 opt.version = 3;
3912 opt.defcls = 1;
3913
3914 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3915 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3916 nl_msg_end_nested(&request, opt_offset);
3917
3918 return tc_transact(&request, NULL);
3919 }
3920
3921 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3922 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3923 static int
3924 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3925 unsigned int parent, struct htb_class *class)
3926 {
3927 size_t opt_offset;
3928 struct tc_htb_opt opt;
3929 struct ofpbuf request;
3930 struct tcmsg *tcmsg;
3931 int error;
3932 int mtu;
3933
3934 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3935 if (error) {
3936 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3937 netdev_get_name(netdev));
3938 return error;
3939 }
3940
3941 memset(&opt, 0, sizeof opt);
3942 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3943 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3944 /* Makes sure the quantum is at least MTU. Setting quantum will
3945 * make htb ignore the r2q for this class. */
3946 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3947 opt.quantum = mtu;
3948 }
3949 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3950 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3951 opt.prio = class->priority;
3952
3953 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3954 &request);
3955 if (!tcmsg) {
3956 return ENODEV;
3957 }
3958 tcmsg->tcm_handle = handle;
3959 tcmsg->tcm_parent = parent;
3960
3961 nl_msg_put_string(&request, TCA_KIND, "htb");
3962 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3963 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3964 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3965 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3966 nl_msg_end_nested(&request, opt_offset);
3967
3968 error = tc_transact(&request, NULL);
3969 if (error) {
3970 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3971 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3972 netdev_get_name(netdev),
3973 tc_get_major(handle), tc_get_minor(handle),
3974 tc_get_major(parent), tc_get_minor(parent),
3975 class->min_rate, class->max_rate,
3976 class->burst, class->priority, ovs_strerror(error));
3977 }
3978 return error;
3979 }
3980
3981 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3982 * description of them into 'details'. The description complies with the
3983 * specification given in the vswitch database documentation for linux-htb
3984 * queue details. */
3985 static int
3986 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3987 {
3988 static const struct nl_policy tca_htb_policy[] = {
3989 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3990 .min_len = sizeof(struct tc_htb_opt) },
3991 };
3992
3993 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3994 const struct tc_htb_opt *htb;
3995
3996 if (!nl_parse_nested(nl_options, tca_htb_policy,
3997 attrs, ARRAY_SIZE(tca_htb_policy))) {
3998 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3999 return EPROTO;
4000 }
4001
4002 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4003 class->min_rate = htb->rate.rate;
4004 class->max_rate = htb->ceil.rate;
4005 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4006 class->priority = htb->prio;
4007 return 0;
4008 }
4009
4010 static int
4011 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4012 struct htb_class *options,
4013 struct netdev_queue_stats *stats)
4014 {
4015 struct nlattr *nl_options;
4016 unsigned int handle;
4017 int error;
4018
4019 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4020 if (!error && queue_id) {
4021 unsigned int major = tc_get_major(handle);
4022 unsigned int minor = tc_get_minor(handle);
4023 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4024 *queue_id = minor - 1;
4025 } else {
4026 error = EPROTO;
4027 }
4028 }
4029 if (!error && options) {
4030 error = htb_parse_tca_options__(nl_options, options);
4031 }
4032 return error;
4033 }
4034
4035 static void
4036 htb_parse_qdisc_details__(struct netdev *netdev_,
4037 const struct smap *details, struct htb_class *hc)
4038 {
4039 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4040
4041 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4042 if (!hc->max_rate) {
4043 enum netdev_features current;
4044
4045 netdev_linux_read_features(netdev);
4046 current = !netdev->get_features_error ? netdev->current : 0;
4047 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4048 }
4049 hc->min_rate = hc->max_rate;
4050 hc->burst = 0;
4051 hc->priority = 0;
4052 }
4053
4054 static int
4055 htb_parse_class_details__(struct netdev *netdev,
4056 const struct smap *details, struct htb_class *hc)
4057 {
4058 const struct htb *htb = htb_get__(netdev);
4059 int mtu, error;
4060 unsigned long long int max_rate_bit;
4061
4062 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4063 if (error) {
4064 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4065 netdev_get_name(netdev));
4066 return error;
4067 }
4068
4069 /* HTB requires at least an mtu sized min-rate to send any traffic even
4070 * on uncongested links. */
4071 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4072 hc->min_rate = MAX(hc->min_rate, mtu);
4073 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4074
4075 /* max-rate */
4076 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4077 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4078 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4079 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4080
4081 /* burst
4082 *
4083 * According to hints in the documentation that I've read, it is important
4084 * that 'burst' be at least as big as the largest frame that might be
4085 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4086 * but having it a bit too small is a problem. Since netdev_get_mtu()
4087 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4088 * the MTU. We actually add 64, instead of 14, as a guard against
4089 * additional headers get tacked on somewhere that we're not aware of. */
4090 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4091 hc->burst = MAX(hc->burst, mtu + 64);
4092
4093 /* priority */
4094 hc->priority = smap_get_ullong(details, "priority", 0);
4095
4096 return 0;
4097 }
4098
4099 static int
4100 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4101 unsigned int parent, struct htb_class *options,
4102 struct netdev_queue_stats *stats)
4103 {
4104 struct ofpbuf *reply;
4105 int error;
4106
4107 error = tc_query_class(netdev, handle, parent, &reply);
4108 if (!error) {
4109 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4110 ofpbuf_delete(reply);
4111 }
4112 return error;
4113 }
4114
4115 static int
4116 htb_tc_install(struct netdev *netdev, const struct smap *details)
4117 {
4118 int error;
4119
4120 error = htb_setup_qdisc__(netdev);
4121 if (!error) {
4122 struct htb_class hc;
4123
4124 htb_parse_qdisc_details__(netdev, details, &hc);
4125 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4126 tc_make_handle(1, 0), &hc);
4127 if (!error) {
4128 htb_install__(netdev, hc.max_rate);
4129 }
4130 }
4131 return error;
4132 }
4133
4134 static struct htb_class *
4135 htb_class_cast__(const struct tc_queue *queue)
4136 {
4137 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4138 }
4139
4140 static void
4141 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4142 const struct htb_class *hc)
4143 {
4144 struct htb *htb = htb_get__(netdev);
4145 size_t hash = hash_int(queue_id, 0);
4146 struct tc_queue *queue;
4147 struct htb_class *hcp;
4148
4149 queue = tc_find_queue__(netdev, queue_id, hash);
4150 if (queue) {
4151 hcp = htb_class_cast__(queue);
4152 } else {
4153 hcp = xmalloc(sizeof *hcp);
4154 queue = &hcp->tc_queue;
4155 queue->queue_id = queue_id;
4156 queue->created = time_msec();
4157 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4158 }
4159
4160 hcp->min_rate = hc->min_rate;
4161 hcp->max_rate = hc->max_rate;
4162 hcp->burst = hc->burst;
4163 hcp->priority = hc->priority;
4164 }
4165
4166 static int
4167 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4168 {
4169 struct ofpbuf msg;
4170 struct queue_dump_state state;
4171 struct htb_class hc;
4172
4173 /* Get qdisc options. */
4174 hc.max_rate = 0;
4175 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4176 htb_install__(netdev, hc.max_rate);
4177
4178 /* Get queues. */
4179 if (!start_queue_dump(netdev, &state)) {
4180 return ENODEV;
4181 }
4182 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4183 unsigned int queue_id;
4184
4185 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4186 htb_update_queue__(netdev, queue_id, &hc);
4187 }
4188 }
4189 finish_queue_dump(&state);
4190
4191 return 0;
4192 }
4193
4194 static void
4195 htb_tc_destroy(struct tc *tc)
4196 {
4197 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4198 struct htb_class *hc;
4199
4200 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4201 free(hc);
4202 }
4203 tc_destroy(tc);
4204 free(htb);
4205 }
4206
4207 static int
4208 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4209 {
4210 const struct htb *htb = htb_get__(netdev);
4211 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4212 return 0;
4213 }
4214
4215 static int
4216 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4217 {
4218 struct htb_class hc;
4219 int error;
4220
4221 htb_parse_qdisc_details__(netdev, details, &hc);
4222 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4223 tc_make_handle(1, 0), &hc);
4224 if (!error) {
4225 htb_get__(netdev)->max_rate = hc.max_rate;
4226 }
4227 return error;
4228 }
4229
4230 static int
4231 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4232 const struct tc_queue *queue, struct smap *details)
4233 {
4234 const struct htb_class *hc = htb_class_cast__(queue);
4235
4236 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4237 if (hc->min_rate != hc->max_rate) {
4238 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4239 }
4240 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4241 if (hc->priority) {
4242 smap_add_format(details, "priority", "%u", hc->priority);
4243 }
4244 return 0;
4245 }
4246
4247 static int
4248 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4249 const struct smap *details)
4250 {
4251 struct htb_class hc;
4252 int error;
4253
4254 error = htb_parse_class_details__(netdev, details, &hc);
4255 if (error) {
4256 return error;
4257 }
4258
4259 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4260 tc_make_handle(1, 0xfffe), &hc);
4261 if (error) {
4262 return error;
4263 }
4264
4265 htb_update_queue__(netdev, queue_id, &hc);
4266 return 0;
4267 }
4268
4269 static int
4270 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4271 {
4272 struct htb_class *hc = htb_class_cast__(queue);
4273 struct htb *htb = htb_get__(netdev);
4274 int error;
4275
4276 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4277 if (!error) {
4278 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4279 free(hc);
4280 }
4281 return error;
4282 }
4283
4284 static int
4285 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4286 struct netdev_queue_stats *stats)
4287 {
4288 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4289 tc_make_handle(1, 0xfffe), NULL, stats);
4290 }
4291
4292 static int
4293 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4294 const struct ofpbuf *nlmsg,
4295 netdev_dump_queue_stats_cb *cb, void *aux)
4296 {
4297 struct netdev_queue_stats stats;
4298 unsigned int handle, major, minor;
4299 int error;
4300
4301 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4302 if (error) {
4303 return error;
4304 }
4305
4306 major = tc_get_major(handle);
4307 minor = tc_get_minor(handle);
4308 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4309 (*cb)(minor - 1, &stats, aux);
4310 }
4311 return 0;
4312 }
4313
4314 static const struct tc_ops tc_ops_htb = {
4315 .linux_name = "htb",
4316 .ovs_name = "linux-htb",
4317 .n_queues = HTB_N_QUEUES,
4318 .tc_install = htb_tc_install,
4319 .tc_load = htb_tc_load,
4320 .tc_destroy = htb_tc_destroy,
4321 .qdisc_get = htb_qdisc_get,
4322 .qdisc_set = htb_qdisc_set,
4323 .class_get = htb_class_get,
4324 .class_set = htb_class_set,
4325 .class_delete = htb_class_delete,
4326 .class_get_stats = htb_class_get_stats,
4327 .class_dump_stats = htb_class_dump_stats
4328 };
4329 \f
4330 /* "linux-hfsc" traffic control class. */
4331
4332 #define HFSC_N_QUEUES 0xf000
4333
4334 struct hfsc {
4335 struct tc tc;
4336 uint32_t max_rate;
4337 };
4338
4339 struct hfsc_class {
4340 struct tc_queue tc_queue;
4341 uint32_t min_rate;
4342 uint32_t max_rate;
4343 };
4344
4345 static struct hfsc *
4346 hfsc_get__(const struct netdev *netdev_)
4347 {
4348 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4349 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4350 }
4351
4352 static struct hfsc_class *
4353 hfsc_class_cast__(const struct tc_queue *queue)
4354 {
4355 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4356 }
4357
4358 static void
4359 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4360 {
4361 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4362 struct hfsc *hfsc;
4363
4364 hfsc = xmalloc(sizeof *hfsc);
4365 tc_init(&hfsc->tc, &tc_ops_hfsc);
4366 hfsc->max_rate = max_rate;
4367 netdev->tc = &hfsc->tc;
4368 }
4369
4370 static void
4371 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4372 const struct hfsc_class *hc)
4373 {
4374 size_t hash;
4375 struct hfsc *hfsc;
4376 struct hfsc_class *hcp;
4377 struct tc_queue *queue;
4378
4379 hfsc = hfsc_get__(netdev);
4380 hash = hash_int(queue_id, 0);
4381
4382 queue = tc_find_queue__(netdev, queue_id, hash);
4383 if (queue) {
4384 hcp = hfsc_class_cast__(queue);
4385 } else {
4386 hcp = xmalloc(sizeof *hcp);
4387 queue = &hcp->tc_queue;
4388 queue->queue_id = queue_id;
4389 queue->created = time_msec();
4390 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4391 }
4392
4393 hcp->min_rate = hc->min_rate;
4394 hcp->max_rate = hc->max_rate;
4395 }
4396
4397 static int
4398 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4399 {
4400 const struct tc_service_curve *rsc, *fsc, *usc;
4401 static const struct nl_policy tca_hfsc_policy[] = {
4402 [TCA_HFSC_RSC] = {
4403 .type = NL_A_UNSPEC,
4404 .optional = false,
4405 .min_len = sizeof(struct tc_service_curve),
4406 },
4407 [TCA_HFSC_FSC] = {
4408 .type = NL_A_UNSPEC,
4409 .optional = false,
4410 .min_len = sizeof(struct tc_service_curve),
4411 },
4412 [TCA_HFSC_USC] = {
4413 .type = NL_A_UNSPEC,
4414 .optional = false,
4415 .min_len = sizeof(struct tc_service_curve),
4416 },
4417 };
4418 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4419
4420 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4421 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4422 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4423 return EPROTO;
4424 }
4425
4426 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4427 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4428 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4429
4430 if (rsc->m1 != 0 || rsc->d != 0 ||
4431 fsc->m1 != 0 || fsc->d != 0 ||
4432 usc->m1 != 0 || usc->d != 0) {
4433 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4434 "Non-linear service curves are not supported.");
4435 return EPROTO;
4436 }
4437
4438 if (rsc->m2 != fsc->m2) {
4439 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4440 "Real-time service curves are not supported ");
4441 return EPROTO;
4442 }
4443
4444 if (rsc->m2 > usc->m2) {
4445 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4446 "Min-rate service curve is greater than "
4447 "the max-rate service curve.");
4448 return EPROTO;
4449 }
4450
4451 class->min_rate = fsc->m2;
4452 class->max_rate = usc->m2;
4453 return 0;
4454 }
4455
4456 static int
4457 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4458 struct hfsc_class *options,
4459 struct netdev_queue_stats *stats)
4460 {
4461 int error;
4462 unsigned int handle;
4463 struct nlattr *nl_options;
4464
4465 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4466 if (error) {
4467 return error;
4468 }
4469
4470 if (queue_id) {
4471 unsigned int major, minor;
4472
4473 major = tc_get_major(handle);
4474 minor = tc_get_minor(handle);
4475 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4476 *queue_id = minor - 1;
4477 } else {
4478 return EPROTO;
4479 }
4480 }
4481
4482 if (options) {
4483 error = hfsc_parse_tca_options__(nl_options, options);
4484 }
4485
4486 return error;
4487 }
4488
4489 static int
4490 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4491 unsigned int parent, struct hfsc_class *options,
4492 struct netdev_queue_stats *stats)
4493 {
4494 int error;
4495 struct ofpbuf *reply;
4496
4497 error = tc_query_class(netdev, handle, parent, &reply);
4498 if (error) {
4499 return error;
4500 }
4501
4502 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4503 ofpbuf_delete(reply);
4504 return error;
4505 }
4506
4507 static void
4508 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4509 struct hfsc_class *class)
4510 {
4511 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4512
4513 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4514 if (!max_rate) {
4515 enum netdev_features current;
4516
4517 netdev_linux_read_features(netdev);
4518 current = !netdev->get_features_error ? netdev->current : 0;
4519 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4520 }
4521
4522 class->min_rate = max_rate;
4523 class->max_rate = max_rate;
4524 }
4525
4526 static int
4527 hfsc_parse_class_details__(struct netdev *netdev,
4528 const struct smap *details,
4529 struct hfsc_class * class)
4530 {
4531 const struct hfsc *hfsc;
4532 uint32_t min_rate, max_rate;
4533
4534 hfsc = hfsc_get__(netdev);
4535
4536 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4537 min_rate = MAX(min_rate, 1);
4538 min_rate = MIN(min_rate, hfsc->max_rate);
4539
4540 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4541 max_rate = MAX(max_rate, min_rate);
4542 max_rate = MIN(max_rate, hfsc->max_rate);
4543
4544 class->min_rate = min_rate;
4545 class->max_rate = max_rate;
4546
4547 return 0;
4548 }
4549
4550 /* Create an HFSC qdisc.
4551 *
4552 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4553 static int
4554 hfsc_setup_qdisc__(struct netdev * netdev)
4555 {
4556 struct tcmsg *tcmsg;
4557 struct ofpbuf request;
4558 struct tc_hfsc_qopt opt;
4559
4560 tc_del_qdisc(netdev);
4561
4562 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4563 NLM_F_EXCL | NLM_F_CREATE, &request);
4564
4565 if (!tcmsg) {
4566 return ENODEV;
4567 }
4568
4569 tcmsg->tcm_handle = tc_make_handle(1, 0);
4570 tcmsg->tcm_parent = TC_H_ROOT;
4571
4572 memset(&opt, 0, sizeof opt);
4573 opt.defcls = 1;
4574
4575 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4576 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4577
4578 return tc_transact(&request, NULL);
4579 }
4580
4581 /* Create an HFSC class.
4582 *
4583 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4584 * sc rate <min_rate> ul rate <max_rate>" */
4585 static int
4586 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4587 unsigned int parent, struct hfsc_class *class)
4588 {
4589 int error;
4590 size_t opt_offset;
4591 struct tcmsg *tcmsg;
4592 struct ofpbuf request;
4593 struct tc_service_curve min, max;
4594
4595 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4596 &request);
4597
4598 if (!tcmsg) {
4599 return ENODEV;
4600 }
4601
4602 tcmsg->tcm_handle = handle;
4603 tcmsg->tcm_parent = parent;
4604
4605 min.m1 = 0;
4606 min.d = 0;
4607 min.m2 = class->min_rate;
4608
4609 max.m1 = 0;
4610 max.d = 0;
4611 max.m2 = class->max_rate;
4612
4613 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4614 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4615 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4616 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4617 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4618 nl_msg_end_nested(&request, opt_offset);
4619
4620 error = tc_transact(&request, NULL);
4621 if (error) {
4622 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4623 "min-rate %ubps, max-rate %ubps (%s)",
4624 netdev_get_name(netdev),
4625 tc_get_major(handle), tc_get_minor(handle),
4626 tc_get_major(parent), tc_get_minor(parent),
4627 class->min_rate, class->max_rate, ovs_strerror(error));
4628 }
4629
4630 return error;
4631 }
4632
4633 static int
4634 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4635 {
4636 int error;
4637 struct hfsc_class class;
4638
4639 error = hfsc_setup_qdisc__(netdev);
4640
4641 if (error) {
4642 return error;
4643 }
4644
4645 hfsc_parse_qdisc_details__(netdev, details, &class);
4646 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4647 tc_make_handle(1, 0), &class);
4648
4649 if (error) {
4650 return error;
4651 }
4652
4653 hfsc_install__(netdev, class.max_rate);
4654 return 0;
4655 }
4656
4657 static int
4658 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4659 {
4660 struct ofpbuf msg;
4661 struct queue_dump_state state;
4662 struct hfsc_class hc;
4663
4664 hc.max_rate = 0;
4665 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4666 hfsc_install__(netdev, hc.max_rate);
4667
4668 if (!start_queue_dump(netdev, &state)) {
4669 return ENODEV;
4670 }
4671
4672 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4673 unsigned int queue_id;
4674
4675 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4676 hfsc_update_queue__(netdev, queue_id, &hc);
4677 }
4678 }
4679
4680 finish_queue_dump(&state);
4681 return 0;
4682 }
4683
4684 static void
4685 hfsc_tc_destroy(struct tc *tc)
4686 {
4687 struct hfsc *hfsc;
4688 struct hfsc_class *hc, *next;
4689
4690 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4691
4692 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4693 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4694 free(hc);
4695 }
4696
4697 tc_destroy(tc);
4698 free(hfsc);
4699 }
4700
4701 static int
4702 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4703 {
4704 const struct hfsc *hfsc;
4705 hfsc = hfsc_get__(netdev);
4706 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4707 return 0;
4708 }
4709
4710 static int
4711 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4712 {
4713 int error;
4714 struct hfsc_class class;
4715
4716 hfsc_parse_qdisc_details__(netdev, details, &class);
4717 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4718 tc_make_handle(1, 0), &class);
4719
4720 if (!error) {
4721 hfsc_get__(netdev)->max_rate = class.max_rate;
4722 }
4723
4724 return error;
4725 }
4726
4727 static int
4728 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4729 const struct tc_queue *queue, struct smap *details)
4730 {
4731 const struct hfsc_class *hc;
4732
4733 hc = hfsc_class_cast__(queue);
4734 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4735 if (hc->min_rate != hc->max_rate) {
4736 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4737 }
4738 return 0;
4739 }
4740
4741 static int
4742 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4743 const struct smap *details)
4744 {
4745 int error;
4746 struct hfsc_class class;
4747
4748 error = hfsc_parse_class_details__(netdev, details, &class);
4749 if (error) {
4750 return error;
4751 }
4752
4753 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4754 tc_make_handle(1, 0xfffe), &class);
4755 if (error) {
4756 return error;
4757 }
4758
4759 hfsc_update_queue__(netdev, queue_id, &class);
4760 return 0;
4761 }
4762
4763 static int
4764 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4765 {
4766 int error;
4767 struct hfsc *hfsc;
4768 struct hfsc_class *hc;
4769
4770 hc = hfsc_class_cast__(queue);
4771 hfsc = hfsc_get__(netdev);
4772
4773 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4774 if (!error) {
4775 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4776 free(hc);
4777 }
4778 return error;
4779 }
4780
4781 static int
4782 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4783 struct netdev_queue_stats *stats)
4784 {
4785 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4786 tc_make_handle(1, 0xfffe), NULL, stats);
4787 }
4788
4789 static int
4790 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4791 const struct ofpbuf *nlmsg,
4792 netdev_dump_queue_stats_cb *cb, void *aux)
4793 {
4794 struct netdev_queue_stats stats;
4795 unsigned int handle, major, minor;
4796 int error;
4797
4798 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4799 if (error) {
4800 return error;
4801 }
4802
4803 major = tc_get_major(handle);
4804 minor = tc_get_minor(handle);
4805 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4806 (*cb)(minor - 1, &stats, aux);
4807 }
4808 return 0;
4809 }
4810
4811 static const struct tc_ops tc_ops_hfsc = {
4812 .linux_name = "hfsc",
4813 .ovs_name = "linux-hfsc",
4814 .n_queues = HFSC_N_QUEUES, /* n_queues */
4815 .tc_install = hfsc_tc_install,
4816 .tc_load = hfsc_tc_load,
4817 .tc_destroy = hfsc_tc_destroy,
4818 .qdisc_get = hfsc_qdisc_get,
4819 .qdisc_set = hfsc_qdisc_set,
4820 .class_get = hfsc_class_get,
4821 .class_set = hfsc_class_set,
4822 .class_delete = hfsc_class_delete,
4823 .class_get_stats = hfsc_class_get_stats,
4824 .class_dump_stats = hfsc_class_dump_stats,
4825 };
4826 \f
4827 /* "linux-noop" traffic control class. */
4828
4829 static void
4830 noop_install__(struct netdev *netdev_)
4831 {
4832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4833 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4834
4835 netdev->tc = CONST_CAST(struct tc *, &tc);
4836 }
4837
4838 static int
4839 noop_tc_install(struct netdev *netdev,
4840 const struct smap *details OVS_UNUSED)
4841 {
4842 noop_install__(netdev);
4843 return 0;
4844 }
4845
4846 static int
4847 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4848 {
4849 noop_install__(netdev);
4850 return 0;
4851 }
4852
4853 static const struct tc_ops tc_ops_noop = {
4854 .ovs_name = "linux-noop", /* ovs_name */
4855 .tc_install = noop_tc_install,
4856 .tc_load = noop_tc_load,
4857 };
4858 \f
4859 /* "linux-default" traffic control class.
4860 *
4861 * This class represents the default, unnamed Linux qdisc. It corresponds to
4862 * the "" (empty string) QoS type in the OVS database. */
4863
4864 static void
4865 default_install__(struct netdev *netdev_)
4866 {
4867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4868 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4869
4870 /* Nothing but a tc class implementation is allowed to write to a tc. This
4871 * class never does that, so we can legitimately use a const tc object. */
4872 netdev->tc = CONST_CAST(struct tc *, &tc);
4873 }
4874
4875 static int
4876 default_tc_install(struct netdev *netdev,
4877 const struct smap *details OVS_UNUSED)
4878 {
4879 default_install__(netdev);
4880 return 0;
4881 }
4882
4883 static int
4884 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4885 {
4886 default_install__(netdev);
4887 return 0;
4888 }
4889
4890 static const struct tc_ops tc_ops_default = {
4891 .ovs_name = "", /* ovs_name */
4892 .tc_install = default_tc_install,
4893 .tc_load = default_tc_load,
4894 };
4895 \f
4896 /* "linux-other" traffic control class.
4897 *
4898 * */
4899
4900 static int
4901 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4902 {
4903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4904 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4905
4906 /* Nothing but a tc class implementation is allowed to write to a tc. This
4907 * class never does that, so we can legitimately use a const tc object. */
4908 netdev->tc = CONST_CAST(struct tc *, &tc);
4909 return 0;
4910 }
4911
4912 static const struct tc_ops tc_ops_other = {
4913 .ovs_name = "linux-other",
4914 .tc_load = other_tc_load,
4915 };
4916 \f
4917 /* Traffic control. */
4918
4919 /* Number of kernel "tc" ticks per second. */
4920 static double ticks_per_s;
4921
4922 /* Number of kernel "jiffies" per second. This is used for the purpose of
4923 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4924 * one jiffy's worth of data.
4925 *
4926 * There are two possibilities here:
4927 *
4928 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4929 * approximate range of 100 to 1024. That means that we really need to
4930 * make sure that the qdisc can buffer that much data.
4931 *
4932 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4933 * has finely granular timers and there's no need to fudge additional room
4934 * for buffers. (There's no extra effort needed to implement that: the
4935 * large 'buffer_hz' is used as a divisor, so practically any number will
4936 * come out as 0 in the division. Small integer results in the case of
4937 * really high dividends won't have any real effect anyhow.)
4938 */
4939 static unsigned int buffer_hz;
4940
4941 static struct tcmsg *
4942 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4943 unsigned int flags, struct ofpbuf *request)
4944 {
4945 int ifindex;
4946 int error;
4947
4948 error = get_ifindex(netdev, &ifindex);
4949 if (error) {
4950 return NULL;
4951 }
4952
4953 return tc_make_request(ifindex, type, flags, request);
4954 }
4955
4956 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4957 * of 'kbits_burst'.
4958 *
4959 * This function is equivalent to running:
4960 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4961 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4962 * mtu 65535 drop
4963 *
4964 * The configuration and stats may be seen with the following command:
4965 * /sbin/tc -s filter show dev <devname> parent ffff:
4966 *
4967 * Returns 0 if successful, otherwise a positive errno value.
4968 */
4969 static int
4970 tc_add_policer(struct netdev *netdev,
4971 uint32_t kbits_rate, uint32_t kbits_burst)
4972 {
4973 struct tc_police tc_police;
4974 struct ofpbuf request;
4975 struct tcmsg *tcmsg;
4976 size_t basic_offset;
4977 size_t police_offset;
4978 int error;
4979 int mtu = 65535;
4980
4981 memset(&tc_police, 0, sizeof tc_police);
4982 tc_police.action = TC_POLICE_SHOT;
4983 tc_police.mtu = mtu;
4984 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4985
4986 /* The following appears wrong in one way: In networking a kilobit is
4987 * usually 1000 bits but this uses 1024 bits.
4988 *
4989 * However if you "fix" those problems then "tc filter show ..." shows
4990 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4991 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4992 * tc's point of view. Whatever. */
4993 tc_police.burst = tc_bytes_to_ticks(
4994 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4995
4996 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4997 NLM_F_EXCL | NLM_F_CREATE, &request);
4998 if (!tcmsg) {
4999 return ENODEV;
5000 }
5001 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5002 tcmsg->tcm_info = tc_make_handle(49,
5003 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5004
5005 nl_msg_put_string(&request, TCA_KIND, "basic");
5006 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5007 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5008 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5009 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5010 nl_msg_end_nested(&request, police_offset);
5011 nl_msg_end_nested(&request, basic_offset);
5012
5013 error = tc_transact(&request, NULL);
5014 if (error) {
5015 return error;
5016 }
5017
5018 return 0;
5019 }
5020
5021 static void
5022 read_psched(void)
5023 {
5024 /* The values in psched are not individually very meaningful, but they are
5025 * important. The tables below show some values seen in the wild.
5026 *
5027 * Some notes:
5028 *
5029 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5030 * (Before that, there are hints that it was 1000000000.)
5031 *
5032 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5033 * above.
5034 *
5035 * /proc/net/psched
5036 * -----------------------------------
5037 * [1] 000c8000 000f4240 000f4240 00000064
5038 * [2] 000003e8 00000400 000f4240 3b9aca00
5039 * [3] 000003e8 00000400 000f4240 3b9aca00
5040 * [4] 000003e8 00000400 000f4240 00000064
5041 * [5] 000003e8 00000040 000f4240 3b9aca00
5042 * [6] 000003e8 00000040 000f4240 000000f9
5043 *
5044 * a b c d ticks_per_s buffer_hz
5045 * ------- --------- ---------- ------------- ----------- -------------
5046 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5047 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5048 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5049 * [4] 1,000 1,024 1,000,000 100 976,562 100
5050 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5051 * [6] 1,000 64 1,000,000 249 15,625,000 249
5052 *
5053 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5054 * [2] 2.6.26-1-686-bigmem from Debian lenny
5055 * [3] 2.6.26-2-sparc64 from Debian lenny
5056 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5057 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5058 * [6] 2.6.34 from kernel.org on KVM
5059 */
5060 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5061 static const char fn[] = "/proc/net/psched";
5062 unsigned int a, b, c, d;
5063 FILE *stream;
5064
5065 if (!ovsthread_once_start(&once)) {
5066 return;
5067 }
5068
5069 ticks_per_s = 1.0;
5070 buffer_hz = 100;
5071
5072 stream = fopen(fn, "r");
5073 if (!stream) {
5074 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5075 goto exit;
5076 }
5077
5078 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5079 VLOG_WARN("%s: read failed", fn);
5080 fclose(stream);
5081 goto exit;
5082 }
5083 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5084 fclose(stream);
5085
5086 if (!a || !b || !c) {
5087 VLOG_WARN("%s: invalid scheduler parameters", fn);
5088 goto exit;
5089 }
5090
5091 ticks_per_s = (double) a * c / b;
5092 if (c == 1000000) {
5093 buffer_hz = d;
5094 } else {
5095 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5096 fn, a, b, c, d);
5097 }
5098 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5099
5100 exit:
5101 ovsthread_once_done(&once);
5102 }
5103
5104 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5105 * rate of 'rate' bytes per second. */
5106 static unsigned int
5107 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5108 {
5109 read_psched();
5110 return (rate * ticks) / ticks_per_s;
5111 }
5112
5113 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5114 * rate of 'rate' bytes per second. */
5115 static unsigned int
5116 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5117 {
5118 read_psched();
5119 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5120 }
5121
5122 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5123 * a transmission rate of 'rate' bytes per second. */
5124 static unsigned int
5125 tc_buffer_per_jiffy(unsigned int rate)
5126 {
5127 read_psched();
5128 return rate / buffer_hz;
5129 }
5130
5131 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5132 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5133 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5134 * stores NULL into it if it is absent.
5135 *
5136 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5137 * 'msg'.
5138 *
5139 * Returns 0 if successful, otherwise a positive errno value. */
5140 static int
5141 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5142 struct nlattr **options)
5143 {
5144 static const struct nl_policy tca_policy[] = {
5145 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5146 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5147 };
5148 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5149
5150 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5151 tca_policy, ta, ARRAY_SIZE(ta))) {
5152 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5153 goto error;
5154 }
5155
5156 if (kind) {
5157 *kind = nl_attr_get_string(ta[TCA_KIND]);
5158 }
5159
5160 if (options) {
5161 *options = ta[TCA_OPTIONS];
5162 }
5163
5164 return 0;
5165
5166 error:
5167 if (kind) {
5168 *kind = NULL;
5169 }
5170 if (options) {
5171 *options = NULL;
5172 }
5173 return EPROTO;
5174 }
5175
5176 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5177 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5178 * into '*options', and its queue statistics into '*stats'. Any of the output
5179 * arguments may be null.
5180 *
5181 * Returns 0 if successful, otherwise a positive errno value. */
5182 static int
5183 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5184 struct nlattr **options, struct netdev_queue_stats *stats)
5185 {
5186 static const struct nl_policy tca_policy[] = {
5187 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5188 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5189 };
5190 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5191
5192 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5193 tca_policy, ta, ARRAY_SIZE(ta))) {
5194 VLOG_WARN_RL(&rl, "failed to parse class message");
5195 goto error;
5196 }
5197
5198 if (handlep) {
5199 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5200 *handlep = tc->tcm_handle;
5201 }
5202
5203 if (options) {
5204 *options = ta[TCA_OPTIONS];
5205 }
5206
5207 if (stats) {
5208 const struct gnet_stats_queue *gsq;
5209 struct gnet_stats_basic gsb;
5210
5211 static const struct nl_policy stats_policy[] = {
5212 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5213 .min_len = sizeof gsb },
5214 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5215 .min_len = sizeof *gsq },
5216 };
5217 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5218
5219 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5220 sa, ARRAY_SIZE(sa))) {
5221 VLOG_WARN_RL(&rl, "failed to parse class stats");
5222 goto error;
5223 }
5224
5225 /* Alignment issues screw up the length of struct gnet_stats_basic on
5226 * some arch/bitsize combinations. Newer versions of Linux have a
5227 * struct gnet_stats_basic_packed, but we can't depend on that. The
5228 * easiest thing to do is just to make a copy. */
5229 memset(&gsb, 0, sizeof gsb);
5230 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5231 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5232 stats->tx_bytes = gsb.bytes;
5233 stats->tx_packets = gsb.packets;
5234
5235 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5236 stats->tx_errors = gsq->drops;
5237 }
5238
5239 return 0;
5240
5241 error:
5242 if (options) {
5243 *options = NULL;
5244 }
5245 if (stats) {
5246 memset(stats, 0, sizeof *stats);
5247 }
5248 return EPROTO;
5249 }
5250
5251 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5252 * on 'netdev'. */
5253 static int
5254 tc_query_class(const struct netdev *netdev,
5255 unsigned int handle, unsigned int parent,
5256 struct ofpbuf **replyp)
5257 {
5258 struct ofpbuf request;
5259 struct tcmsg *tcmsg;
5260 int error;
5261
5262 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5263 &request);
5264 if (!tcmsg) {
5265 return ENODEV;
5266 }
5267 tcmsg->tcm_handle = handle;
5268 tcmsg->tcm_parent = parent;
5269
5270 error = tc_transact(&request, replyp);
5271 if (error) {
5272 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5273 netdev_get_name(netdev),
5274 tc_get_major(handle), tc_get_minor(handle),
5275 tc_get_major(parent), tc_get_minor(parent),
5276 ovs_strerror(error));
5277 }
5278 return error;
5279 }
5280
5281 /* Equivalent to "tc class del dev <name> handle <handle>". */
5282 static int
5283 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5284 {
5285 struct ofpbuf request;
5286 struct tcmsg *tcmsg;
5287 int error;
5288
5289 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5290 if (!tcmsg) {
5291 return ENODEV;
5292 }
5293 tcmsg->tcm_handle = handle;
5294 tcmsg->tcm_parent = 0;
5295
5296 error = tc_transact(&request, NULL);
5297 if (error) {
5298 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5299 netdev_get_name(netdev),
5300 tc_get_major(handle), tc_get_minor(handle),
5301 ovs_strerror(error));
5302 }
5303 return error;
5304 }
5305
5306 /* Equivalent to "tc qdisc del dev <name> root". */
5307 static int
5308 tc_del_qdisc(struct netdev *netdev_)
5309 {
5310 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5311 struct ofpbuf request;
5312 struct tcmsg *tcmsg;
5313 int error;
5314
5315 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5316 if (!tcmsg) {
5317 return ENODEV;
5318 }
5319 tcmsg->tcm_handle = tc_make_handle(1, 0);
5320 tcmsg->tcm_parent = TC_H_ROOT;
5321
5322 error = tc_transact(&request, NULL);
5323 if (error == EINVAL) {
5324 /* EINVAL probably means that the default qdisc was in use, in which
5325 * case we've accomplished our purpose. */
5326 error = 0;
5327 }
5328 if (!error && netdev->tc) {
5329 if (netdev->tc->ops->tc_destroy) {
5330 netdev->tc->ops->tc_destroy(netdev->tc);
5331 }
5332 netdev->tc = NULL;
5333 }
5334 return error;
5335 }
5336
5337 static bool
5338 getqdisc_is_safe(void)
5339 {
5340 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5341 static bool safe = false;
5342
5343 if (ovsthread_once_start(&once)) {
5344 struct utsname utsname;
5345 int major, minor;
5346
5347 if (uname(&utsname) == -1) {
5348 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5349 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5350 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5351 } else if (major < 2 || (major == 2 && minor < 35)) {
5352 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5353 utsname.release);
5354 } else {
5355 safe = true;
5356 }
5357 ovsthread_once_done(&once);
5358 }
5359 return safe;
5360 }
5361
5362 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5363 * kernel to determine what they are. Returns 0 if successful, otherwise a
5364 * positive errno value. */
5365 static int
5366 tc_query_qdisc(const struct netdev *netdev_)
5367 {
5368 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5369 struct ofpbuf request, *qdisc;
5370 const struct tc_ops *ops;
5371 struct tcmsg *tcmsg;
5372 int load_error;
5373 int error;
5374
5375 if (netdev->tc) {
5376 return 0;
5377 }
5378
5379 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5380 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5381 * 2.6.35 without that fix backported to it.
5382 *
5383 * To avoid the OOPS, we must not make a request that would attempt to dump
5384 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5385 * few others. There are a few ways that I can see to do this, but most of
5386 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5387 * technique chosen here is to assume that any non-default qdisc that we
5388 * create will have a class with handle 1:0. The built-in qdiscs only have
5389 * a class with handle 0:0.
5390 *
5391 * On Linux 2.6.35+ we use the straightforward method because it allows us
5392 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5393 * in such a case we get no response at all from the kernel (!) if a
5394 * builtin qdisc is in use (which is later caught by "!error &&
5395 * !qdisc->size"). */
5396 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5397 &request);
5398 if (!tcmsg) {
5399 return ENODEV;
5400 }
5401 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5402 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5403
5404 /* Figure out what tc class to instantiate. */
5405 error = tc_transact(&request, &qdisc);
5406 if (!error && qdisc->size) {
5407 const char *kind;
5408
5409 error = tc_parse_qdisc(qdisc, &kind, NULL);
5410 if (error) {
5411 ops = &tc_ops_other;
5412 } else {
5413 ops = tc_lookup_linux_name(kind);
5414 if (!ops) {
5415 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5416 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5417
5418 ops = &tc_ops_other;
5419 }
5420 }
5421 } else if ((!error && !qdisc->size) || error == ENOENT) {
5422 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5423 * set up by some other entity that doesn't have a handle 1:0. We will
5424 * assume that it's the system default qdisc. */
5425 ops = &tc_ops_default;
5426 error = 0;
5427 } else {
5428 /* Who knows? Maybe the device got deleted. */
5429 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5430 netdev_get_name(netdev_), ovs_strerror(error));
5431 ops = &tc_ops_other;
5432 }
5433
5434 /* Instantiate it. */
5435 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5436 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5437 ofpbuf_delete(qdisc);
5438
5439 return error ? error : load_error;
5440 }
5441
5442 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5443 approximate the time to transmit packets of various lengths. For an MTU of
5444 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5445 represents two possible packet lengths; for a MTU of 513 through 1024, four
5446 possible lengths; and so on.
5447
5448 Returns, for the specified 'mtu', the number of bits that packet lengths
5449 need to be shifted right to fit within such a 256-entry table. */
5450 static int
5451 tc_calc_cell_log(unsigned int mtu)
5452 {
5453 int cell_log;
5454
5455 if (!mtu) {
5456 mtu = ETH_PAYLOAD_MAX;
5457 }
5458 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5459
5460 for (cell_log = 0; mtu >= 256; cell_log++) {
5461 mtu >>= 1;
5462 }
5463
5464 return cell_log;
5465 }
5466
5467 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5468 * of 'mtu'. */
5469 static void
5470 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5471 {
5472 memset(rate, 0, sizeof *rate);
5473 rate->cell_log = tc_calc_cell_log(mtu);
5474 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5475 /* rate->cell_align = 0; */ /* distro headers. */
5476 rate->mpu = ETH_TOTAL_MIN;
5477 rate->rate = Bps;
5478 }
5479
5480 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5481 * attribute of the specified "type".
5482 *
5483 * See tc_calc_cell_log() above for a description of "rtab"s. */
5484 static void
5485 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5486 {
5487 uint32_t *rtab;
5488 unsigned int i;
5489
5490 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5491 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5492 unsigned packet_size = (i + 1) << rate->cell_log;
5493 if (packet_size < rate->mpu) {
5494 packet_size = rate->mpu;
5495 }
5496 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5497 }
5498 }
5499
5500 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5501 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5502 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5503 * 0 is fine.) */
5504 static int
5505 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5506 {
5507 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5508 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5509 }
5510 \f
5511 /* Linux-only functions declared in netdev-linux.h */
5512
5513 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5514 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5515 int
5516 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5517 const char *flag_name, bool enable)
5518 {
5519 const char *netdev_name = netdev_get_name(netdev);
5520 struct ethtool_value evalue;
5521 uint32_t new_flags;
5522 int error;
5523
5524 COVERAGE_INC(netdev_get_ethtool);
5525 memset(&evalue, 0, sizeof evalue);
5526 error = netdev_linux_do_ethtool(netdev_name,
5527 (struct ethtool_cmd *)&evalue,
5528 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5529 if (error) {
5530 return error;
5531 }
5532
5533 COVERAGE_INC(netdev_set_ethtool);
5534 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5535 if (new_flags == evalue.data) {
5536 return 0;
5537 }
5538 evalue.data = new_flags;
5539 error = netdev_linux_do_ethtool(netdev_name,
5540 (struct ethtool_cmd *)&evalue,
5541 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5542 if (error) {
5543 return error;
5544 }
5545
5546 COVERAGE_INC(netdev_get_ethtool);
5547 memset(&evalue, 0, sizeof evalue);
5548 error = netdev_linux_do_ethtool(netdev_name,
5549 (struct ethtool_cmd *)&evalue,
5550 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5551 if (error) {
5552 return error;
5553 }
5554
5555 if (new_flags != evalue.data) {
5556 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5557 "device %s failed", enable ? "enable" : "disable",
5558 flag_name, netdev_name);
5559 return EOPNOTSUPP;
5560 }
5561
5562 return 0;
5563 }
5564 \f
5565 /* Utility functions. */
5566
5567 /* Copies 'src' into 'dst', performing format conversion in the process. */
5568 static void
5569 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5570 const struct rtnl_link_stats *src)
5571 {
5572 dst->rx_packets = src->rx_packets;
5573 dst->tx_packets = src->tx_packets;
5574 dst->rx_bytes = src->rx_bytes;
5575 dst->tx_bytes = src->tx_bytes;
5576 dst->rx_errors = src->rx_errors;
5577 dst->tx_errors = src->tx_errors;
5578 dst->rx_dropped = src->rx_dropped;
5579 dst->tx_dropped = src->tx_dropped;
5580 dst->multicast = src->multicast;
5581 dst->collisions = src->collisions;
5582 dst->rx_length_errors = src->rx_length_errors;
5583 dst->rx_over_errors = src->rx_over_errors;
5584 dst->rx_crc_errors = src->rx_crc_errors;
5585 dst->rx_frame_errors = src->rx_frame_errors;
5586 dst->rx_fifo_errors = src->rx_fifo_errors;
5587 dst->rx_missed_errors = src->rx_missed_errors;
5588 dst->tx_aborted_errors = src->tx_aborted_errors;
5589 dst->tx_carrier_errors = src->tx_carrier_errors;
5590 dst->tx_fifo_errors = src->tx_fifo_errors;
5591 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5592 dst->tx_window_errors = src->tx_window_errors;
5593 }
5594
5595 /* Copies 'src' into 'dst', performing format conversion in the process. */
5596 static void
5597 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5598 const struct rtnl_link_stats64 *src)
5599 {
5600 dst->rx_packets = src->rx_packets;
5601 dst->tx_packets = src->tx_packets;
5602 dst->rx_bytes = src->rx_bytes;
5603 dst->tx_bytes = src->tx_bytes;
5604 dst->rx_errors = src->rx_errors;
5605 dst->tx_errors = src->tx_errors;
5606 dst->rx_dropped = src->rx_dropped;
5607 dst->tx_dropped = src->tx_dropped;
5608 dst->multicast = src->multicast;
5609 dst->collisions = src->collisions;
5610 dst->rx_length_errors = src->rx_length_errors;
5611 dst->rx_over_errors = src->rx_over_errors;
5612 dst->rx_crc_errors = src->rx_crc_errors;
5613 dst->rx_frame_errors = src->rx_frame_errors;
5614 dst->rx_fifo_errors = src->rx_fifo_errors;
5615 dst->rx_missed_errors = src->rx_missed_errors;
5616 dst->tx_aborted_errors = src->tx_aborted_errors;
5617 dst->tx_carrier_errors = src->tx_carrier_errors;
5618 dst->tx_fifo_errors = src->tx_fifo_errors;
5619 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5620 dst->tx_window_errors = src->tx_window_errors;
5621 }
5622
5623 static int
5624 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5625 {
5626 struct ofpbuf request;
5627 struct ofpbuf *reply;
5628 int error;
5629
5630 /* Filtering all counters by default */
5631 memset(stats, 0xFF, sizeof(struct netdev_stats));
5632
5633 ofpbuf_init(&request, 0);
5634 nl_msg_put_nlmsghdr(&request,
5635 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5636 RTM_GETLINK, NLM_F_REQUEST);
5637 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5638 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5639 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5640 ofpbuf_uninit(&request);
5641 if (error) {
5642 return error;
5643 }
5644
5645 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5646 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5647 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5648 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5649 error = 0;
5650 } else {
5651 a = nl_attr_find(reply, 0, IFLA_STATS);
5652 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5653 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5654 error = 0;
5655 } else {
5656 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5657 error = EPROTO;
5658 }
5659 }
5660 } else {
5661 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5662 error = EPROTO;
5663 }
5664
5665
5666 ofpbuf_delete(reply);
5667 return error;
5668 }
5669
5670 static int
5671 get_flags(const struct netdev *dev, unsigned int *flags)
5672 {
5673 struct ifreq ifr;
5674 int error;
5675
5676 *flags = 0;
5677 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5678 if (!error) {
5679 *flags = ifr.ifr_flags;
5680 }
5681 return error;
5682 }
5683
5684 static int
5685 set_flags(const char *name, unsigned int flags)
5686 {
5687 struct ifreq ifr;
5688
5689 ifr.ifr_flags = flags;
5690 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5691 }
5692
5693 int
5694 linux_get_ifindex(const char *netdev_name)
5695 {
5696 struct ifreq ifr;
5697 int error;
5698
5699 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5700 COVERAGE_INC(netdev_get_ifindex);
5701
5702 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5703 if (error) {
5704 /* ENODEV probably means that a vif disappeared asynchronously and
5705 * hasn't been removed from the database yet, so reduce the log level
5706 * to INFO for that case. */
5707 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5708 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5709 netdev_name, ovs_strerror(error));
5710 return -error;
5711 }
5712 return ifr.ifr_ifindex;
5713 }
5714
5715 static int
5716 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5717 {
5718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5719
5720 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5721 netdev_linux_update_via_netlink(netdev);
5722 }
5723
5724 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5725 /* Fall back to ioctl if netlink fails */
5726 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5727
5728 if (ifindex < 0) {
5729 netdev->get_ifindex_error = -ifindex;
5730 netdev->ifindex = 0;
5731 } else {
5732 netdev->get_ifindex_error = 0;
5733 netdev->ifindex = ifindex;
5734 }
5735 netdev->cache_valid |= VALID_IFINDEX;
5736 }
5737
5738 *ifindexp = netdev->ifindex;
5739 return netdev->get_ifindex_error;
5740 }
5741
5742 static int
5743 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5744 {
5745 struct ofpbuf request;
5746 struct ofpbuf *reply;
5747 struct rtnetlink_change chg;
5748 struct rtnetlink_change *change = &chg;
5749 int error;
5750
5751 ofpbuf_init(&request, 0);
5752 nl_msg_put_nlmsghdr(&request,
5753 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5754 RTM_GETLINK, NLM_F_REQUEST);
5755 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5756
5757 /* The correct identifiers for a Linux device are netnsid and ifindex,
5758 * but ifindex changes as the port is moved to another network namespace
5759 * and the interface name statically stored in ovsdb. */
5760 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5761 if (netdev_linux_netnsid_is_remote(netdev)) {
5762 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5763 }
5764 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5765 ofpbuf_uninit(&request);
5766 if (error) {
5767 ofpbuf_delete(reply);
5768 return error;
5769 }
5770
5771 if (rtnetlink_parse(reply, change)
5772 && change->nlmsg_type == RTM_NEWLINK) {
5773 bool changed = false;
5774 error = 0;
5775
5776 /* Update netdev from rtnl msg and increment its seq if needed. */
5777 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5778 netdev->carrier_resets++;
5779 changed = true;
5780 }
5781 if (change->ifi_flags != netdev->ifi_flags) {
5782 netdev->ifi_flags = change->ifi_flags;
5783 changed = true;
5784 }
5785 if (change->mtu && change->mtu != netdev->mtu) {
5786 netdev->mtu = change->mtu;
5787 netdev->cache_valid |= VALID_MTU;
5788 netdev->netdev_mtu_error = 0;
5789 changed = true;
5790 }
5791 if (!eth_addr_is_zero(change->mac)
5792 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5793 netdev->etheraddr = change->mac;
5794 netdev->cache_valid |= VALID_ETHERADDR;
5795 netdev->ether_addr_error = 0;
5796 changed = true;
5797 }
5798 if (change->if_index != netdev->ifindex) {
5799 netdev->ifindex = change->if_index;
5800 netdev->cache_valid |= VALID_IFINDEX;
5801 netdev->get_ifindex_error = 0;
5802 changed = true;
5803 }
5804 if (change->master && netdev_linux_kind_is_lag(change->master)) {
5805 netdev->is_lag_master = true;
5806 }
5807 if (changed) {
5808 netdev_change_seq_changed(&netdev->up);
5809 }
5810 } else {
5811 error = EINVAL;
5812 }
5813
5814 ofpbuf_delete(reply);
5815 return error;
5816 }
5817
5818 static int
5819 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5820 {
5821 struct ifreq ifr;
5822 int hwaddr_family;
5823 int error;
5824
5825 memset(&ifr, 0, sizeof ifr);
5826 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5827 COVERAGE_INC(netdev_get_hwaddr);
5828 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5829 if (error) {
5830 /* ENODEV probably means that a vif disappeared asynchronously and
5831 * hasn't been removed from the database yet, so reduce the log level
5832 * to INFO for that case. */
5833 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5834 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5835 netdev_name, ovs_strerror(error));
5836 return error;
5837 }
5838 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5839 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5840 hwaddr_family != ARPHRD_NONE) {
5841 VLOG_INFO("%s device has unknown hardware address family %d",
5842 netdev_name, hwaddr_family);
5843 return EINVAL;
5844 }
5845 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5846 return 0;
5847 }
5848
5849 static int
5850 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5851 {
5852 struct ifreq ifr;
5853 int error;
5854
5855 memset(&ifr, 0, sizeof ifr);
5856 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5857 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5858 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5859 COVERAGE_INC(netdev_set_hwaddr);
5860 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5861 if (error) {
5862 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5863 netdev_name, ovs_strerror(error));
5864 }
5865 return error;
5866 }
5867
5868 static int
5869 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5870 int cmd, const char *cmd_name)
5871 {
5872 struct ifreq ifr;
5873 int error;
5874
5875 memset(&ifr, 0, sizeof ifr);
5876 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5877 ifr.ifr_data = (caddr_t) ecmd;
5878
5879 ecmd->cmd = cmd;
5880 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5881 if (error) {
5882 if (error != EOPNOTSUPP) {
5883 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5884 "failed: %s", cmd_name, name, ovs_strerror(error));
5885 } else {
5886 /* The device doesn't support this operation. That's pretty
5887 * common, so there's no point in logging anything. */
5888 }
5889 }
5890 return error;
5891 }
5892
5893 /* Returns an AF_PACKET raw socket or a negative errno value. */
5894 static int
5895 af_packet_sock(void)
5896 {
5897 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5898 static int sock;
5899
5900 if (ovsthread_once_start(&once)) {
5901 sock = socket(AF_PACKET, SOCK_RAW, 0);
5902 if (sock >= 0) {
5903 int error = set_nonblocking(sock);
5904 if (error) {
5905 close(sock);
5906 sock = -error;
5907 }
5908 } else {
5909 sock = -errno;
5910 VLOG_ERR("failed to create packet socket: %s",
5911 ovs_strerror(errno));
5912 }
5913 ovsthread_once_done(&once);
5914 }
5915
5916 return sock;
5917 }