]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netdev: Clean up class initialization.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
26 #include <inttypes.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "netlink.h"
63 #include "netnsid.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "tc.h"
74 #include "timer.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
77 #include "util.h"
78
79 VLOG_DEFINE_THIS_MODULE(netdev_linux);
80
81 COVERAGE_DEFINE(netdev_set_policing);
82 COVERAGE_DEFINE(netdev_arp_lookup);
83 COVERAGE_DEFINE(netdev_get_ifindex);
84 COVERAGE_DEFINE(netdev_get_hwaddr);
85 COVERAGE_DEFINE(netdev_set_hwaddr);
86 COVERAGE_DEFINE(netdev_get_ethtool);
87 COVERAGE_DEFINE(netdev_set_ethtool);
88
89 \f
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
92 #endif
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
97 #endif
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
100 #endif
101
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106 #endif
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109 #endif
110
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113 #ifndef TC_RTAB_SIZE
114 #define TC_RTAB_SIZE 1024
115 #endif
116
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
128 #endif
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
131 #endif
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134 #endif
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145 };
146
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155 {
156 return ep->speed | (ep->speed_hi << 16);
157 }
158
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
170 #endif
171
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
183 #endif
184
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
192 #ifndef IFLA_STATS64
193 #define IFLA_STATS64 23
194 #endif
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223 };
224
225 enum {
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
234 };
235 \f
236 struct linux_lag_slave {
237 uint32_t block_id;
238 struct shash_node *node;
239 };
240
241 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
242 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
243
244 /* All slaves whose LAG masters are network devices in OvS. */
245 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
246 = SHASH_INITIALIZER(&lag_shash);
247
248 /* Traffic control. */
249
250 /* An instance of a traffic control class. Always associated with a particular
251 * network device.
252 *
253 * Each TC implementation subclasses this with whatever additional data it
254 * needs. */
255 struct tc {
256 const struct tc_ops *ops;
257 struct hmap queues; /* Contains "struct tc_queue"s.
258 * Read by generic TC layer.
259 * Written only by TC implementation. */
260 };
261
262 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
263
264 /* One traffic control queue.
265 *
266 * Each TC implementation subclasses this with whatever additional data it
267 * needs. */
268 struct tc_queue {
269 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
270 unsigned int queue_id; /* OpenFlow queue ID. */
271 long long int created; /* Time queue was created, in msecs. */
272 };
273
274 /* A particular kind of traffic control. Each implementation generally maps to
275 * one particular Linux qdisc class.
276 *
277 * The functions below return 0 if successful or a positive errno value on
278 * failure, except where otherwise noted. All of them must be provided, except
279 * where otherwise noted. */
280 struct tc_ops {
281 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
282 * This is null for tc_ops_default and tc_ops_other, for which there are no
283 * appropriate values. */
284 const char *linux_name;
285
286 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
287 const char *ovs_name;
288
289 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
290 * queues. The queues are numbered 0 through n_queues - 1. */
291 unsigned int n_queues;
292
293 /* Called to install this TC class on 'netdev'. The implementation should
294 * make the Netlink calls required to set up 'netdev' with the right qdisc
295 * and configure it according to 'details'. The implementation may assume
296 * that the current qdisc is the default; that is, there is no need for it
297 * to delete the current qdisc before installing itself.
298 *
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
302 *
303 * This function must return 0 if and only if it sets 'netdev->tc' to an
304 * initialized 'struct tc'.
305 *
306 * (This function is null for tc_ops_other, which cannot be installed. For
307 * other TC classes it should always be nonnull.) */
308 int (*tc_install)(struct netdev *netdev, const struct smap *details);
309
310 /* Called when the netdev code determines (through a Netlink query) that
311 * this TC class's qdisc is installed on 'netdev', but we didn't install
312 * it ourselves and so don't know any of the details.
313 *
314 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
315 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
316 * implementation should parse the other attributes of 'nlmsg' as
317 * necessary to determine its configuration. If necessary it should also
318 * use Netlink queries to determine the configuration of queues on
319 * 'netdev'.
320 *
321 * This function must return 0 if and only if it sets 'netdev->tc' to an
322 * initialized 'struct tc'. */
323 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
324
325 /* Destroys the data structures allocated by the implementation as part of
326 * 'tc'. (This includes destroying 'tc->queues' by calling
327 * tc_destroy(tc).
328 *
329 * The implementation should not need to perform any Netlink calls. If
330 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
331 * (But it may not be desirable.)
332 *
333 * This function may be null if 'tc' is trivial. */
334 void (*tc_destroy)(struct tc *tc);
335
336 /* Retrieves details of 'netdev->tc' configuration into 'details'.
337 *
338 * The implementation should not need to perform any Netlink calls, because
339 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
340 * cached the configuration.
341 *
342 * The contents of 'details' should be documented as valid for 'ovs_name'
343 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
344 * (which is built as ovs-vswitchd.conf.db(8)).
345 *
346 * This function may be null if 'tc' is not configurable.
347 */
348 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
349
350 /* Reconfigures 'netdev->tc' according to 'details', performing any
351 * required Netlink calls to complete the reconfiguration.
352 *
353 * The contents of 'details' should be documented as valid for 'ovs_name'
354 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
355 * (which is built as ovs-vswitchd.conf.db(8)).
356 *
357 * This function may be null if 'tc' is not configurable.
358 */
359 int (*qdisc_set)(struct netdev *, const struct smap *details);
360
361 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
362 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
363 *
364 * The contents of 'details' should be documented as valid for 'ovs_name'
365 * in the "other_config" column in the "Queue" table in
366 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
367 *
368 * The implementation should not need to perform any Netlink calls, because
369 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
370 * cached the queue configuration.
371 *
372 * This function may be null if 'tc' does not have queues ('n_queues' is
373 * 0). */
374 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
375 struct smap *details);
376
377 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
378 * 'details', perfoming any required Netlink calls to complete the
379 * reconfiguration. The caller ensures that 'queue_id' is less than
380 * 'n_queues'.
381 *
382 * The contents of 'details' should be documented as valid for 'ovs_name'
383 * in the "other_config" column in the "Queue" table in
384 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
385 *
386 * This function may be null if 'tc' does not have queues or its queues are
387 * not configurable. */
388 int (*class_set)(struct netdev *, unsigned int queue_id,
389 const struct smap *details);
390
391 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
392 * tc_queue's within 'netdev->tc->queues'.
393 *
394 * This function may be null if 'tc' does not have queues or its queues
395 * cannot be deleted. */
396 int (*class_delete)(struct netdev *, struct tc_queue *queue);
397
398 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
399 * 'struct tc_queue's within 'netdev->tc->queues'.
400 *
401 * On success, initializes '*stats'.
402 *
403 * This function may be null if 'tc' does not have queues or if it cannot
404 * report queue statistics. */
405 int (*class_get_stats)(const struct netdev *netdev,
406 const struct tc_queue *queue,
407 struct netdev_queue_stats *stats);
408
409 /* Extracts queue stats from 'nlmsg', which is a response to a
410 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
411 *
412 * This function may be null if 'tc' does not have queues or if it cannot
413 * report queue statistics. */
414 int (*class_dump_stats)(const struct netdev *netdev,
415 const struct ofpbuf *nlmsg,
416 netdev_dump_queue_stats_cb *cb, void *aux);
417 };
418
419 static void
420 tc_init(struct tc *tc, const struct tc_ops *ops)
421 {
422 tc->ops = ops;
423 hmap_init(&tc->queues);
424 }
425
426 static void
427 tc_destroy(struct tc *tc)
428 {
429 hmap_destroy(&tc->queues);
430 }
431
432 static const struct tc_ops tc_ops_htb;
433 static const struct tc_ops tc_ops_hfsc;
434 static const struct tc_ops tc_ops_codel;
435 static const struct tc_ops tc_ops_fqcodel;
436 static const struct tc_ops tc_ops_sfq;
437 static const struct tc_ops tc_ops_default;
438 static const struct tc_ops tc_ops_noop;
439 static const struct tc_ops tc_ops_other;
440
441 static const struct tc_ops *const tcs[] = {
442 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
443 &tc_ops_hfsc, /* Hierarchical fair service curve. */
444 &tc_ops_codel, /* Controlled delay */
445 &tc_ops_fqcodel, /* Fair queue controlled delay */
446 &tc_ops_sfq, /* Stochastic fair queueing */
447 &tc_ops_noop, /* Non operating qos type. */
448 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
449 &tc_ops_other, /* Some other qdisc. */
450 NULL
451 };
452
453 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
454 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
455 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
456
457 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
458 int type,
459 unsigned int flags,
460 struct ofpbuf *);
461 static int tc_add_policer(struct netdev *,
462 uint32_t kbits_rate, uint32_t kbits_burst);
463
464 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
465 struct nlattr **options);
466 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
467 struct nlattr **options,
468 struct netdev_queue_stats *);
469 static int tc_query_class(const struct netdev *,
470 unsigned int handle, unsigned int parent,
471 struct ofpbuf **replyp);
472 static int tc_delete_class(const struct netdev *, unsigned int handle);
473
474 static int tc_del_qdisc(struct netdev *netdev);
475 static int tc_query_qdisc(const struct netdev *netdev);
476
477 static int tc_calc_cell_log(unsigned int mtu);
478 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
479 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
480 const struct tc_ratespec *rate);
481 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
482 \f
483 struct netdev_linux {
484 struct netdev up;
485
486 /* Protects all members below. */
487 struct ovs_mutex mutex;
488
489 unsigned int cache_valid;
490
491 bool miimon; /* Link status of last poll. */
492 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
493 struct timer miimon_timer;
494
495 int netnsid; /* Network namespace ID. */
496 /* The following are figured out "on demand" only. They are only valid
497 * when the corresponding VALID_* bit in 'cache_valid' is set. */
498 int ifindex;
499 struct eth_addr etheraddr;
500 int mtu;
501 unsigned int ifi_flags;
502 long long int carrier_resets;
503 uint32_t kbits_rate; /* Policing data. */
504 uint32_t kbits_burst;
505 int vport_stats_error; /* Cached error code from vport_get_stats().
506 0 or an errno value. */
507 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
508 int ether_addr_error; /* Cached error code from set/get etheraddr. */
509 int netdev_policing_error; /* Cached error code from set policing. */
510 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
511 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
512
513 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
515 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
516
517 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
518 struct tc *tc;
519
520 /* For devices of class netdev_tap_class only. */
521 int tap_fd;
522 bool present; /* If the device is present in the namespace */
523 uint64_t tx_dropped; /* tap device can drop if the iface is down */
524
525 /* LAG information. */
526 bool is_lag_master; /* True if the netdev is a LAG master. */
527 };
528
529 struct netdev_rxq_linux {
530 struct netdev_rxq up;
531 bool is_tap;
532 int fd;
533 };
534
535 /* This is set pretty low because we probably won't learn anything from the
536 * additional log messages. */
537 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
538
539 /* Polling miimon status for all ports causes performance degradation when
540 * handling a large number of ports. If there are no devices using miimon, then
541 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
542 *
543 * Readers do not depend on this variable synchronizing with the related
544 * changes in the device miimon status, so we can use atomic_count. */
545 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
546
547 static void netdev_linux_run(const struct netdev_class *);
548
549 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
550 int cmd, const char *cmd_name);
551 static int get_flags(const struct netdev *, unsigned int *flags);
552 static int set_flags(const char *, unsigned int flags);
553 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
554 enum netdev_flags on, enum netdev_flags *old_flagsp)
555 OVS_REQUIRES(netdev->mutex);
556 static int get_ifindex(const struct netdev *, int *ifindexp);
557 static int do_set_addr(struct netdev *netdev,
558 int ioctl_nr, const char *ioctl_name,
559 struct in_addr addr);
560 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
561 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
562 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
563 static int af_packet_sock(void);
564 static bool netdev_linux_miimon_enabled(void);
565 static void netdev_linux_miimon_run(void);
566 static void netdev_linux_miimon_wait(void);
567 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
568
569 static bool
570 is_netdev_linux_class(const struct netdev_class *netdev_class)
571 {
572 return netdev_class->run == netdev_linux_run;
573 }
574
575 static bool
576 is_tap_netdev(const struct netdev *netdev)
577 {
578 return netdev_get_class(netdev) == &netdev_tap_class;
579 }
580
581 static struct netdev_linux *
582 netdev_linux_cast(const struct netdev *netdev)
583 {
584 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
585
586 return CONTAINER_OF(netdev, struct netdev_linux, up);
587 }
588
589 static struct netdev_rxq_linux *
590 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
591 {
592 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
593 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
594 }
595 \f
596 static int
597 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
598 {
599 struct dpif_netlink_vport reply;
600 struct ofpbuf *buf;
601 int error;
602
603 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
604 if (error) {
605 if (error == ENOENT) {
606 /* Assume it is local if there is no API (e.g. if the openvswitch
607 * kernel module is not loaded). */
608 netnsid_set_local(&netdev->netnsid);
609 } else {
610 netnsid_unset(&netdev->netnsid);
611 }
612 return error;
613 }
614
615 netnsid_set(&netdev->netnsid, reply.netnsid);
616 ofpbuf_delete(buf);
617 return 0;
618 }
619
620 static int
621 netdev_linux_netnsid_update(struct netdev_linux *netdev)
622 {
623 if (netnsid_is_unset(netdev->netnsid)) {
624 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
625 netnsid_set_local(&netdev->netnsid);
626 } else {
627 return netdev_linux_netnsid_update__(netdev);
628 }
629 }
630
631 return 0;
632 }
633
634 static bool
635 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
636 {
637 netdev_linux_netnsid_update(netdev);
638 return netnsid_eq(netdev->netnsid, nsid);
639 }
640
641 static bool
642 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
643 {
644 netdev_linux_netnsid_update(netdev);
645 return netnsid_is_remote(netdev->netnsid);
646 }
647
648 static int netdev_linux_update_via_netlink(struct netdev_linux *);
649 static void netdev_linux_update(struct netdev_linux *netdev, int,
650 const struct rtnetlink_change *)
651 OVS_REQUIRES(netdev->mutex);
652 static void netdev_linux_changed(struct netdev_linux *netdev,
653 unsigned int ifi_flags, unsigned int mask)
654 OVS_REQUIRES(netdev->mutex);
655
656 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
657 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
658 * if no such socket could be created. */
659 static struct nl_sock *
660 netdev_linux_notify_sock(void)
661 {
662 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
663 static struct nl_sock *sock;
664 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
665 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
666
667 if (ovsthread_once_start(&once)) {
668 int error;
669
670 error = nl_sock_create(NETLINK_ROUTE, &sock);
671 if (!error) {
672 size_t i;
673
674 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
675 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
676 if (error) {
677 nl_sock_destroy(sock);
678 sock = NULL;
679 break;
680 }
681 }
682 }
683 nl_sock_listen_all_nsid(sock, true);
684 ovsthread_once_done(&once);
685 }
686
687 return sock;
688 }
689
690 static bool
691 netdev_linux_miimon_enabled(void)
692 {
693 return atomic_count_get(&miimon_cnt) > 0;
694 }
695
696 static bool
697 netdev_linux_kind_is_lag(const char *kind)
698 {
699 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
700 return true;
701 }
702
703 return false;
704 }
705
706 static void
707 netdev_linux_update_lag(struct rtnetlink_change *change)
708 OVS_REQUIRES(lag_mutex)
709 {
710 struct linux_lag_slave *lag;
711
712 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
713 return;
714 }
715
716 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
717 lag = shash_find_data(&lag_shash, change->ifname);
718
719 if (!lag) {
720 struct netdev *master_netdev;
721 char master_name[IFNAMSIZ];
722 uint32_t block_id;
723 int error = 0;
724
725 if_indextoname(change->master_ifindex, master_name);
726 master_netdev = netdev_from_name(master_name);
727 if (!master_netdev) {
728 return;
729 }
730
731 if (is_netdev_linux_class(master_netdev->netdev_class)) {
732 block_id = netdev_get_block_id(master_netdev);
733 if (!block_id) {
734 netdev_close(master_netdev);
735 return;
736 }
737
738 lag = xmalloc(sizeof *lag);
739 lag->block_id = block_id;
740 lag->node = shash_add(&lag_shash, change->ifname, lag);
741
742 /* LAG master is linux netdev so add slave to same block. */
743 error = tc_add_del_ingress_qdisc(change->if_index, true,
744 block_id);
745 if (error) {
746 VLOG_WARN("failed to bind LAG slave to master's block");
747 shash_delete(&lag_shash, lag->node);
748 free(lag);
749 }
750 }
751
752 netdev_close(master_netdev);
753 }
754 } else if (change->master_ifindex == 0) {
755 /* Check if this was a lag slave that has been freed. */
756 lag = shash_find_data(&lag_shash, change->ifname);
757
758 if (lag) {
759 tc_add_del_ingress_qdisc(change->if_index, false,
760 lag->block_id);
761 shash_delete(&lag_shash, lag->node);
762 free(lag);
763 }
764 }
765 }
766
767 static void
768 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
769 {
770 struct nl_sock *sock;
771 int error;
772
773 if (netdev_linux_miimon_enabled()) {
774 netdev_linux_miimon_run();
775 }
776
777 sock = netdev_linux_notify_sock();
778 if (!sock) {
779 return;
780 }
781
782 do {
783 uint64_t buf_stub[4096 / 8];
784 int nsid;
785 struct ofpbuf buf;
786
787 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
788 error = nl_sock_recv(sock, &buf, &nsid, false);
789 if (!error) {
790 struct rtnetlink_change change;
791
792 if (rtnetlink_parse(&buf, &change)) {
793 struct netdev *netdev_ = NULL;
794 char dev_name[IFNAMSIZ];
795
796 if (!change.ifname) {
797 change.ifname = if_indextoname(change.if_index, dev_name);
798 }
799
800 if (change.ifname) {
801 netdev_ = netdev_from_name(change.ifname);
802 }
803 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805
806 ovs_mutex_lock(&netdev->mutex);
807 netdev_linux_update(netdev, nsid, &change);
808 ovs_mutex_unlock(&netdev->mutex);
809 }
810 else if (!netdev_ && change.ifname) {
811 /* Netdev is not present in OvS but its master could be. */
812 ovs_mutex_lock(&lag_mutex);
813 netdev_linux_update_lag(&change);
814 ovs_mutex_unlock(&lag_mutex);
815 }
816 netdev_close(netdev_);
817 }
818 } else if (error == ENOBUFS) {
819 struct shash device_shash;
820 struct shash_node *node;
821
822 nl_sock_drain(sock);
823
824 shash_init(&device_shash);
825 netdev_get_devices(&netdev_linux_class, &device_shash);
826 SHASH_FOR_EACH (node, &device_shash) {
827 struct netdev *netdev_ = node->data;
828 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829 unsigned int flags;
830
831 ovs_mutex_lock(&netdev->mutex);
832 get_flags(netdev_, &flags);
833 netdev_linux_changed(netdev, flags, 0);
834 ovs_mutex_unlock(&netdev->mutex);
835
836 netdev_close(netdev_);
837 }
838 shash_destroy(&device_shash);
839 } else if (error != EAGAIN) {
840 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
841 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
842 ovs_strerror(error));
843 }
844 ofpbuf_uninit(&buf);
845 } while (!error);
846 }
847
848 static void
849 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
850 {
851 struct nl_sock *sock;
852
853 if (netdev_linux_miimon_enabled()) {
854 netdev_linux_miimon_wait();
855 }
856 sock = netdev_linux_notify_sock();
857 if (sock) {
858 nl_sock_wait(sock, POLLIN);
859 }
860 }
861
862 static void
863 netdev_linux_changed(struct netdev_linux *dev,
864 unsigned int ifi_flags, unsigned int mask)
865 OVS_REQUIRES(dev->mutex)
866 {
867 netdev_change_seq_changed(&dev->up);
868
869 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
870 dev->carrier_resets++;
871 }
872 dev->ifi_flags = ifi_flags;
873
874 dev->cache_valid &= mask;
875 if (!(mask & VALID_IN)) {
876 netdev_get_addrs_list_flush();
877 }
878 }
879
880 static void
881 netdev_linux_update__(struct netdev_linux *dev,
882 const struct rtnetlink_change *change)
883 OVS_REQUIRES(dev->mutex)
884 {
885 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
886 if (change->nlmsg_type == RTM_NEWLINK) {
887 /* Keep drv-info, and ip addresses. */
888 netdev_linux_changed(dev, change->ifi_flags,
889 VALID_DRVINFO | VALID_IN);
890
891 /* Update netdev from rtnl-change msg. */
892 if (change->mtu) {
893 dev->mtu = change->mtu;
894 dev->cache_valid |= VALID_MTU;
895 dev->netdev_mtu_error = 0;
896 }
897
898 if (!eth_addr_is_zero(change->mac)) {
899 dev->etheraddr = change->mac;
900 dev->cache_valid |= VALID_ETHERADDR;
901 dev->ether_addr_error = 0;
902
903 /* The mac addr has been changed, report it now. */
904 rtnetlink_report_link();
905 }
906
907 if (change->master && netdev_linux_kind_is_lag(change->master)) {
908 dev->is_lag_master = true;
909 }
910
911 dev->ifindex = change->if_index;
912 dev->cache_valid |= VALID_IFINDEX;
913 dev->get_ifindex_error = 0;
914 dev->present = true;
915 } else {
916 /* FIXME */
917 netdev_linux_changed(dev, change->ifi_flags, 0);
918 dev->present = false;
919 netnsid_unset(&dev->netnsid);
920 }
921 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
922 /* Invalidates in4, in6. */
923 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
924 } else {
925 OVS_NOT_REACHED();
926 }
927 }
928
929 static void
930 netdev_linux_update(struct netdev_linux *dev, int nsid,
931 const struct rtnetlink_change *change)
932 OVS_REQUIRES(dev->mutex)
933 {
934 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
935 netdev_linux_update__(dev, change);
936 }
937 }
938
939 static struct netdev *
940 netdev_linux_alloc(void)
941 {
942 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
943 return &netdev->up;
944 }
945
946 static int
947 netdev_linux_common_construct(struct netdev *netdev_)
948 {
949 /* Prevent any attempt to create (or open) a network device named "default"
950 * or "all". These device names are effectively reserved on Linux because
951 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
952 * itself this wouldn't call for any special treatment, but in practice if
953 * a program tries to create devices with these names, it causes the kernel
954 * to fire a "new device" notification event even though creation failed,
955 * and in turn that causes OVS to wake up and try to create them again,
956 * which ends up as a 100% CPU loop. */
957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
958 const char *name = netdev_->name;
959 if (!strcmp(name, "default") || !strcmp(name, "all")) {
960 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
961 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
962 name);
963 return EINVAL;
964 }
965
966 /* The device could be in the same network namespace or in another one. */
967 netnsid_unset(&netdev->netnsid);
968 ovs_mutex_init(&netdev->mutex);
969 return 0;
970 }
971
972 /* Creates system and internal devices. */
973 static int
974 netdev_linux_construct(struct netdev *netdev_)
975 {
976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
977 int error = netdev_linux_common_construct(netdev_);
978 if (error) {
979 return error;
980 }
981
982 error = get_flags(&netdev->up, &netdev->ifi_flags);
983 if (error == ENODEV) {
984 if (netdev->up.netdev_class != &netdev_internal_class) {
985 /* The device does not exist, so don't allow it to be opened. */
986 return ENODEV;
987 } else {
988 /* "Internal" netdevs have to be created as netdev objects before
989 * they exist in the kernel, because creating them in the kernel
990 * happens by passing a netdev object to dpif_port_add().
991 * Therefore, ignore the error. */
992 }
993 }
994
995 return 0;
996 }
997
998 /* For most types of netdevs we open the device for each call of
999 * netdev_open(). However, this is not the case with tap devices,
1000 * since it is only possible to open the device once. In this
1001 * situation we share a single file descriptor, and consequently
1002 * buffers, across all readers. Therefore once data is read it will
1003 * be unavailable to other reads for tap devices. */
1004 static int
1005 netdev_linux_construct_tap(struct netdev *netdev_)
1006 {
1007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1008 static const char tap_dev[] = "/dev/net/tun";
1009 const char *name = netdev_->name;
1010 struct ifreq ifr;
1011
1012 int error = netdev_linux_common_construct(netdev_);
1013 if (error) {
1014 return error;
1015 }
1016
1017 /* Open tap device. */
1018 netdev->tap_fd = open(tap_dev, O_RDWR);
1019 if (netdev->tap_fd < 0) {
1020 error = errno;
1021 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1022 return error;
1023 }
1024
1025 /* Create tap device. */
1026 get_flags(&netdev->up, &netdev->ifi_flags);
1027 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1028 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1029 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1030 VLOG_WARN("%s: creating tap device failed: %s", name,
1031 ovs_strerror(errno));
1032 error = errno;
1033 goto error_close;
1034 }
1035
1036 /* Make non-blocking. */
1037 error = set_nonblocking(netdev->tap_fd);
1038 if (error) {
1039 goto error_close;
1040 }
1041
1042 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1043 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1044 ovs_strerror(errno));
1045 error = errno;
1046 goto error_close;
1047 }
1048
1049 netdev->present = true;
1050 return 0;
1051
1052 error_close:
1053 close(netdev->tap_fd);
1054 return error;
1055 }
1056
1057 static void
1058 netdev_linux_destruct(struct netdev *netdev_)
1059 {
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1061
1062 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1063 netdev->tc->ops->tc_destroy(netdev->tc);
1064 }
1065
1066 if (netdev_get_class(netdev_) == &netdev_tap_class
1067 && netdev->tap_fd >= 0)
1068 {
1069 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1070 close(netdev->tap_fd);
1071 }
1072
1073 if (netdev->miimon_interval > 0) {
1074 atomic_count_dec(&miimon_cnt);
1075 }
1076
1077 ovs_mutex_destroy(&netdev->mutex);
1078 }
1079
1080 static void
1081 netdev_linux_dealloc(struct netdev *netdev_)
1082 {
1083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1084 free(netdev);
1085 }
1086
1087 static struct netdev_rxq *
1088 netdev_linux_rxq_alloc(void)
1089 {
1090 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1091 return &rx->up;
1092 }
1093
1094 static int
1095 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1096 {
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 struct netdev *netdev_ = rx->up.netdev;
1099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 int error;
1101
1102 ovs_mutex_lock(&netdev->mutex);
1103 rx->is_tap = is_tap_netdev(netdev_);
1104 if (rx->is_tap) {
1105 rx->fd = netdev->tap_fd;
1106 } else {
1107 struct sockaddr_ll sll;
1108 int ifindex, val;
1109 /* Result of tcpdump -dd inbound */
1110 static const struct sock_filter filt[] = {
1111 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1112 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1113 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1114 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1115 };
1116 static const struct sock_fprog fprog = {
1117 ARRAY_SIZE(filt), (struct sock_filter *) filt
1118 };
1119
1120 /* Create file descriptor. */
1121 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1122 if (rx->fd < 0) {
1123 error = errno;
1124 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1125 goto error;
1126 }
1127
1128 val = 1;
1129 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1130 error = errno;
1131 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1132 netdev_get_name(netdev_), ovs_strerror(error));
1133 goto error;
1134 }
1135
1136 /* Set non-blocking mode. */
1137 error = set_nonblocking(rx->fd);
1138 if (error) {
1139 goto error;
1140 }
1141
1142 /* Get ethernet device index. */
1143 error = get_ifindex(&netdev->up, &ifindex);
1144 if (error) {
1145 goto error;
1146 }
1147
1148 /* Bind to specific ethernet device. */
1149 memset(&sll, 0, sizeof sll);
1150 sll.sll_family = AF_PACKET;
1151 sll.sll_ifindex = ifindex;
1152 sll.sll_protocol = htons(ETH_P_ALL);
1153 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1154 error = errno;
1155 VLOG_ERR("%s: failed to bind raw socket (%s)",
1156 netdev_get_name(netdev_), ovs_strerror(error));
1157 goto error;
1158 }
1159
1160 /* Filter for only inbound packets. */
1161 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1162 sizeof fprog);
1163 if (error) {
1164 error = errno;
1165 VLOG_ERR("%s: failed to attach filter (%s)",
1166 netdev_get_name(netdev_), ovs_strerror(error));
1167 goto error;
1168 }
1169 }
1170 ovs_mutex_unlock(&netdev->mutex);
1171
1172 return 0;
1173
1174 error:
1175 if (rx->fd >= 0) {
1176 close(rx->fd);
1177 }
1178 ovs_mutex_unlock(&netdev->mutex);
1179 return error;
1180 }
1181
1182 static void
1183 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1184 {
1185 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1186
1187 if (!rx->is_tap) {
1188 close(rx->fd);
1189 }
1190 }
1191
1192 static void
1193 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1194 {
1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1196
1197 free(rx);
1198 }
1199
1200 static ovs_be16
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1202 {
1203 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1204 return htons(aux->tp_vlan_tpid);
1205 } else if (double_tagged) {
1206 return htons(ETH_TYPE_VLAN_8021AD);
1207 } else {
1208 return htons(ETH_TYPE_VLAN_8021Q);
1209 }
1210 }
1211
1212 static bool
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1214 {
1215 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1216 }
1217
1218 static int
1219 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1220 {
1221 size_t size;
1222 ssize_t retval;
1223 struct iovec iov;
1224 struct cmsghdr *cmsg;
1225 union {
1226 struct cmsghdr cmsg;
1227 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1228 } cmsg_buffer;
1229 struct msghdr msgh;
1230
1231 /* Reserve headroom for a single VLAN tag */
1232 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1233 size = dp_packet_tailroom(buffer);
1234
1235 iov.iov_base = dp_packet_data(buffer);
1236 iov.iov_len = size;
1237 msgh.msg_name = NULL;
1238 msgh.msg_namelen = 0;
1239 msgh.msg_iov = &iov;
1240 msgh.msg_iovlen = 1;
1241 msgh.msg_control = &cmsg_buffer;
1242 msgh.msg_controllen = sizeof cmsg_buffer;
1243 msgh.msg_flags = 0;
1244
1245 do {
1246 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1247 } while (retval < 0 && errno == EINTR);
1248
1249 if (retval < 0) {
1250 return errno;
1251 } else if (retval > size) {
1252 return EMSGSIZE;
1253 }
1254
1255 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1256
1257 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1258 const struct tpacket_auxdata *aux;
1259
1260 if (cmsg->cmsg_level != SOL_PACKET
1261 || cmsg->cmsg_type != PACKET_AUXDATA
1262 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1263 continue;
1264 }
1265
1266 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1267 if (auxdata_has_vlan_tci(aux)) {
1268 struct eth_header *eth;
1269 bool double_tagged;
1270
1271 if (retval < ETH_HEADER_LEN) {
1272 return EINVAL;
1273 }
1274
1275 eth = dp_packet_data(buffer);
1276 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1277
1278 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1279 htons(aux->tp_vlan_tci));
1280 break;
1281 }
1282 }
1283
1284 return 0;
1285 }
1286
1287 static int
1288 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1289 {
1290 ssize_t retval;
1291 size_t size = dp_packet_tailroom(buffer);
1292
1293 do {
1294 retval = read(fd, dp_packet_data(buffer), size);
1295 } while (retval < 0 && errno == EINTR);
1296
1297 if (retval < 0) {
1298 return errno;
1299 }
1300
1301 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1302 return 0;
1303 }
1304
1305 static int
1306 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1307 int *qfill)
1308 {
1309 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1310 struct netdev *netdev = rx->up.netdev;
1311 struct dp_packet *buffer;
1312 ssize_t retval;
1313 int mtu;
1314
1315 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1316 mtu = ETH_PAYLOAD_MAX;
1317 }
1318
1319 /* Assume Ethernet port. No need to set packet_type. */
1320 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1321 DP_NETDEV_HEADROOM);
1322 retval = (rx->is_tap
1323 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1324 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1325
1326 if (retval) {
1327 if (retval != EAGAIN && retval != EMSGSIZE) {
1328 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1329 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1330 }
1331 dp_packet_delete(buffer);
1332 } else {
1333 dp_packet_batch_init_packet(batch, buffer);
1334 }
1335
1336 if (qfill) {
1337 *qfill = -ENOTSUP;
1338 }
1339
1340 return retval;
1341 }
1342
1343 static void
1344 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1345 {
1346 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1347 poll_fd_wait(rx->fd, POLLIN);
1348 }
1349
1350 static int
1351 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1352 {
1353 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1354 if (rx->is_tap) {
1355 struct ifreq ifr;
1356 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1357 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1358 if (error) {
1359 return error;
1360 }
1361 drain_fd(rx->fd, ifr.ifr_qlen);
1362 return 0;
1363 } else {
1364 return drain_rcvbuf(rx->fd);
1365 }
1366 }
1367
1368 static int
1369 netdev_linux_sock_batch_send(int sock, int ifindex,
1370 struct dp_packet_batch *batch)
1371 {
1372 const size_t size = dp_packet_batch_size(batch);
1373 /* We don't bother setting most fields in sockaddr_ll because the
1374 * kernel ignores them for SOCK_RAW. */
1375 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1376 .sll_ifindex = ifindex };
1377
1378 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1379 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1380
1381 struct dp_packet *packet;
1382 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1383 iov[i].iov_base = dp_packet_data(packet);
1384 iov[i].iov_len = dp_packet_size(packet);
1385 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1386 .msg_namelen = sizeof sll,
1387 .msg_iov = &iov[i],
1388 .msg_iovlen = 1 };
1389 }
1390
1391 int error = 0;
1392 for (uint32_t ofs = 0; ofs < size; ) {
1393 ssize_t retval;
1394 do {
1395 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1396 error = retval < 0 ? errno : 0;
1397 } while (error == EINTR);
1398 if (error) {
1399 break;
1400 }
1401 ofs += retval;
1402 }
1403
1404 free(mmsg);
1405 free(iov);
1406 return error;
1407 }
1408
1409 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1410 * essential, because packets sent to a tap device with an AF_PACKET socket
1411 * will loop back to be *received* again on the tap device. This doesn't occur
1412 * on other interface types because we attach a socket filter to the rx
1413 * socket. */
1414 static int
1415 netdev_linux_tap_batch_send(struct netdev *netdev_,
1416 struct dp_packet_batch *batch)
1417 {
1418 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1419 struct dp_packet *packet;
1420
1421 /* The Linux tap driver returns EIO if the device is not up,
1422 * so if the device is not up, don't waste time sending it.
1423 * However, if the device is in another network namespace
1424 * then OVS can't retrieve the state. In that case, send the
1425 * packets anyway. */
1426 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1427 netdev->tx_dropped += dp_packet_batch_size(batch);
1428 return 0;
1429 }
1430
1431 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1432 size_t size = dp_packet_size(packet);
1433 ssize_t retval;
1434 int error;
1435
1436 do {
1437 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1438 error = retval < 0 ? errno : 0;
1439 } while (error == EINTR);
1440
1441 if (error) {
1442 /* The Linux tap driver returns EIO if the device is not up. From
1443 * the OVS side this is not an error, so we ignore it; otherwise,
1444 * return the erro. */
1445 if (error != EIO) {
1446 return error;
1447 }
1448 } else if (retval != size) {
1449 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1450 "bytes of %"PRIuSIZE") on %s",
1451 retval, size, netdev_get_name(netdev_));
1452 return EMSGSIZE;
1453 }
1454 }
1455 return 0;
1456 }
1457
1458 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1459 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1460 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1461 * the packet is too big or too small to transmit on the device.
1462 *
1463 * The kernel maintains a packet transmission queue, so the caller is not
1464 * expected to do additional queuing of packets. */
1465 static int
1466 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1467 struct dp_packet_batch *batch,
1468 bool concurrent_txq OVS_UNUSED)
1469 {
1470 int error = 0;
1471 int sock = 0;
1472
1473 if (!is_tap_netdev(netdev_)) {
1474 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1475 error = EOPNOTSUPP;
1476 goto free_batch;
1477 }
1478
1479 sock = af_packet_sock();
1480 if (sock < 0) {
1481 error = -sock;
1482 goto free_batch;
1483 }
1484
1485 int ifindex = netdev_get_ifindex(netdev_);
1486 if (ifindex < 0) {
1487 error = -ifindex;
1488 goto free_batch;
1489 }
1490
1491 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1492 } else {
1493 error = netdev_linux_tap_batch_send(netdev_, batch);
1494 }
1495 if (error) {
1496 if (error == ENOBUFS) {
1497 /* The Linux AF_PACKET implementation never blocks waiting
1498 * for room for packets, instead returning ENOBUFS.
1499 * Translate this into EAGAIN for the caller. */
1500 error = EAGAIN;
1501 } else {
1502 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1503 netdev_get_name(netdev_), ovs_strerror(error));
1504 }
1505 }
1506
1507 free_batch:
1508 dp_packet_delete_batch(batch, true);
1509 return error;
1510 }
1511
1512 /* Registers with the poll loop to wake up from the next call to poll_block()
1513 * when the packet transmission queue has sufficient room to transmit a packet
1514 * with netdev_send().
1515 *
1516 * The kernel maintains a packet transmission queue, so the client is not
1517 * expected to do additional queuing of packets. Thus, this function is
1518 * unlikely to ever be used. It is included for completeness. */
1519 static void
1520 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1521 {
1522 if (is_tap_netdev(netdev)) {
1523 /* TAP device always accepts packets.*/
1524 poll_immediate_wake();
1525 }
1526 }
1527
1528 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1529 * otherwise a positive errno value. */
1530 static int
1531 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1532 {
1533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1534 enum netdev_flags old_flags = 0;
1535 int error;
1536
1537 ovs_mutex_lock(&netdev->mutex);
1538 if (netdev_linux_netnsid_is_remote(netdev)) {
1539 error = EOPNOTSUPP;
1540 goto exit;
1541 }
1542
1543 if (netdev->cache_valid & VALID_ETHERADDR) {
1544 error = netdev->ether_addr_error;
1545 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1546 goto exit;
1547 }
1548 netdev->cache_valid &= ~VALID_ETHERADDR;
1549 }
1550
1551 /* Tap devices must be brought down before setting the address. */
1552 if (is_tap_netdev(netdev_)) {
1553 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1554 }
1555 error = set_etheraddr(netdev_get_name(netdev_), mac);
1556 if (!error || error == ENODEV) {
1557 netdev->ether_addr_error = error;
1558 netdev->cache_valid |= VALID_ETHERADDR;
1559 if (!error) {
1560 netdev->etheraddr = mac;
1561 }
1562 }
1563
1564 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1565 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1566 }
1567
1568 exit:
1569 ovs_mutex_unlock(&netdev->mutex);
1570 return error;
1571 }
1572
1573 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1574 static int
1575 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1576 {
1577 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1578 int error;
1579
1580 ovs_mutex_lock(&netdev->mutex);
1581 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1582 netdev_linux_update_via_netlink(netdev);
1583 }
1584
1585 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1586 /* Fall back to ioctl if netlink fails */
1587 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1588 &netdev->etheraddr);
1589 netdev->cache_valid |= VALID_ETHERADDR;
1590 }
1591
1592 error = netdev->ether_addr_error;
1593 if (!error) {
1594 *mac = netdev->etheraddr;
1595 }
1596 ovs_mutex_unlock(&netdev->mutex);
1597
1598 return error;
1599 }
1600
1601 static int
1602 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1603 {
1604 int error;
1605
1606 if (!(netdev->cache_valid & VALID_MTU)) {
1607 netdev_linux_update_via_netlink(netdev);
1608 }
1609
1610 if (!(netdev->cache_valid & VALID_MTU)) {
1611 /* Fall back to ioctl if netlink fails */
1612 struct ifreq ifr;
1613
1614 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1615 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1616 netdev->mtu = ifr.ifr_mtu;
1617 netdev->cache_valid |= VALID_MTU;
1618 }
1619
1620 error = netdev->netdev_mtu_error;
1621 if (!error) {
1622 *mtup = netdev->mtu;
1623 }
1624
1625 return error;
1626 }
1627
1628 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1629 * in bytes, not including the hardware header; thus, this is typically 1500
1630 * bytes for Ethernet devices. */
1631 static int
1632 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1633 {
1634 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1635 int error;
1636
1637 ovs_mutex_lock(&netdev->mutex);
1638 error = netdev_linux_get_mtu__(netdev, mtup);
1639 ovs_mutex_unlock(&netdev->mutex);
1640
1641 return error;
1642 }
1643
1644 /* Sets the maximum size of transmitted (MTU) for given device using linux
1645 * networking ioctl interface.
1646 */
1647 static int
1648 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1649 {
1650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1651 struct ifreq ifr;
1652 int error;
1653
1654 ovs_mutex_lock(&netdev->mutex);
1655 if (netdev_linux_netnsid_is_remote(netdev)) {
1656 error = EOPNOTSUPP;
1657 goto exit;
1658 }
1659
1660 if (netdev->cache_valid & VALID_MTU) {
1661 error = netdev->netdev_mtu_error;
1662 if (error || netdev->mtu == mtu) {
1663 goto exit;
1664 }
1665 netdev->cache_valid &= ~VALID_MTU;
1666 }
1667 ifr.ifr_mtu = mtu;
1668 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1669 SIOCSIFMTU, "SIOCSIFMTU");
1670 if (!error || error == ENODEV) {
1671 netdev->netdev_mtu_error = error;
1672 netdev->mtu = ifr.ifr_mtu;
1673 netdev->cache_valid |= VALID_MTU;
1674 }
1675 exit:
1676 ovs_mutex_unlock(&netdev->mutex);
1677 return error;
1678 }
1679
1680 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1681 * On failure, returns a negative errno value. */
1682 static int
1683 netdev_linux_get_ifindex(const struct netdev *netdev_)
1684 {
1685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1686 int ifindex, error;
1687
1688 ovs_mutex_lock(&netdev->mutex);
1689 if (netdev_linux_netnsid_is_remote(netdev)) {
1690 error = EOPNOTSUPP;
1691 goto exit;
1692 }
1693 error = get_ifindex(netdev_, &ifindex);
1694
1695 exit:
1696 ovs_mutex_unlock(&netdev->mutex);
1697 return error ? -error : ifindex;
1698 }
1699
1700 static int
1701 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1702 {
1703 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1704
1705 ovs_mutex_lock(&netdev->mutex);
1706 if (netdev->miimon_interval > 0) {
1707 *carrier = netdev->miimon;
1708 } else {
1709 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1710 }
1711 ovs_mutex_unlock(&netdev->mutex);
1712
1713 return 0;
1714 }
1715
1716 static long long int
1717 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1718 {
1719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1720 long long int carrier_resets;
1721
1722 ovs_mutex_lock(&netdev->mutex);
1723 carrier_resets = netdev->carrier_resets;
1724 ovs_mutex_unlock(&netdev->mutex);
1725
1726 return carrier_resets;
1727 }
1728
1729 static int
1730 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1731 struct mii_ioctl_data *data)
1732 {
1733 struct ifreq ifr;
1734 int error;
1735
1736 memset(&ifr, 0, sizeof ifr);
1737 memcpy(&ifr.ifr_data, data, sizeof *data);
1738 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1739 memcpy(data, &ifr.ifr_data, sizeof *data);
1740
1741 return error;
1742 }
1743
1744 static int
1745 netdev_linux_get_miimon(const char *name, bool *miimon)
1746 {
1747 struct mii_ioctl_data data;
1748 int error;
1749
1750 *miimon = false;
1751
1752 memset(&data, 0, sizeof data);
1753 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1754 if (!error) {
1755 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1756 data.reg_num = MII_BMSR;
1757 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1758 &data);
1759
1760 if (!error) {
1761 *miimon = !!(data.val_out & BMSR_LSTATUS);
1762 }
1763 }
1764 if (error) {
1765 struct ethtool_cmd ecmd;
1766
1767 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1768 name);
1769
1770 COVERAGE_INC(netdev_get_ethtool);
1771 memset(&ecmd, 0, sizeof ecmd);
1772 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1773 "ETHTOOL_GLINK");
1774 if (!error) {
1775 struct ethtool_value eval;
1776
1777 memcpy(&eval, &ecmd, sizeof eval);
1778 *miimon = !!eval.data;
1779 } else {
1780 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1781 }
1782 }
1783
1784 return error;
1785 }
1786
1787 static int
1788 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1789 long long int interval)
1790 {
1791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1792
1793 ovs_mutex_lock(&netdev->mutex);
1794 interval = interval > 0 ? MAX(interval, 100) : 0;
1795 if (netdev->miimon_interval != interval) {
1796 if (interval && !netdev->miimon_interval) {
1797 atomic_count_inc(&miimon_cnt);
1798 } else if (!interval && netdev->miimon_interval) {
1799 atomic_count_dec(&miimon_cnt);
1800 }
1801
1802 netdev->miimon_interval = interval;
1803 timer_set_expired(&netdev->miimon_timer);
1804 }
1805 ovs_mutex_unlock(&netdev->mutex);
1806
1807 return 0;
1808 }
1809
1810 static void
1811 netdev_linux_miimon_run(void)
1812 {
1813 struct shash device_shash;
1814 struct shash_node *node;
1815
1816 shash_init(&device_shash);
1817 netdev_get_devices(&netdev_linux_class, &device_shash);
1818 SHASH_FOR_EACH (node, &device_shash) {
1819 struct netdev *netdev = node->data;
1820 struct netdev_linux *dev = netdev_linux_cast(netdev);
1821 bool miimon;
1822
1823 ovs_mutex_lock(&dev->mutex);
1824 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1825 netdev_linux_get_miimon(dev->up.name, &miimon);
1826 if (miimon != dev->miimon) {
1827 dev->miimon = miimon;
1828 netdev_linux_changed(dev, dev->ifi_flags, 0);
1829 }
1830
1831 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1832 }
1833 ovs_mutex_unlock(&dev->mutex);
1834 netdev_close(netdev);
1835 }
1836
1837 shash_destroy(&device_shash);
1838 }
1839
1840 static void
1841 netdev_linux_miimon_wait(void)
1842 {
1843 struct shash device_shash;
1844 struct shash_node *node;
1845
1846 shash_init(&device_shash);
1847 netdev_get_devices(&netdev_linux_class, &device_shash);
1848 SHASH_FOR_EACH (node, &device_shash) {
1849 struct netdev *netdev = node->data;
1850 struct netdev_linux *dev = netdev_linux_cast(netdev);
1851
1852 ovs_mutex_lock(&dev->mutex);
1853 if (dev->miimon_interval > 0) {
1854 timer_wait(&dev->miimon_timer);
1855 }
1856 ovs_mutex_unlock(&dev->mutex);
1857 netdev_close(netdev);
1858 }
1859 shash_destroy(&device_shash);
1860 }
1861
1862 static void
1863 swap_uint64(uint64_t *a, uint64_t *b)
1864 {
1865 uint64_t tmp = *a;
1866 *a = *b;
1867 *b = tmp;
1868 }
1869
1870 /* Copies 'src' into 'dst', performing format conversion in the process.
1871 *
1872 * 'src' is allowed to be misaligned. */
1873 static void
1874 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1875 const struct ovs_vport_stats *src)
1876 {
1877 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1878 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1879 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1880 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1881 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1882 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1883 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1884 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1885 dst->multicast = 0;
1886 dst->collisions = 0;
1887 dst->rx_length_errors = 0;
1888 dst->rx_over_errors = 0;
1889 dst->rx_crc_errors = 0;
1890 dst->rx_frame_errors = 0;
1891 dst->rx_fifo_errors = 0;
1892 dst->rx_missed_errors = 0;
1893 dst->tx_aborted_errors = 0;
1894 dst->tx_carrier_errors = 0;
1895 dst->tx_fifo_errors = 0;
1896 dst->tx_heartbeat_errors = 0;
1897 dst->tx_window_errors = 0;
1898 }
1899
1900 static int
1901 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1902 {
1903 struct dpif_netlink_vport reply;
1904 struct ofpbuf *buf;
1905 int error;
1906
1907 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1908 if (error) {
1909 return error;
1910 } else if (!reply.stats) {
1911 ofpbuf_delete(buf);
1912 return EOPNOTSUPP;
1913 }
1914
1915 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1916
1917 ofpbuf_delete(buf);
1918
1919 return 0;
1920 }
1921
1922 static void
1923 get_stats_via_vport(const struct netdev *netdev_,
1924 struct netdev_stats *stats)
1925 {
1926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1927
1928 if (!netdev->vport_stats_error ||
1929 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1930 int error;
1931
1932 error = get_stats_via_vport__(netdev_, stats);
1933 if (error && error != ENOENT && error != ENODEV) {
1934 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1935 "(%s)",
1936 netdev_get_name(netdev_), ovs_strerror(error));
1937 }
1938 netdev->vport_stats_error = error;
1939 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1940 }
1941 }
1942
1943 /* Retrieves current device stats for 'netdev-linux'. */
1944 static int
1945 netdev_linux_get_stats(const struct netdev *netdev_,
1946 struct netdev_stats *stats)
1947 {
1948 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1949 struct netdev_stats dev_stats;
1950 int error;
1951
1952 ovs_mutex_lock(&netdev->mutex);
1953 get_stats_via_vport(netdev_, stats);
1954 error = get_stats_via_netlink(netdev_, &dev_stats);
1955 if (error) {
1956 if (!netdev->vport_stats_error) {
1957 error = 0;
1958 }
1959 } else if (netdev->vport_stats_error) {
1960 /* stats not available from OVS then use netdev stats. */
1961 *stats = dev_stats;
1962 } else {
1963 /* Use kernel netdev's packet and byte counts since vport's counters
1964 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1965 * enabled. */
1966 stats->rx_packets = dev_stats.rx_packets;
1967 stats->rx_bytes = dev_stats.rx_bytes;
1968 stats->tx_packets = dev_stats.tx_packets;
1969 stats->tx_bytes = dev_stats.tx_bytes;
1970
1971 stats->rx_errors += dev_stats.rx_errors;
1972 stats->tx_errors += dev_stats.tx_errors;
1973 stats->rx_dropped += dev_stats.rx_dropped;
1974 stats->tx_dropped += dev_stats.tx_dropped;
1975 stats->multicast += dev_stats.multicast;
1976 stats->collisions += dev_stats.collisions;
1977 stats->rx_length_errors += dev_stats.rx_length_errors;
1978 stats->rx_over_errors += dev_stats.rx_over_errors;
1979 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1980 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1981 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1982 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1983 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1984 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1985 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1986 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1987 stats->tx_window_errors += dev_stats.tx_window_errors;
1988 }
1989 ovs_mutex_unlock(&netdev->mutex);
1990
1991 return error;
1992 }
1993
1994 /* Retrieves current device stats for 'netdev-tap' netdev or
1995 * netdev-internal. */
1996 static int
1997 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1998 {
1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2000 struct netdev_stats dev_stats;
2001 int error;
2002
2003 ovs_mutex_lock(&netdev->mutex);
2004 get_stats_via_vport(netdev_, stats);
2005 error = get_stats_via_netlink(netdev_, &dev_stats);
2006 if (error) {
2007 if (!netdev->vport_stats_error) {
2008 error = 0;
2009 }
2010 } else if (netdev->vport_stats_error) {
2011 /* Transmit and receive stats will appear to be swapped relative to the
2012 * other ports since we are the one sending the data, not a remote
2013 * computer. For consistency, we swap them back here. This does not
2014 * apply if we are getting stats from the vport layer because it always
2015 * tracks stats from the perspective of the switch. */
2016
2017 *stats = dev_stats;
2018 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2019 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2020 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2021 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2022 stats->rx_length_errors = 0;
2023 stats->rx_over_errors = 0;
2024 stats->rx_crc_errors = 0;
2025 stats->rx_frame_errors = 0;
2026 stats->rx_fifo_errors = 0;
2027 stats->rx_missed_errors = 0;
2028 stats->tx_aborted_errors = 0;
2029 stats->tx_carrier_errors = 0;
2030 stats->tx_fifo_errors = 0;
2031 stats->tx_heartbeat_errors = 0;
2032 stats->tx_window_errors = 0;
2033 } else {
2034 /* Use kernel netdev's packet and byte counts since vport counters
2035 * do not reflect packet counts on the wire when GSO, TSO or GRO
2036 * are enabled. */
2037 stats->rx_packets = dev_stats.tx_packets;
2038 stats->rx_bytes = dev_stats.tx_bytes;
2039 stats->tx_packets = dev_stats.rx_packets;
2040 stats->tx_bytes = dev_stats.rx_bytes;
2041
2042 stats->rx_dropped += dev_stats.tx_dropped;
2043 stats->tx_dropped += dev_stats.rx_dropped;
2044
2045 stats->rx_errors += dev_stats.tx_errors;
2046 stats->tx_errors += dev_stats.rx_errors;
2047
2048 stats->multicast += dev_stats.multicast;
2049 stats->collisions += dev_stats.collisions;
2050 }
2051 stats->tx_dropped += netdev->tx_dropped;
2052 ovs_mutex_unlock(&netdev->mutex);
2053
2054 return error;
2055 }
2056
2057 static int
2058 netdev_internal_get_stats(const struct netdev *netdev_,
2059 struct netdev_stats *stats)
2060 {
2061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2062 int error;
2063
2064 ovs_mutex_lock(&netdev->mutex);
2065 get_stats_via_vport(netdev_, stats);
2066 error = netdev->vport_stats_error;
2067 ovs_mutex_unlock(&netdev->mutex);
2068
2069 return error;
2070 }
2071
2072 static void
2073 netdev_linux_read_features(struct netdev_linux *netdev)
2074 {
2075 struct ethtool_cmd ecmd;
2076 uint32_t speed;
2077 int error;
2078
2079 if (netdev->cache_valid & VALID_FEATURES) {
2080 return;
2081 }
2082
2083 COVERAGE_INC(netdev_get_ethtool);
2084 memset(&ecmd, 0, sizeof ecmd);
2085 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2086 ETHTOOL_GSET, "ETHTOOL_GSET");
2087 if (error) {
2088 goto out;
2089 }
2090
2091 /* Supported features. */
2092 netdev->supported = 0;
2093 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2094 netdev->supported |= NETDEV_F_10MB_HD;
2095 }
2096 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2097 netdev->supported |= NETDEV_F_10MB_FD;
2098 }
2099 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2100 netdev->supported |= NETDEV_F_100MB_HD;
2101 }
2102 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2103 netdev->supported |= NETDEV_F_100MB_FD;
2104 }
2105 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2106 netdev->supported |= NETDEV_F_1GB_HD;
2107 }
2108 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2109 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2110 netdev->supported |= NETDEV_F_1GB_FD;
2111 }
2112 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2113 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2114 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2115 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2116 netdev->supported |= NETDEV_F_10GB_FD;
2117 }
2118 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2119 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2120 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2121 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2122 netdev->supported |= NETDEV_F_40GB_FD;
2123 }
2124 if (ecmd.supported & SUPPORTED_TP) {
2125 netdev->supported |= NETDEV_F_COPPER;
2126 }
2127 if (ecmd.supported & SUPPORTED_FIBRE) {
2128 netdev->supported |= NETDEV_F_FIBER;
2129 }
2130 if (ecmd.supported & SUPPORTED_Autoneg) {
2131 netdev->supported |= NETDEV_F_AUTONEG;
2132 }
2133 if (ecmd.supported & SUPPORTED_Pause) {
2134 netdev->supported |= NETDEV_F_PAUSE;
2135 }
2136 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2137 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2138 }
2139
2140 /* Advertised features. */
2141 netdev->advertised = 0;
2142 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2143 netdev->advertised |= NETDEV_F_10MB_HD;
2144 }
2145 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2146 netdev->advertised |= NETDEV_F_10MB_FD;
2147 }
2148 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2149 netdev->advertised |= NETDEV_F_100MB_HD;
2150 }
2151 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2152 netdev->advertised |= NETDEV_F_100MB_FD;
2153 }
2154 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2155 netdev->advertised |= NETDEV_F_1GB_HD;
2156 }
2157 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2158 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2159 netdev->advertised |= NETDEV_F_1GB_FD;
2160 }
2161 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2162 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2163 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2164 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2165 netdev->advertised |= NETDEV_F_10GB_FD;
2166 }
2167 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2168 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2169 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2170 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2171 netdev->advertised |= NETDEV_F_40GB_FD;
2172 }
2173 if (ecmd.advertising & ADVERTISED_TP) {
2174 netdev->advertised |= NETDEV_F_COPPER;
2175 }
2176 if (ecmd.advertising & ADVERTISED_FIBRE) {
2177 netdev->advertised |= NETDEV_F_FIBER;
2178 }
2179 if (ecmd.advertising & ADVERTISED_Autoneg) {
2180 netdev->advertised |= NETDEV_F_AUTONEG;
2181 }
2182 if (ecmd.advertising & ADVERTISED_Pause) {
2183 netdev->advertised |= NETDEV_F_PAUSE;
2184 }
2185 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2186 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2187 }
2188
2189 /* Current settings. */
2190 speed = ethtool_cmd_speed(&ecmd);
2191 if (speed == SPEED_10) {
2192 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2193 } else if (speed == SPEED_100) {
2194 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2195 } else if (speed == SPEED_1000) {
2196 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2197 } else if (speed == SPEED_10000) {
2198 netdev->current = NETDEV_F_10GB_FD;
2199 } else if (speed == 40000) {
2200 netdev->current = NETDEV_F_40GB_FD;
2201 } else if (speed == 100000) {
2202 netdev->current = NETDEV_F_100GB_FD;
2203 } else if (speed == 1000000) {
2204 netdev->current = NETDEV_F_1TB_FD;
2205 } else {
2206 netdev->current = 0;
2207 }
2208
2209 if (ecmd.port == PORT_TP) {
2210 netdev->current |= NETDEV_F_COPPER;
2211 } else if (ecmd.port == PORT_FIBRE) {
2212 netdev->current |= NETDEV_F_FIBER;
2213 }
2214
2215 if (ecmd.autoneg) {
2216 netdev->current |= NETDEV_F_AUTONEG;
2217 }
2218
2219 out:
2220 netdev->cache_valid |= VALID_FEATURES;
2221 netdev->get_features_error = error;
2222 }
2223
2224 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2225 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2226 * Returns 0 if successful, otherwise a positive errno value. */
2227 static int
2228 netdev_linux_get_features(const struct netdev *netdev_,
2229 enum netdev_features *current,
2230 enum netdev_features *advertised,
2231 enum netdev_features *supported,
2232 enum netdev_features *peer)
2233 {
2234 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2235 int error;
2236
2237 ovs_mutex_lock(&netdev->mutex);
2238 if (netdev_linux_netnsid_is_remote(netdev)) {
2239 error = EOPNOTSUPP;
2240 goto exit;
2241 }
2242
2243 netdev_linux_read_features(netdev);
2244 if (!netdev->get_features_error) {
2245 *current = netdev->current;
2246 *advertised = netdev->advertised;
2247 *supported = netdev->supported;
2248 *peer = 0; /* XXX */
2249 }
2250 error = netdev->get_features_error;
2251
2252 exit:
2253 ovs_mutex_unlock(&netdev->mutex);
2254 return error;
2255 }
2256
2257 /* Set the features advertised by 'netdev' to 'advertise'. */
2258 static int
2259 netdev_linux_set_advertisements(struct netdev *netdev_,
2260 enum netdev_features advertise)
2261 {
2262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2263 struct ethtool_cmd ecmd;
2264 int error;
2265
2266 ovs_mutex_lock(&netdev->mutex);
2267
2268 COVERAGE_INC(netdev_get_ethtool);
2269
2270 if (netdev_linux_netnsid_is_remote(netdev)) {
2271 error = EOPNOTSUPP;
2272 goto exit;
2273 }
2274
2275 memset(&ecmd, 0, sizeof ecmd);
2276 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2277 ETHTOOL_GSET, "ETHTOOL_GSET");
2278 if (error) {
2279 goto exit;
2280 }
2281
2282 ecmd.advertising = 0;
2283 if (advertise & NETDEV_F_10MB_HD) {
2284 ecmd.advertising |= ADVERTISED_10baseT_Half;
2285 }
2286 if (advertise & NETDEV_F_10MB_FD) {
2287 ecmd.advertising |= ADVERTISED_10baseT_Full;
2288 }
2289 if (advertise & NETDEV_F_100MB_HD) {
2290 ecmd.advertising |= ADVERTISED_100baseT_Half;
2291 }
2292 if (advertise & NETDEV_F_100MB_FD) {
2293 ecmd.advertising |= ADVERTISED_100baseT_Full;
2294 }
2295 if (advertise & NETDEV_F_1GB_HD) {
2296 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2297 }
2298 if (advertise & NETDEV_F_1GB_FD) {
2299 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2300 }
2301 if (advertise & NETDEV_F_10GB_FD) {
2302 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2303 }
2304 if (advertise & NETDEV_F_COPPER) {
2305 ecmd.advertising |= ADVERTISED_TP;
2306 }
2307 if (advertise & NETDEV_F_FIBER) {
2308 ecmd.advertising |= ADVERTISED_FIBRE;
2309 }
2310 if (advertise & NETDEV_F_AUTONEG) {
2311 ecmd.advertising |= ADVERTISED_Autoneg;
2312 }
2313 if (advertise & NETDEV_F_PAUSE) {
2314 ecmd.advertising |= ADVERTISED_Pause;
2315 }
2316 if (advertise & NETDEV_F_PAUSE_ASYM) {
2317 ecmd.advertising |= ADVERTISED_Asym_Pause;
2318 }
2319 COVERAGE_INC(netdev_set_ethtool);
2320 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2321 ETHTOOL_SSET, "ETHTOOL_SSET");
2322
2323 exit:
2324 ovs_mutex_unlock(&netdev->mutex);
2325 return error;
2326 }
2327
2328 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2329 * successful, otherwise a positive errno value. */
2330 static int
2331 netdev_linux_set_policing(struct netdev *netdev_,
2332 uint32_t kbits_rate, uint32_t kbits_burst)
2333 {
2334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2335 const char *netdev_name = netdev_get_name(netdev_);
2336 int ifindex;
2337 int error;
2338
2339 if (netdev_is_flow_api_enabled()) {
2340 if (kbits_rate) {
2341 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2342 netdev_name);
2343 }
2344 return EOPNOTSUPP;
2345 }
2346
2347 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2348 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2349 : kbits_burst); /* Stick with user-specified value. */
2350
2351 ovs_mutex_lock(&netdev->mutex);
2352 if (netdev_linux_netnsid_is_remote(netdev)) {
2353 error = EOPNOTSUPP;
2354 goto out;
2355 }
2356
2357 if (netdev->cache_valid & VALID_POLICING) {
2358 error = netdev->netdev_policing_error;
2359 if (error || (netdev->kbits_rate == kbits_rate &&
2360 netdev->kbits_burst == kbits_burst)) {
2361 /* Assume that settings haven't changed since we last set them. */
2362 goto out;
2363 }
2364 netdev->cache_valid &= ~VALID_POLICING;
2365 }
2366
2367 error = get_ifindex(netdev_, &ifindex);
2368 if (error) {
2369 goto out;
2370 }
2371
2372 COVERAGE_INC(netdev_set_policing);
2373 /* Remove any existing ingress qdisc. */
2374 error = tc_add_del_ingress_qdisc(ifindex, false, 0);
2375 if (error) {
2376 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2377 netdev_name, ovs_strerror(error));
2378 goto out;
2379 }
2380
2381 if (kbits_rate) {
2382 error = tc_add_del_ingress_qdisc(ifindex, true, 0);
2383 if (error) {
2384 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2385 netdev_name, ovs_strerror(error));
2386 goto out;
2387 }
2388
2389 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2390 if (error){
2391 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2392 netdev_name, ovs_strerror(error));
2393 goto out;
2394 }
2395 }
2396
2397 netdev->kbits_rate = kbits_rate;
2398 netdev->kbits_burst = kbits_burst;
2399
2400 out:
2401 if (!error || error == ENODEV) {
2402 netdev->netdev_policing_error = error;
2403 netdev->cache_valid |= VALID_POLICING;
2404 }
2405 ovs_mutex_unlock(&netdev->mutex);
2406 return error;
2407 }
2408
2409 static int
2410 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2411 struct sset *types)
2412 {
2413 const struct tc_ops *const *opsp;
2414 for (opsp = tcs; *opsp != NULL; opsp++) {
2415 const struct tc_ops *ops = *opsp;
2416 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2417 sset_add(types, ops->ovs_name);
2418 }
2419 }
2420 return 0;
2421 }
2422
2423 static const struct tc_ops *
2424 tc_lookup_ovs_name(const char *name)
2425 {
2426 const struct tc_ops *const *opsp;
2427
2428 for (opsp = tcs; *opsp != NULL; opsp++) {
2429 const struct tc_ops *ops = *opsp;
2430 if (!strcmp(name, ops->ovs_name)) {
2431 return ops;
2432 }
2433 }
2434 return NULL;
2435 }
2436
2437 static const struct tc_ops *
2438 tc_lookup_linux_name(const char *name)
2439 {
2440 const struct tc_ops *const *opsp;
2441
2442 for (opsp = tcs; *opsp != NULL; opsp++) {
2443 const struct tc_ops *ops = *opsp;
2444 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2445 return ops;
2446 }
2447 }
2448 return NULL;
2449 }
2450
2451 static struct tc_queue *
2452 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2453 size_t hash)
2454 {
2455 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2456 struct tc_queue *queue;
2457
2458 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2459 if (queue->queue_id == queue_id) {
2460 return queue;
2461 }
2462 }
2463 return NULL;
2464 }
2465
2466 static struct tc_queue *
2467 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2468 {
2469 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2470 }
2471
2472 static int
2473 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2474 const char *type,
2475 struct netdev_qos_capabilities *caps)
2476 {
2477 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2478 if (!ops) {
2479 return EOPNOTSUPP;
2480 }
2481 caps->n_queues = ops->n_queues;
2482 return 0;
2483 }
2484
2485 static int
2486 netdev_linux_get_qos(const struct netdev *netdev_,
2487 const char **typep, struct smap *details)
2488 {
2489 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2490 int error;
2491
2492 ovs_mutex_lock(&netdev->mutex);
2493 if (netdev_linux_netnsid_is_remote(netdev)) {
2494 error = EOPNOTSUPP;
2495 goto exit;
2496 }
2497
2498 error = tc_query_qdisc(netdev_);
2499 if (!error) {
2500 *typep = netdev->tc->ops->ovs_name;
2501 error = (netdev->tc->ops->qdisc_get
2502 ? netdev->tc->ops->qdisc_get(netdev_, details)
2503 : 0);
2504 }
2505
2506 exit:
2507 ovs_mutex_unlock(&netdev->mutex);
2508 return error;
2509 }
2510
2511 static int
2512 netdev_linux_set_qos(struct netdev *netdev_,
2513 const char *type, const struct smap *details)
2514 {
2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2516 const struct tc_ops *new_ops;
2517 int error;
2518
2519 new_ops = tc_lookup_ovs_name(type);
2520 if (!new_ops || !new_ops->tc_install) {
2521 return EOPNOTSUPP;
2522 }
2523
2524 if (new_ops == &tc_ops_noop) {
2525 return new_ops->tc_install(netdev_, details);
2526 }
2527
2528 ovs_mutex_lock(&netdev->mutex);
2529 if (netdev_linux_netnsid_is_remote(netdev)) {
2530 error = EOPNOTSUPP;
2531 goto exit;
2532 }
2533
2534 error = tc_query_qdisc(netdev_);
2535 if (error) {
2536 goto exit;
2537 }
2538
2539 if (new_ops == netdev->tc->ops) {
2540 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2541 } else {
2542 /* Delete existing qdisc. */
2543 error = tc_del_qdisc(netdev_);
2544 if (error) {
2545 goto exit;
2546 }
2547 ovs_assert(netdev->tc == NULL);
2548
2549 /* Install new qdisc. */
2550 error = new_ops->tc_install(netdev_, details);
2551 ovs_assert((error == 0) == (netdev->tc != NULL));
2552 }
2553
2554 exit:
2555 ovs_mutex_unlock(&netdev->mutex);
2556 return error;
2557 }
2558
2559 static int
2560 netdev_linux_get_queue(const struct netdev *netdev_,
2561 unsigned int queue_id, struct smap *details)
2562 {
2563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2564 int error;
2565
2566 ovs_mutex_lock(&netdev->mutex);
2567 if (netdev_linux_netnsid_is_remote(netdev)) {
2568 error = EOPNOTSUPP;
2569 goto exit;
2570 }
2571
2572 error = tc_query_qdisc(netdev_);
2573 if (!error) {
2574 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2575 error = (queue
2576 ? netdev->tc->ops->class_get(netdev_, queue, details)
2577 : ENOENT);
2578 }
2579
2580 exit:
2581 ovs_mutex_unlock(&netdev->mutex);
2582 return error;
2583 }
2584
2585 static int
2586 netdev_linux_set_queue(struct netdev *netdev_,
2587 unsigned int queue_id, const struct smap *details)
2588 {
2589 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2590 int error;
2591
2592 ovs_mutex_lock(&netdev->mutex);
2593 if (netdev_linux_netnsid_is_remote(netdev)) {
2594 error = EOPNOTSUPP;
2595 goto exit;
2596 }
2597
2598 error = tc_query_qdisc(netdev_);
2599 if (!error) {
2600 error = (queue_id < netdev->tc->ops->n_queues
2601 && netdev->tc->ops->class_set
2602 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2603 : EINVAL);
2604 }
2605
2606 exit:
2607 ovs_mutex_unlock(&netdev->mutex);
2608 return error;
2609 }
2610
2611 static int
2612 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2613 {
2614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2615 int error;
2616
2617 ovs_mutex_lock(&netdev->mutex);
2618 if (netdev_linux_netnsid_is_remote(netdev)) {
2619 error = EOPNOTSUPP;
2620 goto exit;
2621 }
2622
2623 error = tc_query_qdisc(netdev_);
2624 if (!error) {
2625 if (netdev->tc->ops->class_delete) {
2626 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2627 error = (queue
2628 ? netdev->tc->ops->class_delete(netdev_, queue)
2629 : ENOENT);
2630 } else {
2631 error = EINVAL;
2632 }
2633 }
2634
2635 exit:
2636 ovs_mutex_unlock(&netdev->mutex);
2637 return error;
2638 }
2639
2640 static int
2641 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2642 unsigned int queue_id,
2643 struct netdev_queue_stats *stats)
2644 {
2645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2646 int error;
2647
2648 ovs_mutex_lock(&netdev->mutex);
2649 if (netdev_linux_netnsid_is_remote(netdev)) {
2650 error = EOPNOTSUPP;
2651 goto exit;
2652 }
2653
2654 error = tc_query_qdisc(netdev_);
2655 if (!error) {
2656 if (netdev->tc->ops->class_get_stats) {
2657 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2658 if (queue) {
2659 stats->created = queue->created;
2660 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2661 stats);
2662 } else {
2663 error = ENOENT;
2664 }
2665 } else {
2666 error = EOPNOTSUPP;
2667 }
2668 }
2669
2670 exit:
2671 ovs_mutex_unlock(&netdev->mutex);
2672 return error;
2673 }
2674
2675 struct queue_dump_state {
2676 struct nl_dump dump;
2677 struct ofpbuf buf;
2678 };
2679
2680 static bool
2681 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2682 {
2683 struct ofpbuf request;
2684 struct tcmsg *tcmsg;
2685
2686 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2687 if (!tcmsg) {
2688 return false;
2689 }
2690 tcmsg->tcm_parent = 0;
2691 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2692 ofpbuf_uninit(&request);
2693
2694 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2695 return true;
2696 }
2697
2698 static int
2699 finish_queue_dump(struct queue_dump_state *state)
2700 {
2701 ofpbuf_uninit(&state->buf);
2702 return nl_dump_done(&state->dump);
2703 }
2704
2705 struct netdev_linux_queue_state {
2706 unsigned int *queues;
2707 size_t cur_queue;
2708 size_t n_queues;
2709 };
2710
2711 static int
2712 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2713 {
2714 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2715 int error;
2716
2717 ovs_mutex_lock(&netdev->mutex);
2718 if (netdev_linux_netnsid_is_remote(netdev)) {
2719 error = EOPNOTSUPP;
2720 goto exit;
2721 }
2722
2723 error = tc_query_qdisc(netdev_);
2724 if (!error) {
2725 if (netdev->tc->ops->class_get) {
2726 struct netdev_linux_queue_state *state;
2727 struct tc_queue *queue;
2728 size_t i;
2729
2730 *statep = state = xmalloc(sizeof *state);
2731 state->n_queues = hmap_count(&netdev->tc->queues);
2732 state->cur_queue = 0;
2733 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2734
2735 i = 0;
2736 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2737 state->queues[i++] = queue->queue_id;
2738 }
2739 } else {
2740 error = EOPNOTSUPP;
2741 }
2742 }
2743
2744 exit:
2745 ovs_mutex_unlock(&netdev->mutex);
2746 return error;
2747 }
2748
2749 static int
2750 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2751 unsigned int *queue_idp, struct smap *details)
2752 {
2753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2754 struct netdev_linux_queue_state *state = state_;
2755 int error = EOF;
2756
2757 ovs_mutex_lock(&netdev->mutex);
2758 if (netdev_linux_netnsid_is_remote(netdev)) {
2759 error = EOPNOTSUPP;
2760 goto exit;
2761 }
2762
2763 while (state->cur_queue < state->n_queues) {
2764 unsigned int queue_id = state->queues[state->cur_queue++];
2765 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2766
2767 if (queue) {
2768 *queue_idp = queue_id;
2769 error = netdev->tc->ops->class_get(netdev_, queue, details);
2770 break;
2771 }
2772 }
2773
2774 exit:
2775 ovs_mutex_unlock(&netdev->mutex);
2776 return error;
2777 }
2778
2779 static int
2780 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2781 void *state_)
2782 {
2783 struct netdev_linux_queue_state *state = state_;
2784
2785 free(state->queues);
2786 free(state);
2787 return 0;
2788 }
2789
2790 static int
2791 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2792 netdev_dump_queue_stats_cb *cb, void *aux)
2793 {
2794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2795 int error;
2796
2797 ovs_mutex_lock(&netdev->mutex);
2798 if (netdev_linux_netnsid_is_remote(netdev)) {
2799 error = EOPNOTSUPP;
2800 goto exit;
2801 }
2802
2803 error = tc_query_qdisc(netdev_);
2804 if (!error) {
2805 struct queue_dump_state state;
2806
2807 if (!netdev->tc->ops->class_dump_stats) {
2808 error = EOPNOTSUPP;
2809 } else if (!start_queue_dump(netdev_, &state)) {
2810 error = ENODEV;
2811 } else {
2812 struct ofpbuf msg;
2813 int retval;
2814
2815 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2816 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2817 cb, aux);
2818 if (retval) {
2819 error = retval;
2820 }
2821 }
2822
2823 retval = finish_queue_dump(&state);
2824 if (retval) {
2825 error = retval;
2826 }
2827 }
2828 }
2829
2830 exit:
2831 ovs_mutex_unlock(&netdev->mutex);
2832 return error;
2833 }
2834
2835 static int
2836 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2837 struct in_addr netmask)
2838 {
2839 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2840 int error;
2841
2842 ovs_mutex_lock(&netdev->mutex);
2843 if (netdev_linux_netnsid_is_remote(netdev)) {
2844 error = EOPNOTSUPP;
2845 goto exit;
2846 }
2847
2848 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2849 if (!error) {
2850 if (address.s_addr != INADDR_ANY) {
2851 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2852 "SIOCSIFNETMASK", netmask);
2853 }
2854 }
2855
2856 exit:
2857 ovs_mutex_unlock(&netdev->mutex);
2858 return error;
2859 }
2860
2861 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2862 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2863 * error. */
2864 static int
2865 netdev_linux_get_addr_list(const struct netdev *netdev_,
2866 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2867 {
2868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2869 int error;
2870
2871 ovs_mutex_lock(&netdev->mutex);
2872 if (netdev_linux_netnsid_is_remote(netdev)) {
2873 error = EOPNOTSUPP;
2874 goto exit;
2875 }
2876
2877 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2878
2879 exit:
2880 ovs_mutex_unlock(&netdev->mutex);
2881 return error;
2882 }
2883
2884 static void
2885 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2886 {
2887 struct sockaddr_in sin;
2888 memset(&sin, 0, sizeof sin);
2889 sin.sin_family = AF_INET;
2890 sin.sin_addr = addr;
2891 sin.sin_port = 0;
2892
2893 memset(sa, 0, sizeof *sa);
2894 memcpy(sa, &sin, sizeof sin);
2895 }
2896
2897 static int
2898 do_set_addr(struct netdev *netdev,
2899 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2900 {
2901 struct ifreq ifr;
2902
2903 make_in4_sockaddr(&ifr.ifr_addr, addr);
2904 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2905 ioctl_name);
2906 }
2907
2908 /* Adds 'router' as a default IP gateway. */
2909 static int
2910 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2911 {
2912 struct in_addr any = { INADDR_ANY };
2913 struct rtentry rt;
2914 int error;
2915
2916 memset(&rt, 0, sizeof rt);
2917 make_in4_sockaddr(&rt.rt_dst, any);
2918 make_in4_sockaddr(&rt.rt_gateway, router);
2919 make_in4_sockaddr(&rt.rt_genmask, any);
2920 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2921 error = af_inet_ioctl(SIOCADDRT, &rt);
2922 if (error) {
2923 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2924 }
2925 return error;
2926 }
2927
2928 static int
2929 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2930 char **netdev_name)
2931 {
2932 static const char fn[] = "/proc/net/route";
2933 FILE *stream;
2934 char line[256];
2935 int ln;
2936
2937 *netdev_name = NULL;
2938 stream = fopen(fn, "r");
2939 if (stream == NULL) {
2940 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2941 return errno;
2942 }
2943
2944 ln = 0;
2945 while (fgets(line, sizeof line, stream)) {
2946 if (++ln >= 2) {
2947 char iface[17];
2948 ovs_be32 dest, gateway, mask;
2949 int refcnt, metric, mtu;
2950 unsigned int flags, use, window, irtt;
2951
2952 if (!ovs_scan(line,
2953 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2954 " %d %u %u\n",
2955 iface, &dest, &gateway, &flags, &refcnt,
2956 &use, &metric, &mask, &mtu, &window, &irtt)) {
2957 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2958 fn, ln, line);
2959 continue;
2960 }
2961 if (!(flags & RTF_UP)) {
2962 /* Skip routes that aren't up. */
2963 continue;
2964 }
2965
2966 /* The output of 'dest', 'mask', and 'gateway' were given in
2967 * network byte order, so we don't need need any endian
2968 * conversions here. */
2969 if ((dest & mask) == (host->s_addr & mask)) {
2970 if (!gateway) {
2971 /* The host is directly reachable. */
2972 next_hop->s_addr = 0;
2973 } else {
2974 /* To reach the host, we must go through a gateway. */
2975 next_hop->s_addr = gateway;
2976 }
2977 *netdev_name = xstrdup(iface);
2978 fclose(stream);
2979 return 0;
2980 }
2981 }
2982 }
2983
2984 fclose(stream);
2985 return ENXIO;
2986 }
2987
2988 static int
2989 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2990 {
2991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2992 int error = 0;
2993
2994 ovs_mutex_lock(&netdev->mutex);
2995 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2996 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2997
2998 COVERAGE_INC(netdev_get_ethtool);
2999 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3000 error = netdev_linux_do_ethtool(netdev->up.name,
3001 cmd,
3002 ETHTOOL_GDRVINFO,
3003 "ETHTOOL_GDRVINFO");
3004 if (!error) {
3005 netdev->cache_valid |= VALID_DRVINFO;
3006 }
3007 }
3008
3009 if (!error) {
3010 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3011 smap_add(smap, "driver_version", netdev->drvinfo.version);
3012 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3013 }
3014 ovs_mutex_unlock(&netdev->mutex);
3015
3016 return error;
3017 }
3018
3019 static int
3020 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3021 struct smap *smap)
3022 {
3023 smap_add(smap, "driver_name", "openvswitch");
3024 return 0;
3025 }
3026
3027 static uint32_t
3028 netdev_linux_get_block_id(struct netdev *netdev_)
3029 {
3030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3031 uint32_t block_id = 0;
3032
3033 ovs_mutex_lock(&netdev->mutex);
3034 /* Ensure the linux netdev has had its fields populated. */
3035 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3036 netdev_linux_update_via_netlink(netdev);
3037 }
3038
3039 /* Only assigning block ids to linux netdevs that are LAG masters. */
3040 if (netdev->is_lag_master) {
3041 block_id = netdev->ifindex;
3042 }
3043 ovs_mutex_unlock(&netdev->mutex);
3044
3045 return block_id;
3046 }
3047
3048 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3049 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3050 * returns 0. Otherwise, it returns a positive errno value; in particular,
3051 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3052 static int
3053 netdev_linux_arp_lookup(const struct netdev *netdev,
3054 ovs_be32 ip, struct eth_addr *mac)
3055 {
3056 struct arpreq r;
3057 struct sockaddr_in sin;
3058 int retval;
3059
3060 memset(&r, 0, sizeof r);
3061 memset(&sin, 0, sizeof sin);
3062 sin.sin_family = AF_INET;
3063 sin.sin_addr.s_addr = ip;
3064 sin.sin_port = 0;
3065 memcpy(&r.arp_pa, &sin, sizeof sin);
3066 r.arp_ha.sa_family = ARPHRD_ETHER;
3067 r.arp_flags = 0;
3068 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3069 COVERAGE_INC(netdev_arp_lookup);
3070 retval = af_inet_ioctl(SIOCGARP, &r);
3071 if (!retval) {
3072 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3073 } else if (retval != ENXIO) {
3074 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3075 netdev_get_name(netdev), IP_ARGS(ip),
3076 ovs_strerror(retval));
3077 }
3078 return retval;
3079 }
3080
3081 static int
3082 nd_to_iff_flags(enum netdev_flags nd)
3083 {
3084 int iff = 0;
3085 if (nd & NETDEV_UP) {
3086 iff |= IFF_UP;
3087 }
3088 if (nd & NETDEV_PROMISC) {
3089 iff |= IFF_PROMISC;
3090 }
3091 if (nd & NETDEV_LOOPBACK) {
3092 iff |= IFF_LOOPBACK;
3093 }
3094 return iff;
3095 }
3096
3097 static int
3098 iff_to_nd_flags(int iff)
3099 {
3100 enum netdev_flags nd = 0;
3101 if (iff & IFF_UP) {
3102 nd |= NETDEV_UP;
3103 }
3104 if (iff & IFF_PROMISC) {
3105 nd |= NETDEV_PROMISC;
3106 }
3107 if (iff & IFF_LOOPBACK) {
3108 nd |= NETDEV_LOOPBACK;
3109 }
3110 return nd;
3111 }
3112
3113 static int
3114 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3115 enum netdev_flags on, enum netdev_flags *old_flagsp)
3116 OVS_REQUIRES(netdev->mutex)
3117 {
3118 int old_flags, new_flags;
3119 int error = 0;
3120
3121 old_flags = netdev->ifi_flags;
3122 *old_flagsp = iff_to_nd_flags(old_flags);
3123 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3124 if (new_flags != old_flags) {
3125 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3126 get_flags(&netdev->up, &netdev->ifi_flags);
3127 }
3128
3129 return error;
3130 }
3131
3132 static int
3133 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3134 enum netdev_flags on, enum netdev_flags *old_flagsp)
3135 {
3136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3137 int error = 0;
3138
3139 ovs_mutex_lock(&netdev->mutex);
3140 if (on || off) {
3141 /* Changing flags over netlink isn't support yet. */
3142 if (netdev_linux_netnsid_is_remote(netdev)) {
3143 error = EOPNOTSUPP;
3144 goto exit;
3145 }
3146 error = update_flags(netdev, off, on, old_flagsp);
3147 } else {
3148 /* Try reading flags over netlink, or fall back to ioctl. */
3149 if (!netdev_linux_update_via_netlink(netdev)) {
3150 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3151 } else {
3152 error = update_flags(netdev, off, on, old_flagsp);
3153 }
3154 }
3155
3156 exit:
3157 ovs_mutex_unlock(&netdev->mutex);
3158 return error;
3159 }
3160
3161 #define NETDEV_LINUX_CLASS_COMMON \
3162 .run = netdev_linux_run, \
3163 .wait = netdev_linux_wait, \
3164 .alloc = netdev_linux_alloc, \
3165 .destruct = netdev_linux_destruct, \
3166 .dealloc = netdev_linux_dealloc, \
3167 .send = netdev_linux_send, \
3168 .send_wait = netdev_linux_send_wait, \
3169 .set_etheraddr = netdev_linux_set_etheraddr, \
3170 .get_etheraddr = netdev_linux_get_etheraddr, \
3171 .get_mtu = netdev_linux_get_mtu, \
3172 .set_mtu = netdev_linux_set_mtu, \
3173 .get_ifindex = netdev_linux_get_ifindex, \
3174 .get_carrier = netdev_linux_get_carrier, \
3175 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3176 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3177 .set_advertisements = netdev_linux_set_advertisements, \
3178 .set_policing = netdev_linux_set_policing, \
3179 .get_qos_types = netdev_linux_get_qos_types, \
3180 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3181 .get_qos = netdev_linux_get_qos, \
3182 .set_qos = netdev_linux_set_qos, \
3183 .get_queue = netdev_linux_get_queue, \
3184 .set_queue = netdev_linux_set_queue, \
3185 .delete_queue = netdev_linux_delete_queue, \
3186 .get_queue_stats = netdev_linux_get_queue_stats, \
3187 .queue_dump_start = netdev_linux_queue_dump_start, \
3188 .queue_dump_next = netdev_linux_queue_dump_next, \
3189 .queue_dump_done = netdev_linux_queue_dump_done, \
3190 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3191 .set_in4 = netdev_linux_set_in4, \
3192 .get_addr_list = netdev_linux_get_addr_list, \
3193 .add_router = netdev_linux_add_router, \
3194 .get_next_hop = netdev_linux_get_next_hop, \
3195 .arp_lookup = netdev_linux_arp_lookup, \
3196 .update_flags = netdev_linux_update_flags, \
3197 .rxq_alloc = netdev_linux_rxq_alloc, \
3198 .rxq_construct = netdev_linux_rxq_construct, \
3199 .rxq_destruct = netdev_linux_rxq_destruct, \
3200 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3201 .rxq_recv = netdev_linux_rxq_recv, \
3202 .rxq_wait = netdev_linux_rxq_wait, \
3203 .rxq_drain = netdev_linux_rxq_drain
3204
3205 const struct netdev_class netdev_linux_class = {
3206 NETDEV_LINUX_CLASS_COMMON,
3207 LINUX_FLOW_OFFLOAD_API,
3208 .type = "system",
3209 .construct = netdev_linux_construct,
3210 .get_stats = netdev_linux_get_stats,
3211 .get_features = netdev_linux_get_features,
3212 .get_status = netdev_linux_get_status,
3213 .get_block_id = netdev_linux_get_block_id
3214 };
3215
3216 const struct netdev_class netdev_tap_class = {
3217 NETDEV_LINUX_CLASS_COMMON,
3218 .type = "tap",
3219 .construct = netdev_linux_construct_tap,
3220 .get_stats = netdev_tap_get_stats,
3221 .get_features = netdev_linux_get_features,
3222 .get_status = netdev_linux_get_status,
3223 };
3224
3225 const struct netdev_class netdev_internal_class = {
3226 NETDEV_LINUX_CLASS_COMMON,
3227 .type = "internal",
3228 .construct = netdev_linux_construct,
3229 .get_stats = netdev_internal_get_stats,
3230 .get_status = netdev_internal_get_status,
3231 };
3232 \f
3233
3234 #define CODEL_N_QUEUES 0x0000
3235
3236 /* In sufficiently new kernel headers these are defined as enums in
3237 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3238 * kernels. (This overrides any enum definition in the header file but that's
3239 * harmless.) */
3240 #define TCA_CODEL_TARGET 1
3241 #define TCA_CODEL_LIMIT 2
3242 #define TCA_CODEL_INTERVAL 3
3243
3244 struct codel {
3245 struct tc tc;
3246 uint32_t target;
3247 uint32_t limit;
3248 uint32_t interval;
3249 };
3250
3251 static struct codel *
3252 codel_get__(const struct netdev *netdev_)
3253 {
3254 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3255 return CONTAINER_OF(netdev->tc, struct codel, tc);
3256 }
3257
3258 static void
3259 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3260 uint32_t interval)
3261 {
3262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3263 struct codel *codel;
3264
3265 codel = xmalloc(sizeof *codel);
3266 tc_init(&codel->tc, &tc_ops_codel);
3267 codel->target = target;
3268 codel->limit = limit;
3269 codel->interval = interval;
3270
3271 netdev->tc = &codel->tc;
3272 }
3273
3274 static int
3275 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3276 uint32_t interval)
3277 {
3278 size_t opt_offset;
3279 struct ofpbuf request;
3280 struct tcmsg *tcmsg;
3281 uint32_t otarget, olimit, ointerval;
3282 int error;
3283
3284 tc_del_qdisc(netdev);
3285
3286 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3287 NLM_F_EXCL | NLM_F_CREATE, &request);
3288 if (!tcmsg) {
3289 return ENODEV;
3290 }
3291 tcmsg->tcm_handle = tc_make_handle(1, 0);
3292 tcmsg->tcm_parent = TC_H_ROOT;
3293
3294 otarget = target ? target : 5000;
3295 olimit = limit ? limit : 10240;
3296 ointerval = interval ? interval : 100000;
3297
3298 nl_msg_put_string(&request, TCA_KIND, "codel");
3299 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3300 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3301 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3302 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3303 nl_msg_end_nested(&request, opt_offset);
3304
3305 error = tc_transact(&request, NULL);
3306 if (error) {
3307 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3308 "target %u, limit %u, interval %u error %d(%s)",
3309 netdev_get_name(netdev),
3310 otarget, olimit, ointerval,
3311 error, ovs_strerror(error));
3312 }
3313 return error;
3314 }
3315
3316 static void
3317 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3318 const struct smap *details, struct codel *codel)
3319 {
3320 codel->target = smap_get_ullong(details, "target", 0);
3321 codel->limit = smap_get_ullong(details, "limit", 0);
3322 codel->interval = smap_get_ullong(details, "interval", 0);
3323
3324 if (!codel->target) {
3325 codel->target = 5000;
3326 }
3327 if (!codel->limit) {
3328 codel->limit = 10240;
3329 }
3330 if (!codel->interval) {
3331 codel->interval = 100000;
3332 }
3333 }
3334
3335 static int
3336 codel_tc_install(struct netdev *netdev, const struct smap *details)
3337 {
3338 int error;
3339 struct codel codel;
3340
3341 codel_parse_qdisc_details__(netdev, details, &codel);
3342 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3343 codel.interval);
3344 if (!error) {
3345 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3346 }
3347 return error;
3348 }
3349
3350 static int
3351 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3352 {
3353 static const struct nl_policy tca_codel_policy[] = {
3354 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3355 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3356 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3357 };
3358
3359 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3360
3361 if (!nl_parse_nested(nl_options, tca_codel_policy,
3362 attrs, ARRAY_SIZE(tca_codel_policy))) {
3363 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3364 return EPROTO;
3365 }
3366
3367 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3368 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3369 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3370 return 0;
3371 }
3372
3373 static int
3374 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3375 {
3376 struct nlattr *nlattr;
3377 const char * kind;
3378 int error;
3379 struct codel codel;
3380
3381 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3382 if (error != 0) {
3383 return error;
3384 }
3385
3386 error = codel_parse_tca_options__(nlattr, &codel);
3387 if (error != 0) {
3388 return error;
3389 }
3390
3391 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3392 return 0;
3393 }
3394
3395
3396 static void
3397 codel_tc_destroy(struct tc *tc)
3398 {
3399 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3400 tc_destroy(tc);
3401 free(codel);
3402 }
3403
3404 static int
3405 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3406 {
3407 const struct codel *codel = codel_get__(netdev);
3408 smap_add_format(details, "target", "%u", codel->target);
3409 smap_add_format(details, "limit", "%u", codel->limit);
3410 smap_add_format(details, "interval", "%u", codel->interval);
3411 return 0;
3412 }
3413
3414 static int
3415 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3416 {
3417 struct codel codel;
3418
3419 codel_parse_qdisc_details__(netdev, details, &codel);
3420 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3421 codel_get__(netdev)->target = codel.target;
3422 codel_get__(netdev)->limit = codel.limit;
3423 codel_get__(netdev)->interval = codel.interval;
3424 return 0;
3425 }
3426
3427 static const struct tc_ops tc_ops_codel = {
3428 .linux_name = "codel",
3429 .ovs_name = "linux-codel",
3430 .n_queues = CODEL_N_QUEUES,
3431 .tc_install = codel_tc_install,
3432 .tc_load = codel_tc_load,
3433 .tc_destroy = codel_tc_destroy,
3434 .qdisc_get = codel_qdisc_get,
3435 .qdisc_set = codel_qdisc_set,
3436 };
3437 \f
3438 /* FQ-CoDel traffic control class. */
3439
3440 #define FQCODEL_N_QUEUES 0x0000
3441
3442 /* In sufficiently new kernel headers these are defined as enums in
3443 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3444 * kernels. (This overrides any enum definition in the header file but that's
3445 * harmless.) */
3446 #define TCA_FQ_CODEL_TARGET 1
3447 #define TCA_FQ_CODEL_LIMIT 2
3448 #define TCA_FQ_CODEL_INTERVAL 3
3449 #define TCA_FQ_CODEL_ECN 4
3450 #define TCA_FQ_CODEL_FLOWS 5
3451 #define TCA_FQ_CODEL_QUANTUM 6
3452
3453 struct fqcodel {
3454 struct tc tc;
3455 uint32_t target;
3456 uint32_t limit;
3457 uint32_t interval;
3458 uint32_t flows;
3459 uint32_t quantum;
3460 };
3461
3462 static struct fqcodel *
3463 fqcodel_get__(const struct netdev *netdev_)
3464 {
3465 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3466 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3467 }
3468
3469 static void
3470 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3471 uint32_t interval, uint32_t flows, uint32_t quantum)
3472 {
3473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3474 struct fqcodel *fqcodel;
3475
3476 fqcodel = xmalloc(sizeof *fqcodel);
3477 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3478 fqcodel->target = target;
3479 fqcodel->limit = limit;
3480 fqcodel->interval = interval;
3481 fqcodel->flows = flows;
3482 fqcodel->quantum = quantum;
3483
3484 netdev->tc = &fqcodel->tc;
3485 }
3486
3487 static int
3488 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3489 uint32_t interval, uint32_t flows, uint32_t quantum)
3490 {
3491 size_t opt_offset;
3492 struct ofpbuf request;
3493 struct tcmsg *tcmsg;
3494 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3495 int error;
3496
3497 tc_del_qdisc(netdev);
3498
3499 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3500 NLM_F_EXCL | NLM_F_CREATE, &request);
3501 if (!tcmsg) {
3502 return ENODEV;
3503 }
3504 tcmsg->tcm_handle = tc_make_handle(1, 0);
3505 tcmsg->tcm_parent = TC_H_ROOT;
3506
3507 otarget = target ? target : 5000;
3508 olimit = limit ? limit : 10240;
3509 ointerval = interval ? interval : 100000;
3510 oflows = flows ? flows : 1024;
3511 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3512 not mtu */
3513
3514 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3515 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3516 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3517 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3518 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3519 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3520 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3521 nl_msg_end_nested(&request, opt_offset);
3522
3523 error = tc_transact(&request, NULL);
3524 if (error) {
3525 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3526 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3527 netdev_get_name(netdev),
3528 otarget, olimit, ointerval, oflows, oquantum,
3529 error, ovs_strerror(error));
3530 }
3531 return error;
3532 }
3533
3534 static void
3535 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3536 const struct smap *details, struct fqcodel *fqcodel)
3537 {
3538 fqcodel->target = smap_get_ullong(details, "target", 0);
3539 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3540 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3541 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3542 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3543
3544 if (!fqcodel->target) {
3545 fqcodel->target = 5000;
3546 }
3547 if (!fqcodel->limit) {
3548 fqcodel->limit = 10240;
3549 }
3550 if (!fqcodel->interval) {
3551 fqcodel->interval = 1000000;
3552 }
3553 if (!fqcodel->flows) {
3554 fqcodel->flows = 1024;
3555 }
3556 if (!fqcodel->quantum) {
3557 fqcodel->quantum = 1514;
3558 }
3559 }
3560
3561 static int
3562 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3563 {
3564 int error;
3565 struct fqcodel fqcodel;
3566
3567 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3568 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3569 fqcodel.interval, fqcodel.flows,
3570 fqcodel.quantum);
3571 if (!error) {
3572 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3573 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3574 }
3575 return error;
3576 }
3577
3578 static int
3579 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3580 {
3581 static const struct nl_policy tca_fqcodel_policy[] = {
3582 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3583 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3584 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3585 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3586 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3587 };
3588
3589 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3590
3591 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3592 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3593 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3594 return EPROTO;
3595 }
3596
3597 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3598 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3599 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3600 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3601 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3602 return 0;
3603 }
3604
3605 static int
3606 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3607 {
3608 struct nlattr *nlattr;
3609 const char * kind;
3610 int error;
3611 struct fqcodel fqcodel;
3612
3613 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3614 if (error != 0) {
3615 return error;
3616 }
3617
3618 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3619 if (error != 0) {
3620 return error;
3621 }
3622
3623 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3624 fqcodel.flows, fqcodel.quantum);
3625 return 0;
3626 }
3627
3628 static void
3629 fqcodel_tc_destroy(struct tc *tc)
3630 {
3631 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3632 tc_destroy(tc);
3633 free(fqcodel);
3634 }
3635
3636 static int
3637 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3638 {
3639 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3640 smap_add_format(details, "target", "%u", fqcodel->target);
3641 smap_add_format(details, "limit", "%u", fqcodel->limit);
3642 smap_add_format(details, "interval", "%u", fqcodel->interval);
3643 smap_add_format(details, "flows", "%u", fqcodel->flows);
3644 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3645 return 0;
3646 }
3647
3648 static int
3649 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3650 {
3651 struct fqcodel fqcodel;
3652
3653 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3654 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3655 fqcodel.flows, fqcodel.quantum);
3656 fqcodel_get__(netdev)->target = fqcodel.target;
3657 fqcodel_get__(netdev)->limit = fqcodel.limit;
3658 fqcodel_get__(netdev)->interval = fqcodel.interval;
3659 fqcodel_get__(netdev)->flows = fqcodel.flows;
3660 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3661 return 0;
3662 }
3663
3664 static const struct tc_ops tc_ops_fqcodel = {
3665 .linux_name = "fq_codel",
3666 .ovs_name = "linux-fq_codel",
3667 .n_queues = FQCODEL_N_QUEUES,
3668 .tc_install = fqcodel_tc_install,
3669 .tc_load = fqcodel_tc_load,
3670 .tc_destroy = fqcodel_tc_destroy,
3671 .qdisc_get = fqcodel_qdisc_get,
3672 .qdisc_set = fqcodel_qdisc_set,
3673 };
3674 \f
3675 /* SFQ traffic control class. */
3676
3677 #define SFQ_N_QUEUES 0x0000
3678
3679 struct sfq {
3680 struct tc tc;
3681 uint32_t quantum;
3682 uint32_t perturb;
3683 };
3684
3685 static struct sfq *
3686 sfq_get__(const struct netdev *netdev_)
3687 {
3688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3689 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3690 }
3691
3692 static void
3693 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3694 {
3695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3696 struct sfq *sfq;
3697
3698 sfq = xmalloc(sizeof *sfq);
3699 tc_init(&sfq->tc, &tc_ops_sfq);
3700 sfq->perturb = perturb;
3701 sfq->quantum = quantum;
3702
3703 netdev->tc = &sfq->tc;
3704 }
3705
3706 static int
3707 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3708 {
3709 struct tc_sfq_qopt opt;
3710 struct ofpbuf request;
3711 struct tcmsg *tcmsg;
3712 int mtu;
3713 int mtu_error, error;
3714 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3715
3716 tc_del_qdisc(netdev);
3717
3718 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3719 NLM_F_EXCL | NLM_F_CREATE, &request);
3720 if (!tcmsg) {
3721 return ENODEV;
3722 }
3723 tcmsg->tcm_handle = tc_make_handle(1, 0);
3724 tcmsg->tcm_parent = TC_H_ROOT;
3725
3726 memset(&opt, 0, sizeof opt);
3727 if (!quantum) {
3728 if (!mtu_error) {
3729 opt.quantum = mtu; /* if we cannot find mtu, use default */
3730 }
3731 } else {
3732 opt.quantum = quantum;
3733 }
3734
3735 if (!perturb) {
3736 opt.perturb_period = 10;
3737 } else {
3738 opt.perturb_period = perturb;
3739 }
3740
3741 nl_msg_put_string(&request, TCA_KIND, "sfq");
3742 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3743
3744 error = tc_transact(&request, NULL);
3745 if (error) {
3746 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3747 "quantum %u, perturb %u error %d(%s)",
3748 netdev_get_name(netdev),
3749 opt.quantum, opt.perturb_period,
3750 error, ovs_strerror(error));
3751 }
3752 return error;
3753 }
3754
3755 static void
3756 sfq_parse_qdisc_details__(struct netdev *netdev,
3757 const struct smap *details, struct sfq *sfq)
3758 {
3759 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3760 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3761
3762 if (!sfq->perturb) {
3763 sfq->perturb = 10;
3764 }
3765
3766 if (!sfq->quantum) {
3767 int mtu;
3768 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3769 sfq->quantum = mtu;
3770 } else {
3771 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3772 "device without mtu");
3773 }
3774 }
3775 }
3776
3777 static int
3778 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3779 {
3780 int error;
3781 struct sfq sfq;
3782
3783 sfq_parse_qdisc_details__(netdev, details, &sfq);
3784 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3785 if (!error) {
3786 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3787 }
3788 return error;
3789 }
3790
3791 static int
3792 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3793 {
3794 const struct tc_sfq_qopt *sfq;
3795 struct nlattr *nlattr;
3796 const char * kind;
3797 int error;
3798
3799 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3800 if (error == 0) {
3801 sfq = nl_attr_get(nlattr);
3802 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3803 return 0;
3804 }
3805
3806 return error;
3807 }
3808
3809 static void
3810 sfq_tc_destroy(struct tc *tc)
3811 {
3812 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3813 tc_destroy(tc);
3814 free(sfq);
3815 }
3816
3817 static int
3818 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3819 {
3820 const struct sfq *sfq = sfq_get__(netdev);
3821 smap_add_format(details, "quantum", "%u", sfq->quantum);
3822 smap_add_format(details, "perturb", "%u", sfq->perturb);
3823 return 0;
3824 }
3825
3826 static int
3827 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3828 {
3829 struct sfq sfq;
3830
3831 sfq_parse_qdisc_details__(netdev, details, &sfq);
3832 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3833 sfq_get__(netdev)->quantum = sfq.quantum;
3834 sfq_get__(netdev)->perturb = sfq.perturb;
3835 return 0;
3836 }
3837
3838 static const struct tc_ops tc_ops_sfq = {
3839 .linux_name = "sfq",
3840 .ovs_name = "linux-sfq",
3841 .n_queues = SFQ_N_QUEUES,
3842 .tc_install = sfq_tc_install,
3843 .tc_load = sfq_tc_load,
3844 .tc_destroy = sfq_tc_destroy,
3845 .qdisc_get = sfq_qdisc_get,
3846 .qdisc_set = sfq_qdisc_set,
3847 };
3848 \f
3849 /* HTB traffic control class. */
3850
3851 #define HTB_N_QUEUES 0xf000
3852 #define HTB_RATE2QUANTUM 10
3853
3854 struct htb {
3855 struct tc tc;
3856 unsigned int max_rate; /* In bytes/s. */
3857 };
3858
3859 struct htb_class {
3860 struct tc_queue tc_queue;
3861 unsigned int min_rate; /* In bytes/s. */
3862 unsigned int max_rate; /* In bytes/s. */
3863 unsigned int burst; /* In bytes. */
3864 unsigned int priority; /* Lower values are higher priorities. */
3865 };
3866
3867 static struct htb *
3868 htb_get__(const struct netdev *netdev_)
3869 {
3870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3871 return CONTAINER_OF(netdev->tc, struct htb, tc);
3872 }
3873
3874 static void
3875 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3876 {
3877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3878 struct htb *htb;
3879
3880 htb = xmalloc(sizeof *htb);
3881 tc_init(&htb->tc, &tc_ops_htb);
3882 htb->max_rate = max_rate;
3883
3884 netdev->tc = &htb->tc;
3885 }
3886
3887 /* Create an HTB qdisc.
3888 *
3889 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3890 static int
3891 htb_setup_qdisc__(struct netdev *netdev)
3892 {
3893 size_t opt_offset;
3894 struct tc_htb_glob opt;
3895 struct ofpbuf request;
3896 struct tcmsg *tcmsg;
3897
3898 tc_del_qdisc(netdev);
3899
3900 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3901 NLM_F_EXCL | NLM_F_CREATE, &request);
3902 if (!tcmsg) {
3903 return ENODEV;
3904 }
3905 tcmsg->tcm_handle = tc_make_handle(1, 0);
3906 tcmsg->tcm_parent = TC_H_ROOT;
3907
3908 nl_msg_put_string(&request, TCA_KIND, "htb");
3909
3910 memset(&opt, 0, sizeof opt);
3911 opt.rate2quantum = HTB_RATE2QUANTUM;
3912 opt.version = 3;
3913 opt.defcls = 1;
3914
3915 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3916 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3917 nl_msg_end_nested(&request, opt_offset);
3918
3919 return tc_transact(&request, NULL);
3920 }
3921
3922 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3923 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3924 static int
3925 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3926 unsigned int parent, struct htb_class *class)
3927 {
3928 size_t opt_offset;
3929 struct tc_htb_opt opt;
3930 struct ofpbuf request;
3931 struct tcmsg *tcmsg;
3932 int error;
3933 int mtu;
3934
3935 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3936 if (error) {
3937 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3938 netdev_get_name(netdev));
3939 return error;
3940 }
3941
3942 memset(&opt, 0, sizeof opt);
3943 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3944 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3945 /* Makes sure the quantum is at least MTU. Setting quantum will
3946 * make htb ignore the r2q for this class. */
3947 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3948 opt.quantum = mtu;
3949 }
3950 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3951 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3952 opt.prio = class->priority;
3953
3954 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3955 &request);
3956 if (!tcmsg) {
3957 return ENODEV;
3958 }
3959 tcmsg->tcm_handle = handle;
3960 tcmsg->tcm_parent = parent;
3961
3962 nl_msg_put_string(&request, TCA_KIND, "htb");
3963 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3964 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3965 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3966 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3967 nl_msg_end_nested(&request, opt_offset);
3968
3969 error = tc_transact(&request, NULL);
3970 if (error) {
3971 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3972 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3973 netdev_get_name(netdev),
3974 tc_get_major(handle), tc_get_minor(handle),
3975 tc_get_major(parent), tc_get_minor(parent),
3976 class->min_rate, class->max_rate,
3977 class->burst, class->priority, ovs_strerror(error));
3978 }
3979 return error;
3980 }
3981
3982 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3983 * description of them into 'details'. The description complies with the
3984 * specification given in the vswitch database documentation for linux-htb
3985 * queue details. */
3986 static int
3987 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3988 {
3989 static const struct nl_policy tca_htb_policy[] = {
3990 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3991 .min_len = sizeof(struct tc_htb_opt) },
3992 };
3993
3994 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3995 const struct tc_htb_opt *htb;
3996
3997 if (!nl_parse_nested(nl_options, tca_htb_policy,
3998 attrs, ARRAY_SIZE(tca_htb_policy))) {
3999 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4000 return EPROTO;
4001 }
4002
4003 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4004 class->min_rate = htb->rate.rate;
4005 class->max_rate = htb->ceil.rate;
4006 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4007 class->priority = htb->prio;
4008 return 0;
4009 }
4010
4011 static int
4012 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4013 struct htb_class *options,
4014 struct netdev_queue_stats *stats)
4015 {
4016 struct nlattr *nl_options;
4017 unsigned int handle;
4018 int error;
4019
4020 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4021 if (!error && queue_id) {
4022 unsigned int major = tc_get_major(handle);
4023 unsigned int minor = tc_get_minor(handle);
4024 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4025 *queue_id = minor - 1;
4026 } else {
4027 error = EPROTO;
4028 }
4029 }
4030 if (!error && options) {
4031 error = htb_parse_tca_options__(nl_options, options);
4032 }
4033 return error;
4034 }
4035
4036 static void
4037 htb_parse_qdisc_details__(struct netdev *netdev_,
4038 const struct smap *details, struct htb_class *hc)
4039 {
4040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4041
4042 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4043 if (!hc->max_rate) {
4044 enum netdev_features current;
4045
4046 netdev_linux_read_features(netdev);
4047 current = !netdev->get_features_error ? netdev->current : 0;
4048 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4049 }
4050 hc->min_rate = hc->max_rate;
4051 hc->burst = 0;
4052 hc->priority = 0;
4053 }
4054
4055 static int
4056 htb_parse_class_details__(struct netdev *netdev,
4057 const struct smap *details, struct htb_class *hc)
4058 {
4059 const struct htb *htb = htb_get__(netdev);
4060 int mtu, error;
4061 unsigned long long int max_rate_bit;
4062
4063 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4064 if (error) {
4065 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4066 netdev_get_name(netdev));
4067 return error;
4068 }
4069
4070 /* HTB requires at least an mtu sized min-rate to send any traffic even
4071 * on uncongested links. */
4072 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4073 hc->min_rate = MAX(hc->min_rate, mtu);
4074 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4075
4076 /* max-rate */
4077 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4078 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4079 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4080 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4081
4082 /* burst
4083 *
4084 * According to hints in the documentation that I've read, it is important
4085 * that 'burst' be at least as big as the largest frame that might be
4086 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4087 * but having it a bit too small is a problem. Since netdev_get_mtu()
4088 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4089 * the MTU. We actually add 64, instead of 14, as a guard against
4090 * additional headers get tacked on somewhere that we're not aware of. */
4091 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4092 hc->burst = MAX(hc->burst, mtu + 64);
4093
4094 /* priority */
4095 hc->priority = smap_get_ullong(details, "priority", 0);
4096
4097 return 0;
4098 }
4099
4100 static int
4101 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4102 unsigned int parent, struct htb_class *options,
4103 struct netdev_queue_stats *stats)
4104 {
4105 struct ofpbuf *reply;
4106 int error;
4107
4108 error = tc_query_class(netdev, handle, parent, &reply);
4109 if (!error) {
4110 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4111 ofpbuf_delete(reply);
4112 }
4113 return error;
4114 }
4115
4116 static int
4117 htb_tc_install(struct netdev *netdev, const struct smap *details)
4118 {
4119 int error;
4120
4121 error = htb_setup_qdisc__(netdev);
4122 if (!error) {
4123 struct htb_class hc;
4124
4125 htb_parse_qdisc_details__(netdev, details, &hc);
4126 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4127 tc_make_handle(1, 0), &hc);
4128 if (!error) {
4129 htb_install__(netdev, hc.max_rate);
4130 }
4131 }
4132 return error;
4133 }
4134
4135 static struct htb_class *
4136 htb_class_cast__(const struct tc_queue *queue)
4137 {
4138 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4139 }
4140
4141 static void
4142 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4143 const struct htb_class *hc)
4144 {
4145 struct htb *htb = htb_get__(netdev);
4146 size_t hash = hash_int(queue_id, 0);
4147 struct tc_queue *queue;
4148 struct htb_class *hcp;
4149
4150 queue = tc_find_queue__(netdev, queue_id, hash);
4151 if (queue) {
4152 hcp = htb_class_cast__(queue);
4153 } else {
4154 hcp = xmalloc(sizeof *hcp);
4155 queue = &hcp->tc_queue;
4156 queue->queue_id = queue_id;
4157 queue->created = time_msec();
4158 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4159 }
4160
4161 hcp->min_rate = hc->min_rate;
4162 hcp->max_rate = hc->max_rate;
4163 hcp->burst = hc->burst;
4164 hcp->priority = hc->priority;
4165 }
4166
4167 static int
4168 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4169 {
4170 struct ofpbuf msg;
4171 struct queue_dump_state state;
4172 struct htb_class hc;
4173
4174 /* Get qdisc options. */
4175 hc.max_rate = 0;
4176 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4177 htb_install__(netdev, hc.max_rate);
4178
4179 /* Get queues. */
4180 if (!start_queue_dump(netdev, &state)) {
4181 return ENODEV;
4182 }
4183 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4184 unsigned int queue_id;
4185
4186 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4187 htb_update_queue__(netdev, queue_id, &hc);
4188 }
4189 }
4190 finish_queue_dump(&state);
4191
4192 return 0;
4193 }
4194
4195 static void
4196 htb_tc_destroy(struct tc *tc)
4197 {
4198 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4199 struct htb_class *hc;
4200
4201 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4202 free(hc);
4203 }
4204 tc_destroy(tc);
4205 free(htb);
4206 }
4207
4208 static int
4209 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4210 {
4211 const struct htb *htb = htb_get__(netdev);
4212 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4213 return 0;
4214 }
4215
4216 static int
4217 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4218 {
4219 struct htb_class hc;
4220 int error;
4221
4222 htb_parse_qdisc_details__(netdev, details, &hc);
4223 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4224 tc_make_handle(1, 0), &hc);
4225 if (!error) {
4226 htb_get__(netdev)->max_rate = hc.max_rate;
4227 }
4228 return error;
4229 }
4230
4231 static int
4232 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4233 const struct tc_queue *queue, struct smap *details)
4234 {
4235 const struct htb_class *hc = htb_class_cast__(queue);
4236
4237 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4238 if (hc->min_rate != hc->max_rate) {
4239 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4240 }
4241 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4242 if (hc->priority) {
4243 smap_add_format(details, "priority", "%u", hc->priority);
4244 }
4245 return 0;
4246 }
4247
4248 static int
4249 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4250 const struct smap *details)
4251 {
4252 struct htb_class hc;
4253 int error;
4254
4255 error = htb_parse_class_details__(netdev, details, &hc);
4256 if (error) {
4257 return error;
4258 }
4259
4260 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4261 tc_make_handle(1, 0xfffe), &hc);
4262 if (error) {
4263 return error;
4264 }
4265
4266 htb_update_queue__(netdev, queue_id, &hc);
4267 return 0;
4268 }
4269
4270 static int
4271 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4272 {
4273 struct htb_class *hc = htb_class_cast__(queue);
4274 struct htb *htb = htb_get__(netdev);
4275 int error;
4276
4277 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4278 if (!error) {
4279 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4280 free(hc);
4281 }
4282 return error;
4283 }
4284
4285 static int
4286 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4287 struct netdev_queue_stats *stats)
4288 {
4289 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4290 tc_make_handle(1, 0xfffe), NULL, stats);
4291 }
4292
4293 static int
4294 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4295 const struct ofpbuf *nlmsg,
4296 netdev_dump_queue_stats_cb *cb, void *aux)
4297 {
4298 struct netdev_queue_stats stats;
4299 unsigned int handle, major, minor;
4300 int error;
4301
4302 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4303 if (error) {
4304 return error;
4305 }
4306
4307 major = tc_get_major(handle);
4308 minor = tc_get_minor(handle);
4309 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4310 (*cb)(minor - 1, &stats, aux);
4311 }
4312 return 0;
4313 }
4314
4315 static const struct tc_ops tc_ops_htb = {
4316 .linux_name = "htb",
4317 .ovs_name = "linux-htb",
4318 .n_queues = HTB_N_QUEUES,
4319 .tc_install = htb_tc_install,
4320 .tc_load = htb_tc_load,
4321 .tc_destroy = htb_tc_destroy,
4322 .qdisc_get = htb_qdisc_get,
4323 .qdisc_set = htb_qdisc_set,
4324 .class_get = htb_class_get,
4325 .class_set = htb_class_set,
4326 .class_delete = htb_class_delete,
4327 .class_get_stats = htb_class_get_stats,
4328 .class_dump_stats = htb_class_dump_stats
4329 };
4330 \f
4331 /* "linux-hfsc" traffic control class. */
4332
4333 #define HFSC_N_QUEUES 0xf000
4334
4335 struct hfsc {
4336 struct tc tc;
4337 uint32_t max_rate;
4338 };
4339
4340 struct hfsc_class {
4341 struct tc_queue tc_queue;
4342 uint32_t min_rate;
4343 uint32_t max_rate;
4344 };
4345
4346 static struct hfsc *
4347 hfsc_get__(const struct netdev *netdev_)
4348 {
4349 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4350 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4351 }
4352
4353 static struct hfsc_class *
4354 hfsc_class_cast__(const struct tc_queue *queue)
4355 {
4356 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4357 }
4358
4359 static void
4360 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4361 {
4362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4363 struct hfsc *hfsc;
4364
4365 hfsc = xmalloc(sizeof *hfsc);
4366 tc_init(&hfsc->tc, &tc_ops_hfsc);
4367 hfsc->max_rate = max_rate;
4368 netdev->tc = &hfsc->tc;
4369 }
4370
4371 static void
4372 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4373 const struct hfsc_class *hc)
4374 {
4375 size_t hash;
4376 struct hfsc *hfsc;
4377 struct hfsc_class *hcp;
4378 struct tc_queue *queue;
4379
4380 hfsc = hfsc_get__(netdev);
4381 hash = hash_int(queue_id, 0);
4382
4383 queue = tc_find_queue__(netdev, queue_id, hash);
4384 if (queue) {
4385 hcp = hfsc_class_cast__(queue);
4386 } else {
4387 hcp = xmalloc(sizeof *hcp);
4388 queue = &hcp->tc_queue;
4389 queue->queue_id = queue_id;
4390 queue->created = time_msec();
4391 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4392 }
4393
4394 hcp->min_rate = hc->min_rate;
4395 hcp->max_rate = hc->max_rate;
4396 }
4397
4398 static int
4399 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4400 {
4401 const struct tc_service_curve *rsc, *fsc, *usc;
4402 static const struct nl_policy tca_hfsc_policy[] = {
4403 [TCA_HFSC_RSC] = {
4404 .type = NL_A_UNSPEC,
4405 .optional = false,
4406 .min_len = sizeof(struct tc_service_curve),
4407 },
4408 [TCA_HFSC_FSC] = {
4409 .type = NL_A_UNSPEC,
4410 .optional = false,
4411 .min_len = sizeof(struct tc_service_curve),
4412 },
4413 [TCA_HFSC_USC] = {
4414 .type = NL_A_UNSPEC,
4415 .optional = false,
4416 .min_len = sizeof(struct tc_service_curve),
4417 },
4418 };
4419 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4420
4421 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4422 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4423 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4424 return EPROTO;
4425 }
4426
4427 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4428 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4429 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4430
4431 if (rsc->m1 != 0 || rsc->d != 0 ||
4432 fsc->m1 != 0 || fsc->d != 0 ||
4433 usc->m1 != 0 || usc->d != 0) {
4434 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4435 "Non-linear service curves are not supported.");
4436 return EPROTO;
4437 }
4438
4439 if (rsc->m2 != fsc->m2) {
4440 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4441 "Real-time service curves are not supported ");
4442 return EPROTO;
4443 }
4444
4445 if (rsc->m2 > usc->m2) {
4446 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4447 "Min-rate service curve is greater than "
4448 "the max-rate service curve.");
4449 return EPROTO;
4450 }
4451
4452 class->min_rate = fsc->m2;
4453 class->max_rate = usc->m2;
4454 return 0;
4455 }
4456
4457 static int
4458 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4459 struct hfsc_class *options,
4460 struct netdev_queue_stats *stats)
4461 {
4462 int error;
4463 unsigned int handle;
4464 struct nlattr *nl_options;
4465
4466 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4467 if (error) {
4468 return error;
4469 }
4470
4471 if (queue_id) {
4472 unsigned int major, minor;
4473
4474 major = tc_get_major(handle);
4475 minor = tc_get_minor(handle);
4476 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4477 *queue_id = minor - 1;
4478 } else {
4479 return EPROTO;
4480 }
4481 }
4482
4483 if (options) {
4484 error = hfsc_parse_tca_options__(nl_options, options);
4485 }
4486
4487 return error;
4488 }
4489
4490 static int
4491 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4492 unsigned int parent, struct hfsc_class *options,
4493 struct netdev_queue_stats *stats)
4494 {
4495 int error;
4496 struct ofpbuf *reply;
4497
4498 error = tc_query_class(netdev, handle, parent, &reply);
4499 if (error) {
4500 return error;
4501 }
4502
4503 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4504 ofpbuf_delete(reply);
4505 return error;
4506 }
4507
4508 static void
4509 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4510 struct hfsc_class *class)
4511 {
4512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4513
4514 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4515 if (!max_rate) {
4516 enum netdev_features current;
4517
4518 netdev_linux_read_features(netdev);
4519 current = !netdev->get_features_error ? netdev->current : 0;
4520 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4521 }
4522
4523 class->min_rate = max_rate;
4524 class->max_rate = max_rate;
4525 }
4526
4527 static int
4528 hfsc_parse_class_details__(struct netdev *netdev,
4529 const struct smap *details,
4530 struct hfsc_class * class)
4531 {
4532 const struct hfsc *hfsc;
4533 uint32_t min_rate, max_rate;
4534
4535 hfsc = hfsc_get__(netdev);
4536
4537 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4538 min_rate = MAX(min_rate, 1);
4539 min_rate = MIN(min_rate, hfsc->max_rate);
4540
4541 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4542 max_rate = MAX(max_rate, min_rate);
4543 max_rate = MIN(max_rate, hfsc->max_rate);
4544
4545 class->min_rate = min_rate;
4546 class->max_rate = max_rate;
4547
4548 return 0;
4549 }
4550
4551 /* Create an HFSC qdisc.
4552 *
4553 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4554 static int
4555 hfsc_setup_qdisc__(struct netdev * netdev)
4556 {
4557 struct tcmsg *tcmsg;
4558 struct ofpbuf request;
4559 struct tc_hfsc_qopt opt;
4560
4561 tc_del_qdisc(netdev);
4562
4563 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4564 NLM_F_EXCL | NLM_F_CREATE, &request);
4565
4566 if (!tcmsg) {
4567 return ENODEV;
4568 }
4569
4570 tcmsg->tcm_handle = tc_make_handle(1, 0);
4571 tcmsg->tcm_parent = TC_H_ROOT;
4572
4573 memset(&opt, 0, sizeof opt);
4574 opt.defcls = 1;
4575
4576 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4577 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4578
4579 return tc_transact(&request, NULL);
4580 }
4581
4582 /* Create an HFSC class.
4583 *
4584 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4585 * sc rate <min_rate> ul rate <max_rate>" */
4586 static int
4587 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4588 unsigned int parent, struct hfsc_class *class)
4589 {
4590 int error;
4591 size_t opt_offset;
4592 struct tcmsg *tcmsg;
4593 struct ofpbuf request;
4594 struct tc_service_curve min, max;
4595
4596 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4597 &request);
4598
4599 if (!tcmsg) {
4600 return ENODEV;
4601 }
4602
4603 tcmsg->tcm_handle = handle;
4604 tcmsg->tcm_parent = parent;
4605
4606 min.m1 = 0;
4607 min.d = 0;
4608 min.m2 = class->min_rate;
4609
4610 max.m1 = 0;
4611 max.d = 0;
4612 max.m2 = class->max_rate;
4613
4614 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4615 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4616 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4617 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4618 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4619 nl_msg_end_nested(&request, opt_offset);
4620
4621 error = tc_transact(&request, NULL);
4622 if (error) {
4623 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4624 "min-rate %ubps, max-rate %ubps (%s)",
4625 netdev_get_name(netdev),
4626 tc_get_major(handle), tc_get_minor(handle),
4627 tc_get_major(parent), tc_get_minor(parent),
4628 class->min_rate, class->max_rate, ovs_strerror(error));
4629 }
4630
4631 return error;
4632 }
4633
4634 static int
4635 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4636 {
4637 int error;
4638 struct hfsc_class class;
4639
4640 error = hfsc_setup_qdisc__(netdev);
4641
4642 if (error) {
4643 return error;
4644 }
4645
4646 hfsc_parse_qdisc_details__(netdev, details, &class);
4647 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4648 tc_make_handle(1, 0), &class);
4649
4650 if (error) {
4651 return error;
4652 }
4653
4654 hfsc_install__(netdev, class.max_rate);
4655 return 0;
4656 }
4657
4658 static int
4659 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4660 {
4661 struct ofpbuf msg;
4662 struct queue_dump_state state;
4663 struct hfsc_class hc;
4664
4665 hc.max_rate = 0;
4666 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4667 hfsc_install__(netdev, hc.max_rate);
4668
4669 if (!start_queue_dump(netdev, &state)) {
4670 return ENODEV;
4671 }
4672
4673 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4674 unsigned int queue_id;
4675
4676 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4677 hfsc_update_queue__(netdev, queue_id, &hc);
4678 }
4679 }
4680
4681 finish_queue_dump(&state);
4682 return 0;
4683 }
4684
4685 static void
4686 hfsc_tc_destroy(struct tc *tc)
4687 {
4688 struct hfsc *hfsc;
4689 struct hfsc_class *hc, *next;
4690
4691 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4692
4693 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4694 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4695 free(hc);
4696 }
4697
4698 tc_destroy(tc);
4699 free(hfsc);
4700 }
4701
4702 static int
4703 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4704 {
4705 const struct hfsc *hfsc;
4706 hfsc = hfsc_get__(netdev);
4707 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4708 return 0;
4709 }
4710
4711 static int
4712 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4713 {
4714 int error;
4715 struct hfsc_class class;
4716
4717 hfsc_parse_qdisc_details__(netdev, details, &class);
4718 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4719 tc_make_handle(1, 0), &class);
4720
4721 if (!error) {
4722 hfsc_get__(netdev)->max_rate = class.max_rate;
4723 }
4724
4725 return error;
4726 }
4727
4728 static int
4729 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4730 const struct tc_queue *queue, struct smap *details)
4731 {
4732 const struct hfsc_class *hc;
4733
4734 hc = hfsc_class_cast__(queue);
4735 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4736 if (hc->min_rate != hc->max_rate) {
4737 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4738 }
4739 return 0;
4740 }
4741
4742 static int
4743 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4744 const struct smap *details)
4745 {
4746 int error;
4747 struct hfsc_class class;
4748
4749 error = hfsc_parse_class_details__(netdev, details, &class);
4750 if (error) {
4751 return error;
4752 }
4753
4754 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4755 tc_make_handle(1, 0xfffe), &class);
4756 if (error) {
4757 return error;
4758 }
4759
4760 hfsc_update_queue__(netdev, queue_id, &class);
4761 return 0;
4762 }
4763
4764 static int
4765 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4766 {
4767 int error;
4768 struct hfsc *hfsc;
4769 struct hfsc_class *hc;
4770
4771 hc = hfsc_class_cast__(queue);
4772 hfsc = hfsc_get__(netdev);
4773
4774 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4775 if (!error) {
4776 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4777 free(hc);
4778 }
4779 return error;
4780 }
4781
4782 static int
4783 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4784 struct netdev_queue_stats *stats)
4785 {
4786 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4787 tc_make_handle(1, 0xfffe), NULL, stats);
4788 }
4789
4790 static int
4791 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4792 const struct ofpbuf *nlmsg,
4793 netdev_dump_queue_stats_cb *cb, void *aux)
4794 {
4795 struct netdev_queue_stats stats;
4796 unsigned int handle, major, minor;
4797 int error;
4798
4799 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4800 if (error) {
4801 return error;
4802 }
4803
4804 major = tc_get_major(handle);
4805 minor = tc_get_minor(handle);
4806 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4807 (*cb)(minor - 1, &stats, aux);
4808 }
4809 return 0;
4810 }
4811
4812 static const struct tc_ops tc_ops_hfsc = {
4813 .linux_name = "hfsc",
4814 .ovs_name = "linux-hfsc",
4815 .n_queues = HFSC_N_QUEUES, /* n_queues */
4816 .tc_install = hfsc_tc_install,
4817 .tc_load = hfsc_tc_load,
4818 .tc_destroy = hfsc_tc_destroy,
4819 .qdisc_get = hfsc_qdisc_get,
4820 .qdisc_set = hfsc_qdisc_set,
4821 .class_get = hfsc_class_get,
4822 .class_set = hfsc_class_set,
4823 .class_delete = hfsc_class_delete,
4824 .class_get_stats = hfsc_class_get_stats,
4825 .class_dump_stats = hfsc_class_dump_stats,
4826 };
4827 \f
4828 /* "linux-noop" traffic control class. */
4829
4830 static void
4831 noop_install__(struct netdev *netdev_)
4832 {
4833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4834 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4835
4836 netdev->tc = CONST_CAST(struct tc *, &tc);
4837 }
4838
4839 static int
4840 noop_tc_install(struct netdev *netdev,
4841 const struct smap *details OVS_UNUSED)
4842 {
4843 noop_install__(netdev);
4844 return 0;
4845 }
4846
4847 static int
4848 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4849 {
4850 noop_install__(netdev);
4851 return 0;
4852 }
4853
4854 static const struct tc_ops tc_ops_noop = {
4855 .ovs_name = "linux-noop", /* ovs_name */
4856 .tc_install = noop_tc_install,
4857 .tc_load = noop_tc_load,
4858 };
4859 \f
4860 /* "linux-default" traffic control class.
4861 *
4862 * This class represents the default, unnamed Linux qdisc. It corresponds to
4863 * the "" (empty string) QoS type in the OVS database. */
4864
4865 static void
4866 default_install__(struct netdev *netdev_)
4867 {
4868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4869 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4870
4871 /* Nothing but a tc class implementation is allowed to write to a tc. This
4872 * class never does that, so we can legitimately use a const tc object. */
4873 netdev->tc = CONST_CAST(struct tc *, &tc);
4874 }
4875
4876 static int
4877 default_tc_install(struct netdev *netdev,
4878 const struct smap *details OVS_UNUSED)
4879 {
4880 default_install__(netdev);
4881 return 0;
4882 }
4883
4884 static int
4885 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4886 {
4887 default_install__(netdev);
4888 return 0;
4889 }
4890
4891 static const struct tc_ops tc_ops_default = {
4892 .ovs_name = "", /* ovs_name */
4893 .tc_install = default_tc_install,
4894 .tc_load = default_tc_load,
4895 };
4896 \f
4897 /* "linux-other" traffic control class.
4898 *
4899 * */
4900
4901 static int
4902 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4903 {
4904 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4905 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4906
4907 /* Nothing but a tc class implementation is allowed to write to a tc. This
4908 * class never does that, so we can legitimately use a const tc object. */
4909 netdev->tc = CONST_CAST(struct tc *, &tc);
4910 return 0;
4911 }
4912
4913 static const struct tc_ops tc_ops_other = {
4914 .ovs_name = "linux-other",
4915 .tc_load = other_tc_load,
4916 };
4917 \f
4918 /* Traffic control. */
4919
4920 /* Number of kernel "tc" ticks per second. */
4921 static double ticks_per_s;
4922
4923 /* Number of kernel "jiffies" per second. This is used for the purpose of
4924 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4925 * one jiffy's worth of data.
4926 *
4927 * There are two possibilities here:
4928 *
4929 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4930 * approximate range of 100 to 1024. That means that we really need to
4931 * make sure that the qdisc can buffer that much data.
4932 *
4933 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4934 * has finely granular timers and there's no need to fudge additional room
4935 * for buffers. (There's no extra effort needed to implement that: the
4936 * large 'buffer_hz' is used as a divisor, so practically any number will
4937 * come out as 0 in the division. Small integer results in the case of
4938 * really high dividends won't have any real effect anyhow.)
4939 */
4940 static unsigned int buffer_hz;
4941
4942 static struct tcmsg *
4943 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4944 unsigned int flags, struct ofpbuf *request)
4945 {
4946 int ifindex;
4947 int error;
4948
4949 error = get_ifindex(netdev, &ifindex);
4950 if (error) {
4951 return NULL;
4952 }
4953
4954 return tc_make_request(ifindex, type, flags, request);
4955 }
4956
4957 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4958 * of 'kbits_burst'.
4959 *
4960 * This function is equivalent to running:
4961 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4962 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4963 * mtu 65535 drop
4964 *
4965 * The configuration and stats may be seen with the following command:
4966 * /sbin/tc -s filter show dev <devname> parent ffff:
4967 *
4968 * Returns 0 if successful, otherwise a positive errno value.
4969 */
4970 static int
4971 tc_add_policer(struct netdev *netdev,
4972 uint32_t kbits_rate, uint32_t kbits_burst)
4973 {
4974 struct tc_police tc_police;
4975 struct ofpbuf request;
4976 struct tcmsg *tcmsg;
4977 size_t basic_offset;
4978 size_t police_offset;
4979 int error;
4980 int mtu = 65535;
4981
4982 memset(&tc_police, 0, sizeof tc_police);
4983 tc_police.action = TC_POLICE_SHOT;
4984 tc_police.mtu = mtu;
4985 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4986
4987 /* The following appears wrong in one way: In networking a kilobit is
4988 * usually 1000 bits but this uses 1024 bits.
4989 *
4990 * However if you "fix" those problems then "tc filter show ..." shows
4991 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4992 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4993 * tc's point of view. Whatever. */
4994 tc_police.burst = tc_bytes_to_ticks(
4995 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4996
4997 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4998 NLM_F_EXCL | NLM_F_CREATE, &request);
4999 if (!tcmsg) {
5000 return ENODEV;
5001 }
5002 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5003 tcmsg->tcm_info = tc_make_handle(49,
5004 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5005
5006 nl_msg_put_string(&request, TCA_KIND, "basic");
5007 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5008 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5009 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5010 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5011 nl_msg_end_nested(&request, police_offset);
5012 nl_msg_end_nested(&request, basic_offset);
5013
5014 error = tc_transact(&request, NULL);
5015 if (error) {
5016 return error;
5017 }
5018
5019 return 0;
5020 }
5021
5022 static void
5023 read_psched(void)
5024 {
5025 /* The values in psched are not individually very meaningful, but they are
5026 * important. The tables below show some values seen in the wild.
5027 *
5028 * Some notes:
5029 *
5030 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5031 * (Before that, there are hints that it was 1000000000.)
5032 *
5033 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5034 * above.
5035 *
5036 * /proc/net/psched
5037 * -----------------------------------
5038 * [1] 000c8000 000f4240 000f4240 00000064
5039 * [2] 000003e8 00000400 000f4240 3b9aca00
5040 * [3] 000003e8 00000400 000f4240 3b9aca00
5041 * [4] 000003e8 00000400 000f4240 00000064
5042 * [5] 000003e8 00000040 000f4240 3b9aca00
5043 * [6] 000003e8 00000040 000f4240 000000f9
5044 *
5045 * a b c d ticks_per_s buffer_hz
5046 * ------- --------- ---------- ------------- ----------- -------------
5047 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5048 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5049 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5050 * [4] 1,000 1,024 1,000,000 100 976,562 100
5051 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5052 * [6] 1,000 64 1,000,000 249 15,625,000 249
5053 *
5054 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5055 * [2] 2.6.26-1-686-bigmem from Debian lenny
5056 * [3] 2.6.26-2-sparc64 from Debian lenny
5057 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5058 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5059 * [6] 2.6.34 from kernel.org on KVM
5060 */
5061 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5062 static const char fn[] = "/proc/net/psched";
5063 unsigned int a, b, c, d;
5064 FILE *stream;
5065
5066 if (!ovsthread_once_start(&once)) {
5067 return;
5068 }
5069
5070 ticks_per_s = 1.0;
5071 buffer_hz = 100;
5072
5073 stream = fopen(fn, "r");
5074 if (!stream) {
5075 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5076 goto exit;
5077 }
5078
5079 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5080 VLOG_WARN("%s: read failed", fn);
5081 fclose(stream);
5082 goto exit;
5083 }
5084 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5085 fclose(stream);
5086
5087 if (!a || !b || !c) {
5088 VLOG_WARN("%s: invalid scheduler parameters", fn);
5089 goto exit;
5090 }
5091
5092 ticks_per_s = (double) a * c / b;
5093 if (c == 1000000) {
5094 buffer_hz = d;
5095 } else {
5096 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5097 fn, a, b, c, d);
5098 }
5099 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5100
5101 exit:
5102 ovsthread_once_done(&once);
5103 }
5104
5105 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5106 * rate of 'rate' bytes per second. */
5107 static unsigned int
5108 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5109 {
5110 read_psched();
5111 return (rate * ticks) / ticks_per_s;
5112 }
5113
5114 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5115 * rate of 'rate' bytes per second. */
5116 static unsigned int
5117 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5118 {
5119 read_psched();
5120 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5121 }
5122
5123 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5124 * a transmission rate of 'rate' bytes per second. */
5125 static unsigned int
5126 tc_buffer_per_jiffy(unsigned int rate)
5127 {
5128 read_psched();
5129 return rate / buffer_hz;
5130 }
5131
5132 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5133 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5134 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5135 * stores NULL into it if it is absent.
5136 *
5137 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5138 * 'msg'.
5139 *
5140 * Returns 0 if successful, otherwise a positive errno value. */
5141 static int
5142 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5143 struct nlattr **options)
5144 {
5145 static const struct nl_policy tca_policy[] = {
5146 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5147 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5148 };
5149 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5150
5151 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5152 tca_policy, ta, ARRAY_SIZE(ta))) {
5153 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5154 goto error;
5155 }
5156
5157 if (kind) {
5158 *kind = nl_attr_get_string(ta[TCA_KIND]);
5159 }
5160
5161 if (options) {
5162 *options = ta[TCA_OPTIONS];
5163 }
5164
5165 return 0;
5166
5167 error:
5168 if (kind) {
5169 *kind = NULL;
5170 }
5171 if (options) {
5172 *options = NULL;
5173 }
5174 return EPROTO;
5175 }
5176
5177 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5178 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5179 * into '*options', and its queue statistics into '*stats'. Any of the output
5180 * arguments may be null.
5181 *
5182 * Returns 0 if successful, otherwise a positive errno value. */
5183 static int
5184 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5185 struct nlattr **options, struct netdev_queue_stats *stats)
5186 {
5187 static const struct nl_policy tca_policy[] = {
5188 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5189 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5190 };
5191 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5192
5193 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5194 tca_policy, ta, ARRAY_SIZE(ta))) {
5195 VLOG_WARN_RL(&rl, "failed to parse class message");
5196 goto error;
5197 }
5198
5199 if (handlep) {
5200 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5201 *handlep = tc->tcm_handle;
5202 }
5203
5204 if (options) {
5205 *options = ta[TCA_OPTIONS];
5206 }
5207
5208 if (stats) {
5209 const struct gnet_stats_queue *gsq;
5210 struct gnet_stats_basic gsb;
5211
5212 static const struct nl_policy stats_policy[] = {
5213 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5214 .min_len = sizeof gsb },
5215 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5216 .min_len = sizeof *gsq },
5217 };
5218 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5219
5220 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5221 sa, ARRAY_SIZE(sa))) {
5222 VLOG_WARN_RL(&rl, "failed to parse class stats");
5223 goto error;
5224 }
5225
5226 /* Alignment issues screw up the length of struct gnet_stats_basic on
5227 * some arch/bitsize combinations. Newer versions of Linux have a
5228 * struct gnet_stats_basic_packed, but we can't depend on that. The
5229 * easiest thing to do is just to make a copy. */
5230 memset(&gsb, 0, sizeof gsb);
5231 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5232 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5233 stats->tx_bytes = gsb.bytes;
5234 stats->tx_packets = gsb.packets;
5235
5236 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5237 stats->tx_errors = gsq->drops;
5238 }
5239
5240 return 0;
5241
5242 error:
5243 if (options) {
5244 *options = NULL;
5245 }
5246 if (stats) {
5247 memset(stats, 0, sizeof *stats);
5248 }
5249 return EPROTO;
5250 }
5251
5252 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5253 * on 'netdev'. */
5254 static int
5255 tc_query_class(const struct netdev *netdev,
5256 unsigned int handle, unsigned int parent,
5257 struct ofpbuf **replyp)
5258 {
5259 struct ofpbuf request;
5260 struct tcmsg *tcmsg;
5261 int error;
5262
5263 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5264 &request);
5265 if (!tcmsg) {
5266 return ENODEV;
5267 }
5268 tcmsg->tcm_handle = handle;
5269 tcmsg->tcm_parent = parent;
5270
5271 error = tc_transact(&request, replyp);
5272 if (error) {
5273 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5274 netdev_get_name(netdev),
5275 tc_get_major(handle), tc_get_minor(handle),
5276 tc_get_major(parent), tc_get_minor(parent),
5277 ovs_strerror(error));
5278 }
5279 return error;
5280 }
5281
5282 /* Equivalent to "tc class del dev <name> handle <handle>". */
5283 static int
5284 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5285 {
5286 struct ofpbuf request;
5287 struct tcmsg *tcmsg;
5288 int error;
5289
5290 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5291 if (!tcmsg) {
5292 return ENODEV;
5293 }
5294 tcmsg->tcm_handle = handle;
5295 tcmsg->tcm_parent = 0;
5296
5297 error = tc_transact(&request, NULL);
5298 if (error) {
5299 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5300 netdev_get_name(netdev),
5301 tc_get_major(handle), tc_get_minor(handle),
5302 ovs_strerror(error));
5303 }
5304 return error;
5305 }
5306
5307 /* Equivalent to "tc qdisc del dev <name> root". */
5308 static int
5309 tc_del_qdisc(struct netdev *netdev_)
5310 {
5311 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5312 struct ofpbuf request;
5313 struct tcmsg *tcmsg;
5314 int error;
5315
5316 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5317 if (!tcmsg) {
5318 return ENODEV;
5319 }
5320 tcmsg->tcm_handle = tc_make_handle(1, 0);
5321 tcmsg->tcm_parent = TC_H_ROOT;
5322
5323 error = tc_transact(&request, NULL);
5324 if (error == EINVAL) {
5325 /* EINVAL probably means that the default qdisc was in use, in which
5326 * case we've accomplished our purpose. */
5327 error = 0;
5328 }
5329 if (!error && netdev->tc) {
5330 if (netdev->tc->ops->tc_destroy) {
5331 netdev->tc->ops->tc_destroy(netdev->tc);
5332 }
5333 netdev->tc = NULL;
5334 }
5335 return error;
5336 }
5337
5338 static bool
5339 getqdisc_is_safe(void)
5340 {
5341 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5342 static bool safe = false;
5343
5344 if (ovsthread_once_start(&once)) {
5345 struct utsname utsname;
5346 int major, minor;
5347
5348 if (uname(&utsname) == -1) {
5349 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5350 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5351 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5352 } else if (major < 2 || (major == 2 && minor < 35)) {
5353 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5354 utsname.release);
5355 } else {
5356 safe = true;
5357 }
5358 ovsthread_once_done(&once);
5359 }
5360 return safe;
5361 }
5362
5363 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5364 * kernel to determine what they are. Returns 0 if successful, otherwise a
5365 * positive errno value. */
5366 static int
5367 tc_query_qdisc(const struct netdev *netdev_)
5368 {
5369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5370 struct ofpbuf request, *qdisc;
5371 const struct tc_ops *ops;
5372 struct tcmsg *tcmsg;
5373 int load_error;
5374 int error;
5375
5376 if (netdev->tc) {
5377 return 0;
5378 }
5379
5380 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5381 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5382 * 2.6.35 without that fix backported to it.
5383 *
5384 * To avoid the OOPS, we must not make a request that would attempt to dump
5385 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5386 * few others. There are a few ways that I can see to do this, but most of
5387 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5388 * technique chosen here is to assume that any non-default qdisc that we
5389 * create will have a class with handle 1:0. The built-in qdiscs only have
5390 * a class with handle 0:0.
5391 *
5392 * On Linux 2.6.35+ we use the straightforward method because it allows us
5393 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5394 * in such a case we get no response at all from the kernel (!) if a
5395 * builtin qdisc is in use (which is later caught by "!error &&
5396 * !qdisc->size"). */
5397 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5398 &request);
5399 if (!tcmsg) {
5400 return ENODEV;
5401 }
5402 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5403 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5404
5405 /* Figure out what tc class to instantiate. */
5406 error = tc_transact(&request, &qdisc);
5407 if (!error && qdisc->size) {
5408 const char *kind;
5409
5410 error = tc_parse_qdisc(qdisc, &kind, NULL);
5411 if (error) {
5412 ops = &tc_ops_other;
5413 } else {
5414 ops = tc_lookup_linux_name(kind);
5415 if (!ops) {
5416 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5417 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5418
5419 ops = &tc_ops_other;
5420 }
5421 }
5422 } else if ((!error && !qdisc->size) || error == ENOENT) {
5423 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5424 * set up by some other entity that doesn't have a handle 1:0. We will
5425 * assume that it's the system default qdisc. */
5426 ops = &tc_ops_default;
5427 error = 0;
5428 } else {
5429 /* Who knows? Maybe the device got deleted. */
5430 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5431 netdev_get_name(netdev_), ovs_strerror(error));
5432 ops = &tc_ops_other;
5433 }
5434
5435 /* Instantiate it. */
5436 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5437 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5438 ofpbuf_delete(qdisc);
5439
5440 return error ? error : load_error;
5441 }
5442
5443 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5444 approximate the time to transmit packets of various lengths. For an MTU of
5445 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5446 represents two possible packet lengths; for a MTU of 513 through 1024, four
5447 possible lengths; and so on.
5448
5449 Returns, for the specified 'mtu', the number of bits that packet lengths
5450 need to be shifted right to fit within such a 256-entry table. */
5451 static int
5452 tc_calc_cell_log(unsigned int mtu)
5453 {
5454 int cell_log;
5455
5456 if (!mtu) {
5457 mtu = ETH_PAYLOAD_MAX;
5458 }
5459 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5460
5461 for (cell_log = 0; mtu >= 256; cell_log++) {
5462 mtu >>= 1;
5463 }
5464
5465 return cell_log;
5466 }
5467
5468 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5469 * of 'mtu'. */
5470 static void
5471 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5472 {
5473 memset(rate, 0, sizeof *rate);
5474 rate->cell_log = tc_calc_cell_log(mtu);
5475 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5476 /* rate->cell_align = 0; */ /* distro headers. */
5477 rate->mpu = ETH_TOTAL_MIN;
5478 rate->rate = Bps;
5479 }
5480
5481 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5482 * attribute of the specified "type".
5483 *
5484 * See tc_calc_cell_log() above for a description of "rtab"s. */
5485 static void
5486 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5487 {
5488 uint32_t *rtab;
5489 unsigned int i;
5490
5491 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5492 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5493 unsigned packet_size = (i + 1) << rate->cell_log;
5494 if (packet_size < rate->mpu) {
5495 packet_size = rate->mpu;
5496 }
5497 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5498 }
5499 }
5500
5501 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5502 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5503 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5504 * 0 is fine.) */
5505 static int
5506 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5507 {
5508 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5509 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5510 }
5511 \f
5512 /* Linux-only functions declared in netdev-linux.h */
5513
5514 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5515 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5516 int
5517 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5518 const char *flag_name, bool enable)
5519 {
5520 const char *netdev_name = netdev_get_name(netdev);
5521 struct ethtool_value evalue;
5522 uint32_t new_flags;
5523 int error;
5524
5525 COVERAGE_INC(netdev_get_ethtool);
5526 memset(&evalue, 0, sizeof evalue);
5527 error = netdev_linux_do_ethtool(netdev_name,
5528 (struct ethtool_cmd *)&evalue,
5529 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5530 if (error) {
5531 return error;
5532 }
5533
5534 COVERAGE_INC(netdev_set_ethtool);
5535 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5536 if (new_flags == evalue.data) {
5537 return 0;
5538 }
5539 evalue.data = new_flags;
5540 error = netdev_linux_do_ethtool(netdev_name,
5541 (struct ethtool_cmd *)&evalue,
5542 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5543 if (error) {
5544 return error;
5545 }
5546
5547 COVERAGE_INC(netdev_get_ethtool);
5548 memset(&evalue, 0, sizeof evalue);
5549 error = netdev_linux_do_ethtool(netdev_name,
5550 (struct ethtool_cmd *)&evalue,
5551 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5552 if (error) {
5553 return error;
5554 }
5555
5556 if (new_flags != evalue.data) {
5557 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5558 "device %s failed", enable ? "enable" : "disable",
5559 flag_name, netdev_name);
5560 return EOPNOTSUPP;
5561 }
5562
5563 return 0;
5564 }
5565 \f
5566 /* Utility functions. */
5567
5568 /* Copies 'src' into 'dst', performing format conversion in the process. */
5569 static void
5570 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5571 const struct rtnl_link_stats *src)
5572 {
5573 dst->rx_packets = src->rx_packets;
5574 dst->tx_packets = src->tx_packets;
5575 dst->rx_bytes = src->rx_bytes;
5576 dst->tx_bytes = src->tx_bytes;
5577 dst->rx_errors = src->rx_errors;
5578 dst->tx_errors = src->tx_errors;
5579 dst->rx_dropped = src->rx_dropped;
5580 dst->tx_dropped = src->tx_dropped;
5581 dst->multicast = src->multicast;
5582 dst->collisions = src->collisions;
5583 dst->rx_length_errors = src->rx_length_errors;
5584 dst->rx_over_errors = src->rx_over_errors;
5585 dst->rx_crc_errors = src->rx_crc_errors;
5586 dst->rx_frame_errors = src->rx_frame_errors;
5587 dst->rx_fifo_errors = src->rx_fifo_errors;
5588 dst->rx_missed_errors = src->rx_missed_errors;
5589 dst->tx_aborted_errors = src->tx_aborted_errors;
5590 dst->tx_carrier_errors = src->tx_carrier_errors;
5591 dst->tx_fifo_errors = src->tx_fifo_errors;
5592 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5593 dst->tx_window_errors = src->tx_window_errors;
5594 }
5595
5596 /* Copies 'src' into 'dst', performing format conversion in the process. */
5597 static void
5598 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5599 const struct rtnl_link_stats64 *src)
5600 {
5601 dst->rx_packets = src->rx_packets;
5602 dst->tx_packets = src->tx_packets;
5603 dst->rx_bytes = src->rx_bytes;
5604 dst->tx_bytes = src->tx_bytes;
5605 dst->rx_errors = src->rx_errors;
5606 dst->tx_errors = src->tx_errors;
5607 dst->rx_dropped = src->rx_dropped;
5608 dst->tx_dropped = src->tx_dropped;
5609 dst->multicast = src->multicast;
5610 dst->collisions = src->collisions;
5611 dst->rx_length_errors = src->rx_length_errors;
5612 dst->rx_over_errors = src->rx_over_errors;
5613 dst->rx_crc_errors = src->rx_crc_errors;
5614 dst->rx_frame_errors = src->rx_frame_errors;
5615 dst->rx_fifo_errors = src->rx_fifo_errors;
5616 dst->rx_missed_errors = src->rx_missed_errors;
5617 dst->tx_aborted_errors = src->tx_aborted_errors;
5618 dst->tx_carrier_errors = src->tx_carrier_errors;
5619 dst->tx_fifo_errors = src->tx_fifo_errors;
5620 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5621 dst->tx_window_errors = src->tx_window_errors;
5622 }
5623
5624 static int
5625 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5626 {
5627 struct ofpbuf request;
5628 struct ofpbuf *reply;
5629 int error;
5630
5631 /* Filtering all counters by default */
5632 memset(stats, 0xFF, sizeof(struct netdev_stats));
5633
5634 ofpbuf_init(&request, 0);
5635 nl_msg_put_nlmsghdr(&request,
5636 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5637 RTM_GETLINK, NLM_F_REQUEST);
5638 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5639 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5640 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5641 ofpbuf_uninit(&request);
5642 if (error) {
5643 return error;
5644 }
5645
5646 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5647 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5648 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5649 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5650 error = 0;
5651 } else {
5652 a = nl_attr_find(reply, 0, IFLA_STATS);
5653 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5654 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5655 error = 0;
5656 } else {
5657 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5658 error = EPROTO;
5659 }
5660 }
5661 } else {
5662 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5663 error = EPROTO;
5664 }
5665
5666
5667 ofpbuf_delete(reply);
5668 return error;
5669 }
5670
5671 static int
5672 get_flags(const struct netdev *dev, unsigned int *flags)
5673 {
5674 struct ifreq ifr;
5675 int error;
5676
5677 *flags = 0;
5678 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5679 if (!error) {
5680 *flags = ifr.ifr_flags;
5681 }
5682 return error;
5683 }
5684
5685 static int
5686 set_flags(const char *name, unsigned int flags)
5687 {
5688 struct ifreq ifr;
5689
5690 ifr.ifr_flags = flags;
5691 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5692 }
5693
5694 int
5695 linux_get_ifindex(const char *netdev_name)
5696 {
5697 struct ifreq ifr;
5698 int error;
5699
5700 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5701 COVERAGE_INC(netdev_get_ifindex);
5702
5703 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5704 if (error) {
5705 /* ENODEV probably means that a vif disappeared asynchronously and
5706 * hasn't been removed from the database yet, so reduce the log level
5707 * to INFO for that case. */
5708 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5709 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5710 netdev_name, ovs_strerror(error));
5711 return -error;
5712 }
5713 return ifr.ifr_ifindex;
5714 }
5715
5716 static int
5717 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5718 {
5719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5720
5721 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5722 netdev_linux_update_via_netlink(netdev);
5723 }
5724
5725 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5726 /* Fall back to ioctl if netlink fails */
5727 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5728
5729 if (ifindex < 0) {
5730 netdev->get_ifindex_error = -ifindex;
5731 netdev->ifindex = 0;
5732 } else {
5733 netdev->get_ifindex_error = 0;
5734 netdev->ifindex = ifindex;
5735 }
5736 netdev->cache_valid |= VALID_IFINDEX;
5737 }
5738
5739 *ifindexp = netdev->ifindex;
5740 return netdev->get_ifindex_error;
5741 }
5742
5743 static int
5744 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5745 {
5746 struct ofpbuf request;
5747 struct ofpbuf *reply;
5748 struct rtnetlink_change chg;
5749 struct rtnetlink_change *change = &chg;
5750 int error;
5751
5752 ofpbuf_init(&request, 0);
5753 nl_msg_put_nlmsghdr(&request,
5754 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5755 RTM_GETLINK, NLM_F_REQUEST);
5756 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5757
5758 /* The correct identifiers for a Linux device are netnsid and ifindex,
5759 * but ifindex changes as the port is moved to another network namespace
5760 * and the interface name statically stored in ovsdb. */
5761 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5762 if (netdev_linux_netnsid_is_remote(netdev)) {
5763 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5764 }
5765 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5766 ofpbuf_uninit(&request);
5767 if (error) {
5768 ofpbuf_delete(reply);
5769 return error;
5770 }
5771
5772 if (rtnetlink_parse(reply, change)
5773 && change->nlmsg_type == RTM_NEWLINK) {
5774 bool changed = false;
5775 error = 0;
5776
5777 /* Update netdev from rtnl msg and increment its seq if needed. */
5778 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5779 netdev->carrier_resets++;
5780 changed = true;
5781 }
5782 if (change->ifi_flags != netdev->ifi_flags) {
5783 netdev->ifi_flags = change->ifi_flags;
5784 changed = true;
5785 }
5786 if (change->mtu && change->mtu != netdev->mtu) {
5787 netdev->mtu = change->mtu;
5788 netdev->cache_valid |= VALID_MTU;
5789 netdev->netdev_mtu_error = 0;
5790 changed = true;
5791 }
5792 if (!eth_addr_is_zero(change->mac)
5793 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5794 netdev->etheraddr = change->mac;
5795 netdev->cache_valid |= VALID_ETHERADDR;
5796 netdev->ether_addr_error = 0;
5797 changed = true;
5798 }
5799 if (change->if_index != netdev->ifindex) {
5800 netdev->ifindex = change->if_index;
5801 netdev->cache_valid |= VALID_IFINDEX;
5802 netdev->get_ifindex_error = 0;
5803 changed = true;
5804 }
5805 if (change->master && netdev_linux_kind_is_lag(change->master)) {
5806 netdev->is_lag_master = true;
5807 }
5808 if (changed) {
5809 netdev_change_seq_changed(&netdev->up);
5810 }
5811 } else {
5812 error = EINVAL;
5813 }
5814
5815 ofpbuf_delete(reply);
5816 return error;
5817 }
5818
5819 static int
5820 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5821 {
5822 struct ifreq ifr;
5823 int hwaddr_family;
5824 int error;
5825
5826 memset(&ifr, 0, sizeof ifr);
5827 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5828 COVERAGE_INC(netdev_get_hwaddr);
5829 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5830 if (error) {
5831 /* ENODEV probably means that a vif disappeared asynchronously and
5832 * hasn't been removed from the database yet, so reduce the log level
5833 * to INFO for that case. */
5834 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5835 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5836 netdev_name, ovs_strerror(error));
5837 return error;
5838 }
5839 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5840 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5841 hwaddr_family != ARPHRD_NONE) {
5842 VLOG_INFO("%s device has unknown hardware address family %d",
5843 netdev_name, hwaddr_family);
5844 return EINVAL;
5845 }
5846 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5847 return 0;
5848 }
5849
5850 static int
5851 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5852 {
5853 struct ifreq ifr;
5854 int error;
5855
5856 memset(&ifr, 0, sizeof ifr);
5857 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5858 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5859 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5860 COVERAGE_INC(netdev_set_hwaddr);
5861 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5862 if (error) {
5863 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5864 netdev_name, ovs_strerror(error));
5865 }
5866 return error;
5867 }
5868
5869 static int
5870 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5871 int cmd, const char *cmd_name)
5872 {
5873 struct ifreq ifr;
5874 int error;
5875
5876 memset(&ifr, 0, sizeof ifr);
5877 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5878 ifr.ifr_data = (caddr_t) ecmd;
5879
5880 ecmd->cmd = cmd;
5881 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5882 if (error) {
5883 if (error != EOPNOTSUPP) {
5884 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5885 "failed: %s", cmd_name, name, ovs_strerror(error));
5886 } else {
5887 /* The device doesn't support this operation. That's pretty
5888 * common, so there's no point in logging anything. */
5889 }
5890 }
5891 return error;
5892 }
5893
5894 /* Returns an AF_PACKET raw socket or a negative errno value. */
5895 static int
5896 af_packet_sock(void)
5897 {
5898 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5899 static int sock;
5900
5901 if (ovsthread_once_start(&once)) {
5902 sock = socket(AF_PACKET, SOCK_RAW, 0);
5903 if (sock >= 0) {
5904 int error = set_nonblocking(sock);
5905 if (error) {
5906 close(sock);
5907 sock = -error;
5908 }
5909 } else {
5910 sock = -errno;
5911 VLOG_ERR("failed to create packet socket: %s",
5912 ovs_strerror(errno));
5913 }
5914 ovsthread_once_done(&once);
5915 }
5916
5917 return sock;
5918 }