]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
lib/tc: Support matching on ip tos
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
26 #include <inttypes.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "netlink.h"
63 #include "netnsid.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "tc.h"
74 #include "timer.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
77 #include "util.h"
78
79 VLOG_DEFINE_THIS_MODULE(netdev_linux);
80
81 COVERAGE_DEFINE(netdev_set_policing);
82 COVERAGE_DEFINE(netdev_arp_lookup);
83 COVERAGE_DEFINE(netdev_get_ifindex);
84 COVERAGE_DEFINE(netdev_get_hwaddr);
85 COVERAGE_DEFINE(netdev_set_hwaddr);
86 COVERAGE_DEFINE(netdev_get_ethtool);
87 COVERAGE_DEFINE(netdev_set_ethtool);
88
89 \f
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
92 #endif
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
97 #endif
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
100 #endif
101
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106 #endif
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109 #endif
110
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113 #ifndef TC_RTAB_SIZE
114 #define TC_RTAB_SIZE 1024
115 #endif
116
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
128 #endif
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
131 #endif
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134 #endif
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145 };
146
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155 {
156 return ep->speed | (ep->speed_hi << 16);
157 }
158
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
170 #endif
171
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
183 #endif
184
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
192 #ifndef IFLA_STATS64
193 #define IFLA_STATS64 23
194 #endif
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223 };
224
225 enum {
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
234 };
235 \f
236 struct linux_lag_slave {
237 uint32_t block_id;
238 struct shash_node *node;
239 };
240
241 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
242 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
243
244 /* All slaves whose LAG masters are network devices in OvS. */
245 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
246 = SHASH_INITIALIZER(&lag_shash);
247
248 /* Traffic control. */
249
250 /* An instance of a traffic control class. Always associated with a particular
251 * network device.
252 *
253 * Each TC implementation subclasses this with whatever additional data it
254 * needs. */
255 struct tc {
256 const struct tc_ops *ops;
257 struct hmap queues; /* Contains "struct tc_queue"s.
258 * Read by generic TC layer.
259 * Written only by TC implementation. */
260 };
261
262 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
263
264 /* One traffic control queue.
265 *
266 * Each TC implementation subclasses this with whatever additional data it
267 * needs. */
268 struct tc_queue {
269 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
270 unsigned int queue_id; /* OpenFlow queue ID. */
271 long long int created; /* Time queue was created, in msecs. */
272 };
273
274 /* A particular kind of traffic control. Each implementation generally maps to
275 * one particular Linux qdisc class.
276 *
277 * The functions below return 0 if successful or a positive errno value on
278 * failure, except where otherwise noted. All of them must be provided, except
279 * where otherwise noted. */
280 struct tc_ops {
281 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
282 * This is null for tc_ops_default and tc_ops_other, for which there are no
283 * appropriate values. */
284 const char *linux_name;
285
286 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
287 const char *ovs_name;
288
289 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
290 * queues. The queues are numbered 0 through n_queues - 1. */
291 unsigned int n_queues;
292
293 /* Called to install this TC class on 'netdev'. The implementation should
294 * make the Netlink calls required to set up 'netdev' with the right qdisc
295 * and configure it according to 'details'. The implementation may assume
296 * that the current qdisc is the default; that is, there is no need for it
297 * to delete the current qdisc before installing itself.
298 *
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
302 *
303 * This function must return 0 if and only if it sets 'netdev->tc' to an
304 * initialized 'struct tc'.
305 *
306 * (This function is null for tc_ops_other, which cannot be installed. For
307 * other TC classes it should always be nonnull.) */
308 int (*tc_install)(struct netdev *netdev, const struct smap *details);
309
310 /* Called when the netdev code determines (through a Netlink query) that
311 * this TC class's qdisc is installed on 'netdev', but we didn't install
312 * it ourselves and so don't know any of the details.
313 *
314 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
315 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
316 * implementation should parse the other attributes of 'nlmsg' as
317 * necessary to determine its configuration. If necessary it should also
318 * use Netlink queries to determine the configuration of queues on
319 * 'netdev'.
320 *
321 * This function must return 0 if and only if it sets 'netdev->tc' to an
322 * initialized 'struct tc'. */
323 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
324
325 /* Destroys the data structures allocated by the implementation as part of
326 * 'tc'. (This includes destroying 'tc->queues' by calling
327 * tc_destroy(tc).
328 *
329 * The implementation should not need to perform any Netlink calls. If
330 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
331 * (But it may not be desirable.)
332 *
333 * This function may be null if 'tc' is trivial. */
334 void (*tc_destroy)(struct tc *tc);
335
336 /* Retrieves details of 'netdev->tc' configuration into 'details'.
337 *
338 * The implementation should not need to perform any Netlink calls, because
339 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
340 * cached the configuration.
341 *
342 * The contents of 'details' should be documented as valid for 'ovs_name'
343 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
344 * (which is built as ovs-vswitchd.conf.db(8)).
345 *
346 * This function may be null if 'tc' is not configurable.
347 */
348 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
349
350 /* Reconfigures 'netdev->tc' according to 'details', performing any
351 * required Netlink calls to complete the reconfiguration.
352 *
353 * The contents of 'details' should be documented as valid for 'ovs_name'
354 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
355 * (which is built as ovs-vswitchd.conf.db(8)).
356 *
357 * This function may be null if 'tc' is not configurable.
358 */
359 int (*qdisc_set)(struct netdev *, const struct smap *details);
360
361 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
362 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
363 *
364 * The contents of 'details' should be documented as valid for 'ovs_name'
365 * in the "other_config" column in the "Queue" table in
366 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
367 *
368 * The implementation should not need to perform any Netlink calls, because
369 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
370 * cached the queue configuration.
371 *
372 * This function may be null if 'tc' does not have queues ('n_queues' is
373 * 0). */
374 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
375 struct smap *details);
376
377 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
378 * 'details', perfoming any required Netlink calls to complete the
379 * reconfiguration. The caller ensures that 'queue_id' is less than
380 * 'n_queues'.
381 *
382 * The contents of 'details' should be documented as valid for 'ovs_name'
383 * in the "other_config" column in the "Queue" table in
384 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
385 *
386 * This function may be null if 'tc' does not have queues or its queues are
387 * not configurable. */
388 int (*class_set)(struct netdev *, unsigned int queue_id,
389 const struct smap *details);
390
391 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
392 * tc_queue's within 'netdev->tc->queues'.
393 *
394 * This function may be null if 'tc' does not have queues or its queues
395 * cannot be deleted. */
396 int (*class_delete)(struct netdev *, struct tc_queue *queue);
397
398 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
399 * 'struct tc_queue's within 'netdev->tc->queues'.
400 *
401 * On success, initializes '*stats'.
402 *
403 * This function may be null if 'tc' does not have queues or if it cannot
404 * report queue statistics. */
405 int (*class_get_stats)(const struct netdev *netdev,
406 const struct tc_queue *queue,
407 struct netdev_queue_stats *stats);
408
409 /* Extracts queue stats from 'nlmsg', which is a response to a
410 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
411 *
412 * This function may be null if 'tc' does not have queues or if it cannot
413 * report queue statistics. */
414 int (*class_dump_stats)(const struct netdev *netdev,
415 const struct ofpbuf *nlmsg,
416 netdev_dump_queue_stats_cb *cb, void *aux);
417 };
418
419 static void
420 tc_init(struct tc *tc, const struct tc_ops *ops)
421 {
422 tc->ops = ops;
423 hmap_init(&tc->queues);
424 }
425
426 static void
427 tc_destroy(struct tc *tc)
428 {
429 hmap_destroy(&tc->queues);
430 }
431
432 static const struct tc_ops tc_ops_htb;
433 static const struct tc_ops tc_ops_hfsc;
434 static const struct tc_ops tc_ops_codel;
435 static const struct tc_ops tc_ops_fqcodel;
436 static const struct tc_ops tc_ops_sfq;
437 static const struct tc_ops tc_ops_default;
438 static const struct tc_ops tc_ops_noop;
439 static const struct tc_ops tc_ops_other;
440
441 static const struct tc_ops *const tcs[] = {
442 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
443 &tc_ops_hfsc, /* Hierarchical fair service curve. */
444 &tc_ops_codel, /* Controlled delay */
445 &tc_ops_fqcodel, /* Fair queue controlled delay */
446 &tc_ops_sfq, /* Stochastic fair queueing */
447 &tc_ops_noop, /* Non operating qos type. */
448 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
449 &tc_ops_other, /* Some other qdisc. */
450 NULL
451 };
452
453 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
454 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
455 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
456
457 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
458 int type,
459 unsigned int flags,
460 struct ofpbuf *);
461 static int tc_add_policer(struct netdev *,
462 uint32_t kbits_rate, uint32_t kbits_burst);
463
464 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
465 struct nlattr **options);
466 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
467 struct nlattr **options,
468 struct netdev_queue_stats *);
469 static int tc_query_class(const struct netdev *,
470 unsigned int handle, unsigned int parent,
471 struct ofpbuf **replyp);
472 static int tc_delete_class(const struct netdev *, unsigned int handle);
473
474 static int tc_del_qdisc(struct netdev *netdev);
475 static int tc_query_qdisc(const struct netdev *netdev);
476
477 static int tc_calc_cell_log(unsigned int mtu);
478 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
479 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
480 const struct tc_ratespec *rate);
481 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
482 \f
483 struct netdev_linux {
484 struct netdev up;
485
486 /* Protects all members below. */
487 struct ovs_mutex mutex;
488
489 unsigned int cache_valid;
490
491 bool miimon; /* Link status of last poll. */
492 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
493 struct timer miimon_timer;
494
495 int netnsid; /* Network namespace ID. */
496 /* The following are figured out "on demand" only. They are only valid
497 * when the corresponding VALID_* bit in 'cache_valid' is set. */
498 int ifindex;
499 struct eth_addr etheraddr;
500 int mtu;
501 unsigned int ifi_flags;
502 long long int carrier_resets;
503 uint32_t kbits_rate; /* Policing data. */
504 uint32_t kbits_burst;
505 int vport_stats_error; /* Cached error code from vport_get_stats().
506 0 or an errno value. */
507 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
508 int ether_addr_error; /* Cached error code from set/get etheraddr. */
509 int netdev_policing_error; /* Cached error code from set policing. */
510 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
511 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
512
513 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
515 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
516
517 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
518 struct tc *tc;
519
520 /* For devices of class netdev_tap_class only. */
521 int tap_fd;
522 bool present; /* If the device is present in the namespace */
523 uint64_t tx_dropped; /* tap device can drop if the iface is down */
524
525 /* LAG information. */
526 bool is_lag_master; /* True if the netdev is a LAG master. */
527 };
528
529 struct netdev_rxq_linux {
530 struct netdev_rxq up;
531 bool is_tap;
532 int fd;
533 };
534
535 /* This is set pretty low because we probably won't learn anything from the
536 * additional log messages. */
537 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
538
539 /* Polling miimon status for all ports causes performance degradation when
540 * handling a large number of ports. If there are no devices using miimon, then
541 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
542 *
543 * Readers do not depend on this variable synchronizing with the related
544 * changes in the device miimon status, so we can use atomic_count. */
545 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
546
547 static void netdev_linux_run(const struct netdev_class *);
548
549 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
550 int cmd, const char *cmd_name);
551 static int get_flags(const struct netdev *, unsigned int *flags);
552 static int set_flags(const char *, unsigned int flags);
553 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
554 enum netdev_flags on, enum netdev_flags *old_flagsp)
555 OVS_REQUIRES(netdev->mutex);
556 static int get_ifindex(const struct netdev *, int *ifindexp);
557 static int do_set_addr(struct netdev *netdev,
558 int ioctl_nr, const char *ioctl_name,
559 struct in_addr addr);
560 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
561 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
562 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
563 static int af_packet_sock(void);
564 static bool netdev_linux_miimon_enabled(void);
565 static void netdev_linux_miimon_run(void);
566 static void netdev_linux_miimon_wait(void);
567 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
568
569 static bool
570 is_netdev_linux_class(const struct netdev_class *netdev_class)
571 {
572 return netdev_class->run == netdev_linux_run;
573 }
574
575 static bool
576 is_tap_netdev(const struct netdev *netdev)
577 {
578 return netdev_get_class(netdev) == &netdev_tap_class;
579 }
580
581 static struct netdev_linux *
582 netdev_linux_cast(const struct netdev *netdev)
583 {
584 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
585
586 return CONTAINER_OF(netdev, struct netdev_linux, up);
587 }
588
589 static struct netdev_rxq_linux *
590 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
591 {
592 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
593 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
594 }
595 \f
596 static int
597 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
598 {
599 struct dpif_netlink_vport reply;
600 struct ofpbuf *buf;
601 int error;
602
603 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
604 if (error) {
605 if (error == ENOENT) {
606 /* Assume it is local if there is no API (e.g. if the openvswitch
607 * kernel module is not loaded). */
608 netnsid_set_local(&netdev->netnsid);
609 } else {
610 netnsid_unset(&netdev->netnsid);
611 }
612 return error;
613 }
614
615 netnsid_set(&netdev->netnsid, reply.netnsid);
616 ofpbuf_delete(buf);
617 return 0;
618 }
619
620 static int
621 netdev_linux_netnsid_update(struct netdev_linux *netdev)
622 {
623 if (netnsid_is_unset(netdev->netnsid)) {
624 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
625 netnsid_set_local(&netdev->netnsid);
626 } else {
627 return netdev_linux_netnsid_update__(netdev);
628 }
629 }
630
631 return 0;
632 }
633
634 static bool
635 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
636 {
637 netdev_linux_netnsid_update(netdev);
638 return netnsid_eq(netdev->netnsid, nsid);
639 }
640
641 static bool
642 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
643 {
644 netdev_linux_netnsid_update(netdev);
645 return netnsid_is_remote(netdev->netnsid);
646 }
647
648 static int netdev_linux_update_via_netlink(struct netdev_linux *);
649 static void netdev_linux_update(struct netdev_linux *netdev, int,
650 const struct rtnetlink_change *)
651 OVS_REQUIRES(netdev->mutex);
652 static void netdev_linux_changed(struct netdev_linux *netdev,
653 unsigned int ifi_flags, unsigned int mask)
654 OVS_REQUIRES(netdev->mutex);
655
656 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
657 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
658 * if no such socket could be created. */
659 static struct nl_sock *
660 netdev_linux_notify_sock(void)
661 {
662 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
663 static struct nl_sock *sock;
664 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
665 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
666
667 if (ovsthread_once_start(&once)) {
668 int error;
669
670 error = nl_sock_create(NETLINK_ROUTE, &sock);
671 if (!error) {
672 size_t i;
673
674 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
675 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
676 if (error) {
677 nl_sock_destroy(sock);
678 sock = NULL;
679 break;
680 }
681 }
682 }
683 nl_sock_listen_all_nsid(sock, true);
684 ovsthread_once_done(&once);
685 }
686
687 return sock;
688 }
689
690 static bool
691 netdev_linux_miimon_enabled(void)
692 {
693 return atomic_count_get(&miimon_cnt) > 0;
694 }
695
696 static bool
697 netdev_linux_kind_is_lag(const char *kind)
698 {
699 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
700 return true;
701 }
702
703 return false;
704 }
705
706 static void
707 netdev_linux_update_lag(struct rtnetlink_change *change)
708 OVS_REQUIRES(lag_mutex)
709 {
710 struct linux_lag_slave *lag;
711
712 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
713 return;
714 }
715
716 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
717 lag = shash_find_data(&lag_shash, change->ifname);
718
719 if (!lag) {
720 struct netdev *master_netdev;
721 char master_name[IFNAMSIZ];
722 uint32_t block_id;
723 int error = 0;
724
725 if_indextoname(change->master_ifindex, master_name);
726 master_netdev = netdev_from_name(master_name);
727 if (!master_netdev) {
728 return;
729 }
730
731 if (is_netdev_linux_class(master_netdev->netdev_class)) {
732 block_id = netdev_get_block_id(master_netdev);
733 if (!block_id) {
734 netdev_close(master_netdev);
735 return;
736 }
737
738 lag = xmalloc(sizeof *lag);
739 lag->block_id = block_id;
740 lag->node = shash_add(&lag_shash, change->ifname, lag);
741
742 /* LAG master is linux netdev so add slave to same block. */
743 error = tc_add_del_ingress_qdisc(change->if_index, true,
744 block_id);
745 if (error) {
746 VLOG_WARN("failed to bind LAG slave to master's block");
747 shash_delete(&lag_shash, lag->node);
748 free(lag);
749 }
750 }
751
752 netdev_close(master_netdev);
753 }
754 } else if (change->master_ifindex == 0) {
755 /* Check if this was a lag slave that has been freed. */
756 lag = shash_find_data(&lag_shash, change->ifname);
757
758 if (lag) {
759 tc_add_del_ingress_qdisc(change->if_index, false,
760 lag->block_id);
761 shash_delete(&lag_shash, lag->node);
762 free(lag);
763 }
764 }
765 }
766
767 static void
768 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
769 {
770 struct nl_sock *sock;
771 int error;
772
773 if (netdev_linux_miimon_enabled()) {
774 netdev_linux_miimon_run();
775 }
776
777 sock = netdev_linux_notify_sock();
778 if (!sock) {
779 return;
780 }
781
782 do {
783 uint64_t buf_stub[4096 / 8];
784 int nsid;
785 struct ofpbuf buf;
786
787 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
788 error = nl_sock_recv(sock, &buf, &nsid, false);
789 if (!error) {
790 struct rtnetlink_change change;
791
792 if (rtnetlink_parse(&buf, &change)) {
793 struct netdev *netdev_ = NULL;
794 char dev_name[IFNAMSIZ];
795
796 if (!change.ifname) {
797 change.ifname = if_indextoname(change.if_index, dev_name);
798 }
799
800 if (change.ifname) {
801 netdev_ = netdev_from_name(change.ifname);
802 }
803 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805
806 ovs_mutex_lock(&netdev->mutex);
807 netdev_linux_update(netdev, nsid, &change);
808 ovs_mutex_unlock(&netdev->mutex);
809 }
810 else if (!netdev_ && change.ifname) {
811 /* Netdev is not present in OvS but its master could be. */
812 ovs_mutex_lock(&lag_mutex);
813 netdev_linux_update_lag(&change);
814 ovs_mutex_unlock(&lag_mutex);
815 }
816 netdev_close(netdev_);
817 }
818 } else if (error == ENOBUFS) {
819 struct shash device_shash;
820 struct shash_node *node;
821
822 nl_sock_drain(sock);
823
824 shash_init(&device_shash);
825 netdev_get_devices(&netdev_linux_class, &device_shash);
826 SHASH_FOR_EACH (node, &device_shash) {
827 struct netdev *netdev_ = node->data;
828 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829 unsigned int flags;
830
831 ovs_mutex_lock(&netdev->mutex);
832 get_flags(netdev_, &flags);
833 netdev_linux_changed(netdev, flags, 0);
834 ovs_mutex_unlock(&netdev->mutex);
835
836 netdev_close(netdev_);
837 }
838 shash_destroy(&device_shash);
839 } else if (error != EAGAIN) {
840 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
841 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
842 ovs_strerror(error));
843 }
844 ofpbuf_uninit(&buf);
845 } while (!error);
846 }
847
848 static void
849 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
850 {
851 struct nl_sock *sock;
852
853 if (netdev_linux_miimon_enabled()) {
854 netdev_linux_miimon_wait();
855 }
856 sock = netdev_linux_notify_sock();
857 if (sock) {
858 nl_sock_wait(sock, POLLIN);
859 }
860 }
861
862 static void
863 netdev_linux_changed(struct netdev_linux *dev,
864 unsigned int ifi_flags, unsigned int mask)
865 OVS_REQUIRES(dev->mutex)
866 {
867 netdev_change_seq_changed(&dev->up);
868
869 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
870 dev->carrier_resets++;
871 }
872 dev->ifi_flags = ifi_flags;
873
874 dev->cache_valid &= mask;
875 if (!(mask & VALID_IN)) {
876 netdev_get_addrs_list_flush();
877 }
878 }
879
880 static void
881 netdev_linux_update__(struct netdev_linux *dev,
882 const struct rtnetlink_change *change)
883 OVS_REQUIRES(dev->mutex)
884 {
885 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
886 if (change->nlmsg_type == RTM_NEWLINK) {
887 /* Keep drv-info, and ip addresses. */
888 netdev_linux_changed(dev, change->ifi_flags,
889 VALID_DRVINFO | VALID_IN);
890
891 /* Update netdev from rtnl-change msg. */
892 if (change->mtu) {
893 dev->mtu = change->mtu;
894 dev->cache_valid |= VALID_MTU;
895 dev->netdev_mtu_error = 0;
896 }
897
898 if (!eth_addr_is_zero(change->mac)) {
899 dev->etheraddr = change->mac;
900 dev->cache_valid |= VALID_ETHERADDR;
901 dev->ether_addr_error = 0;
902
903 /* The mac addr has been changed, report it now. */
904 rtnetlink_report_link();
905 }
906
907 if (change->master && netdev_linux_kind_is_lag(change->master)) {
908 dev->is_lag_master = true;
909 }
910
911 dev->ifindex = change->if_index;
912 dev->cache_valid |= VALID_IFINDEX;
913 dev->get_ifindex_error = 0;
914 dev->present = true;
915 } else {
916 /* FIXME */
917 netdev_linux_changed(dev, change->ifi_flags, 0);
918 dev->present = false;
919 netnsid_unset(&dev->netnsid);
920 }
921 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
922 /* Invalidates in4, in6. */
923 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
924 } else {
925 OVS_NOT_REACHED();
926 }
927 }
928
929 static void
930 netdev_linux_update(struct netdev_linux *dev, int nsid,
931 const struct rtnetlink_change *change)
932 OVS_REQUIRES(dev->mutex)
933 {
934 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
935 netdev_linux_update__(dev, change);
936 }
937 }
938
939 static struct netdev *
940 netdev_linux_alloc(void)
941 {
942 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
943 return &netdev->up;
944 }
945
946 static int
947 netdev_linux_common_construct(struct netdev *netdev_)
948 {
949 /* Prevent any attempt to create (or open) a network device named "default"
950 * or "all". These device names are effectively reserved on Linux because
951 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
952 * itself this wouldn't call for any special treatment, but in practice if
953 * a program tries to create devices with these names, it causes the kernel
954 * to fire a "new device" notification event even though creation failed,
955 * and in turn that causes OVS to wake up and try to create them again,
956 * which ends up as a 100% CPU loop. */
957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
958 const char *name = netdev_->name;
959 if (!strcmp(name, "default") || !strcmp(name, "all")) {
960 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
961 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
962 name);
963 return EINVAL;
964 }
965
966 /* The device could be in the same network namespace or in another one. */
967 netnsid_unset(&netdev->netnsid);
968 ovs_mutex_init(&netdev->mutex);
969 return 0;
970 }
971
972 /* Creates system and internal devices. */
973 static int
974 netdev_linux_construct(struct netdev *netdev_)
975 {
976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
977 int error = netdev_linux_common_construct(netdev_);
978 if (error) {
979 return error;
980 }
981
982 error = get_flags(&netdev->up, &netdev->ifi_flags);
983 if (error == ENODEV) {
984 if (netdev->up.netdev_class != &netdev_internal_class) {
985 /* The device does not exist, so don't allow it to be opened. */
986 return ENODEV;
987 } else {
988 /* "Internal" netdevs have to be created as netdev objects before
989 * they exist in the kernel, because creating them in the kernel
990 * happens by passing a netdev object to dpif_port_add().
991 * Therefore, ignore the error. */
992 }
993 }
994
995 return 0;
996 }
997
998 /* For most types of netdevs we open the device for each call of
999 * netdev_open(). However, this is not the case with tap devices,
1000 * since it is only possible to open the device once. In this
1001 * situation we share a single file descriptor, and consequently
1002 * buffers, across all readers. Therefore once data is read it will
1003 * be unavailable to other reads for tap devices. */
1004 static int
1005 netdev_linux_construct_tap(struct netdev *netdev_)
1006 {
1007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1008 static const char tap_dev[] = "/dev/net/tun";
1009 const char *name = netdev_->name;
1010 struct ifreq ifr;
1011
1012 int error = netdev_linux_common_construct(netdev_);
1013 if (error) {
1014 return error;
1015 }
1016
1017 /* Open tap device. */
1018 netdev->tap_fd = open(tap_dev, O_RDWR);
1019 if (netdev->tap_fd < 0) {
1020 error = errno;
1021 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1022 return error;
1023 }
1024
1025 /* Create tap device. */
1026 get_flags(&netdev->up, &netdev->ifi_flags);
1027 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1028 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1029 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1030 VLOG_WARN("%s: creating tap device failed: %s", name,
1031 ovs_strerror(errno));
1032 error = errno;
1033 goto error_close;
1034 }
1035
1036 /* Make non-blocking. */
1037 error = set_nonblocking(netdev->tap_fd);
1038 if (error) {
1039 goto error_close;
1040 }
1041
1042 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1043 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1044 ovs_strerror(errno));
1045 error = errno;
1046 goto error_close;
1047 }
1048
1049 netdev->present = true;
1050 return 0;
1051
1052 error_close:
1053 close(netdev->tap_fd);
1054 return error;
1055 }
1056
1057 static void
1058 netdev_linux_destruct(struct netdev *netdev_)
1059 {
1060 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1061
1062 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1063 netdev->tc->ops->tc_destroy(netdev->tc);
1064 }
1065
1066 if (netdev_get_class(netdev_) == &netdev_tap_class
1067 && netdev->tap_fd >= 0)
1068 {
1069 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1070 close(netdev->tap_fd);
1071 }
1072
1073 if (netdev->miimon_interval > 0) {
1074 atomic_count_dec(&miimon_cnt);
1075 }
1076
1077 ovs_mutex_destroy(&netdev->mutex);
1078 }
1079
1080 static void
1081 netdev_linux_dealloc(struct netdev *netdev_)
1082 {
1083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1084 free(netdev);
1085 }
1086
1087 static struct netdev_rxq *
1088 netdev_linux_rxq_alloc(void)
1089 {
1090 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1091 return &rx->up;
1092 }
1093
1094 static int
1095 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1096 {
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 struct netdev *netdev_ = rx->up.netdev;
1099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1100 int error;
1101
1102 ovs_mutex_lock(&netdev->mutex);
1103 rx->is_tap = is_tap_netdev(netdev_);
1104 if (rx->is_tap) {
1105 rx->fd = netdev->tap_fd;
1106 } else {
1107 struct sockaddr_ll sll;
1108 int ifindex, val;
1109 /* Result of tcpdump -dd inbound */
1110 static const struct sock_filter filt[] = {
1111 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1112 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1113 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1114 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1115 };
1116 static const struct sock_fprog fprog = {
1117 ARRAY_SIZE(filt), (struct sock_filter *) filt
1118 };
1119
1120 /* Create file descriptor. */
1121 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1122 if (rx->fd < 0) {
1123 error = errno;
1124 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1125 goto error;
1126 }
1127
1128 val = 1;
1129 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1130 error = errno;
1131 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1132 netdev_get_name(netdev_), ovs_strerror(error));
1133 goto error;
1134 }
1135
1136 /* Set non-blocking mode. */
1137 error = set_nonblocking(rx->fd);
1138 if (error) {
1139 goto error;
1140 }
1141
1142 /* Get ethernet device index. */
1143 error = get_ifindex(&netdev->up, &ifindex);
1144 if (error) {
1145 goto error;
1146 }
1147
1148 /* Bind to specific ethernet device. */
1149 memset(&sll, 0, sizeof sll);
1150 sll.sll_family = AF_PACKET;
1151 sll.sll_ifindex = ifindex;
1152 sll.sll_protocol = htons(ETH_P_ALL);
1153 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1154 error = errno;
1155 VLOG_ERR("%s: failed to bind raw socket (%s)",
1156 netdev_get_name(netdev_), ovs_strerror(error));
1157 goto error;
1158 }
1159
1160 /* Filter for only inbound packets. */
1161 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1162 sizeof fprog);
1163 if (error) {
1164 error = errno;
1165 VLOG_ERR("%s: failed to attach filter (%s)",
1166 netdev_get_name(netdev_), ovs_strerror(error));
1167 goto error;
1168 }
1169 }
1170 ovs_mutex_unlock(&netdev->mutex);
1171
1172 return 0;
1173
1174 error:
1175 if (rx->fd >= 0) {
1176 close(rx->fd);
1177 }
1178 ovs_mutex_unlock(&netdev->mutex);
1179 return error;
1180 }
1181
1182 static void
1183 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1184 {
1185 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1186
1187 if (!rx->is_tap) {
1188 close(rx->fd);
1189 }
1190 }
1191
1192 static void
1193 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1194 {
1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1196
1197 free(rx);
1198 }
1199
1200 static ovs_be16
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1202 {
1203 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1204 return htons(aux->tp_vlan_tpid);
1205 } else if (double_tagged) {
1206 return htons(ETH_TYPE_VLAN_8021AD);
1207 } else {
1208 return htons(ETH_TYPE_VLAN_8021Q);
1209 }
1210 }
1211
1212 static bool
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1214 {
1215 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1216 }
1217
1218 static int
1219 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1220 {
1221 size_t size;
1222 ssize_t retval;
1223 struct iovec iov;
1224 struct cmsghdr *cmsg;
1225 union {
1226 struct cmsghdr cmsg;
1227 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1228 } cmsg_buffer;
1229 struct msghdr msgh;
1230
1231 /* Reserve headroom for a single VLAN tag */
1232 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1233 size = dp_packet_tailroom(buffer);
1234
1235 iov.iov_base = dp_packet_data(buffer);
1236 iov.iov_len = size;
1237 msgh.msg_name = NULL;
1238 msgh.msg_namelen = 0;
1239 msgh.msg_iov = &iov;
1240 msgh.msg_iovlen = 1;
1241 msgh.msg_control = &cmsg_buffer;
1242 msgh.msg_controllen = sizeof cmsg_buffer;
1243 msgh.msg_flags = 0;
1244
1245 do {
1246 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1247 } while (retval < 0 && errno == EINTR);
1248
1249 if (retval < 0) {
1250 return errno;
1251 } else if (retval > size) {
1252 return EMSGSIZE;
1253 }
1254
1255 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1256
1257 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1258 const struct tpacket_auxdata *aux;
1259
1260 if (cmsg->cmsg_level != SOL_PACKET
1261 || cmsg->cmsg_type != PACKET_AUXDATA
1262 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1263 continue;
1264 }
1265
1266 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1267 if (auxdata_has_vlan_tci(aux)) {
1268 struct eth_header *eth;
1269 bool double_tagged;
1270
1271 if (retval < ETH_HEADER_LEN) {
1272 return EINVAL;
1273 }
1274
1275 eth = dp_packet_data(buffer);
1276 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1277
1278 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1279 htons(aux->tp_vlan_tci));
1280 break;
1281 }
1282 }
1283
1284 return 0;
1285 }
1286
1287 static int
1288 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1289 {
1290 ssize_t retval;
1291 size_t size = dp_packet_tailroom(buffer);
1292
1293 do {
1294 retval = read(fd, dp_packet_data(buffer), size);
1295 } while (retval < 0 && errno == EINTR);
1296
1297 if (retval < 0) {
1298 return errno;
1299 }
1300
1301 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1302 return 0;
1303 }
1304
1305 static int
1306 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1307 int *qfill)
1308 {
1309 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1310 struct netdev *netdev = rx->up.netdev;
1311 struct dp_packet *buffer;
1312 ssize_t retval;
1313 int mtu;
1314
1315 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1316 mtu = ETH_PAYLOAD_MAX;
1317 }
1318
1319 /* Assume Ethernet port. No need to set packet_type. */
1320 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1321 DP_NETDEV_HEADROOM);
1322 retval = (rx->is_tap
1323 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1324 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1325
1326 if (retval) {
1327 if (retval != EAGAIN && retval != EMSGSIZE) {
1328 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1329 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1330 }
1331 dp_packet_delete(buffer);
1332 } else {
1333 dp_packet_batch_init_packet(batch, buffer);
1334 }
1335
1336 if (qfill) {
1337 *qfill = -ENOTSUP;
1338 }
1339
1340 return retval;
1341 }
1342
1343 static void
1344 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1345 {
1346 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1347 poll_fd_wait(rx->fd, POLLIN);
1348 }
1349
1350 static int
1351 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1352 {
1353 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1354 if (rx->is_tap) {
1355 struct ifreq ifr;
1356 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1357 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1358 if (error) {
1359 return error;
1360 }
1361 drain_fd(rx->fd, ifr.ifr_qlen);
1362 return 0;
1363 } else {
1364 return drain_rcvbuf(rx->fd);
1365 }
1366 }
1367
1368 static int
1369 netdev_linux_sock_batch_send(int sock, int ifindex,
1370 struct dp_packet_batch *batch)
1371 {
1372 const size_t size = dp_packet_batch_size(batch);
1373 /* We don't bother setting most fields in sockaddr_ll because the
1374 * kernel ignores them for SOCK_RAW. */
1375 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1376 .sll_ifindex = ifindex };
1377
1378 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1379 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1380
1381 struct dp_packet *packet;
1382 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1383 iov[i].iov_base = dp_packet_data(packet);
1384 iov[i].iov_len = dp_packet_size(packet);
1385 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1386 .msg_namelen = sizeof sll,
1387 .msg_iov = &iov[i],
1388 .msg_iovlen = 1 };
1389 }
1390
1391 int error = 0;
1392 for (uint32_t ofs = 0; ofs < size; ) {
1393 ssize_t retval;
1394 do {
1395 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1396 error = retval < 0 ? errno : 0;
1397 } while (error == EINTR);
1398 if (error) {
1399 break;
1400 }
1401 ofs += retval;
1402 }
1403
1404 free(mmsg);
1405 free(iov);
1406 return error;
1407 }
1408
1409 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1410 * essential, because packets sent to a tap device with an AF_PACKET socket
1411 * will loop back to be *received* again on the tap device. This doesn't occur
1412 * on other interface types because we attach a socket filter to the rx
1413 * socket. */
1414 static int
1415 netdev_linux_tap_batch_send(struct netdev *netdev_,
1416 struct dp_packet_batch *batch)
1417 {
1418 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1419 struct dp_packet *packet;
1420
1421 /* The Linux tap driver returns EIO if the device is not up,
1422 * so if the device is not up, don't waste time sending it.
1423 * However, if the device is in another network namespace
1424 * then OVS can't retrieve the state. In that case, send the
1425 * packets anyway. */
1426 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1427 netdev->tx_dropped += dp_packet_batch_size(batch);
1428 return 0;
1429 }
1430
1431 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1432 size_t size = dp_packet_size(packet);
1433 ssize_t retval;
1434 int error;
1435
1436 do {
1437 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1438 error = retval < 0 ? errno : 0;
1439 } while (error == EINTR);
1440
1441 if (error) {
1442 /* The Linux tap driver returns EIO if the device is not up. From
1443 * the OVS side this is not an error, so we ignore it; otherwise,
1444 * return the erro. */
1445 if (error != EIO) {
1446 return error;
1447 }
1448 } else if (retval != size) {
1449 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1450 "bytes of %"PRIuSIZE") on %s",
1451 retval, size, netdev_get_name(netdev_));
1452 return EMSGSIZE;
1453 }
1454 }
1455 return 0;
1456 }
1457
1458 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1459 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1460 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1461 * the packet is too big or too small to transmit on the device.
1462 *
1463 * The kernel maintains a packet transmission queue, so the caller is not
1464 * expected to do additional queuing of packets. */
1465 static int
1466 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1467 struct dp_packet_batch *batch,
1468 bool concurrent_txq OVS_UNUSED)
1469 {
1470 int error = 0;
1471 int sock = 0;
1472
1473 if (!is_tap_netdev(netdev_)) {
1474 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1475 error = EOPNOTSUPP;
1476 goto free_batch;
1477 }
1478
1479 sock = af_packet_sock();
1480 if (sock < 0) {
1481 error = -sock;
1482 goto free_batch;
1483 }
1484
1485 int ifindex = netdev_get_ifindex(netdev_);
1486 if (ifindex < 0) {
1487 error = -ifindex;
1488 goto free_batch;
1489 }
1490
1491 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1492 } else {
1493 error = netdev_linux_tap_batch_send(netdev_, batch);
1494 }
1495 if (error) {
1496 if (error == ENOBUFS) {
1497 /* The Linux AF_PACKET implementation never blocks waiting
1498 * for room for packets, instead returning ENOBUFS.
1499 * Translate this into EAGAIN for the caller. */
1500 error = EAGAIN;
1501 } else {
1502 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1503 netdev_get_name(netdev_), ovs_strerror(error));
1504 }
1505 }
1506
1507 free_batch:
1508 dp_packet_delete_batch(batch, true);
1509 return error;
1510 }
1511
1512 /* Registers with the poll loop to wake up from the next call to poll_block()
1513 * when the packet transmission queue has sufficient room to transmit a packet
1514 * with netdev_send().
1515 *
1516 * The kernel maintains a packet transmission queue, so the client is not
1517 * expected to do additional queuing of packets. Thus, this function is
1518 * unlikely to ever be used. It is included for completeness. */
1519 static void
1520 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1521 {
1522 if (is_tap_netdev(netdev)) {
1523 /* TAP device always accepts packets.*/
1524 poll_immediate_wake();
1525 }
1526 }
1527
1528 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1529 * otherwise a positive errno value. */
1530 static int
1531 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1532 {
1533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1534 enum netdev_flags old_flags = 0;
1535 int error;
1536
1537 ovs_mutex_lock(&netdev->mutex);
1538 if (netdev_linux_netnsid_is_remote(netdev)) {
1539 error = EOPNOTSUPP;
1540 goto exit;
1541 }
1542
1543 if (netdev->cache_valid & VALID_ETHERADDR) {
1544 error = netdev->ether_addr_error;
1545 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1546 goto exit;
1547 }
1548 netdev->cache_valid &= ~VALID_ETHERADDR;
1549 }
1550
1551 /* Tap devices must be brought down before setting the address. */
1552 if (is_tap_netdev(netdev_)) {
1553 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1554 }
1555 error = set_etheraddr(netdev_get_name(netdev_), mac);
1556 if (!error || error == ENODEV) {
1557 netdev->ether_addr_error = error;
1558 netdev->cache_valid |= VALID_ETHERADDR;
1559 if (!error) {
1560 netdev->etheraddr = mac;
1561 }
1562 }
1563
1564 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1565 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1566 }
1567
1568 exit:
1569 ovs_mutex_unlock(&netdev->mutex);
1570 return error;
1571 }
1572
1573 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1574 static int
1575 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1576 {
1577 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1578 int error;
1579
1580 ovs_mutex_lock(&netdev->mutex);
1581 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1582 netdev_linux_update_via_netlink(netdev);
1583 }
1584
1585 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1586 /* Fall back to ioctl if netlink fails */
1587 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1588 &netdev->etheraddr);
1589 netdev->cache_valid |= VALID_ETHERADDR;
1590 }
1591
1592 error = netdev->ether_addr_error;
1593 if (!error) {
1594 *mac = netdev->etheraddr;
1595 }
1596 ovs_mutex_unlock(&netdev->mutex);
1597
1598 return error;
1599 }
1600
1601 static int
1602 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1603 {
1604 int error;
1605
1606 if (!(netdev->cache_valid & VALID_MTU)) {
1607 netdev_linux_update_via_netlink(netdev);
1608 }
1609
1610 if (!(netdev->cache_valid & VALID_MTU)) {
1611 /* Fall back to ioctl if netlink fails */
1612 struct ifreq ifr;
1613
1614 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1615 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1616 netdev->mtu = ifr.ifr_mtu;
1617 netdev->cache_valid |= VALID_MTU;
1618 }
1619
1620 error = netdev->netdev_mtu_error;
1621 if (!error) {
1622 *mtup = netdev->mtu;
1623 }
1624
1625 return error;
1626 }
1627
1628 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1629 * in bytes, not including the hardware header; thus, this is typically 1500
1630 * bytes for Ethernet devices. */
1631 static int
1632 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1633 {
1634 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1635 int error;
1636
1637 ovs_mutex_lock(&netdev->mutex);
1638 error = netdev_linux_get_mtu__(netdev, mtup);
1639 ovs_mutex_unlock(&netdev->mutex);
1640
1641 return error;
1642 }
1643
1644 /* Sets the maximum size of transmitted (MTU) for given device using linux
1645 * networking ioctl interface.
1646 */
1647 static int
1648 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1649 {
1650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1651 struct ifreq ifr;
1652 int error;
1653
1654 ovs_mutex_lock(&netdev->mutex);
1655 if (netdev_linux_netnsid_is_remote(netdev)) {
1656 error = EOPNOTSUPP;
1657 goto exit;
1658 }
1659
1660 if (netdev->cache_valid & VALID_MTU) {
1661 error = netdev->netdev_mtu_error;
1662 if (error || netdev->mtu == mtu) {
1663 goto exit;
1664 }
1665 netdev->cache_valid &= ~VALID_MTU;
1666 }
1667 ifr.ifr_mtu = mtu;
1668 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1669 SIOCSIFMTU, "SIOCSIFMTU");
1670 if (!error || error == ENODEV) {
1671 netdev->netdev_mtu_error = error;
1672 netdev->mtu = ifr.ifr_mtu;
1673 netdev->cache_valid |= VALID_MTU;
1674 }
1675 exit:
1676 ovs_mutex_unlock(&netdev->mutex);
1677 return error;
1678 }
1679
1680 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1681 * On failure, returns a negative errno value. */
1682 static int
1683 netdev_linux_get_ifindex(const struct netdev *netdev_)
1684 {
1685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1686 int ifindex, error;
1687
1688 ovs_mutex_lock(&netdev->mutex);
1689 if (netdev_linux_netnsid_is_remote(netdev)) {
1690 error = EOPNOTSUPP;
1691 goto exit;
1692 }
1693 error = get_ifindex(netdev_, &ifindex);
1694
1695 exit:
1696 ovs_mutex_unlock(&netdev->mutex);
1697 return error ? -error : ifindex;
1698 }
1699
1700 static int
1701 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1702 {
1703 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1704
1705 ovs_mutex_lock(&netdev->mutex);
1706 if (netdev->miimon_interval > 0) {
1707 *carrier = netdev->miimon;
1708 } else {
1709 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1710 }
1711 ovs_mutex_unlock(&netdev->mutex);
1712
1713 return 0;
1714 }
1715
1716 static long long int
1717 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1718 {
1719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1720 long long int carrier_resets;
1721
1722 ovs_mutex_lock(&netdev->mutex);
1723 carrier_resets = netdev->carrier_resets;
1724 ovs_mutex_unlock(&netdev->mutex);
1725
1726 return carrier_resets;
1727 }
1728
1729 static int
1730 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1731 struct mii_ioctl_data *data)
1732 {
1733 struct ifreq ifr;
1734 int error;
1735
1736 memset(&ifr, 0, sizeof ifr);
1737 memcpy(&ifr.ifr_data, data, sizeof *data);
1738 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1739 memcpy(data, &ifr.ifr_data, sizeof *data);
1740
1741 return error;
1742 }
1743
1744 static int
1745 netdev_linux_get_miimon(const char *name, bool *miimon)
1746 {
1747 struct mii_ioctl_data data;
1748 int error;
1749
1750 *miimon = false;
1751
1752 memset(&data, 0, sizeof data);
1753 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1754 if (!error) {
1755 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1756 data.reg_num = MII_BMSR;
1757 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1758 &data);
1759
1760 if (!error) {
1761 *miimon = !!(data.val_out & BMSR_LSTATUS);
1762 }
1763 }
1764 if (error) {
1765 struct ethtool_cmd ecmd;
1766
1767 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1768 name);
1769
1770 COVERAGE_INC(netdev_get_ethtool);
1771 memset(&ecmd, 0, sizeof ecmd);
1772 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1773 "ETHTOOL_GLINK");
1774 if (!error) {
1775 struct ethtool_value eval;
1776
1777 memcpy(&eval, &ecmd, sizeof eval);
1778 *miimon = !!eval.data;
1779 } else {
1780 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1781 }
1782 }
1783
1784 return error;
1785 }
1786
1787 static int
1788 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1789 long long int interval)
1790 {
1791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1792
1793 ovs_mutex_lock(&netdev->mutex);
1794 interval = interval > 0 ? MAX(interval, 100) : 0;
1795 if (netdev->miimon_interval != interval) {
1796 if (interval && !netdev->miimon_interval) {
1797 atomic_count_inc(&miimon_cnt);
1798 } else if (!interval && netdev->miimon_interval) {
1799 atomic_count_dec(&miimon_cnt);
1800 }
1801
1802 netdev->miimon_interval = interval;
1803 timer_set_expired(&netdev->miimon_timer);
1804 }
1805 ovs_mutex_unlock(&netdev->mutex);
1806
1807 return 0;
1808 }
1809
1810 static void
1811 netdev_linux_miimon_run(void)
1812 {
1813 struct shash device_shash;
1814 struct shash_node *node;
1815
1816 shash_init(&device_shash);
1817 netdev_get_devices(&netdev_linux_class, &device_shash);
1818 SHASH_FOR_EACH (node, &device_shash) {
1819 struct netdev *netdev = node->data;
1820 struct netdev_linux *dev = netdev_linux_cast(netdev);
1821 bool miimon;
1822
1823 ovs_mutex_lock(&dev->mutex);
1824 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1825 netdev_linux_get_miimon(dev->up.name, &miimon);
1826 if (miimon != dev->miimon) {
1827 dev->miimon = miimon;
1828 netdev_linux_changed(dev, dev->ifi_flags, 0);
1829 }
1830
1831 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1832 }
1833 ovs_mutex_unlock(&dev->mutex);
1834 netdev_close(netdev);
1835 }
1836
1837 shash_destroy(&device_shash);
1838 }
1839
1840 static void
1841 netdev_linux_miimon_wait(void)
1842 {
1843 struct shash device_shash;
1844 struct shash_node *node;
1845
1846 shash_init(&device_shash);
1847 netdev_get_devices(&netdev_linux_class, &device_shash);
1848 SHASH_FOR_EACH (node, &device_shash) {
1849 struct netdev *netdev = node->data;
1850 struct netdev_linux *dev = netdev_linux_cast(netdev);
1851
1852 ovs_mutex_lock(&dev->mutex);
1853 if (dev->miimon_interval > 0) {
1854 timer_wait(&dev->miimon_timer);
1855 }
1856 ovs_mutex_unlock(&dev->mutex);
1857 netdev_close(netdev);
1858 }
1859 shash_destroy(&device_shash);
1860 }
1861
1862 static void
1863 swap_uint64(uint64_t *a, uint64_t *b)
1864 {
1865 uint64_t tmp = *a;
1866 *a = *b;
1867 *b = tmp;
1868 }
1869
1870 /* Copies 'src' into 'dst', performing format conversion in the process.
1871 *
1872 * 'src' is allowed to be misaligned. */
1873 static void
1874 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1875 const struct ovs_vport_stats *src)
1876 {
1877 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1878 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1879 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1880 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1881 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1882 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1883 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1884 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1885 dst->multicast = 0;
1886 dst->collisions = 0;
1887 dst->rx_length_errors = 0;
1888 dst->rx_over_errors = 0;
1889 dst->rx_crc_errors = 0;
1890 dst->rx_frame_errors = 0;
1891 dst->rx_fifo_errors = 0;
1892 dst->rx_missed_errors = 0;
1893 dst->tx_aborted_errors = 0;
1894 dst->tx_carrier_errors = 0;
1895 dst->tx_fifo_errors = 0;
1896 dst->tx_heartbeat_errors = 0;
1897 dst->tx_window_errors = 0;
1898 }
1899
1900 static int
1901 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1902 {
1903 struct dpif_netlink_vport reply;
1904 struct ofpbuf *buf;
1905 int error;
1906
1907 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1908 if (error) {
1909 return error;
1910 } else if (!reply.stats) {
1911 ofpbuf_delete(buf);
1912 return EOPNOTSUPP;
1913 }
1914
1915 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1916
1917 ofpbuf_delete(buf);
1918
1919 return 0;
1920 }
1921
1922 static void
1923 get_stats_via_vport(const struct netdev *netdev_,
1924 struct netdev_stats *stats)
1925 {
1926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1927
1928 if (!netdev->vport_stats_error ||
1929 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1930 int error;
1931
1932 error = get_stats_via_vport__(netdev_, stats);
1933 if (error && error != ENOENT && error != ENODEV) {
1934 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1935 "(%s)",
1936 netdev_get_name(netdev_), ovs_strerror(error));
1937 }
1938 netdev->vport_stats_error = error;
1939 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1940 }
1941 }
1942
1943 /* Retrieves current device stats for 'netdev-linux'. */
1944 static int
1945 netdev_linux_get_stats(const struct netdev *netdev_,
1946 struct netdev_stats *stats)
1947 {
1948 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1949 struct netdev_stats dev_stats;
1950 int error;
1951
1952 ovs_mutex_lock(&netdev->mutex);
1953 get_stats_via_vport(netdev_, stats);
1954 error = get_stats_via_netlink(netdev_, &dev_stats);
1955 if (error) {
1956 if (!netdev->vport_stats_error) {
1957 error = 0;
1958 }
1959 } else if (netdev->vport_stats_error) {
1960 /* stats not available from OVS then use netdev stats. */
1961 *stats = dev_stats;
1962 } else {
1963 /* Use kernel netdev's packet and byte counts since vport's counters
1964 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1965 * enabled. */
1966 stats->rx_packets = dev_stats.rx_packets;
1967 stats->rx_bytes = dev_stats.rx_bytes;
1968 stats->tx_packets = dev_stats.tx_packets;
1969 stats->tx_bytes = dev_stats.tx_bytes;
1970
1971 stats->rx_errors += dev_stats.rx_errors;
1972 stats->tx_errors += dev_stats.tx_errors;
1973 stats->rx_dropped += dev_stats.rx_dropped;
1974 stats->tx_dropped += dev_stats.tx_dropped;
1975 stats->multicast += dev_stats.multicast;
1976 stats->collisions += dev_stats.collisions;
1977 stats->rx_length_errors += dev_stats.rx_length_errors;
1978 stats->rx_over_errors += dev_stats.rx_over_errors;
1979 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1980 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1981 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1982 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1983 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1984 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1985 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1986 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1987 stats->tx_window_errors += dev_stats.tx_window_errors;
1988 }
1989 ovs_mutex_unlock(&netdev->mutex);
1990
1991 return error;
1992 }
1993
1994 /* Retrieves current device stats for 'netdev-tap' netdev or
1995 * netdev-internal. */
1996 static int
1997 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1998 {
1999 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2000 struct netdev_stats dev_stats;
2001 int error;
2002
2003 ovs_mutex_lock(&netdev->mutex);
2004 get_stats_via_vport(netdev_, stats);
2005 error = get_stats_via_netlink(netdev_, &dev_stats);
2006 if (error) {
2007 if (!netdev->vport_stats_error) {
2008 error = 0;
2009 }
2010 } else if (netdev->vport_stats_error) {
2011 /* Transmit and receive stats will appear to be swapped relative to the
2012 * other ports since we are the one sending the data, not a remote
2013 * computer. For consistency, we swap them back here. This does not
2014 * apply if we are getting stats from the vport layer because it always
2015 * tracks stats from the perspective of the switch. */
2016
2017 *stats = dev_stats;
2018 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2019 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2020 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2021 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2022 stats->rx_length_errors = 0;
2023 stats->rx_over_errors = 0;
2024 stats->rx_crc_errors = 0;
2025 stats->rx_frame_errors = 0;
2026 stats->rx_fifo_errors = 0;
2027 stats->rx_missed_errors = 0;
2028 stats->tx_aborted_errors = 0;
2029 stats->tx_carrier_errors = 0;
2030 stats->tx_fifo_errors = 0;
2031 stats->tx_heartbeat_errors = 0;
2032 stats->tx_window_errors = 0;
2033 } else {
2034 /* Use kernel netdev's packet and byte counts since vport counters
2035 * do not reflect packet counts on the wire when GSO, TSO or GRO
2036 * are enabled. */
2037 stats->rx_packets = dev_stats.tx_packets;
2038 stats->rx_bytes = dev_stats.tx_bytes;
2039 stats->tx_packets = dev_stats.rx_packets;
2040 stats->tx_bytes = dev_stats.rx_bytes;
2041
2042 stats->rx_dropped += dev_stats.tx_dropped;
2043 stats->tx_dropped += dev_stats.rx_dropped;
2044
2045 stats->rx_errors += dev_stats.tx_errors;
2046 stats->tx_errors += dev_stats.rx_errors;
2047
2048 stats->multicast += dev_stats.multicast;
2049 stats->collisions += dev_stats.collisions;
2050 }
2051 stats->tx_dropped += netdev->tx_dropped;
2052 ovs_mutex_unlock(&netdev->mutex);
2053
2054 return error;
2055 }
2056
2057 static int
2058 netdev_internal_get_stats(const struct netdev *netdev_,
2059 struct netdev_stats *stats)
2060 {
2061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2062 int error;
2063
2064 ovs_mutex_lock(&netdev->mutex);
2065 get_stats_via_vport(netdev_, stats);
2066 error = netdev->vport_stats_error;
2067 ovs_mutex_unlock(&netdev->mutex);
2068
2069 return error;
2070 }
2071
2072 static void
2073 netdev_linux_read_features(struct netdev_linux *netdev)
2074 {
2075 struct ethtool_cmd ecmd;
2076 uint32_t speed;
2077 int error;
2078
2079 if (netdev->cache_valid & VALID_FEATURES) {
2080 return;
2081 }
2082
2083 COVERAGE_INC(netdev_get_ethtool);
2084 memset(&ecmd, 0, sizeof ecmd);
2085 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2086 ETHTOOL_GSET, "ETHTOOL_GSET");
2087 if (error) {
2088 goto out;
2089 }
2090
2091 /* Supported features. */
2092 netdev->supported = 0;
2093 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2094 netdev->supported |= NETDEV_F_10MB_HD;
2095 }
2096 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2097 netdev->supported |= NETDEV_F_10MB_FD;
2098 }
2099 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2100 netdev->supported |= NETDEV_F_100MB_HD;
2101 }
2102 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2103 netdev->supported |= NETDEV_F_100MB_FD;
2104 }
2105 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2106 netdev->supported |= NETDEV_F_1GB_HD;
2107 }
2108 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2109 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2110 netdev->supported |= NETDEV_F_1GB_FD;
2111 }
2112 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2113 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2114 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2115 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2116 netdev->supported |= NETDEV_F_10GB_FD;
2117 }
2118 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2119 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2120 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2121 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2122 netdev->supported |= NETDEV_F_40GB_FD;
2123 }
2124 if (ecmd.supported & SUPPORTED_TP) {
2125 netdev->supported |= NETDEV_F_COPPER;
2126 }
2127 if (ecmd.supported & SUPPORTED_FIBRE) {
2128 netdev->supported |= NETDEV_F_FIBER;
2129 }
2130 if (ecmd.supported & SUPPORTED_Autoneg) {
2131 netdev->supported |= NETDEV_F_AUTONEG;
2132 }
2133 if (ecmd.supported & SUPPORTED_Pause) {
2134 netdev->supported |= NETDEV_F_PAUSE;
2135 }
2136 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2137 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2138 }
2139
2140 /* Advertised features. */
2141 netdev->advertised = 0;
2142 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2143 netdev->advertised |= NETDEV_F_10MB_HD;
2144 }
2145 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2146 netdev->advertised |= NETDEV_F_10MB_FD;
2147 }
2148 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2149 netdev->advertised |= NETDEV_F_100MB_HD;
2150 }
2151 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2152 netdev->advertised |= NETDEV_F_100MB_FD;
2153 }
2154 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2155 netdev->advertised |= NETDEV_F_1GB_HD;
2156 }
2157 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2158 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2159 netdev->advertised |= NETDEV_F_1GB_FD;
2160 }
2161 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2162 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2163 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2164 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2165 netdev->advertised |= NETDEV_F_10GB_FD;
2166 }
2167 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2168 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2169 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2170 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2171 netdev->advertised |= NETDEV_F_40GB_FD;
2172 }
2173 if (ecmd.advertising & ADVERTISED_TP) {
2174 netdev->advertised |= NETDEV_F_COPPER;
2175 }
2176 if (ecmd.advertising & ADVERTISED_FIBRE) {
2177 netdev->advertised |= NETDEV_F_FIBER;
2178 }
2179 if (ecmd.advertising & ADVERTISED_Autoneg) {
2180 netdev->advertised |= NETDEV_F_AUTONEG;
2181 }
2182 if (ecmd.advertising & ADVERTISED_Pause) {
2183 netdev->advertised |= NETDEV_F_PAUSE;
2184 }
2185 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2186 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2187 }
2188
2189 /* Current settings. */
2190 speed = ethtool_cmd_speed(&ecmd);
2191 if (speed == SPEED_10) {
2192 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2193 } else if (speed == SPEED_100) {
2194 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2195 } else if (speed == SPEED_1000) {
2196 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2197 } else if (speed == SPEED_10000) {
2198 netdev->current = NETDEV_F_10GB_FD;
2199 } else if (speed == 40000) {
2200 netdev->current = NETDEV_F_40GB_FD;
2201 } else if (speed == 100000) {
2202 netdev->current = NETDEV_F_100GB_FD;
2203 } else if (speed == 1000000) {
2204 netdev->current = NETDEV_F_1TB_FD;
2205 } else {
2206 netdev->current = 0;
2207 }
2208
2209 if (ecmd.port == PORT_TP) {
2210 netdev->current |= NETDEV_F_COPPER;
2211 } else if (ecmd.port == PORT_FIBRE) {
2212 netdev->current |= NETDEV_F_FIBER;
2213 }
2214
2215 if (ecmd.autoneg) {
2216 netdev->current |= NETDEV_F_AUTONEG;
2217 }
2218
2219 out:
2220 netdev->cache_valid |= VALID_FEATURES;
2221 netdev->get_features_error = error;
2222 }
2223
2224 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2225 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2226 * Returns 0 if successful, otherwise a positive errno value. */
2227 static int
2228 netdev_linux_get_features(const struct netdev *netdev_,
2229 enum netdev_features *current,
2230 enum netdev_features *advertised,
2231 enum netdev_features *supported,
2232 enum netdev_features *peer)
2233 {
2234 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2235 int error;
2236
2237 ovs_mutex_lock(&netdev->mutex);
2238 if (netdev_linux_netnsid_is_remote(netdev)) {
2239 error = EOPNOTSUPP;
2240 goto exit;
2241 }
2242
2243 netdev_linux_read_features(netdev);
2244 if (!netdev->get_features_error) {
2245 *current = netdev->current;
2246 *advertised = netdev->advertised;
2247 *supported = netdev->supported;
2248 *peer = 0; /* XXX */
2249 }
2250 error = netdev->get_features_error;
2251
2252 exit:
2253 ovs_mutex_unlock(&netdev->mutex);
2254 return error;
2255 }
2256
2257 /* Set the features advertised by 'netdev' to 'advertise'. */
2258 static int
2259 netdev_linux_set_advertisements(struct netdev *netdev_,
2260 enum netdev_features advertise)
2261 {
2262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2263 struct ethtool_cmd ecmd;
2264 int error;
2265
2266 ovs_mutex_lock(&netdev->mutex);
2267
2268 COVERAGE_INC(netdev_get_ethtool);
2269
2270 if (netdev_linux_netnsid_is_remote(netdev)) {
2271 error = EOPNOTSUPP;
2272 goto exit;
2273 }
2274
2275 memset(&ecmd, 0, sizeof ecmd);
2276 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2277 ETHTOOL_GSET, "ETHTOOL_GSET");
2278 if (error) {
2279 goto exit;
2280 }
2281
2282 ecmd.advertising = 0;
2283 if (advertise & NETDEV_F_10MB_HD) {
2284 ecmd.advertising |= ADVERTISED_10baseT_Half;
2285 }
2286 if (advertise & NETDEV_F_10MB_FD) {
2287 ecmd.advertising |= ADVERTISED_10baseT_Full;
2288 }
2289 if (advertise & NETDEV_F_100MB_HD) {
2290 ecmd.advertising |= ADVERTISED_100baseT_Half;
2291 }
2292 if (advertise & NETDEV_F_100MB_FD) {
2293 ecmd.advertising |= ADVERTISED_100baseT_Full;
2294 }
2295 if (advertise & NETDEV_F_1GB_HD) {
2296 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2297 }
2298 if (advertise & NETDEV_F_1GB_FD) {
2299 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2300 }
2301 if (advertise & NETDEV_F_10GB_FD) {
2302 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2303 }
2304 if (advertise & NETDEV_F_COPPER) {
2305 ecmd.advertising |= ADVERTISED_TP;
2306 }
2307 if (advertise & NETDEV_F_FIBER) {
2308 ecmd.advertising |= ADVERTISED_FIBRE;
2309 }
2310 if (advertise & NETDEV_F_AUTONEG) {
2311 ecmd.advertising |= ADVERTISED_Autoneg;
2312 }
2313 if (advertise & NETDEV_F_PAUSE) {
2314 ecmd.advertising |= ADVERTISED_Pause;
2315 }
2316 if (advertise & NETDEV_F_PAUSE_ASYM) {
2317 ecmd.advertising |= ADVERTISED_Asym_Pause;
2318 }
2319 COVERAGE_INC(netdev_set_ethtool);
2320 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2321 ETHTOOL_SSET, "ETHTOOL_SSET");
2322
2323 exit:
2324 ovs_mutex_unlock(&netdev->mutex);
2325 return error;
2326 }
2327
2328 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2329 * successful, otherwise a positive errno value. */
2330 static int
2331 netdev_linux_set_policing(struct netdev *netdev_,
2332 uint32_t kbits_rate, uint32_t kbits_burst)
2333 {
2334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2335 const char *netdev_name = netdev_get_name(netdev_);
2336 int ifindex;
2337 int error;
2338
2339 if (netdev_is_flow_api_enabled()) {
2340 if (kbits_rate) {
2341 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2342 netdev_name);
2343 }
2344 return EOPNOTSUPP;
2345 }
2346
2347 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2348 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2349 : kbits_burst); /* Stick with user-specified value. */
2350
2351 ovs_mutex_lock(&netdev->mutex);
2352 if (netdev_linux_netnsid_is_remote(netdev)) {
2353 error = EOPNOTSUPP;
2354 goto out;
2355 }
2356
2357 if (netdev->cache_valid & VALID_POLICING) {
2358 error = netdev->netdev_policing_error;
2359 if (error || (netdev->kbits_rate == kbits_rate &&
2360 netdev->kbits_burst == kbits_burst)) {
2361 /* Assume that settings haven't changed since we last set them. */
2362 goto out;
2363 }
2364 netdev->cache_valid &= ~VALID_POLICING;
2365 }
2366
2367 error = get_ifindex(netdev_, &ifindex);
2368 if (error) {
2369 goto out;
2370 }
2371
2372 COVERAGE_INC(netdev_set_policing);
2373 /* Remove any existing ingress qdisc. */
2374 error = tc_add_del_ingress_qdisc(ifindex, false, 0);
2375 if (error) {
2376 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2377 netdev_name, ovs_strerror(error));
2378 goto out;
2379 }
2380
2381 if (kbits_rate) {
2382 error = tc_add_del_ingress_qdisc(ifindex, true, 0);
2383 if (error) {
2384 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2385 netdev_name, ovs_strerror(error));
2386 goto out;
2387 }
2388
2389 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2390 if (error){
2391 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2392 netdev_name, ovs_strerror(error));
2393 goto out;
2394 }
2395 }
2396
2397 netdev->kbits_rate = kbits_rate;
2398 netdev->kbits_burst = kbits_burst;
2399
2400 out:
2401 if (!error || error == ENODEV) {
2402 netdev->netdev_policing_error = error;
2403 netdev->cache_valid |= VALID_POLICING;
2404 }
2405 ovs_mutex_unlock(&netdev->mutex);
2406 return error;
2407 }
2408
2409 static int
2410 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2411 struct sset *types)
2412 {
2413 const struct tc_ops *const *opsp;
2414 for (opsp = tcs; *opsp != NULL; opsp++) {
2415 const struct tc_ops *ops = *opsp;
2416 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2417 sset_add(types, ops->ovs_name);
2418 }
2419 }
2420 return 0;
2421 }
2422
2423 static const struct tc_ops *
2424 tc_lookup_ovs_name(const char *name)
2425 {
2426 const struct tc_ops *const *opsp;
2427
2428 for (opsp = tcs; *opsp != NULL; opsp++) {
2429 const struct tc_ops *ops = *opsp;
2430 if (!strcmp(name, ops->ovs_name)) {
2431 return ops;
2432 }
2433 }
2434 return NULL;
2435 }
2436
2437 static const struct tc_ops *
2438 tc_lookup_linux_name(const char *name)
2439 {
2440 const struct tc_ops *const *opsp;
2441
2442 for (opsp = tcs; *opsp != NULL; opsp++) {
2443 const struct tc_ops *ops = *opsp;
2444 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2445 return ops;
2446 }
2447 }
2448 return NULL;
2449 }
2450
2451 static struct tc_queue *
2452 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2453 size_t hash)
2454 {
2455 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2456 struct tc_queue *queue;
2457
2458 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2459 if (queue->queue_id == queue_id) {
2460 return queue;
2461 }
2462 }
2463 return NULL;
2464 }
2465
2466 static struct tc_queue *
2467 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2468 {
2469 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2470 }
2471
2472 static int
2473 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2474 const char *type,
2475 struct netdev_qos_capabilities *caps)
2476 {
2477 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2478 if (!ops) {
2479 return EOPNOTSUPP;
2480 }
2481 caps->n_queues = ops->n_queues;
2482 return 0;
2483 }
2484
2485 static int
2486 netdev_linux_get_qos(const struct netdev *netdev_,
2487 const char **typep, struct smap *details)
2488 {
2489 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2490 int error;
2491
2492 ovs_mutex_lock(&netdev->mutex);
2493 if (netdev_linux_netnsid_is_remote(netdev)) {
2494 error = EOPNOTSUPP;
2495 goto exit;
2496 }
2497
2498 error = tc_query_qdisc(netdev_);
2499 if (!error) {
2500 *typep = netdev->tc->ops->ovs_name;
2501 error = (netdev->tc->ops->qdisc_get
2502 ? netdev->tc->ops->qdisc_get(netdev_, details)
2503 : 0);
2504 }
2505
2506 exit:
2507 ovs_mutex_unlock(&netdev->mutex);
2508 return error;
2509 }
2510
2511 static int
2512 netdev_linux_set_qos(struct netdev *netdev_,
2513 const char *type, const struct smap *details)
2514 {
2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2516 const struct tc_ops *new_ops;
2517 int error;
2518
2519 new_ops = tc_lookup_ovs_name(type);
2520 if (!new_ops || !new_ops->tc_install) {
2521 return EOPNOTSUPP;
2522 }
2523
2524 if (new_ops == &tc_ops_noop) {
2525 return new_ops->tc_install(netdev_, details);
2526 }
2527
2528 ovs_mutex_lock(&netdev->mutex);
2529 if (netdev_linux_netnsid_is_remote(netdev)) {
2530 error = EOPNOTSUPP;
2531 goto exit;
2532 }
2533
2534 error = tc_query_qdisc(netdev_);
2535 if (error) {
2536 goto exit;
2537 }
2538
2539 if (new_ops == netdev->tc->ops) {
2540 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2541 } else {
2542 /* Delete existing qdisc. */
2543 error = tc_del_qdisc(netdev_);
2544 if (error) {
2545 goto exit;
2546 }
2547 ovs_assert(netdev->tc == NULL);
2548
2549 /* Install new qdisc. */
2550 error = new_ops->tc_install(netdev_, details);
2551 ovs_assert((error == 0) == (netdev->tc != NULL));
2552 }
2553
2554 exit:
2555 ovs_mutex_unlock(&netdev->mutex);
2556 return error;
2557 }
2558
2559 static int
2560 netdev_linux_get_queue(const struct netdev *netdev_,
2561 unsigned int queue_id, struct smap *details)
2562 {
2563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2564 int error;
2565
2566 ovs_mutex_lock(&netdev->mutex);
2567 if (netdev_linux_netnsid_is_remote(netdev)) {
2568 error = EOPNOTSUPP;
2569 goto exit;
2570 }
2571
2572 error = tc_query_qdisc(netdev_);
2573 if (!error) {
2574 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2575 error = (queue
2576 ? netdev->tc->ops->class_get(netdev_, queue, details)
2577 : ENOENT);
2578 }
2579
2580 exit:
2581 ovs_mutex_unlock(&netdev->mutex);
2582 return error;
2583 }
2584
2585 static int
2586 netdev_linux_set_queue(struct netdev *netdev_,
2587 unsigned int queue_id, const struct smap *details)
2588 {
2589 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2590 int error;
2591
2592 ovs_mutex_lock(&netdev->mutex);
2593 if (netdev_linux_netnsid_is_remote(netdev)) {
2594 error = EOPNOTSUPP;
2595 goto exit;
2596 }
2597
2598 error = tc_query_qdisc(netdev_);
2599 if (!error) {
2600 error = (queue_id < netdev->tc->ops->n_queues
2601 && netdev->tc->ops->class_set
2602 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2603 : EINVAL);
2604 }
2605
2606 exit:
2607 ovs_mutex_unlock(&netdev->mutex);
2608 return error;
2609 }
2610
2611 static int
2612 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2613 {
2614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2615 int error;
2616
2617 ovs_mutex_lock(&netdev->mutex);
2618 if (netdev_linux_netnsid_is_remote(netdev)) {
2619 error = EOPNOTSUPP;
2620 goto exit;
2621 }
2622
2623 error = tc_query_qdisc(netdev_);
2624 if (!error) {
2625 if (netdev->tc->ops->class_delete) {
2626 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2627 error = (queue
2628 ? netdev->tc->ops->class_delete(netdev_, queue)
2629 : ENOENT);
2630 } else {
2631 error = EINVAL;
2632 }
2633 }
2634
2635 exit:
2636 ovs_mutex_unlock(&netdev->mutex);
2637 return error;
2638 }
2639
2640 static int
2641 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2642 unsigned int queue_id,
2643 struct netdev_queue_stats *stats)
2644 {
2645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2646 int error;
2647
2648 ovs_mutex_lock(&netdev->mutex);
2649 if (netdev_linux_netnsid_is_remote(netdev)) {
2650 error = EOPNOTSUPP;
2651 goto exit;
2652 }
2653
2654 error = tc_query_qdisc(netdev_);
2655 if (!error) {
2656 if (netdev->tc->ops->class_get_stats) {
2657 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2658 if (queue) {
2659 stats->created = queue->created;
2660 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2661 stats);
2662 } else {
2663 error = ENOENT;
2664 }
2665 } else {
2666 error = EOPNOTSUPP;
2667 }
2668 }
2669
2670 exit:
2671 ovs_mutex_unlock(&netdev->mutex);
2672 return error;
2673 }
2674
2675 struct queue_dump_state {
2676 struct nl_dump dump;
2677 struct ofpbuf buf;
2678 };
2679
2680 static bool
2681 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2682 {
2683 struct ofpbuf request;
2684 struct tcmsg *tcmsg;
2685
2686 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2687 if (!tcmsg) {
2688 return false;
2689 }
2690 tcmsg->tcm_parent = 0;
2691 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2692 ofpbuf_uninit(&request);
2693
2694 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2695 return true;
2696 }
2697
2698 static int
2699 finish_queue_dump(struct queue_dump_state *state)
2700 {
2701 ofpbuf_uninit(&state->buf);
2702 return nl_dump_done(&state->dump);
2703 }
2704
2705 struct netdev_linux_queue_state {
2706 unsigned int *queues;
2707 size_t cur_queue;
2708 size_t n_queues;
2709 };
2710
2711 static int
2712 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2713 {
2714 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2715 int error;
2716
2717 ovs_mutex_lock(&netdev->mutex);
2718 if (netdev_linux_netnsid_is_remote(netdev)) {
2719 error = EOPNOTSUPP;
2720 goto exit;
2721 }
2722
2723 error = tc_query_qdisc(netdev_);
2724 if (!error) {
2725 if (netdev->tc->ops->class_get) {
2726 struct netdev_linux_queue_state *state;
2727 struct tc_queue *queue;
2728 size_t i;
2729
2730 *statep = state = xmalloc(sizeof *state);
2731 state->n_queues = hmap_count(&netdev->tc->queues);
2732 state->cur_queue = 0;
2733 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2734
2735 i = 0;
2736 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2737 state->queues[i++] = queue->queue_id;
2738 }
2739 } else {
2740 error = EOPNOTSUPP;
2741 }
2742 }
2743
2744 exit:
2745 ovs_mutex_unlock(&netdev->mutex);
2746 return error;
2747 }
2748
2749 static int
2750 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2751 unsigned int *queue_idp, struct smap *details)
2752 {
2753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2754 struct netdev_linux_queue_state *state = state_;
2755 int error = EOF;
2756
2757 ovs_mutex_lock(&netdev->mutex);
2758 if (netdev_linux_netnsid_is_remote(netdev)) {
2759 error = EOPNOTSUPP;
2760 goto exit;
2761 }
2762
2763 while (state->cur_queue < state->n_queues) {
2764 unsigned int queue_id = state->queues[state->cur_queue++];
2765 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2766
2767 if (queue) {
2768 *queue_idp = queue_id;
2769 error = netdev->tc->ops->class_get(netdev_, queue, details);
2770 break;
2771 }
2772 }
2773
2774 exit:
2775 ovs_mutex_unlock(&netdev->mutex);
2776 return error;
2777 }
2778
2779 static int
2780 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2781 void *state_)
2782 {
2783 struct netdev_linux_queue_state *state = state_;
2784
2785 free(state->queues);
2786 free(state);
2787 return 0;
2788 }
2789
2790 static int
2791 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2792 netdev_dump_queue_stats_cb *cb, void *aux)
2793 {
2794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2795 int error;
2796
2797 ovs_mutex_lock(&netdev->mutex);
2798 if (netdev_linux_netnsid_is_remote(netdev)) {
2799 error = EOPNOTSUPP;
2800 goto exit;
2801 }
2802
2803 error = tc_query_qdisc(netdev_);
2804 if (!error) {
2805 struct queue_dump_state state;
2806
2807 if (!netdev->tc->ops->class_dump_stats) {
2808 error = EOPNOTSUPP;
2809 } else if (!start_queue_dump(netdev_, &state)) {
2810 error = ENODEV;
2811 } else {
2812 struct ofpbuf msg;
2813 int retval;
2814
2815 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2816 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2817 cb, aux);
2818 if (retval) {
2819 error = retval;
2820 }
2821 }
2822
2823 retval = finish_queue_dump(&state);
2824 if (retval) {
2825 error = retval;
2826 }
2827 }
2828 }
2829
2830 exit:
2831 ovs_mutex_unlock(&netdev->mutex);
2832 return error;
2833 }
2834
2835 static int
2836 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2837 struct in_addr netmask)
2838 {
2839 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2840 int error;
2841
2842 ovs_mutex_lock(&netdev->mutex);
2843 if (netdev_linux_netnsid_is_remote(netdev)) {
2844 error = EOPNOTSUPP;
2845 goto exit;
2846 }
2847
2848 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2849 if (!error) {
2850 if (address.s_addr != INADDR_ANY) {
2851 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2852 "SIOCSIFNETMASK", netmask);
2853 }
2854 }
2855
2856 exit:
2857 ovs_mutex_unlock(&netdev->mutex);
2858 return error;
2859 }
2860
2861 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2862 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2863 * error. */
2864 static int
2865 netdev_linux_get_addr_list(const struct netdev *netdev_,
2866 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2867 {
2868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2869 int error;
2870
2871 ovs_mutex_lock(&netdev->mutex);
2872 if (netdev_linux_netnsid_is_remote(netdev)) {
2873 error = EOPNOTSUPP;
2874 goto exit;
2875 }
2876
2877 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2878
2879 exit:
2880 ovs_mutex_unlock(&netdev->mutex);
2881 return error;
2882 }
2883
2884 static void
2885 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2886 {
2887 struct sockaddr_in sin;
2888 memset(&sin, 0, sizeof sin);
2889 sin.sin_family = AF_INET;
2890 sin.sin_addr = addr;
2891 sin.sin_port = 0;
2892
2893 memset(sa, 0, sizeof *sa);
2894 memcpy(sa, &sin, sizeof sin);
2895 }
2896
2897 static int
2898 do_set_addr(struct netdev *netdev,
2899 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2900 {
2901 struct ifreq ifr;
2902
2903 make_in4_sockaddr(&ifr.ifr_addr, addr);
2904 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2905 ioctl_name);
2906 }
2907
2908 /* Adds 'router' as a default IP gateway. */
2909 static int
2910 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2911 {
2912 struct in_addr any = { INADDR_ANY };
2913 struct rtentry rt;
2914 int error;
2915
2916 memset(&rt, 0, sizeof rt);
2917 make_in4_sockaddr(&rt.rt_dst, any);
2918 make_in4_sockaddr(&rt.rt_gateway, router);
2919 make_in4_sockaddr(&rt.rt_genmask, any);
2920 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2921 error = af_inet_ioctl(SIOCADDRT, &rt);
2922 if (error) {
2923 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2924 }
2925 return error;
2926 }
2927
2928 static int
2929 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2930 char **netdev_name)
2931 {
2932 static const char fn[] = "/proc/net/route";
2933 FILE *stream;
2934 char line[256];
2935 int ln;
2936
2937 *netdev_name = NULL;
2938 stream = fopen(fn, "r");
2939 if (stream == NULL) {
2940 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2941 return errno;
2942 }
2943
2944 ln = 0;
2945 while (fgets(line, sizeof line, stream)) {
2946 if (++ln >= 2) {
2947 char iface[17];
2948 ovs_be32 dest, gateway, mask;
2949 int refcnt, metric, mtu;
2950 unsigned int flags, use, window, irtt;
2951
2952 if (!ovs_scan(line,
2953 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2954 " %d %u %u\n",
2955 iface, &dest, &gateway, &flags, &refcnt,
2956 &use, &metric, &mask, &mtu, &window, &irtt)) {
2957 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2958 fn, ln, line);
2959 continue;
2960 }
2961 if (!(flags & RTF_UP)) {
2962 /* Skip routes that aren't up. */
2963 continue;
2964 }
2965
2966 /* The output of 'dest', 'mask', and 'gateway' were given in
2967 * network byte order, so we don't need need any endian
2968 * conversions here. */
2969 if ((dest & mask) == (host->s_addr & mask)) {
2970 if (!gateway) {
2971 /* The host is directly reachable. */
2972 next_hop->s_addr = 0;
2973 } else {
2974 /* To reach the host, we must go through a gateway. */
2975 next_hop->s_addr = gateway;
2976 }
2977 *netdev_name = xstrdup(iface);
2978 fclose(stream);
2979 return 0;
2980 }
2981 }
2982 }
2983
2984 fclose(stream);
2985 return ENXIO;
2986 }
2987
2988 static int
2989 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2990 {
2991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2992 int error = 0;
2993
2994 ovs_mutex_lock(&netdev->mutex);
2995 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2996 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2997
2998 COVERAGE_INC(netdev_get_ethtool);
2999 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3000 error = netdev_linux_do_ethtool(netdev->up.name,
3001 cmd,
3002 ETHTOOL_GDRVINFO,
3003 "ETHTOOL_GDRVINFO");
3004 if (!error) {
3005 netdev->cache_valid |= VALID_DRVINFO;
3006 }
3007 }
3008
3009 if (!error) {
3010 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3011 smap_add(smap, "driver_version", netdev->drvinfo.version);
3012 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3013 }
3014 ovs_mutex_unlock(&netdev->mutex);
3015
3016 return error;
3017 }
3018
3019 static int
3020 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3021 struct smap *smap)
3022 {
3023 smap_add(smap, "driver_name", "openvswitch");
3024 return 0;
3025 }
3026
3027 static uint32_t
3028 netdev_linux_get_block_id(struct netdev *netdev_)
3029 {
3030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3031 uint32_t block_id = 0;
3032
3033 ovs_mutex_lock(&netdev->mutex);
3034 /* Ensure the linux netdev has had its fields populated. */
3035 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3036 netdev_linux_update_via_netlink(netdev);
3037 }
3038
3039 /* Only assigning block ids to linux netdevs that are LAG masters. */
3040 if (netdev->is_lag_master) {
3041 block_id = netdev->ifindex;
3042 }
3043 ovs_mutex_unlock(&netdev->mutex);
3044
3045 return block_id;
3046 }
3047
3048 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3049 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3050 * returns 0. Otherwise, it returns a positive errno value; in particular,
3051 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3052 static int
3053 netdev_linux_arp_lookup(const struct netdev *netdev,
3054 ovs_be32 ip, struct eth_addr *mac)
3055 {
3056 struct arpreq r;
3057 struct sockaddr_in sin;
3058 int retval;
3059
3060 memset(&r, 0, sizeof r);
3061 memset(&sin, 0, sizeof sin);
3062 sin.sin_family = AF_INET;
3063 sin.sin_addr.s_addr = ip;
3064 sin.sin_port = 0;
3065 memcpy(&r.arp_pa, &sin, sizeof sin);
3066 r.arp_ha.sa_family = ARPHRD_ETHER;
3067 r.arp_flags = 0;
3068 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3069 COVERAGE_INC(netdev_arp_lookup);
3070 retval = af_inet_ioctl(SIOCGARP, &r);
3071 if (!retval) {
3072 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3073 } else if (retval != ENXIO) {
3074 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3075 netdev_get_name(netdev), IP_ARGS(ip),
3076 ovs_strerror(retval));
3077 }
3078 return retval;
3079 }
3080
3081 static int
3082 nd_to_iff_flags(enum netdev_flags nd)
3083 {
3084 int iff = 0;
3085 if (nd & NETDEV_UP) {
3086 iff |= IFF_UP;
3087 }
3088 if (nd & NETDEV_PROMISC) {
3089 iff |= IFF_PROMISC;
3090 }
3091 if (nd & NETDEV_LOOPBACK) {
3092 iff |= IFF_LOOPBACK;
3093 }
3094 return iff;
3095 }
3096
3097 static int
3098 iff_to_nd_flags(int iff)
3099 {
3100 enum netdev_flags nd = 0;
3101 if (iff & IFF_UP) {
3102 nd |= NETDEV_UP;
3103 }
3104 if (iff & IFF_PROMISC) {
3105 nd |= NETDEV_PROMISC;
3106 }
3107 if (iff & IFF_LOOPBACK) {
3108 nd |= NETDEV_LOOPBACK;
3109 }
3110 return nd;
3111 }
3112
3113 static int
3114 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3115 enum netdev_flags on, enum netdev_flags *old_flagsp)
3116 OVS_REQUIRES(netdev->mutex)
3117 {
3118 int old_flags, new_flags;
3119 int error = 0;
3120
3121 old_flags = netdev->ifi_flags;
3122 *old_flagsp = iff_to_nd_flags(old_flags);
3123 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3124 if (new_flags != old_flags) {
3125 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3126 get_flags(&netdev->up, &netdev->ifi_flags);
3127 }
3128
3129 return error;
3130 }
3131
3132 static int
3133 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3134 enum netdev_flags on, enum netdev_flags *old_flagsp)
3135 {
3136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3137 int error = 0;
3138
3139 ovs_mutex_lock(&netdev->mutex);
3140 if (on || off) {
3141 /* Changing flags over netlink isn't support yet. */
3142 if (netdev_linux_netnsid_is_remote(netdev)) {
3143 error = EOPNOTSUPP;
3144 goto exit;
3145 }
3146 error = update_flags(netdev, off, on, old_flagsp);
3147 } else {
3148 /* Try reading flags over netlink, or fall back to ioctl. */
3149 if (!netdev_linux_update_via_netlink(netdev)) {
3150 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3151 } else {
3152 error = update_flags(netdev, off, on, old_flagsp);
3153 }
3154 }
3155
3156 exit:
3157 ovs_mutex_unlock(&netdev->mutex);
3158 return error;
3159 }
3160
3161 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
3162 GET_FEATURES, GET_STATUS, \
3163 FLOW_OFFLOAD_API, GET_BLOCK_ID) \
3164 { \
3165 NAME, \
3166 false, /* is_pmd */ \
3167 \
3168 NULL, \
3169 netdev_linux_run, \
3170 netdev_linux_wait, \
3171 \
3172 netdev_linux_alloc, \
3173 CONSTRUCT, \
3174 netdev_linux_destruct, \
3175 netdev_linux_dealloc, \
3176 NULL, /* get_config */ \
3177 NULL, /* set_config */ \
3178 NULL, /* get_tunnel_config */ \
3179 NULL, /* build header */ \
3180 NULL, /* push header */ \
3181 NULL, /* pop header */ \
3182 NULL, /* get_numa_id */ \
3183 NULL, /* set_tx_multiq */ \
3184 \
3185 netdev_linux_send, \
3186 netdev_linux_send_wait, \
3187 \
3188 netdev_linux_set_etheraddr, \
3189 netdev_linux_get_etheraddr, \
3190 netdev_linux_get_mtu, \
3191 netdev_linux_set_mtu, \
3192 netdev_linux_get_ifindex, \
3193 netdev_linux_get_carrier, \
3194 netdev_linux_get_carrier_resets, \
3195 netdev_linux_set_miimon_interval, \
3196 GET_STATS, \
3197 NULL, \
3198 \
3199 GET_FEATURES, \
3200 netdev_linux_set_advertisements, \
3201 NULL, /* get_pt_mode */ \
3202 \
3203 netdev_linux_set_policing, \
3204 netdev_linux_get_qos_types, \
3205 netdev_linux_get_qos_capabilities, \
3206 netdev_linux_get_qos, \
3207 netdev_linux_set_qos, \
3208 netdev_linux_get_queue, \
3209 netdev_linux_set_queue, \
3210 netdev_linux_delete_queue, \
3211 netdev_linux_get_queue_stats, \
3212 netdev_linux_queue_dump_start, \
3213 netdev_linux_queue_dump_next, \
3214 netdev_linux_queue_dump_done, \
3215 netdev_linux_dump_queue_stats, \
3216 \
3217 netdev_linux_set_in4, \
3218 netdev_linux_get_addr_list, \
3219 netdev_linux_add_router, \
3220 netdev_linux_get_next_hop, \
3221 GET_STATUS, \
3222 netdev_linux_arp_lookup, \
3223 \
3224 netdev_linux_update_flags, \
3225 NULL, /* reconfigure */ \
3226 \
3227 netdev_linux_rxq_alloc, \
3228 netdev_linux_rxq_construct, \
3229 netdev_linux_rxq_destruct, \
3230 netdev_linux_rxq_dealloc, \
3231 netdev_linux_rxq_recv, \
3232 netdev_linux_rxq_wait, \
3233 netdev_linux_rxq_drain, \
3234 \
3235 FLOW_OFFLOAD_API, \
3236 GET_BLOCK_ID \
3237 }
3238
3239 const struct netdev_class netdev_linux_class =
3240 NETDEV_LINUX_CLASS(
3241 "system",
3242 netdev_linux_construct,
3243 netdev_linux_get_stats,
3244 netdev_linux_get_features,
3245 netdev_linux_get_status,
3246 LINUX_FLOW_OFFLOAD_API,
3247 netdev_linux_get_block_id);
3248
3249 const struct netdev_class netdev_tap_class =
3250 NETDEV_LINUX_CLASS(
3251 "tap",
3252 netdev_linux_construct_tap,
3253 netdev_tap_get_stats,
3254 netdev_linux_get_features,
3255 netdev_linux_get_status,
3256 NO_OFFLOAD_API,
3257 NULL);
3258
3259 const struct netdev_class netdev_internal_class =
3260 NETDEV_LINUX_CLASS(
3261 "internal",
3262 netdev_linux_construct,
3263 netdev_internal_get_stats,
3264 NULL, /* get_features */
3265 netdev_internal_get_status,
3266 NO_OFFLOAD_API,
3267 NULL);
3268 \f
3269
3270 #define CODEL_N_QUEUES 0x0000
3271
3272 /* In sufficiently new kernel headers these are defined as enums in
3273 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3274 * kernels. (This overrides any enum definition in the header file but that's
3275 * harmless.) */
3276 #define TCA_CODEL_TARGET 1
3277 #define TCA_CODEL_LIMIT 2
3278 #define TCA_CODEL_INTERVAL 3
3279
3280 struct codel {
3281 struct tc tc;
3282 uint32_t target;
3283 uint32_t limit;
3284 uint32_t interval;
3285 };
3286
3287 static struct codel *
3288 codel_get__(const struct netdev *netdev_)
3289 {
3290 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3291 return CONTAINER_OF(netdev->tc, struct codel, tc);
3292 }
3293
3294 static void
3295 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3296 uint32_t interval)
3297 {
3298 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3299 struct codel *codel;
3300
3301 codel = xmalloc(sizeof *codel);
3302 tc_init(&codel->tc, &tc_ops_codel);
3303 codel->target = target;
3304 codel->limit = limit;
3305 codel->interval = interval;
3306
3307 netdev->tc = &codel->tc;
3308 }
3309
3310 static int
3311 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3312 uint32_t interval)
3313 {
3314 size_t opt_offset;
3315 struct ofpbuf request;
3316 struct tcmsg *tcmsg;
3317 uint32_t otarget, olimit, ointerval;
3318 int error;
3319
3320 tc_del_qdisc(netdev);
3321
3322 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3323 NLM_F_EXCL | NLM_F_CREATE, &request);
3324 if (!tcmsg) {
3325 return ENODEV;
3326 }
3327 tcmsg->tcm_handle = tc_make_handle(1, 0);
3328 tcmsg->tcm_parent = TC_H_ROOT;
3329
3330 otarget = target ? target : 5000;
3331 olimit = limit ? limit : 10240;
3332 ointerval = interval ? interval : 100000;
3333
3334 nl_msg_put_string(&request, TCA_KIND, "codel");
3335 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3336 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3337 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3338 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3339 nl_msg_end_nested(&request, opt_offset);
3340
3341 error = tc_transact(&request, NULL);
3342 if (error) {
3343 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3344 "target %u, limit %u, interval %u error %d(%s)",
3345 netdev_get_name(netdev),
3346 otarget, olimit, ointerval,
3347 error, ovs_strerror(error));
3348 }
3349 return error;
3350 }
3351
3352 static void
3353 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3354 const struct smap *details, struct codel *codel)
3355 {
3356 codel->target = smap_get_ullong(details, "target", 0);
3357 codel->limit = smap_get_ullong(details, "limit", 0);
3358 codel->interval = smap_get_ullong(details, "interval", 0);
3359
3360 if (!codel->target) {
3361 codel->target = 5000;
3362 }
3363 if (!codel->limit) {
3364 codel->limit = 10240;
3365 }
3366 if (!codel->interval) {
3367 codel->interval = 100000;
3368 }
3369 }
3370
3371 static int
3372 codel_tc_install(struct netdev *netdev, const struct smap *details)
3373 {
3374 int error;
3375 struct codel codel;
3376
3377 codel_parse_qdisc_details__(netdev, details, &codel);
3378 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3379 codel.interval);
3380 if (!error) {
3381 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3382 }
3383 return error;
3384 }
3385
3386 static int
3387 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3388 {
3389 static const struct nl_policy tca_codel_policy[] = {
3390 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3391 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3392 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3393 };
3394
3395 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3396
3397 if (!nl_parse_nested(nl_options, tca_codel_policy,
3398 attrs, ARRAY_SIZE(tca_codel_policy))) {
3399 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3400 return EPROTO;
3401 }
3402
3403 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3404 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3405 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3406 return 0;
3407 }
3408
3409 static int
3410 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3411 {
3412 struct nlattr *nlattr;
3413 const char * kind;
3414 int error;
3415 struct codel codel;
3416
3417 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3418 if (error != 0) {
3419 return error;
3420 }
3421
3422 error = codel_parse_tca_options__(nlattr, &codel);
3423 if (error != 0) {
3424 return error;
3425 }
3426
3427 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3428 return 0;
3429 }
3430
3431
3432 static void
3433 codel_tc_destroy(struct tc *tc)
3434 {
3435 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3436 tc_destroy(tc);
3437 free(codel);
3438 }
3439
3440 static int
3441 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3442 {
3443 const struct codel *codel = codel_get__(netdev);
3444 smap_add_format(details, "target", "%u", codel->target);
3445 smap_add_format(details, "limit", "%u", codel->limit);
3446 smap_add_format(details, "interval", "%u", codel->interval);
3447 return 0;
3448 }
3449
3450 static int
3451 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3452 {
3453 struct codel codel;
3454
3455 codel_parse_qdisc_details__(netdev, details, &codel);
3456 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3457 codel_get__(netdev)->target = codel.target;
3458 codel_get__(netdev)->limit = codel.limit;
3459 codel_get__(netdev)->interval = codel.interval;
3460 return 0;
3461 }
3462
3463 static const struct tc_ops tc_ops_codel = {
3464 "codel", /* linux_name */
3465 "linux-codel", /* ovs_name */
3466 CODEL_N_QUEUES, /* n_queues */
3467 codel_tc_install,
3468 codel_tc_load,
3469 codel_tc_destroy,
3470 codel_qdisc_get,
3471 codel_qdisc_set,
3472 NULL,
3473 NULL,
3474 NULL,
3475 NULL,
3476 NULL
3477 };
3478 \f
3479 /* FQ-CoDel traffic control class. */
3480
3481 #define FQCODEL_N_QUEUES 0x0000
3482
3483 /* In sufficiently new kernel headers these are defined as enums in
3484 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3485 * kernels. (This overrides any enum definition in the header file but that's
3486 * harmless.) */
3487 #define TCA_FQ_CODEL_TARGET 1
3488 #define TCA_FQ_CODEL_LIMIT 2
3489 #define TCA_FQ_CODEL_INTERVAL 3
3490 #define TCA_FQ_CODEL_ECN 4
3491 #define TCA_FQ_CODEL_FLOWS 5
3492 #define TCA_FQ_CODEL_QUANTUM 6
3493
3494 struct fqcodel {
3495 struct tc tc;
3496 uint32_t target;
3497 uint32_t limit;
3498 uint32_t interval;
3499 uint32_t flows;
3500 uint32_t quantum;
3501 };
3502
3503 static struct fqcodel *
3504 fqcodel_get__(const struct netdev *netdev_)
3505 {
3506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3507 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3508 }
3509
3510 static void
3511 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3512 uint32_t interval, uint32_t flows, uint32_t quantum)
3513 {
3514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3515 struct fqcodel *fqcodel;
3516
3517 fqcodel = xmalloc(sizeof *fqcodel);
3518 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3519 fqcodel->target = target;
3520 fqcodel->limit = limit;
3521 fqcodel->interval = interval;
3522 fqcodel->flows = flows;
3523 fqcodel->quantum = quantum;
3524
3525 netdev->tc = &fqcodel->tc;
3526 }
3527
3528 static int
3529 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3530 uint32_t interval, uint32_t flows, uint32_t quantum)
3531 {
3532 size_t opt_offset;
3533 struct ofpbuf request;
3534 struct tcmsg *tcmsg;
3535 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3536 int error;
3537
3538 tc_del_qdisc(netdev);
3539
3540 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3541 NLM_F_EXCL | NLM_F_CREATE, &request);
3542 if (!tcmsg) {
3543 return ENODEV;
3544 }
3545 tcmsg->tcm_handle = tc_make_handle(1, 0);
3546 tcmsg->tcm_parent = TC_H_ROOT;
3547
3548 otarget = target ? target : 5000;
3549 olimit = limit ? limit : 10240;
3550 ointerval = interval ? interval : 100000;
3551 oflows = flows ? flows : 1024;
3552 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3553 not mtu */
3554
3555 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3556 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3557 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3558 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3559 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3560 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3561 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3562 nl_msg_end_nested(&request, opt_offset);
3563
3564 error = tc_transact(&request, NULL);
3565 if (error) {
3566 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3567 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3568 netdev_get_name(netdev),
3569 otarget, olimit, ointerval, oflows, oquantum,
3570 error, ovs_strerror(error));
3571 }
3572 return error;
3573 }
3574
3575 static void
3576 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3577 const struct smap *details, struct fqcodel *fqcodel)
3578 {
3579 fqcodel->target = smap_get_ullong(details, "target", 0);
3580 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3581 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3582 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3583 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3584
3585 if (!fqcodel->target) {
3586 fqcodel->target = 5000;
3587 }
3588 if (!fqcodel->limit) {
3589 fqcodel->limit = 10240;
3590 }
3591 if (!fqcodel->interval) {
3592 fqcodel->interval = 1000000;
3593 }
3594 if (!fqcodel->flows) {
3595 fqcodel->flows = 1024;
3596 }
3597 if (!fqcodel->quantum) {
3598 fqcodel->quantum = 1514;
3599 }
3600 }
3601
3602 static int
3603 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3604 {
3605 int error;
3606 struct fqcodel fqcodel;
3607
3608 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3609 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3610 fqcodel.interval, fqcodel.flows,
3611 fqcodel.quantum);
3612 if (!error) {
3613 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3614 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3615 }
3616 return error;
3617 }
3618
3619 static int
3620 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3621 {
3622 static const struct nl_policy tca_fqcodel_policy[] = {
3623 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3624 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3625 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3626 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3627 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3628 };
3629
3630 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3631
3632 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3633 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3634 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3635 return EPROTO;
3636 }
3637
3638 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3639 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3640 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3641 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3642 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3643 return 0;
3644 }
3645
3646 static int
3647 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3648 {
3649 struct nlattr *nlattr;
3650 const char * kind;
3651 int error;
3652 struct fqcodel fqcodel;
3653
3654 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3655 if (error != 0) {
3656 return error;
3657 }
3658
3659 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3660 if (error != 0) {
3661 return error;
3662 }
3663
3664 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3665 fqcodel.flows, fqcodel.quantum);
3666 return 0;
3667 }
3668
3669 static void
3670 fqcodel_tc_destroy(struct tc *tc)
3671 {
3672 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3673 tc_destroy(tc);
3674 free(fqcodel);
3675 }
3676
3677 static int
3678 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3679 {
3680 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3681 smap_add_format(details, "target", "%u", fqcodel->target);
3682 smap_add_format(details, "limit", "%u", fqcodel->limit);
3683 smap_add_format(details, "interval", "%u", fqcodel->interval);
3684 smap_add_format(details, "flows", "%u", fqcodel->flows);
3685 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3686 return 0;
3687 }
3688
3689 static int
3690 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3691 {
3692 struct fqcodel fqcodel;
3693
3694 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3695 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3696 fqcodel.flows, fqcodel.quantum);
3697 fqcodel_get__(netdev)->target = fqcodel.target;
3698 fqcodel_get__(netdev)->limit = fqcodel.limit;
3699 fqcodel_get__(netdev)->interval = fqcodel.interval;
3700 fqcodel_get__(netdev)->flows = fqcodel.flows;
3701 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3702 return 0;
3703 }
3704
3705 static const struct tc_ops tc_ops_fqcodel = {
3706 "fq_codel", /* linux_name */
3707 "linux-fq_codel", /* ovs_name */
3708 FQCODEL_N_QUEUES, /* n_queues */
3709 fqcodel_tc_install,
3710 fqcodel_tc_load,
3711 fqcodel_tc_destroy,
3712 fqcodel_qdisc_get,
3713 fqcodel_qdisc_set,
3714 NULL,
3715 NULL,
3716 NULL,
3717 NULL,
3718 NULL
3719 };
3720 \f
3721 /* SFQ traffic control class. */
3722
3723 #define SFQ_N_QUEUES 0x0000
3724
3725 struct sfq {
3726 struct tc tc;
3727 uint32_t quantum;
3728 uint32_t perturb;
3729 };
3730
3731 static struct sfq *
3732 sfq_get__(const struct netdev *netdev_)
3733 {
3734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3735 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3736 }
3737
3738 static void
3739 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3740 {
3741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3742 struct sfq *sfq;
3743
3744 sfq = xmalloc(sizeof *sfq);
3745 tc_init(&sfq->tc, &tc_ops_sfq);
3746 sfq->perturb = perturb;
3747 sfq->quantum = quantum;
3748
3749 netdev->tc = &sfq->tc;
3750 }
3751
3752 static int
3753 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3754 {
3755 struct tc_sfq_qopt opt;
3756 struct ofpbuf request;
3757 struct tcmsg *tcmsg;
3758 int mtu;
3759 int mtu_error, error;
3760 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3761
3762 tc_del_qdisc(netdev);
3763
3764 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3765 NLM_F_EXCL | NLM_F_CREATE, &request);
3766 if (!tcmsg) {
3767 return ENODEV;
3768 }
3769 tcmsg->tcm_handle = tc_make_handle(1, 0);
3770 tcmsg->tcm_parent = TC_H_ROOT;
3771
3772 memset(&opt, 0, sizeof opt);
3773 if (!quantum) {
3774 if (!mtu_error) {
3775 opt.quantum = mtu; /* if we cannot find mtu, use default */
3776 }
3777 } else {
3778 opt.quantum = quantum;
3779 }
3780
3781 if (!perturb) {
3782 opt.perturb_period = 10;
3783 } else {
3784 opt.perturb_period = perturb;
3785 }
3786
3787 nl_msg_put_string(&request, TCA_KIND, "sfq");
3788 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3789
3790 error = tc_transact(&request, NULL);
3791 if (error) {
3792 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3793 "quantum %u, perturb %u error %d(%s)",
3794 netdev_get_name(netdev),
3795 opt.quantum, opt.perturb_period,
3796 error, ovs_strerror(error));
3797 }
3798 return error;
3799 }
3800
3801 static void
3802 sfq_parse_qdisc_details__(struct netdev *netdev,
3803 const struct smap *details, struct sfq *sfq)
3804 {
3805 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3806 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3807
3808 if (!sfq->perturb) {
3809 sfq->perturb = 10;
3810 }
3811
3812 if (!sfq->quantum) {
3813 int mtu;
3814 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3815 sfq->quantum = mtu;
3816 } else {
3817 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3818 "device without mtu");
3819 }
3820 }
3821 }
3822
3823 static int
3824 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3825 {
3826 int error;
3827 struct sfq sfq;
3828
3829 sfq_parse_qdisc_details__(netdev, details, &sfq);
3830 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3831 if (!error) {
3832 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3833 }
3834 return error;
3835 }
3836
3837 static int
3838 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3839 {
3840 const struct tc_sfq_qopt *sfq;
3841 struct nlattr *nlattr;
3842 const char * kind;
3843 int error;
3844
3845 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3846 if (error == 0) {
3847 sfq = nl_attr_get(nlattr);
3848 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3849 return 0;
3850 }
3851
3852 return error;
3853 }
3854
3855 static void
3856 sfq_tc_destroy(struct tc *tc)
3857 {
3858 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3859 tc_destroy(tc);
3860 free(sfq);
3861 }
3862
3863 static int
3864 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3865 {
3866 const struct sfq *sfq = sfq_get__(netdev);
3867 smap_add_format(details, "quantum", "%u", sfq->quantum);
3868 smap_add_format(details, "perturb", "%u", sfq->perturb);
3869 return 0;
3870 }
3871
3872 static int
3873 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3874 {
3875 struct sfq sfq;
3876
3877 sfq_parse_qdisc_details__(netdev, details, &sfq);
3878 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3879 sfq_get__(netdev)->quantum = sfq.quantum;
3880 sfq_get__(netdev)->perturb = sfq.perturb;
3881 return 0;
3882 }
3883
3884 static const struct tc_ops tc_ops_sfq = {
3885 "sfq", /* linux_name */
3886 "linux-sfq", /* ovs_name */
3887 SFQ_N_QUEUES, /* n_queues */
3888 sfq_tc_install,
3889 sfq_tc_load,
3890 sfq_tc_destroy,
3891 sfq_qdisc_get,
3892 sfq_qdisc_set,
3893 NULL,
3894 NULL,
3895 NULL,
3896 NULL,
3897 NULL
3898 };
3899 \f
3900 /* HTB traffic control class. */
3901
3902 #define HTB_N_QUEUES 0xf000
3903 #define HTB_RATE2QUANTUM 10
3904
3905 struct htb {
3906 struct tc tc;
3907 unsigned int max_rate; /* In bytes/s. */
3908 };
3909
3910 struct htb_class {
3911 struct tc_queue tc_queue;
3912 unsigned int min_rate; /* In bytes/s. */
3913 unsigned int max_rate; /* In bytes/s. */
3914 unsigned int burst; /* In bytes. */
3915 unsigned int priority; /* Lower values are higher priorities. */
3916 };
3917
3918 static struct htb *
3919 htb_get__(const struct netdev *netdev_)
3920 {
3921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3922 return CONTAINER_OF(netdev->tc, struct htb, tc);
3923 }
3924
3925 static void
3926 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3927 {
3928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3929 struct htb *htb;
3930
3931 htb = xmalloc(sizeof *htb);
3932 tc_init(&htb->tc, &tc_ops_htb);
3933 htb->max_rate = max_rate;
3934
3935 netdev->tc = &htb->tc;
3936 }
3937
3938 /* Create an HTB qdisc.
3939 *
3940 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3941 static int
3942 htb_setup_qdisc__(struct netdev *netdev)
3943 {
3944 size_t opt_offset;
3945 struct tc_htb_glob opt;
3946 struct ofpbuf request;
3947 struct tcmsg *tcmsg;
3948
3949 tc_del_qdisc(netdev);
3950
3951 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3952 NLM_F_EXCL | NLM_F_CREATE, &request);
3953 if (!tcmsg) {
3954 return ENODEV;
3955 }
3956 tcmsg->tcm_handle = tc_make_handle(1, 0);
3957 tcmsg->tcm_parent = TC_H_ROOT;
3958
3959 nl_msg_put_string(&request, TCA_KIND, "htb");
3960
3961 memset(&opt, 0, sizeof opt);
3962 opt.rate2quantum = HTB_RATE2QUANTUM;
3963 opt.version = 3;
3964 opt.defcls = 1;
3965
3966 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3967 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3968 nl_msg_end_nested(&request, opt_offset);
3969
3970 return tc_transact(&request, NULL);
3971 }
3972
3973 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3974 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3975 static int
3976 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3977 unsigned int parent, struct htb_class *class)
3978 {
3979 size_t opt_offset;
3980 struct tc_htb_opt opt;
3981 struct ofpbuf request;
3982 struct tcmsg *tcmsg;
3983 int error;
3984 int mtu;
3985
3986 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3987 if (error) {
3988 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3989 netdev_get_name(netdev));
3990 return error;
3991 }
3992
3993 memset(&opt, 0, sizeof opt);
3994 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3995 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3996 /* Makes sure the quantum is at least MTU. Setting quantum will
3997 * make htb ignore the r2q for this class. */
3998 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3999 opt.quantum = mtu;
4000 }
4001 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4002 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4003 opt.prio = class->priority;
4004
4005 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4006 &request);
4007 if (!tcmsg) {
4008 return ENODEV;
4009 }
4010 tcmsg->tcm_handle = handle;
4011 tcmsg->tcm_parent = parent;
4012
4013 nl_msg_put_string(&request, TCA_KIND, "htb");
4014 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4015 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4016 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4017 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4018 nl_msg_end_nested(&request, opt_offset);
4019
4020 error = tc_transact(&request, NULL);
4021 if (error) {
4022 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4023 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4024 netdev_get_name(netdev),
4025 tc_get_major(handle), tc_get_minor(handle),
4026 tc_get_major(parent), tc_get_minor(parent),
4027 class->min_rate, class->max_rate,
4028 class->burst, class->priority, ovs_strerror(error));
4029 }
4030 return error;
4031 }
4032
4033 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4034 * description of them into 'details'. The description complies with the
4035 * specification given in the vswitch database documentation for linux-htb
4036 * queue details. */
4037 static int
4038 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4039 {
4040 static const struct nl_policy tca_htb_policy[] = {
4041 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4042 .min_len = sizeof(struct tc_htb_opt) },
4043 };
4044
4045 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4046 const struct tc_htb_opt *htb;
4047
4048 if (!nl_parse_nested(nl_options, tca_htb_policy,
4049 attrs, ARRAY_SIZE(tca_htb_policy))) {
4050 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4051 return EPROTO;
4052 }
4053
4054 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4055 class->min_rate = htb->rate.rate;
4056 class->max_rate = htb->ceil.rate;
4057 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4058 class->priority = htb->prio;
4059 return 0;
4060 }
4061
4062 static int
4063 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4064 struct htb_class *options,
4065 struct netdev_queue_stats *stats)
4066 {
4067 struct nlattr *nl_options;
4068 unsigned int handle;
4069 int error;
4070
4071 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4072 if (!error && queue_id) {
4073 unsigned int major = tc_get_major(handle);
4074 unsigned int minor = tc_get_minor(handle);
4075 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4076 *queue_id = minor - 1;
4077 } else {
4078 error = EPROTO;
4079 }
4080 }
4081 if (!error && options) {
4082 error = htb_parse_tca_options__(nl_options, options);
4083 }
4084 return error;
4085 }
4086
4087 static void
4088 htb_parse_qdisc_details__(struct netdev *netdev_,
4089 const struct smap *details, struct htb_class *hc)
4090 {
4091 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4092
4093 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4094 if (!hc->max_rate) {
4095 enum netdev_features current;
4096
4097 netdev_linux_read_features(netdev);
4098 current = !netdev->get_features_error ? netdev->current : 0;
4099 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4100 }
4101 hc->min_rate = hc->max_rate;
4102 hc->burst = 0;
4103 hc->priority = 0;
4104 }
4105
4106 static int
4107 htb_parse_class_details__(struct netdev *netdev,
4108 const struct smap *details, struct htb_class *hc)
4109 {
4110 const struct htb *htb = htb_get__(netdev);
4111 int mtu, error;
4112 unsigned long long int max_rate_bit;
4113
4114 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4115 if (error) {
4116 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4117 netdev_get_name(netdev));
4118 return error;
4119 }
4120
4121 /* HTB requires at least an mtu sized min-rate to send any traffic even
4122 * on uncongested links. */
4123 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4124 hc->min_rate = MAX(hc->min_rate, mtu);
4125 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4126
4127 /* max-rate */
4128 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4129 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4130 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4131 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4132
4133 /* burst
4134 *
4135 * According to hints in the documentation that I've read, it is important
4136 * that 'burst' be at least as big as the largest frame that might be
4137 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4138 * but having it a bit too small is a problem. Since netdev_get_mtu()
4139 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4140 * the MTU. We actually add 64, instead of 14, as a guard against
4141 * additional headers get tacked on somewhere that we're not aware of. */
4142 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4143 hc->burst = MAX(hc->burst, mtu + 64);
4144
4145 /* priority */
4146 hc->priority = smap_get_ullong(details, "priority", 0);
4147
4148 return 0;
4149 }
4150
4151 static int
4152 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4153 unsigned int parent, struct htb_class *options,
4154 struct netdev_queue_stats *stats)
4155 {
4156 struct ofpbuf *reply;
4157 int error;
4158
4159 error = tc_query_class(netdev, handle, parent, &reply);
4160 if (!error) {
4161 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4162 ofpbuf_delete(reply);
4163 }
4164 return error;
4165 }
4166
4167 static int
4168 htb_tc_install(struct netdev *netdev, const struct smap *details)
4169 {
4170 int error;
4171
4172 error = htb_setup_qdisc__(netdev);
4173 if (!error) {
4174 struct htb_class hc;
4175
4176 htb_parse_qdisc_details__(netdev, details, &hc);
4177 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4178 tc_make_handle(1, 0), &hc);
4179 if (!error) {
4180 htb_install__(netdev, hc.max_rate);
4181 }
4182 }
4183 return error;
4184 }
4185
4186 static struct htb_class *
4187 htb_class_cast__(const struct tc_queue *queue)
4188 {
4189 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4190 }
4191
4192 static void
4193 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4194 const struct htb_class *hc)
4195 {
4196 struct htb *htb = htb_get__(netdev);
4197 size_t hash = hash_int(queue_id, 0);
4198 struct tc_queue *queue;
4199 struct htb_class *hcp;
4200
4201 queue = tc_find_queue__(netdev, queue_id, hash);
4202 if (queue) {
4203 hcp = htb_class_cast__(queue);
4204 } else {
4205 hcp = xmalloc(sizeof *hcp);
4206 queue = &hcp->tc_queue;
4207 queue->queue_id = queue_id;
4208 queue->created = time_msec();
4209 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4210 }
4211
4212 hcp->min_rate = hc->min_rate;
4213 hcp->max_rate = hc->max_rate;
4214 hcp->burst = hc->burst;
4215 hcp->priority = hc->priority;
4216 }
4217
4218 static int
4219 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4220 {
4221 struct ofpbuf msg;
4222 struct queue_dump_state state;
4223 struct htb_class hc;
4224
4225 /* Get qdisc options. */
4226 hc.max_rate = 0;
4227 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4228 htb_install__(netdev, hc.max_rate);
4229
4230 /* Get queues. */
4231 if (!start_queue_dump(netdev, &state)) {
4232 return ENODEV;
4233 }
4234 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4235 unsigned int queue_id;
4236
4237 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4238 htb_update_queue__(netdev, queue_id, &hc);
4239 }
4240 }
4241 finish_queue_dump(&state);
4242
4243 return 0;
4244 }
4245
4246 static void
4247 htb_tc_destroy(struct tc *tc)
4248 {
4249 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4250 struct htb_class *hc;
4251
4252 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4253 free(hc);
4254 }
4255 tc_destroy(tc);
4256 free(htb);
4257 }
4258
4259 static int
4260 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4261 {
4262 const struct htb *htb = htb_get__(netdev);
4263 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4264 return 0;
4265 }
4266
4267 static int
4268 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4269 {
4270 struct htb_class hc;
4271 int error;
4272
4273 htb_parse_qdisc_details__(netdev, details, &hc);
4274 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4275 tc_make_handle(1, 0), &hc);
4276 if (!error) {
4277 htb_get__(netdev)->max_rate = hc.max_rate;
4278 }
4279 return error;
4280 }
4281
4282 static int
4283 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4284 const struct tc_queue *queue, struct smap *details)
4285 {
4286 const struct htb_class *hc = htb_class_cast__(queue);
4287
4288 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4289 if (hc->min_rate != hc->max_rate) {
4290 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4291 }
4292 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4293 if (hc->priority) {
4294 smap_add_format(details, "priority", "%u", hc->priority);
4295 }
4296 return 0;
4297 }
4298
4299 static int
4300 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4301 const struct smap *details)
4302 {
4303 struct htb_class hc;
4304 int error;
4305
4306 error = htb_parse_class_details__(netdev, details, &hc);
4307 if (error) {
4308 return error;
4309 }
4310
4311 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4312 tc_make_handle(1, 0xfffe), &hc);
4313 if (error) {
4314 return error;
4315 }
4316
4317 htb_update_queue__(netdev, queue_id, &hc);
4318 return 0;
4319 }
4320
4321 static int
4322 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4323 {
4324 struct htb_class *hc = htb_class_cast__(queue);
4325 struct htb *htb = htb_get__(netdev);
4326 int error;
4327
4328 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4329 if (!error) {
4330 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4331 free(hc);
4332 }
4333 return error;
4334 }
4335
4336 static int
4337 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4338 struct netdev_queue_stats *stats)
4339 {
4340 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4341 tc_make_handle(1, 0xfffe), NULL, stats);
4342 }
4343
4344 static int
4345 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4346 const struct ofpbuf *nlmsg,
4347 netdev_dump_queue_stats_cb *cb, void *aux)
4348 {
4349 struct netdev_queue_stats stats;
4350 unsigned int handle, major, minor;
4351 int error;
4352
4353 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4354 if (error) {
4355 return error;
4356 }
4357
4358 major = tc_get_major(handle);
4359 minor = tc_get_minor(handle);
4360 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4361 (*cb)(minor - 1, &stats, aux);
4362 }
4363 return 0;
4364 }
4365
4366 static const struct tc_ops tc_ops_htb = {
4367 "htb", /* linux_name */
4368 "linux-htb", /* ovs_name */
4369 HTB_N_QUEUES, /* n_queues */
4370 htb_tc_install,
4371 htb_tc_load,
4372 htb_tc_destroy,
4373 htb_qdisc_get,
4374 htb_qdisc_set,
4375 htb_class_get,
4376 htb_class_set,
4377 htb_class_delete,
4378 htb_class_get_stats,
4379 htb_class_dump_stats
4380 };
4381 \f
4382 /* "linux-hfsc" traffic control class. */
4383
4384 #define HFSC_N_QUEUES 0xf000
4385
4386 struct hfsc {
4387 struct tc tc;
4388 uint32_t max_rate;
4389 };
4390
4391 struct hfsc_class {
4392 struct tc_queue tc_queue;
4393 uint32_t min_rate;
4394 uint32_t max_rate;
4395 };
4396
4397 static struct hfsc *
4398 hfsc_get__(const struct netdev *netdev_)
4399 {
4400 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4401 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4402 }
4403
4404 static struct hfsc_class *
4405 hfsc_class_cast__(const struct tc_queue *queue)
4406 {
4407 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4408 }
4409
4410 static void
4411 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4412 {
4413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4414 struct hfsc *hfsc;
4415
4416 hfsc = xmalloc(sizeof *hfsc);
4417 tc_init(&hfsc->tc, &tc_ops_hfsc);
4418 hfsc->max_rate = max_rate;
4419 netdev->tc = &hfsc->tc;
4420 }
4421
4422 static void
4423 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4424 const struct hfsc_class *hc)
4425 {
4426 size_t hash;
4427 struct hfsc *hfsc;
4428 struct hfsc_class *hcp;
4429 struct tc_queue *queue;
4430
4431 hfsc = hfsc_get__(netdev);
4432 hash = hash_int(queue_id, 0);
4433
4434 queue = tc_find_queue__(netdev, queue_id, hash);
4435 if (queue) {
4436 hcp = hfsc_class_cast__(queue);
4437 } else {
4438 hcp = xmalloc(sizeof *hcp);
4439 queue = &hcp->tc_queue;
4440 queue->queue_id = queue_id;
4441 queue->created = time_msec();
4442 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4443 }
4444
4445 hcp->min_rate = hc->min_rate;
4446 hcp->max_rate = hc->max_rate;
4447 }
4448
4449 static int
4450 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4451 {
4452 const struct tc_service_curve *rsc, *fsc, *usc;
4453 static const struct nl_policy tca_hfsc_policy[] = {
4454 [TCA_HFSC_RSC] = {
4455 .type = NL_A_UNSPEC,
4456 .optional = false,
4457 .min_len = sizeof(struct tc_service_curve),
4458 },
4459 [TCA_HFSC_FSC] = {
4460 .type = NL_A_UNSPEC,
4461 .optional = false,
4462 .min_len = sizeof(struct tc_service_curve),
4463 },
4464 [TCA_HFSC_USC] = {
4465 .type = NL_A_UNSPEC,
4466 .optional = false,
4467 .min_len = sizeof(struct tc_service_curve),
4468 },
4469 };
4470 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4471
4472 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4473 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4474 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4475 return EPROTO;
4476 }
4477
4478 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4479 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4480 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4481
4482 if (rsc->m1 != 0 || rsc->d != 0 ||
4483 fsc->m1 != 0 || fsc->d != 0 ||
4484 usc->m1 != 0 || usc->d != 0) {
4485 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4486 "Non-linear service curves are not supported.");
4487 return EPROTO;
4488 }
4489
4490 if (rsc->m2 != fsc->m2) {
4491 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4492 "Real-time service curves are not supported ");
4493 return EPROTO;
4494 }
4495
4496 if (rsc->m2 > usc->m2) {
4497 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4498 "Min-rate service curve is greater than "
4499 "the max-rate service curve.");
4500 return EPROTO;
4501 }
4502
4503 class->min_rate = fsc->m2;
4504 class->max_rate = usc->m2;
4505 return 0;
4506 }
4507
4508 static int
4509 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4510 struct hfsc_class *options,
4511 struct netdev_queue_stats *stats)
4512 {
4513 int error;
4514 unsigned int handle;
4515 struct nlattr *nl_options;
4516
4517 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4518 if (error) {
4519 return error;
4520 }
4521
4522 if (queue_id) {
4523 unsigned int major, minor;
4524
4525 major = tc_get_major(handle);
4526 minor = tc_get_minor(handle);
4527 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4528 *queue_id = minor - 1;
4529 } else {
4530 return EPROTO;
4531 }
4532 }
4533
4534 if (options) {
4535 error = hfsc_parse_tca_options__(nl_options, options);
4536 }
4537
4538 return error;
4539 }
4540
4541 static int
4542 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4543 unsigned int parent, struct hfsc_class *options,
4544 struct netdev_queue_stats *stats)
4545 {
4546 int error;
4547 struct ofpbuf *reply;
4548
4549 error = tc_query_class(netdev, handle, parent, &reply);
4550 if (error) {
4551 return error;
4552 }
4553
4554 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4555 ofpbuf_delete(reply);
4556 return error;
4557 }
4558
4559 static void
4560 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4561 struct hfsc_class *class)
4562 {
4563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4564
4565 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4566 if (!max_rate) {
4567 enum netdev_features current;
4568
4569 netdev_linux_read_features(netdev);
4570 current = !netdev->get_features_error ? netdev->current : 0;
4571 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4572 }
4573
4574 class->min_rate = max_rate;
4575 class->max_rate = max_rate;
4576 }
4577
4578 static int
4579 hfsc_parse_class_details__(struct netdev *netdev,
4580 const struct smap *details,
4581 struct hfsc_class * class)
4582 {
4583 const struct hfsc *hfsc;
4584 uint32_t min_rate, max_rate;
4585
4586 hfsc = hfsc_get__(netdev);
4587
4588 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4589 min_rate = MAX(min_rate, 1);
4590 min_rate = MIN(min_rate, hfsc->max_rate);
4591
4592 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4593 max_rate = MAX(max_rate, min_rate);
4594 max_rate = MIN(max_rate, hfsc->max_rate);
4595
4596 class->min_rate = min_rate;
4597 class->max_rate = max_rate;
4598
4599 return 0;
4600 }
4601
4602 /* Create an HFSC qdisc.
4603 *
4604 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4605 static int
4606 hfsc_setup_qdisc__(struct netdev * netdev)
4607 {
4608 struct tcmsg *tcmsg;
4609 struct ofpbuf request;
4610 struct tc_hfsc_qopt opt;
4611
4612 tc_del_qdisc(netdev);
4613
4614 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4615 NLM_F_EXCL | NLM_F_CREATE, &request);
4616
4617 if (!tcmsg) {
4618 return ENODEV;
4619 }
4620
4621 tcmsg->tcm_handle = tc_make_handle(1, 0);
4622 tcmsg->tcm_parent = TC_H_ROOT;
4623
4624 memset(&opt, 0, sizeof opt);
4625 opt.defcls = 1;
4626
4627 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4628 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4629
4630 return tc_transact(&request, NULL);
4631 }
4632
4633 /* Create an HFSC class.
4634 *
4635 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4636 * sc rate <min_rate> ul rate <max_rate>" */
4637 static int
4638 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4639 unsigned int parent, struct hfsc_class *class)
4640 {
4641 int error;
4642 size_t opt_offset;
4643 struct tcmsg *tcmsg;
4644 struct ofpbuf request;
4645 struct tc_service_curve min, max;
4646
4647 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4648 &request);
4649
4650 if (!tcmsg) {
4651 return ENODEV;
4652 }
4653
4654 tcmsg->tcm_handle = handle;
4655 tcmsg->tcm_parent = parent;
4656
4657 min.m1 = 0;
4658 min.d = 0;
4659 min.m2 = class->min_rate;
4660
4661 max.m1 = 0;
4662 max.d = 0;
4663 max.m2 = class->max_rate;
4664
4665 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4666 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4667 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4668 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4669 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4670 nl_msg_end_nested(&request, opt_offset);
4671
4672 error = tc_transact(&request, NULL);
4673 if (error) {
4674 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4675 "min-rate %ubps, max-rate %ubps (%s)",
4676 netdev_get_name(netdev),
4677 tc_get_major(handle), tc_get_minor(handle),
4678 tc_get_major(parent), tc_get_minor(parent),
4679 class->min_rate, class->max_rate, ovs_strerror(error));
4680 }
4681
4682 return error;
4683 }
4684
4685 static int
4686 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4687 {
4688 int error;
4689 struct hfsc_class class;
4690
4691 error = hfsc_setup_qdisc__(netdev);
4692
4693 if (error) {
4694 return error;
4695 }
4696
4697 hfsc_parse_qdisc_details__(netdev, details, &class);
4698 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4699 tc_make_handle(1, 0), &class);
4700
4701 if (error) {
4702 return error;
4703 }
4704
4705 hfsc_install__(netdev, class.max_rate);
4706 return 0;
4707 }
4708
4709 static int
4710 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4711 {
4712 struct ofpbuf msg;
4713 struct queue_dump_state state;
4714 struct hfsc_class hc;
4715
4716 hc.max_rate = 0;
4717 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4718 hfsc_install__(netdev, hc.max_rate);
4719
4720 if (!start_queue_dump(netdev, &state)) {
4721 return ENODEV;
4722 }
4723
4724 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4725 unsigned int queue_id;
4726
4727 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4728 hfsc_update_queue__(netdev, queue_id, &hc);
4729 }
4730 }
4731
4732 finish_queue_dump(&state);
4733 return 0;
4734 }
4735
4736 static void
4737 hfsc_tc_destroy(struct tc *tc)
4738 {
4739 struct hfsc *hfsc;
4740 struct hfsc_class *hc, *next;
4741
4742 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4743
4744 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4745 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4746 free(hc);
4747 }
4748
4749 tc_destroy(tc);
4750 free(hfsc);
4751 }
4752
4753 static int
4754 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4755 {
4756 const struct hfsc *hfsc;
4757 hfsc = hfsc_get__(netdev);
4758 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4759 return 0;
4760 }
4761
4762 static int
4763 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4764 {
4765 int error;
4766 struct hfsc_class class;
4767
4768 hfsc_parse_qdisc_details__(netdev, details, &class);
4769 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4770 tc_make_handle(1, 0), &class);
4771
4772 if (!error) {
4773 hfsc_get__(netdev)->max_rate = class.max_rate;
4774 }
4775
4776 return error;
4777 }
4778
4779 static int
4780 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4781 const struct tc_queue *queue, struct smap *details)
4782 {
4783 const struct hfsc_class *hc;
4784
4785 hc = hfsc_class_cast__(queue);
4786 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4787 if (hc->min_rate != hc->max_rate) {
4788 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4789 }
4790 return 0;
4791 }
4792
4793 static int
4794 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4795 const struct smap *details)
4796 {
4797 int error;
4798 struct hfsc_class class;
4799
4800 error = hfsc_parse_class_details__(netdev, details, &class);
4801 if (error) {
4802 return error;
4803 }
4804
4805 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4806 tc_make_handle(1, 0xfffe), &class);
4807 if (error) {
4808 return error;
4809 }
4810
4811 hfsc_update_queue__(netdev, queue_id, &class);
4812 return 0;
4813 }
4814
4815 static int
4816 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4817 {
4818 int error;
4819 struct hfsc *hfsc;
4820 struct hfsc_class *hc;
4821
4822 hc = hfsc_class_cast__(queue);
4823 hfsc = hfsc_get__(netdev);
4824
4825 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4826 if (!error) {
4827 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4828 free(hc);
4829 }
4830 return error;
4831 }
4832
4833 static int
4834 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4835 struct netdev_queue_stats *stats)
4836 {
4837 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4838 tc_make_handle(1, 0xfffe), NULL, stats);
4839 }
4840
4841 static int
4842 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4843 const struct ofpbuf *nlmsg,
4844 netdev_dump_queue_stats_cb *cb, void *aux)
4845 {
4846 struct netdev_queue_stats stats;
4847 unsigned int handle, major, minor;
4848 int error;
4849
4850 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4851 if (error) {
4852 return error;
4853 }
4854
4855 major = tc_get_major(handle);
4856 minor = tc_get_minor(handle);
4857 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4858 (*cb)(minor - 1, &stats, aux);
4859 }
4860 return 0;
4861 }
4862
4863 static const struct tc_ops tc_ops_hfsc = {
4864 "hfsc", /* linux_name */
4865 "linux-hfsc", /* ovs_name */
4866 HFSC_N_QUEUES, /* n_queues */
4867 hfsc_tc_install, /* tc_install */
4868 hfsc_tc_load, /* tc_load */
4869 hfsc_tc_destroy, /* tc_destroy */
4870 hfsc_qdisc_get, /* qdisc_get */
4871 hfsc_qdisc_set, /* qdisc_set */
4872 hfsc_class_get, /* class_get */
4873 hfsc_class_set, /* class_set */
4874 hfsc_class_delete, /* class_delete */
4875 hfsc_class_get_stats, /* class_get_stats */
4876 hfsc_class_dump_stats /* class_dump_stats */
4877 };
4878 \f
4879 /* "linux-noop" traffic control class. */
4880
4881 static void
4882 noop_install__(struct netdev *netdev_)
4883 {
4884 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4885 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4886
4887 netdev->tc = CONST_CAST(struct tc *, &tc);
4888 }
4889
4890 static int
4891 noop_tc_install(struct netdev *netdev,
4892 const struct smap *details OVS_UNUSED)
4893 {
4894 noop_install__(netdev);
4895 return 0;
4896 }
4897
4898 static int
4899 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4900 {
4901 noop_install__(netdev);
4902 return 0;
4903 }
4904
4905 static const struct tc_ops tc_ops_noop = {
4906 NULL, /* linux_name */
4907 "linux-noop", /* ovs_name */
4908 0, /* n_queues */
4909 noop_tc_install,
4910 noop_tc_load,
4911 NULL, /* tc_destroy */
4912 NULL, /* qdisc_get */
4913 NULL, /* qdisc_set */
4914 NULL, /* class_get */
4915 NULL, /* class_set */
4916 NULL, /* class_delete */
4917 NULL, /* class_get_stats */
4918 NULL /* class_dump_stats */
4919 };
4920 \f
4921 /* "linux-default" traffic control class.
4922 *
4923 * This class represents the default, unnamed Linux qdisc. It corresponds to
4924 * the "" (empty string) QoS type in the OVS database. */
4925
4926 static void
4927 default_install__(struct netdev *netdev_)
4928 {
4929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4930 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4931
4932 /* Nothing but a tc class implementation is allowed to write to a tc. This
4933 * class never does that, so we can legitimately use a const tc object. */
4934 netdev->tc = CONST_CAST(struct tc *, &tc);
4935 }
4936
4937 static int
4938 default_tc_install(struct netdev *netdev,
4939 const struct smap *details OVS_UNUSED)
4940 {
4941 default_install__(netdev);
4942 return 0;
4943 }
4944
4945 static int
4946 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4947 {
4948 default_install__(netdev);
4949 return 0;
4950 }
4951
4952 static const struct tc_ops tc_ops_default = {
4953 NULL, /* linux_name */
4954 "", /* ovs_name */
4955 0, /* n_queues */
4956 default_tc_install,
4957 default_tc_load,
4958 NULL, /* tc_destroy */
4959 NULL, /* qdisc_get */
4960 NULL, /* qdisc_set */
4961 NULL, /* class_get */
4962 NULL, /* class_set */
4963 NULL, /* class_delete */
4964 NULL, /* class_get_stats */
4965 NULL /* class_dump_stats */
4966 };
4967 \f
4968 /* "linux-other" traffic control class.
4969 *
4970 * */
4971
4972 static int
4973 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4974 {
4975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4976 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4977
4978 /* Nothing but a tc class implementation is allowed to write to a tc. This
4979 * class never does that, so we can legitimately use a const tc object. */
4980 netdev->tc = CONST_CAST(struct tc *, &tc);
4981 return 0;
4982 }
4983
4984 static const struct tc_ops tc_ops_other = {
4985 NULL, /* linux_name */
4986 "linux-other", /* ovs_name */
4987 0, /* n_queues */
4988 NULL, /* tc_install */
4989 other_tc_load,
4990 NULL, /* tc_destroy */
4991 NULL, /* qdisc_get */
4992 NULL, /* qdisc_set */
4993 NULL, /* class_get */
4994 NULL, /* class_set */
4995 NULL, /* class_delete */
4996 NULL, /* class_get_stats */
4997 NULL /* class_dump_stats */
4998 };
4999 \f
5000 /* Traffic control. */
5001
5002 /* Number of kernel "tc" ticks per second. */
5003 static double ticks_per_s;
5004
5005 /* Number of kernel "jiffies" per second. This is used for the purpose of
5006 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5007 * one jiffy's worth of data.
5008 *
5009 * There are two possibilities here:
5010 *
5011 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5012 * approximate range of 100 to 1024. That means that we really need to
5013 * make sure that the qdisc can buffer that much data.
5014 *
5015 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5016 * has finely granular timers and there's no need to fudge additional room
5017 * for buffers. (There's no extra effort needed to implement that: the
5018 * large 'buffer_hz' is used as a divisor, so practically any number will
5019 * come out as 0 in the division. Small integer results in the case of
5020 * really high dividends won't have any real effect anyhow.)
5021 */
5022 static unsigned int buffer_hz;
5023
5024 static struct tcmsg *
5025 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5026 unsigned int flags, struct ofpbuf *request)
5027 {
5028 int ifindex;
5029 int error;
5030
5031 error = get_ifindex(netdev, &ifindex);
5032 if (error) {
5033 return NULL;
5034 }
5035
5036 return tc_make_request(ifindex, type, flags, request);
5037 }
5038
5039 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5040 * of 'kbits_burst'.
5041 *
5042 * This function is equivalent to running:
5043 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5044 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5045 * mtu 65535 drop
5046 *
5047 * The configuration and stats may be seen with the following command:
5048 * /sbin/tc -s filter show dev <devname> parent ffff:
5049 *
5050 * Returns 0 if successful, otherwise a positive errno value.
5051 */
5052 static int
5053 tc_add_policer(struct netdev *netdev,
5054 uint32_t kbits_rate, uint32_t kbits_burst)
5055 {
5056 struct tc_police tc_police;
5057 struct ofpbuf request;
5058 struct tcmsg *tcmsg;
5059 size_t basic_offset;
5060 size_t police_offset;
5061 int error;
5062 int mtu = 65535;
5063
5064 memset(&tc_police, 0, sizeof tc_police);
5065 tc_police.action = TC_POLICE_SHOT;
5066 tc_police.mtu = mtu;
5067 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5068
5069 /* The following appears wrong in one way: In networking a kilobit is
5070 * usually 1000 bits but this uses 1024 bits.
5071 *
5072 * However if you "fix" those problems then "tc filter show ..." shows
5073 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5074 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5075 * tc's point of view. Whatever. */
5076 tc_police.burst = tc_bytes_to_ticks(
5077 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5078
5079 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5080 NLM_F_EXCL | NLM_F_CREATE, &request);
5081 if (!tcmsg) {
5082 return ENODEV;
5083 }
5084 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5085 tcmsg->tcm_info = tc_make_handle(49,
5086 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5087
5088 nl_msg_put_string(&request, TCA_KIND, "basic");
5089 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5090 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5091 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5092 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5093 nl_msg_end_nested(&request, police_offset);
5094 nl_msg_end_nested(&request, basic_offset);
5095
5096 error = tc_transact(&request, NULL);
5097 if (error) {
5098 return error;
5099 }
5100
5101 return 0;
5102 }
5103
5104 static void
5105 read_psched(void)
5106 {
5107 /* The values in psched are not individually very meaningful, but they are
5108 * important. The tables below show some values seen in the wild.
5109 *
5110 * Some notes:
5111 *
5112 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5113 * (Before that, there are hints that it was 1000000000.)
5114 *
5115 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5116 * above.
5117 *
5118 * /proc/net/psched
5119 * -----------------------------------
5120 * [1] 000c8000 000f4240 000f4240 00000064
5121 * [2] 000003e8 00000400 000f4240 3b9aca00
5122 * [3] 000003e8 00000400 000f4240 3b9aca00
5123 * [4] 000003e8 00000400 000f4240 00000064
5124 * [5] 000003e8 00000040 000f4240 3b9aca00
5125 * [6] 000003e8 00000040 000f4240 000000f9
5126 *
5127 * a b c d ticks_per_s buffer_hz
5128 * ------- --------- ---------- ------------- ----------- -------------
5129 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5130 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5131 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5132 * [4] 1,000 1,024 1,000,000 100 976,562 100
5133 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5134 * [6] 1,000 64 1,000,000 249 15,625,000 249
5135 *
5136 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5137 * [2] 2.6.26-1-686-bigmem from Debian lenny
5138 * [3] 2.6.26-2-sparc64 from Debian lenny
5139 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5140 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5141 * [6] 2.6.34 from kernel.org on KVM
5142 */
5143 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5144 static const char fn[] = "/proc/net/psched";
5145 unsigned int a, b, c, d;
5146 FILE *stream;
5147
5148 if (!ovsthread_once_start(&once)) {
5149 return;
5150 }
5151
5152 ticks_per_s = 1.0;
5153 buffer_hz = 100;
5154
5155 stream = fopen(fn, "r");
5156 if (!stream) {
5157 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5158 goto exit;
5159 }
5160
5161 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5162 VLOG_WARN("%s: read failed", fn);
5163 fclose(stream);
5164 goto exit;
5165 }
5166 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5167 fclose(stream);
5168
5169 if (!a || !c) {
5170 VLOG_WARN("%s: invalid scheduler parameters", fn);
5171 goto exit;
5172 }
5173
5174 ticks_per_s = (double) a * c / b;
5175 if (c == 1000000) {
5176 buffer_hz = d;
5177 } else {
5178 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5179 fn, a, b, c, d);
5180 }
5181 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5182
5183 exit:
5184 ovsthread_once_done(&once);
5185 }
5186
5187 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5188 * rate of 'rate' bytes per second. */
5189 static unsigned int
5190 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5191 {
5192 read_psched();
5193 return (rate * ticks) / ticks_per_s;
5194 }
5195
5196 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5197 * rate of 'rate' bytes per second. */
5198 static unsigned int
5199 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5200 {
5201 read_psched();
5202 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5203 }
5204
5205 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5206 * a transmission rate of 'rate' bytes per second. */
5207 static unsigned int
5208 tc_buffer_per_jiffy(unsigned int rate)
5209 {
5210 read_psched();
5211 return rate / buffer_hz;
5212 }
5213
5214 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5215 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5216 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5217 * stores NULL into it if it is absent.
5218 *
5219 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5220 * 'msg'.
5221 *
5222 * Returns 0 if successful, otherwise a positive errno value. */
5223 static int
5224 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5225 struct nlattr **options)
5226 {
5227 static const struct nl_policy tca_policy[] = {
5228 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5229 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5230 };
5231 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5232
5233 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5234 tca_policy, ta, ARRAY_SIZE(ta))) {
5235 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5236 goto error;
5237 }
5238
5239 if (kind) {
5240 *kind = nl_attr_get_string(ta[TCA_KIND]);
5241 }
5242
5243 if (options) {
5244 *options = ta[TCA_OPTIONS];
5245 }
5246
5247 return 0;
5248
5249 error:
5250 if (kind) {
5251 *kind = NULL;
5252 }
5253 if (options) {
5254 *options = NULL;
5255 }
5256 return EPROTO;
5257 }
5258
5259 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5260 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5261 * into '*options', and its queue statistics into '*stats'. Any of the output
5262 * arguments may be null.
5263 *
5264 * Returns 0 if successful, otherwise a positive errno value. */
5265 static int
5266 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5267 struct nlattr **options, struct netdev_queue_stats *stats)
5268 {
5269 static const struct nl_policy tca_policy[] = {
5270 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5271 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5272 };
5273 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5274
5275 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5276 tca_policy, ta, ARRAY_SIZE(ta))) {
5277 VLOG_WARN_RL(&rl, "failed to parse class message");
5278 goto error;
5279 }
5280
5281 if (handlep) {
5282 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5283 *handlep = tc->tcm_handle;
5284 }
5285
5286 if (options) {
5287 *options = ta[TCA_OPTIONS];
5288 }
5289
5290 if (stats) {
5291 const struct gnet_stats_queue *gsq;
5292 struct gnet_stats_basic gsb;
5293
5294 static const struct nl_policy stats_policy[] = {
5295 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5296 .min_len = sizeof gsb },
5297 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5298 .min_len = sizeof *gsq },
5299 };
5300 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5301
5302 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5303 sa, ARRAY_SIZE(sa))) {
5304 VLOG_WARN_RL(&rl, "failed to parse class stats");
5305 goto error;
5306 }
5307
5308 /* Alignment issues screw up the length of struct gnet_stats_basic on
5309 * some arch/bitsize combinations. Newer versions of Linux have a
5310 * struct gnet_stats_basic_packed, but we can't depend on that. The
5311 * easiest thing to do is just to make a copy. */
5312 memset(&gsb, 0, sizeof gsb);
5313 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5314 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5315 stats->tx_bytes = gsb.bytes;
5316 stats->tx_packets = gsb.packets;
5317
5318 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5319 stats->tx_errors = gsq->drops;
5320 }
5321
5322 return 0;
5323
5324 error:
5325 if (options) {
5326 *options = NULL;
5327 }
5328 if (stats) {
5329 memset(stats, 0, sizeof *stats);
5330 }
5331 return EPROTO;
5332 }
5333
5334 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5335 * on 'netdev'. */
5336 static int
5337 tc_query_class(const struct netdev *netdev,
5338 unsigned int handle, unsigned int parent,
5339 struct ofpbuf **replyp)
5340 {
5341 struct ofpbuf request;
5342 struct tcmsg *tcmsg;
5343 int error;
5344
5345 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5346 &request);
5347 if (!tcmsg) {
5348 return ENODEV;
5349 }
5350 tcmsg->tcm_handle = handle;
5351 tcmsg->tcm_parent = parent;
5352
5353 error = tc_transact(&request, replyp);
5354 if (error) {
5355 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5356 netdev_get_name(netdev),
5357 tc_get_major(handle), tc_get_minor(handle),
5358 tc_get_major(parent), tc_get_minor(parent),
5359 ovs_strerror(error));
5360 }
5361 return error;
5362 }
5363
5364 /* Equivalent to "tc class del dev <name> handle <handle>". */
5365 static int
5366 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5367 {
5368 struct ofpbuf request;
5369 struct tcmsg *tcmsg;
5370 int error;
5371
5372 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5373 if (!tcmsg) {
5374 return ENODEV;
5375 }
5376 tcmsg->tcm_handle = handle;
5377 tcmsg->tcm_parent = 0;
5378
5379 error = tc_transact(&request, NULL);
5380 if (error) {
5381 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5382 netdev_get_name(netdev),
5383 tc_get_major(handle), tc_get_minor(handle),
5384 ovs_strerror(error));
5385 }
5386 return error;
5387 }
5388
5389 /* Equivalent to "tc qdisc del dev <name> root". */
5390 static int
5391 tc_del_qdisc(struct netdev *netdev_)
5392 {
5393 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5394 struct ofpbuf request;
5395 struct tcmsg *tcmsg;
5396 int error;
5397
5398 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5399 if (!tcmsg) {
5400 return ENODEV;
5401 }
5402 tcmsg->tcm_handle = tc_make_handle(1, 0);
5403 tcmsg->tcm_parent = TC_H_ROOT;
5404
5405 error = tc_transact(&request, NULL);
5406 if (error == EINVAL) {
5407 /* EINVAL probably means that the default qdisc was in use, in which
5408 * case we've accomplished our purpose. */
5409 error = 0;
5410 }
5411 if (!error && netdev->tc) {
5412 if (netdev->tc->ops->tc_destroy) {
5413 netdev->tc->ops->tc_destroy(netdev->tc);
5414 }
5415 netdev->tc = NULL;
5416 }
5417 return error;
5418 }
5419
5420 static bool
5421 getqdisc_is_safe(void)
5422 {
5423 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5424 static bool safe = false;
5425
5426 if (ovsthread_once_start(&once)) {
5427 struct utsname utsname;
5428 int major, minor;
5429
5430 if (uname(&utsname) == -1) {
5431 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5432 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5433 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5434 } else if (major < 2 || (major == 2 && minor < 35)) {
5435 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5436 utsname.release);
5437 } else {
5438 safe = true;
5439 }
5440 ovsthread_once_done(&once);
5441 }
5442 return safe;
5443 }
5444
5445 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5446 * kernel to determine what they are. Returns 0 if successful, otherwise a
5447 * positive errno value. */
5448 static int
5449 tc_query_qdisc(const struct netdev *netdev_)
5450 {
5451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5452 struct ofpbuf request, *qdisc;
5453 const struct tc_ops *ops;
5454 struct tcmsg *tcmsg;
5455 int load_error;
5456 int error;
5457
5458 if (netdev->tc) {
5459 return 0;
5460 }
5461
5462 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5463 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5464 * 2.6.35 without that fix backported to it.
5465 *
5466 * To avoid the OOPS, we must not make a request that would attempt to dump
5467 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5468 * few others. There are a few ways that I can see to do this, but most of
5469 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5470 * technique chosen here is to assume that any non-default qdisc that we
5471 * create will have a class with handle 1:0. The built-in qdiscs only have
5472 * a class with handle 0:0.
5473 *
5474 * On Linux 2.6.35+ we use the straightforward method because it allows us
5475 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5476 * in such a case we get no response at all from the kernel (!) if a
5477 * builtin qdisc is in use (which is later caught by "!error &&
5478 * !qdisc->size"). */
5479 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5480 &request);
5481 if (!tcmsg) {
5482 return ENODEV;
5483 }
5484 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5485 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5486
5487 /* Figure out what tc class to instantiate. */
5488 error = tc_transact(&request, &qdisc);
5489 if (!error && qdisc->size) {
5490 const char *kind;
5491
5492 error = tc_parse_qdisc(qdisc, &kind, NULL);
5493 if (error) {
5494 ops = &tc_ops_other;
5495 } else {
5496 ops = tc_lookup_linux_name(kind);
5497 if (!ops) {
5498 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5499 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5500
5501 ops = &tc_ops_other;
5502 }
5503 }
5504 } else if ((!error && !qdisc->size) || error == ENOENT) {
5505 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5506 * set up by some other entity that doesn't have a handle 1:0. We will
5507 * assume that it's the system default qdisc. */
5508 ops = &tc_ops_default;
5509 error = 0;
5510 } else {
5511 /* Who knows? Maybe the device got deleted. */
5512 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5513 netdev_get_name(netdev_), ovs_strerror(error));
5514 ops = &tc_ops_other;
5515 }
5516
5517 /* Instantiate it. */
5518 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5519 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5520 ofpbuf_delete(qdisc);
5521
5522 return error ? error : load_error;
5523 }
5524
5525 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5526 approximate the time to transmit packets of various lengths. For an MTU of
5527 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5528 represents two possible packet lengths; for a MTU of 513 through 1024, four
5529 possible lengths; and so on.
5530
5531 Returns, for the specified 'mtu', the number of bits that packet lengths
5532 need to be shifted right to fit within such a 256-entry table. */
5533 static int
5534 tc_calc_cell_log(unsigned int mtu)
5535 {
5536 int cell_log;
5537
5538 if (!mtu) {
5539 mtu = ETH_PAYLOAD_MAX;
5540 }
5541 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5542
5543 for (cell_log = 0; mtu >= 256; cell_log++) {
5544 mtu >>= 1;
5545 }
5546
5547 return cell_log;
5548 }
5549
5550 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5551 * of 'mtu'. */
5552 static void
5553 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5554 {
5555 memset(rate, 0, sizeof *rate);
5556 rate->cell_log = tc_calc_cell_log(mtu);
5557 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5558 /* rate->cell_align = 0; */ /* distro headers. */
5559 rate->mpu = ETH_TOTAL_MIN;
5560 rate->rate = Bps;
5561 }
5562
5563 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5564 * attribute of the specified "type".
5565 *
5566 * See tc_calc_cell_log() above for a description of "rtab"s. */
5567 static void
5568 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5569 {
5570 uint32_t *rtab;
5571 unsigned int i;
5572
5573 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5574 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5575 unsigned packet_size = (i + 1) << rate->cell_log;
5576 if (packet_size < rate->mpu) {
5577 packet_size = rate->mpu;
5578 }
5579 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5580 }
5581 }
5582
5583 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5584 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5585 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5586 * 0 is fine.) */
5587 static int
5588 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5589 {
5590 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5591 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5592 }
5593 \f
5594 /* Linux-only functions declared in netdev-linux.h */
5595
5596 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5597 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5598 int
5599 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5600 const char *flag_name, bool enable)
5601 {
5602 const char *netdev_name = netdev_get_name(netdev);
5603 struct ethtool_value evalue;
5604 uint32_t new_flags;
5605 int error;
5606
5607 COVERAGE_INC(netdev_get_ethtool);
5608 memset(&evalue, 0, sizeof evalue);
5609 error = netdev_linux_do_ethtool(netdev_name,
5610 (struct ethtool_cmd *)&evalue,
5611 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5612 if (error) {
5613 return error;
5614 }
5615
5616 COVERAGE_INC(netdev_set_ethtool);
5617 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5618 if (new_flags == evalue.data) {
5619 return 0;
5620 }
5621 evalue.data = new_flags;
5622 error = netdev_linux_do_ethtool(netdev_name,
5623 (struct ethtool_cmd *)&evalue,
5624 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5625 if (error) {
5626 return error;
5627 }
5628
5629 COVERAGE_INC(netdev_get_ethtool);
5630 memset(&evalue, 0, sizeof evalue);
5631 error = netdev_linux_do_ethtool(netdev_name,
5632 (struct ethtool_cmd *)&evalue,
5633 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5634 if (error) {
5635 return error;
5636 }
5637
5638 if (new_flags != evalue.data) {
5639 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5640 "device %s failed", enable ? "enable" : "disable",
5641 flag_name, netdev_name);
5642 return EOPNOTSUPP;
5643 }
5644
5645 return 0;
5646 }
5647 \f
5648 /* Utility functions. */
5649
5650 /* Copies 'src' into 'dst', performing format conversion in the process. */
5651 static void
5652 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5653 const struct rtnl_link_stats *src)
5654 {
5655 dst->rx_packets = src->rx_packets;
5656 dst->tx_packets = src->tx_packets;
5657 dst->rx_bytes = src->rx_bytes;
5658 dst->tx_bytes = src->tx_bytes;
5659 dst->rx_errors = src->rx_errors;
5660 dst->tx_errors = src->tx_errors;
5661 dst->rx_dropped = src->rx_dropped;
5662 dst->tx_dropped = src->tx_dropped;
5663 dst->multicast = src->multicast;
5664 dst->collisions = src->collisions;
5665 dst->rx_length_errors = src->rx_length_errors;
5666 dst->rx_over_errors = src->rx_over_errors;
5667 dst->rx_crc_errors = src->rx_crc_errors;
5668 dst->rx_frame_errors = src->rx_frame_errors;
5669 dst->rx_fifo_errors = src->rx_fifo_errors;
5670 dst->rx_missed_errors = src->rx_missed_errors;
5671 dst->tx_aborted_errors = src->tx_aborted_errors;
5672 dst->tx_carrier_errors = src->tx_carrier_errors;
5673 dst->tx_fifo_errors = src->tx_fifo_errors;
5674 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5675 dst->tx_window_errors = src->tx_window_errors;
5676 }
5677
5678 /* Copies 'src' into 'dst', performing format conversion in the process. */
5679 static void
5680 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5681 const struct rtnl_link_stats64 *src)
5682 {
5683 dst->rx_packets = src->rx_packets;
5684 dst->tx_packets = src->tx_packets;
5685 dst->rx_bytes = src->rx_bytes;
5686 dst->tx_bytes = src->tx_bytes;
5687 dst->rx_errors = src->rx_errors;
5688 dst->tx_errors = src->tx_errors;
5689 dst->rx_dropped = src->rx_dropped;
5690 dst->tx_dropped = src->tx_dropped;
5691 dst->multicast = src->multicast;
5692 dst->collisions = src->collisions;
5693 dst->rx_length_errors = src->rx_length_errors;
5694 dst->rx_over_errors = src->rx_over_errors;
5695 dst->rx_crc_errors = src->rx_crc_errors;
5696 dst->rx_frame_errors = src->rx_frame_errors;
5697 dst->rx_fifo_errors = src->rx_fifo_errors;
5698 dst->rx_missed_errors = src->rx_missed_errors;
5699 dst->tx_aborted_errors = src->tx_aborted_errors;
5700 dst->tx_carrier_errors = src->tx_carrier_errors;
5701 dst->tx_fifo_errors = src->tx_fifo_errors;
5702 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5703 dst->tx_window_errors = src->tx_window_errors;
5704 }
5705
5706 static int
5707 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5708 {
5709 struct ofpbuf request;
5710 struct ofpbuf *reply;
5711 int error;
5712
5713 /* Filtering all counters by default */
5714 memset(stats, 0xFF, sizeof(struct netdev_stats));
5715
5716 ofpbuf_init(&request, 0);
5717 nl_msg_put_nlmsghdr(&request,
5718 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5719 RTM_GETLINK, NLM_F_REQUEST);
5720 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5721 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5722 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5723 ofpbuf_uninit(&request);
5724 if (error) {
5725 return error;
5726 }
5727
5728 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5729 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5730 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5731 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5732 error = 0;
5733 } else {
5734 a = nl_attr_find(reply, 0, IFLA_STATS);
5735 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5736 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5737 error = 0;
5738 } else {
5739 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5740 error = EPROTO;
5741 }
5742 }
5743 } else {
5744 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5745 error = EPROTO;
5746 }
5747
5748
5749 ofpbuf_delete(reply);
5750 return error;
5751 }
5752
5753 static int
5754 get_flags(const struct netdev *dev, unsigned int *flags)
5755 {
5756 struct ifreq ifr;
5757 int error;
5758
5759 *flags = 0;
5760 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5761 if (!error) {
5762 *flags = ifr.ifr_flags;
5763 }
5764 return error;
5765 }
5766
5767 static int
5768 set_flags(const char *name, unsigned int flags)
5769 {
5770 struct ifreq ifr;
5771
5772 ifr.ifr_flags = flags;
5773 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5774 }
5775
5776 int
5777 linux_get_ifindex(const char *netdev_name)
5778 {
5779 struct ifreq ifr;
5780 int error;
5781
5782 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5783 COVERAGE_INC(netdev_get_ifindex);
5784
5785 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5786 if (error) {
5787 /* ENODEV probably means that a vif disappeared asynchronously and
5788 * hasn't been removed from the database yet, so reduce the log level
5789 * to INFO for that case. */
5790 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5791 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5792 netdev_name, ovs_strerror(error));
5793 return -error;
5794 }
5795 return ifr.ifr_ifindex;
5796 }
5797
5798 static int
5799 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5800 {
5801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5802
5803 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5804 netdev_linux_update_via_netlink(netdev);
5805 }
5806
5807 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5808 /* Fall back to ioctl if netlink fails */
5809 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5810
5811 if (ifindex < 0) {
5812 netdev->get_ifindex_error = -ifindex;
5813 netdev->ifindex = 0;
5814 } else {
5815 netdev->get_ifindex_error = 0;
5816 netdev->ifindex = ifindex;
5817 }
5818 netdev->cache_valid |= VALID_IFINDEX;
5819 }
5820
5821 *ifindexp = netdev->ifindex;
5822 return netdev->get_ifindex_error;
5823 }
5824
5825 static int
5826 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5827 {
5828 struct ofpbuf request;
5829 struct ofpbuf *reply;
5830 struct rtnetlink_change chg;
5831 struct rtnetlink_change *change = &chg;
5832 int error;
5833
5834 ofpbuf_init(&request, 0);
5835 nl_msg_put_nlmsghdr(&request,
5836 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5837 RTM_GETLINK, NLM_F_REQUEST);
5838 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5839
5840 /* The correct identifiers for a Linux device are netnsid and ifindex,
5841 * but ifindex changes as the port is moved to another network namespace
5842 * and the interface name statically stored in ovsdb. */
5843 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5844 if (netdev_linux_netnsid_is_remote(netdev)) {
5845 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5846 }
5847 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5848 ofpbuf_uninit(&request);
5849 if (error) {
5850 ofpbuf_delete(reply);
5851 return error;
5852 }
5853
5854 if (rtnetlink_parse(reply, change)
5855 && change->nlmsg_type == RTM_NEWLINK) {
5856 bool changed = false;
5857 error = 0;
5858
5859 /* Update netdev from rtnl msg and increment its seq if needed. */
5860 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5861 netdev->carrier_resets++;
5862 changed = true;
5863 }
5864 if (change->ifi_flags != netdev->ifi_flags) {
5865 netdev->ifi_flags = change->ifi_flags;
5866 changed = true;
5867 }
5868 if (change->mtu && change->mtu != netdev->mtu) {
5869 netdev->mtu = change->mtu;
5870 netdev->cache_valid |= VALID_MTU;
5871 netdev->netdev_mtu_error = 0;
5872 changed = true;
5873 }
5874 if (!eth_addr_is_zero(change->mac)
5875 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5876 netdev->etheraddr = change->mac;
5877 netdev->cache_valid |= VALID_ETHERADDR;
5878 netdev->ether_addr_error = 0;
5879 changed = true;
5880 }
5881 if (change->if_index != netdev->ifindex) {
5882 netdev->ifindex = change->if_index;
5883 netdev->cache_valid |= VALID_IFINDEX;
5884 netdev->get_ifindex_error = 0;
5885 changed = true;
5886 }
5887 if (change->master && netdev_linux_kind_is_lag(change->master)) {
5888 netdev->is_lag_master = true;
5889 }
5890 if (changed) {
5891 netdev_change_seq_changed(&netdev->up);
5892 }
5893 } else {
5894 error = EINVAL;
5895 }
5896
5897 ofpbuf_delete(reply);
5898 return error;
5899 }
5900
5901 static int
5902 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5903 {
5904 struct ifreq ifr;
5905 int hwaddr_family;
5906 int error;
5907
5908 memset(&ifr, 0, sizeof ifr);
5909 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5910 COVERAGE_INC(netdev_get_hwaddr);
5911 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5912 if (error) {
5913 /* ENODEV probably means that a vif disappeared asynchronously and
5914 * hasn't been removed from the database yet, so reduce the log level
5915 * to INFO for that case. */
5916 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5917 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5918 netdev_name, ovs_strerror(error));
5919 return error;
5920 }
5921 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5922 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5923 hwaddr_family != ARPHRD_NONE) {
5924 VLOG_INFO("%s device has unknown hardware address family %d",
5925 netdev_name, hwaddr_family);
5926 return EINVAL;
5927 }
5928 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5929 return 0;
5930 }
5931
5932 static int
5933 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5934 {
5935 struct ifreq ifr;
5936 int error;
5937
5938 memset(&ifr, 0, sizeof ifr);
5939 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5940 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5941 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5942 COVERAGE_INC(netdev_set_hwaddr);
5943 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5944 if (error) {
5945 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5946 netdev_name, ovs_strerror(error));
5947 }
5948 return error;
5949 }
5950
5951 static int
5952 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5953 int cmd, const char *cmd_name)
5954 {
5955 struct ifreq ifr;
5956 int error;
5957
5958 memset(&ifr, 0, sizeof ifr);
5959 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5960 ifr.ifr_data = (caddr_t) ecmd;
5961
5962 ecmd->cmd = cmd;
5963 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5964 if (error) {
5965 if (error != EOPNOTSUPP) {
5966 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5967 "failed: %s", cmd_name, name, ovs_strerror(error));
5968 } else {
5969 /* The device doesn't support this operation. That's pretty
5970 * common, so there's no point in logging anything. */
5971 }
5972 }
5973 return error;
5974 }
5975
5976 /* Returns an AF_PACKET raw socket or a negative errno value. */
5977 static int
5978 af_packet_sock(void)
5979 {
5980 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5981 static int sock;
5982
5983 if (ovsthread_once_start(&once)) {
5984 sock = socket(AF_PACKET, SOCK_RAW, 0);
5985 if (sock >= 0) {
5986 int error = set_nonblocking(sock);
5987 if (error) {
5988 close(sock);
5989 sock = -error;
5990 }
5991 } else {
5992 sock = -errno;
5993 VLOG_ERR("failed to create packet socket: %s",
5994 ovs_strerror(errno));
5995 }
5996 ovsthread_once_done(&once);
5997 }
5998
5999 return sock;
6000 }