]> git.proxmox.com Git - ovs.git/blob - lib/netdev-linux.c
netlink linux: enable listening to all nsids
[ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
26 #include <inttypes.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
40 #include <net/if.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "netlink.h"
63 #include "netnsid.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "tc.h"
74 #include "timer.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
77 #include "util.h"
78
79 VLOG_DEFINE_THIS_MODULE(netdev_linux);
80
81 COVERAGE_DEFINE(netdev_set_policing);
82 COVERAGE_DEFINE(netdev_arp_lookup);
83 COVERAGE_DEFINE(netdev_get_ifindex);
84 COVERAGE_DEFINE(netdev_get_hwaddr);
85 COVERAGE_DEFINE(netdev_set_hwaddr);
86 COVERAGE_DEFINE(netdev_get_ethtool);
87 COVERAGE_DEFINE(netdev_set_ethtool);
88
89 \f
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
92 #endif
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
97 #endif
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
100 #endif
101
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106 #endif
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109 #endif
110
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113 #ifndef TC_RTAB_SIZE
114 #define TC_RTAB_SIZE 1024
115 #endif
116
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
128 #endif
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
131 #endif
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134 #endif
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145 };
146
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155 {
156 return ep->speed | (ep->speed_hi << 16);
157 }
158
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
170 #endif
171
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
183 #endif
184
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
192 #ifndef IFLA_STATS64
193 #define IFLA_STATS64 23
194 #endif
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223 };
224
225 enum {
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
234 };
235 \f
236 /* Traffic control. */
237
238 /* An instance of a traffic control class. Always associated with a particular
239 * network device.
240 *
241 * Each TC implementation subclasses this with whatever additional data it
242 * needs. */
243 struct tc {
244 const struct tc_ops *ops;
245 struct hmap queues; /* Contains "struct tc_queue"s.
246 * Read by generic TC layer.
247 * Written only by TC implementation. */
248 };
249
250 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
251
252 /* One traffic control queue.
253 *
254 * Each TC implementation subclasses this with whatever additional data it
255 * needs. */
256 struct tc_queue {
257 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
258 unsigned int queue_id; /* OpenFlow queue ID. */
259 long long int created; /* Time queue was created, in msecs. */
260 };
261
262 /* A particular kind of traffic control. Each implementation generally maps to
263 * one particular Linux qdisc class.
264 *
265 * The functions below return 0 if successful or a positive errno value on
266 * failure, except where otherwise noted. All of them must be provided, except
267 * where otherwise noted. */
268 struct tc_ops {
269 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
270 * This is null for tc_ops_default and tc_ops_other, for which there are no
271 * appropriate values. */
272 const char *linux_name;
273
274 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
275 const char *ovs_name;
276
277 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
278 * queues. The queues are numbered 0 through n_queues - 1. */
279 unsigned int n_queues;
280
281 /* Called to install this TC class on 'netdev'. The implementation should
282 * make the Netlink calls required to set up 'netdev' with the right qdisc
283 * and configure it according to 'details'. The implementation may assume
284 * that the current qdisc is the default; that is, there is no need for it
285 * to delete the current qdisc before installing itself.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function must return 0 if and only if it sets 'netdev->tc' to an
292 * initialized 'struct tc'.
293 *
294 * (This function is null for tc_ops_other, which cannot be installed. For
295 * other TC classes it should always be nonnull.) */
296 int (*tc_install)(struct netdev *netdev, const struct smap *details);
297
298 /* Called when the netdev code determines (through a Netlink query) that
299 * this TC class's qdisc is installed on 'netdev', but we didn't install
300 * it ourselves and so don't know any of the details.
301 *
302 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
303 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
304 * implementation should parse the other attributes of 'nlmsg' as
305 * necessary to determine its configuration. If necessary it should also
306 * use Netlink queries to determine the configuration of queues on
307 * 'netdev'.
308 *
309 * This function must return 0 if and only if it sets 'netdev->tc' to an
310 * initialized 'struct tc'. */
311 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
312
313 /* Destroys the data structures allocated by the implementation as part of
314 * 'tc'. (This includes destroying 'tc->queues' by calling
315 * tc_destroy(tc).
316 *
317 * The implementation should not need to perform any Netlink calls. If
318 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
319 * (But it may not be desirable.)
320 *
321 * This function may be null if 'tc' is trivial. */
322 void (*tc_destroy)(struct tc *tc);
323
324 /* Retrieves details of 'netdev->tc' configuration into 'details'.
325 *
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the configuration.
329 *
330 * The contents of 'details' should be documented as valid for 'ovs_name'
331 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
332 * (which is built as ovs-vswitchd.conf.db(8)).
333 *
334 * This function may be null if 'tc' is not configurable.
335 */
336 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
337
338 /* Reconfigures 'netdev->tc' according to 'details', performing any
339 * required Netlink calls to complete the reconfiguration.
340 *
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
344 *
345 * This function may be null if 'tc' is not configurable.
346 */
347 int (*qdisc_set)(struct netdev *, const struct smap *details);
348
349 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
350 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
351 *
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "Queue" table in
354 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
355 *
356 * The implementation should not need to perform any Netlink calls, because
357 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
358 * cached the queue configuration.
359 *
360 * This function may be null if 'tc' does not have queues ('n_queues' is
361 * 0). */
362 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
363 struct smap *details);
364
365 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
366 * 'details', perfoming any required Netlink calls to complete the
367 * reconfiguration. The caller ensures that 'queue_id' is less than
368 * 'n_queues'.
369 *
370 * The contents of 'details' should be documented as valid for 'ovs_name'
371 * in the "other_config" column in the "Queue" table in
372 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
373 *
374 * This function may be null if 'tc' does not have queues or its queues are
375 * not configurable. */
376 int (*class_set)(struct netdev *, unsigned int queue_id,
377 const struct smap *details);
378
379 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
380 * tc_queue's within 'netdev->tc->queues'.
381 *
382 * This function may be null if 'tc' does not have queues or its queues
383 * cannot be deleted. */
384 int (*class_delete)(struct netdev *, struct tc_queue *queue);
385
386 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
387 * 'struct tc_queue's within 'netdev->tc->queues'.
388 *
389 * On success, initializes '*stats'.
390 *
391 * This function may be null if 'tc' does not have queues or if it cannot
392 * report queue statistics. */
393 int (*class_get_stats)(const struct netdev *netdev,
394 const struct tc_queue *queue,
395 struct netdev_queue_stats *stats);
396
397 /* Extracts queue stats from 'nlmsg', which is a response to a
398 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
399 *
400 * This function may be null if 'tc' does not have queues or if it cannot
401 * report queue statistics. */
402 int (*class_dump_stats)(const struct netdev *netdev,
403 const struct ofpbuf *nlmsg,
404 netdev_dump_queue_stats_cb *cb, void *aux);
405 };
406
407 static void
408 tc_init(struct tc *tc, const struct tc_ops *ops)
409 {
410 tc->ops = ops;
411 hmap_init(&tc->queues);
412 }
413
414 static void
415 tc_destroy(struct tc *tc)
416 {
417 hmap_destroy(&tc->queues);
418 }
419
420 static const struct tc_ops tc_ops_htb;
421 static const struct tc_ops tc_ops_hfsc;
422 static const struct tc_ops tc_ops_codel;
423 static const struct tc_ops tc_ops_fqcodel;
424 static const struct tc_ops tc_ops_sfq;
425 static const struct tc_ops tc_ops_default;
426 static const struct tc_ops tc_ops_noop;
427 static const struct tc_ops tc_ops_other;
428
429 static const struct tc_ops *const tcs[] = {
430 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
431 &tc_ops_hfsc, /* Hierarchical fair service curve. */
432 &tc_ops_codel, /* Controlled delay */
433 &tc_ops_fqcodel, /* Fair queue controlled delay */
434 &tc_ops_sfq, /* Stochastic fair queueing */
435 &tc_ops_noop, /* Non operating qos type. */
436 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
437 &tc_ops_other, /* Some other qdisc. */
438 NULL
439 };
440
441 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
446 int type,
447 unsigned int flags,
448 struct ofpbuf *);
449 static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
451
452 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457 static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460 static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462 static int tc_del_qdisc(struct netdev *netdev);
463 static int tc_query_qdisc(const struct netdev *netdev);
464
465 static int tc_calc_cell_log(unsigned int mtu);
466 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470 \f
471 struct netdev_linux {
472 struct netdev up;
473
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
477 unsigned int cache_valid;
478
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
483 int netnsid; /* Network namespace ID. */
484 /* The following are figured out "on demand" only. They are only valid
485 * when the corresponding VALID_* bit in 'cache_valid' is set. */
486 int ifindex;
487 struct eth_addr etheraddr;
488 int mtu;
489 unsigned int ifi_flags;
490 long long int carrier_resets;
491 uint32_t kbits_rate; /* Policing data. */
492 uint32_t kbits_burst;
493 int vport_stats_error; /* Cached error code from vport_get_stats().
494 0 or an errno value. */
495 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
496 int ether_addr_error; /* Cached error code from set/get etheraddr. */
497 int netdev_policing_error; /* Cached error code from set policing. */
498 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
499 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
500
501 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
504
505 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
506 struct tc *tc;
507
508 /* For devices of class netdev_tap_class only. */
509 int tap_fd;
510 bool present; /* If the device is present in the namespace */
511 uint64_t tx_dropped; /* tap device can drop if the iface is down */
512 };
513
514 struct netdev_rxq_linux {
515 struct netdev_rxq up;
516 bool is_tap;
517 int fd;
518 };
519
520 /* This is set pretty low because we probably won't learn anything from the
521 * additional log messages. */
522 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
523
524 /* Polling miimon status for all ports causes performance degradation when
525 * handling a large number of ports. If there are no devices using miimon, then
526 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 *
528 * Readers do not depend on this variable synchronizing with the related
529 * changes in the device miimon status, so we can use atomic_count. */
530 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
531
532 static void netdev_linux_run(const struct netdev_class *);
533
534 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
535 int cmd, const char *cmd_name);
536 static int get_flags(const struct netdev *, unsigned int *flags);
537 static int set_flags(const char *, unsigned int flags);
538 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
539 enum netdev_flags on, enum netdev_flags *old_flagsp)
540 OVS_REQUIRES(netdev->mutex);
541 static int get_ifindex(const struct netdev *, int *ifindexp);
542 static int do_set_addr(struct netdev *netdev,
543 int ioctl_nr, const char *ioctl_name,
544 struct in_addr addr);
545 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
546 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
547 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
548 static int af_packet_sock(void);
549 static bool netdev_linux_miimon_enabled(void);
550 static void netdev_linux_miimon_run(void);
551 static void netdev_linux_miimon_wait(void);
552 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
553
554 static bool
555 is_netdev_linux_class(const struct netdev_class *netdev_class)
556 {
557 return netdev_class->run == netdev_linux_run;
558 }
559
560 static bool
561 is_tap_netdev(const struct netdev *netdev)
562 {
563 return netdev_get_class(netdev) == &netdev_tap_class;
564 }
565
566 static struct netdev_linux *
567 netdev_linux_cast(const struct netdev *netdev)
568 {
569 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
570
571 return CONTAINER_OF(netdev, struct netdev_linux, up);
572 }
573
574 static struct netdev_rxq_linux *
575 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
576 {
577 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
578 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
579 }
580 \f
581 static int
582 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
583 {
584 struct dpif_netlink_vport reply;
585 struct ofpbuf *buf;
586 int error;
587
588 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
589 if (error) {
590 netnsid_unset(&netdev->netnsid);
591 return error;
592 }
593
594 netnsid_set(&netdev->netnsid, reply.netnsid);
595 ofpbuf_delete(buf);
596 return 0;
597 }
598
599 static int
600 netdev_linux_netnsid_update(struct netdev_linux *netdev)
601 {
602 if (netnsid_is_unset(netdev->netnsid)) {
603 return netdev_linux_netnsid_update__(netdev);
604 }
605
606 return 0;
607 }
608
609 static bool
610 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
611 {
612 netdev_linux_netnsid_update(netdev);
613 return netnsid_eq(netdev->netnsid, nsid);
614 }
615
616 static bool
617 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
618 {
619 netdev_linux_netnsid_update(netdev);
620 return netnsid_is_remote(netdev->netnsid);
621 }
622
623 static int netdev_linux_update_via_netlink(struct netdev_linux *);
624 static void netdev_linux_update(struct netdev_linux *netdev, int,
625 const struct rtnetlink_change *)
626 OVS_REQUIRES(netdev->mutex);
627 static void netdev_linux_changed(struct netdev_linux *netdev,
628 unsigned int ifi_flags, unsigned int mask)
629 OVS_REQUIRES(netdev->mutex);
630
631 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
632 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
633 * if no such socket could be created. */
634 static struct nl_sock *
635 netdev_linux_notify_sock(void)
636 {
637 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
638 static struct nl_sock *sock;
639 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
640 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
641
642 if (ovsthread_once_start(&once)) {
643 int error;
644
645 error = nl_sock_create(NETLINK_ROUTE, &sock);
646 if (!error) {
647 size_t i;
648
649 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
650 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
651 if (error) {
652 nl_sock_destroy(sock);
653 sock = NULL;
654 break;
655 }
656 }
657 }
658 nl_sock_listen_all_nsid(sock, true);
659 ovsthread_once_done(&once);
660 }
661
662 return sock;
663 }
664
665 static bool
666 netdev_linux_miimon_enabled(void)
667 {
668 return atomic_count_get(&miimon_cnt) > 0;
669 }
670
671 static void
672 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
673 {
674 struct nl_sock *sock;
675 int error;
676
677 if (netdev_linux_miimon_enabled()) {
678 netdev_linux_miimon_run();
679 }
680
681 sock = netdev_linux_notify_sock();
682 if (!sock) {
683 return;
684 }
685
686 do {
687 uint64_t buf_stub[4096 / 8];
688 int nsid;
689 struct ofpbuf buf;
690
691 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
692 error = nl_sock_recv(sock, &buf, &nsid, false);
693 if (!error) {
694 struct rtnetlink_change change;
695
696 if (rtnetlink_parse(&buf, &change)) {
697 struct netdev *netdev_ = NULL;
698 char dev_name[IFNAMSIZ];
699
700 if (!change.ifname) {
701 change.ifname = if_indextoname(change.if_index, dev_name);
702 }
703
704 if (change.ifname) {
705 netdev_ = netdev_from_name(change.ifname);
706 }
707 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
708 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
709
710 ovs_mutex_lock(&netdev->mutex);
711 netdev_linux_update(netdev, nsid, &change);
712 ovs_mutex_unlock(&netdev->mutex);
713 }
714 netdev_close(netdev_);
715 }
716 } else if (error == ENOBUFS) {
717 struct shash device_shash;
718 struct shash_node *node;
719
720 nl_sock_drain(sock);
721
722 shash_init(&device_shash);
723 netdev_get_devices(&netdev_linux_class, &device_shash);
724 SHASH_FOR_EACH (node, &device_shash) {
725 struct netdev *netdev_ = node->data;
726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
727 unsigned int flags;
728
729 ovs_mutex_lock(&netdev->mutex);
730 get_flags(netdev_, &flags);
731 netdev_linux_changed(netdev, flags, 0);
732 ovs_mutex_unlock(&netdev->mutex);
733
734 netdev_close(netdev_);
735 }
736 shash_destroy(&device_shash);
737 } else if (error != EAGAIN) {
738 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
739 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
740 ovs_strerror(error));
741 }
742 ofpbuf_uninit(&buf);
743 } while (!error);
744 }
745
746 static void
747 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
748 {
749 struct nl_sock *sock;
750
751 if (netdev_linux_miimon_enabled()) {
752 netdev_linux_miimon_wait();
753 }
754 sock = netdev_linux_notify_sock();
755 if (sock) {
756 nl_sock_wait(sock, POLLIN);
757 }
758 }
759
760 static void
761 netdev_linux_changed(struct netdev_linux *dev,
762 unsigned int ifi_flags, unsigned int mask)
763 OVS_REQUIRES(dev->mutex)
764 {
765 netdev_change_seq_changed(&dev->up);
766
767 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
768 dev->carrier_resets++;
769 }
770 dev->ifi_flags = ifi_flags;
771
772 dev->cache_valid &= mask;
773 if (!(mask & VALID_IN)) {
774 netdev_get_addrs_list_flush();
775 }
776 }
777
778 static void
779 netdev_linux_update__(struct netdev_linux *dev,
780 const struct rtnetlink_change *change)
781 OVS_REQUIRES(dev->mutex)
782 {
783 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
784 if (change->nlmsg_type == RTM_NEWLINK) {
785 /* Keep drv-info, and ip addresses. */
786 netdev_linux_changed(dev, change->ifi_flags,
787 VALID_DRVINFO | VALID_IN);
788
789 /* Update netdev from rtnl-change msg. */
790 if (change->mtu) {
791 dev->mtu = change->mtu;
792 dev->cache_valid |= VALID_MTU;
793 dev->netdev_mtu_error = 0;
794 }
795
796 if (!eth_addr_is_zero(change->mac)) {
797 dev->etheraddr = change->mac;
798 dev->cache_valid |= VALID_ETHERADDR;
799 dev->ether_addr_error = 0;
800
801 /* The mac addr has been changed, report it now. */
802 rtnetlink_report_link();
803 }
804
805 dev->ifindex = change->if_index;
806 dev->cache_valid |= VALID_IFINDEX;
807 dev->get_ifindex_error = 0;
808 dev->present = true;
809 } else {
810 /* FIXME */
811 netdev_linux_changed(dev, change->ifi_flags, 0);
812 dev->present = false;
813 netnsid_unset(&dev->netnsid);
814 }
815 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
816 /* Invalidates in4, in6. */
817 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
818 } else {
819 OVS_NOT_REACHED();
820 }
821 }
822
823 static void
824 netdev_linux_update(struct netdev_linux *dev, int nsid,
825 const struct rtnetlink_change *change)
826 OVS_REQUIRES(dev->mutex)
827 {
828 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
829 netdev_linux_update__(dev, change);
830 }
831 }
832
833 static struct netdev *
834 netdev_linux_alloc(void)
835 {
836 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
837 return &netdev->up;
838 }
839
840 static int
841 netdev_linux_common_construct(struct netdev *netdev_)
842 {
843 /* Prevent any attempt to create (or open) a network device named "default"
844 * or "all". These device names are effectively reserved on Linux because
845 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
846 * itself this wouldn't call for any special treatment, but in practice if
847 * a program tries to create devices with these names, it causes the kernel
848 * to fire a "new device" notification event even though creation failed,
849 * and in turn that causes OVS to wake up and try to create them again,
850 * which ends up as a 100% CPU loop. */
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
852 const char *name = netdev_->name;
853 if (!strcmp(name, "default") || !strcmp(name, "all")) {
854 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
855 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
856 name);
857 return EINVAL;
858 }
859
860 /* The device could be in the same network namespace or in another one. */
861 netnsid_unset(&netdev->netnsid);
862 ovs_mutex_init(&netdev->mutex);
863 return 0;
864 }
865
866 /* Creates system and internal devices. */
867 static int
868 netdev_linux_construct(struct netdev *netdev_)
869 {
870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
871 int error = netdev_linux_common_construct(netdev_);
872 if (error) {
873 return error;
874 }
875
876 error = get_flags(&netdev->up, &netdev->ifi_flags);
877 if (error == ENODEV) {
878 if (netdev->up.netdev_class != &netdev_internal_class) {
879 /* The device does not exist, so don't allow it to be opened. */
880 return ENODEV;
881 } else {
882 /* "Internal" netdevs have to be created as netdev objects before
883 * they exist in the kernel, because creating them in the kernel
884 * happens by passing a netdev object to dpif_port_add().
885 * Therefore, ignore the error. */
886 }
887 }
888
889 return 0;
890 }
891
892 /* For most types of netdevs we open the device for each call of
893 * netdev_open(). However, this is not the case with tap devices,
894 * since it is only possible to open the device once. In this
895 * situation we share a single file descriptor, and consequently
896 * buffers, across all readers. Therefore once data is read it will
897 * be unavailable to other reads for tap devices. */
898 static int
899 netdev_linux_construct_tap(struct netdev *netdev_)
900 {
901 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
902 static const char tap_dev[] = "/dev/net/tun";
903 const char *name = netdev_->name;
904 struct ifreq ifr;
905
906 int error = netdev_linux_common_construct(netdev_);
907 if (error) {
908 return error;
909 }
910
911 /* Open tap device. */
912 netdev->tap_fd = open(tap_dev, O_RDWR);
913 if (netdev->tap_fd < 0) {
914 error = errno;
915 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
916 return error;
917 }
918
919 /* Create tap device. */
920 get_flags(&netdev->up, &netdev->ifi_flags);
921 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
922 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
923 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
924 VLOG_WARN("%s: creating tap device failed: %s", name,
925 ovs_strerror(errno));
926 error = errno;
927 goto error_close;
928 }
929
930 /* Make non-blocking. */
931 error = set_nonblocking(netdev->tap_fd);
932 if (error) {
933 goto error_close;
934 }
935
936 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
937 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
938 ovs_strerror(errno));
939 error = errno;
940 goto error_close;
941 }
942
943 return 0;
944
945 error_close:
946 close(netdev->tap_fd);
947 return error;
948 }
949
950 static void
951 netdev_linux_destruct(struct netdev *netdev_)
952 {
953 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
954
955 if (netdev->tc && netdev->tc->ops->tc_destroy) {
956 netdev->tc->ops->tc_destroy(netdev->tc);
957 }
958
959 if (netdev_get_class(netdev_) == &netdev_tap_class
960 && netdev->tap_fd >= 0)
961 {
962 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
963 close(netdev->tap_fd);
964 }
965
966 if (netdev->miimon_interval > 0) {
967 atomic_count_dec(&miimon_cnt);
968 }
969
970 ovs_mutex_destroy(&netdev->mutex);
971 }
972
973 static void
974 netdev_linux_dealloc(struct netdev *netdev_)
975 {
976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
977 free(netdev);
978 }
979
980 static struct netdev_rxq *
981 netdev_linux_rxq_alloc(void)
982 {
983 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
984 return &rx->up;
985 }
986
987 static int
988 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
989 {
990 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
991 struct netdev *netdev_ = rx->up.netdev;
992 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
993 int error;
994
995 ovs_mutex_lock(&netdev->mutex);
996 rx->is_tap = is_tap_netdev(netdev_);
997 if (rx->is_tap) {
998 rx->fd = netdev->tap_fd;
999 } else {
1000 struct sockaddr_ll sll;
1001 int ifindex, val;
1002 /* Result of tcpdump -dd inbound */
1003 static const struct sock_filter filt[] = {
1004 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1005 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1006 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1007 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1008 };
1009 static const struct sock_fprog fprog = {
1010 ARRAY_SIZE(filt), (struct sock_filter *) filt
1011 };
1012
1013 /* Create file descriptor. */
1014 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1015 if (rx->fd < 0) {
1016 error = errno;
1017 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1018 goto error;
1019 }
1020
1021 val = 1;
1022 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1023 error = errno;
1024 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1025 netdev_get_name(netdev_), ovs_strerror(error));
1026 goto error;
1027 }
1028
1029 /* Set non-blocking mode. */
1030 error = set_nonblocking(rx->fd);
1031 if (error) {
1032 goto error;
1033 }
1034
1035 /* Get ethernet device index. */
1036 error = get_ifindex(&netdev->up, &ifindex);
1037 if (error) {
1038 goto error;
1039 }
1040
1041 /* Bind to specific ethernet device. */
1042 memset(&sll, 0, sizeof sll);
1043 sll.sll_family = AF_PACKET;
1044 sll.sll_ifindex = ifindex;
1045 sll.sll_protocol = htons(ETH_P_ALL);
1046 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1047 error = errno;
1048 VLOG_ERR("%s: failed to bind raw socket (%s)",
1049 netdev_get_name(netdev_), ovs_strerror(error));
1050 goto error;
1051 }
1052
1053 /* Filter for only inbound packets. */
1054 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1055 sizeof fprog);
1056 if (error) {
1057 error = errno;
1058 VLOG_ERR("%s: failed to attach filter (%s)",
1059 netdev_get_name(netdev_), ovs_strerror(error));
1060 goto error;
1061 }
1062 }
1063 ovs_mutex_unlock(&netdev->mutex);
1064
1065 return 0;
1066
1067 error:
1068 if (rx->fd >= 0) {
1069 close(rx->fd);
1070 }
1071 ovs_mutex_unlock(&netdev->mutex);
1072 return error;
1073 }
1074
1075 static void
1076 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1077 {
1078 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1079
1080 if (!rx->is_tap) {
1081 close(rx->fd);
1082 }
1083 }
1084
1085 static void
1086 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1087 {
1088 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1089
1090 free(rx);
1091 }
1092
1093 static ovs_be16
1094 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1095 {
1096 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1097 return htons(aux->tp_vlan_tpid);
1098 } else if (double_tagged) {
1099 return htons(ETH_TYPE_VLAN_8021AD);
1100 } else {
1101 return htons(ETH_TYPE_VLAN_8021Q);
1102 }
1103 }
1104
1105 static bool
1106 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1107 {
1108 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1109 }
1110
1111 static int
1112 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1113 {
1114 size_t size;
1115 ssize_t retval;
1116 struct iovec iov;
1117 struct cmsghdr *cmsg;
1118 union {
1119 struct cmsghdr cmsg;
1120 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1121 } cmsg_buffer;
1122 struct msghdr msgh;
1123
1124 /* Reserve headroom for a single VLAN tag */
1125 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1126 size = dp_packet_tailroom(buffer);
1127
1128 iov.iov_base = dp_packet_data(buffer);
1129 iov.iov_len = size;
1130 msgh.msg_name = NULL;
1131 msgh.msg_namelen = 0;
1132 msgh.msg_iov = &iov;
1133 msgh.msg_iovlen = 1;
1134 msgh.msg_control = &cmsg_buffer;
1135 msgh.msg_controllen = sizeof cmsg_buffer;
1136 msgh.msg_flags = 0;
1137
1138 do {
1139 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1140 } while (retval < 0 && errno == EINTR);
1141
1142 if (retval < 0) {
1143 return errno;
1144 } else if (retval > size) {
1145 return EMSGSIZE;
1146 }
1147
1148 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1149
1150 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1151 const struct tpacket_auxdata *aux;
1152
1153 if (cmsg->cmsg_level != SOL_PACKET
1154 || cmsg->cmsg_type != PACKET_AUXDATA
1155 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1156 continue;
1157 }
1158
1159 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1160 if (auxdata_has_vlan_tci(aux)) {
1161 struct eth_header *eth;
1162 bool double_tagged;
1163
1164 if (retval < ETH_HEADER_LEN) {
1165 return EINVAL;
1166 }
1167
1168 eth = dp_packet_data(buffer);
1169 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1170
1171 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1172 htons(aux->tp_vlan_tci));
1173 break;
1174 }
1175 }
1176
1177 return 0;
1178 }
1179
1180 static int
1181 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1182 {
1183 ssize_t retval;
1184 size_t size = dp_packet_tailroom(buffer);
1185
1186 do {
1187 retval = read(fd, dp_packet_data(buffer), size);
1188 } while (retval < 0 && errno == EINTR);
1189
1190 if (retval < 0) {
1191 return errno;
1192 }
1193
1194 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1195 return 0;
1196 }
1197
1198 static int
1199 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
1200 {
1201 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1202 struct netdev *netdev = rx->up.netdev;
1203 struct dp_packet *buffer;
1204 ssize_t retval;
1205 int mtu;
1206
1207 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1208 mtu = ETH_PAYLOAD_MAX;
1209 }
1210
1211 /* Assume Ethernet port. No need to set packet_type. */
1212 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1213 DP_NETDEV_HEADROOM);
1214 retval = (rx->is_tap
1215 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1216 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1217
1218 if (retval) {
1219 if (retval != EAGAIN && retval != EMSGSIZE) {
1220 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1221 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1222 }
1223 dp_packet_delete(buffer);
1224 } else {
1225 dp_packet_batch_init_packet(batch, buffer);
1226 }
1227
1228 return retval;
1229 }
1230
1231 static void
1232 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1233 {
1234 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1235 poll_fd_wait(rx->fd, POLLIN);
1236 }
1237
1238 static int
1239 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1240 {
1241 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1242 if (rx->is_tap) {
1243 struct ifreq ifr;
1244 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1245 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1246 if (error) {
1247 return error;
1248 }
1249 drain_fd(rx->fd, ifr.ifr_qlen);
1250 return 0;
1251 } else {
1252 return drain_rcvbuf(rx->fd);
1253 }
1254 }
1255
1256 static int
1257 netdev_linux_sock_batch_send(int sock, int ifindex,
1258 struct dp_packet_batch *batch)
1259 {
1260 const size_t size = dp_packet_batch_size(batch);
1261 /* We don't bother setting most fields in sockaddr_ll because the
1262 * kernel ignores them for SOCK_RAW. */
1263 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1264 .sll_ifindex = ifindex };
1265
1266 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1267 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1268
1269 struct dp_packet *packet;
1270 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1271 iov[i].iov_base = dp_packet_data(packet);
1272 iov[i].iov_len = dp_packet_size(packet);
1273 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1274 .msg_namelen = sizeof sll,
1275 .msg_iov = &iov[i],
1276 .msg_iovlen = 1 };
1277 }
1278
1279 int error = 0;
1280 for (uint32_t ofs = 0; ofs < size; ) {
1281 ssize_t retval;
1282 do {
1283 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1284 error = retval < 0 ? errno : 0;
1285 } while (error == EINTR);
1286 if (error) {
1287 break;
1288 }
1289 ofs += retval;
1290 }
1291
1292 free(mmsg);
1293 free(iov);
1294 return error;
1295 }
1296
1297 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1298 * essential, because packets sent to a tap device with an AF_PACKET socket
1299 * will loop back to be *received* again on the tap device. This doesn't occur
1300 * on other interface types because we attach a socket filter to the rx
1301 * socket. */
1302 static int
1303 netdev_linux_tap_batch_send(struct netdev *netdev_,
1304 struct dp_packet_batch *batch)
1305 {
1306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1307 struct dp_packet *packet;
1308
1309 /* The Linux tap driver returns EIO if the device is not up,
1310 * so if the device is not up, don't waste time sending it.
1311 * However, if the device is in another network namespace
1312 * then OVS can't retrieve the state. In that case, send the
1313 * packets anyway. */
1314 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1315 netdev->tx_dropped += dp_packet_batch_size(batch);
1316 return 0;
1317 }
1318
1319 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1320 size_t size = dp_packet_size(packet);
1321 ssize_t retval;
1322 int error;
1323
1324 do {
1325 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1326 error = retval < 0 ? errno : 0;
1327 } while (error == EINTR);
1328
1329 if (error) {
1330 /* The Linux tap driver returns EIO if the device is not up. From
1331 * the OVS side this is not an error, so we ignore it; otherwise,
1332 * return the erro. */
1333 if (error != EIO) {
1334 return error;
1335 }
1336 } else if (retval != size) {
1337 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1338 "bytes of %"PRIuSIZE") on %s",
1339 retval, size, netdev_get_name(netdev_));
1340 return EMSGSIZE;
1341 }
1342 }
1343 return 0;
1344 }
1345
1346 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1347 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1348 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1349 * the packet is too big or too small to transmit on the device.
1350 *
1351 * The kernel maintains a packet transmission queue, so the caller is not
1352 * expected to do additional queuing of packets. */
1353 static int
1354 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1355 struct dp_packet_batch *batch,
1356 bool concurrent_txq OVS_UNUSED)
1357 {
1358 int error = 0;
1359 int sock = 0;
1360
1361 if (!is_tap_netdev(netdev_)) {
1362 sock = af_packet_sock();
1363 if (sock < 0) {
1364 error = -sock;
1365 goto free_batch;
1366 }
1367
1368 int ifindex = netdev_get_ifindex(netdev_);
1369 if (ifindex < 0) {
1370 error = -ifindex;
1371 goto free_batch;
1372 }
1373
1374 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1375 } else {
1376 error = netdev_linux_tap_batch_send(netdev_, batch);
1377 }
1378 if (error) {
1379 if (error == ENOBUFS) {
1380 /* The Linux AF_PACKET implementation never blocks waiting
1381 * for room for packets, instead returning ENOBUFS.
1382 * Translate this into EAGAIN for the caller. */
1383 error = EAGAIN;
1384 } else {
1385 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1386 netdev_get_name(netdev_), ovs_strerror(error));
1387 }
1388 }
1389
1390 free_batch:
1391 dp_packet_delete_batch(batch, true);
1392 return error;
1393 }
1394
1395 /* Registers with the poll loop to wake up from the next call to poll_block()
1396 * when the packet transmission queue has sufficient room to transmit a packet
1397 * with netdev_send().
1398 *
1399 * The kernel maintains a packet transmission queue, so the client is not
1400 * expected to do additional queuing of packets. Thus, this function is
1401 * unlikely to ever be used. It is included for completeness. */
1402 static void
1403 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1404 {
1405 if (is_tap_netdev(netdev)) {
1406 /* TAP device always accepts packets.*/
1407 poll_immediate_wake();
1408 }
1409 }
1410
1411 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1412 * otherwise a positive errno value. */
1413 static int
1414 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1415 {
1416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1417 enum netdev_flags old_flags = 0;
1418 int error;
1419
1420 ovs_mutex_lock(&netdev->mutex);
1421
1422 if (netdev->cache_valid & VALID_ETHERADDR) {
1423 error = netdev->ether_addr_error;
1424 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1425 goto exit;
1426 }
1427 netdev->cache_valid &= ~VALID_ETHERADDR;
1428 }
1429
1430 /* Tap devices must be brought down before setting the address. */
1431 if (is_tap_netdev(netdev_)) {
1432 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1433 }
1434 error = set_etheraddr(netdev_get_name(netdev_), mac);
1435 if (!error || error == ENODEV) {
1436 netdev->ether_addr_error = error;
1437 netdev->cache_valid |= VALID_ETHERADDR;
1438 if (!error) {
1439 netdev->etheraddr = mac;
1440 }
1441 }
1442
1443 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1444 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1445 }
1446
1447 exit:
1448 ovs_mutex_unlock(&netdev->mutex);
1449 return error;
1450 }
1451
1452 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1453 static int
1454 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1455 {
1456 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1457 int error;
1458
1459 ovs_mutex_lock(&netdev->mutex);
1460 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1461 netdev_linux_update_via_netlink(netdev);
1462 }
1463
1464 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1465 /* Fall back to ioctl if netlink fails */
1466 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1467 &netdev->etheraddr);
1468 netdev->cache_valid |= VALID_ETHERADDR;
1469 }
1470
1471 error = netdev->ether_addr_error;
1472 if (!error) {
1473 *mac = netdev->etheraddr;
1474 }
1475 ovs_mutex_unlock(&netdev->mutex);
1476
1477 return error;
1478 }
1479
1480 static int
1481 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1482 {
1483 int error;
1484
1485 if (!(netdev->cache_valid & VALID_MTU)) {
1486 netdev_linux_update_via_netlink(netdev);
1487 }
1488
1489 if (!(netdev->cache_valid & VALID_MTU)) {
1490 /* Fall back to ioctl if netlink fails */
1491 struct ifreq ifr;
1492
1493 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1494 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1495 netdev->mtu = ifr.ifr_mtu;
1496 netdev->cache_valid |= VALID_MTU;
1497 }
1498
1499 error = netdev->netdev_mtu_error;
1500 if (!error) {
1501 *mtup = netdev->mtu;
1502 }
1503
1504 return error;
1505 }
1506
1507 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1508 * in bytes, not including the hardware header; thus, this is typically 1500
1509 * bytes for Ethernet devices. */
1510 static int
1511 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1512 {
1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1514 int error;
1515
1516 ovs_mutex_lock(&netdev->mutex);
1517 error = netdev_linux_get_mtu__(netdev, mtup);
1518 ovs_mutex_unlock(&netdev->mutex);
1519
1520 return error;
1521 }
1522
1523 /* Sets the maximum size of transmitted (MTU) for given device using linux
1524 * networking ioctl interface.
1525 */
1526 static int
1527 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1528 {
1529 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1530 struct ifreq ifr;
1531 int error;
1532
1533 ovs_mutex_lock(&netdev->mutex);
1534 if (netdev->cache_valid & VALID_MTU) {
1535 error = netdev->netdev_mtu_error;
1536 if (error || netdev->mtu == mtu) {
1537 goto exit;
1538 }
1539 netdev->cache_valid &= ~VALID_MTU;
1540 }
1541 ifr.ifr_mtu = mtu;
1542 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1543 SIOCSIFMTU, "SIOCSIFMTU");
1544 if (!error || error == ENODEV) {
1545 netdev->netdev_mtu_error = error;
1546 netdev->mtu = ifr.ifr_mtu;
1547 netdev->cache_valid |= VALID_MTU;
1548 }
1549 exit:
1550 ovs_mutex_unlock(&netdev->mutex);
1551 return error;
1552 }
1553
1554 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1555 * On failure, returns a negative errno value. */
1556 static int
1557 netdev_linux_get_ifindex(const struct netdev *netdev_)
1558 {
1559 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1560 int ifindex, error;
1561
1562 ovs_mutex_lock(&netdev->mutex);
1563 error = get_ifindex(netdev_, &ifindex);
1564 ovs_mutex_unlock(&netdev->mutex);
1565
1566 return error ? -error : ifindex;
1567 }
1568
1569 static int
1570 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1571 {
1572 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1573
1574 ovs_mutex_lock(&netdev->mutex);
1575 if (netdev->miimon_interval > 0) {
1576 *carrier = netdev->miimon;
1577 } else {
1578 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1579 }
1580 ovs_mutex_unlock(&netdev->mutex);
1581
1582 return 0;
1583 }
1584
1585 static long long int
1586 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1587 {
1588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1589 long long int carrier_resets;
1590
1591 ovs_mutex_lock(&netdev->mutex);
1592 carrier_resets = netdev->carrier_resets;
1593 ovs_mutex_unlock(&netdev->mutex);
1594
1595 return carrier_resets;
1596 }
1597
1598 static int
1599 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1600 struct mii_ioctl_data *data)
1601 {
1602 struct ifreq ifr;
1603 int error;
1604
1605 memset(&ifr, 0, sizeof ifr);
1606 memcpy(&ifr.ifr_data, data, sizeof *data);
1607 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1608 memcpy(data, &ifr.ifr_data, sizeof *data);
1609
1610 return error;
1611 }
1612
1613 static int
1614 netdev_linux_get_miimon(const char *name, bool *miimon)
1615 {
1616 struct mii_ioctl_data data;
1617 int error;
1618
1619 *miimon = false;
1620
1621 memset(&data, 0, sizeof data);
1622 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1623 if (!error) {
1624 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1625 data.reg_num = MII_BMSR;
1626 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1627 &data);
1628
1629 if (!error) {
1630 *miimon = !!(data.val_out & BMSR_LSTATUS);
1631 }
1632 }
1633 if (error) {
1634 struct ethtool_cmd ecmd;
1635
1636 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1637 name);
1638
1639 COVERAGE_INC(netdev_get_ethtool);
1640 memset(&ecmd, 0, sizeof ecmd);
1641 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1642 "ETHTOOL_GLINK");
1643 if (!error) {
1644 struct ethtool_value eval;
1645
1646 memcpy(&eval, &ecmd, sizeof eval);
1647 *miimon = !!eval.data;
1648 } else {
1649 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1650 }
1651 }
1652
1653 return error;
1654 }
1655
1656 static int
1657 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1658 long long int interval)
1659 {
1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1661
1662 ovs_mutex_lock(&netdev->mutex);
1663 interval = interval > 0 ? MAX(interval, 100) : 0;
1664 if (netdev->miimon_interval != interval) {
1665 if (interval && !netdev->miimon_interval) {
1666 atomic_count_inc(&miimon_cnt);
1667 } else if (!interval && netdev->miimon_interval) {
1668 atomic_count_dec(&miimon_cnt);
1669 }
1670
1671 netdev->miimon_interval = interval;
1672 timer_set_expired(&netdev->miimon_timer);
1673 }
1674 ovs_mutex_unlock(&netdev->mutex);
1675
1676 return 0;
1677 }
1678
1679 static void
1680 netdev_linux_miimon_run(void)
1681 {
1682 struct shash device_shash;
1683 struct shash_node *node;
1684
1685 shash_init(&device_shash);
1686 netdev_get_devices(&netdev_linux_class, &device_shash);
1687 SHASH_FOR_EACH (node, &device_shash) {
1688 struct netdev *netdev = node->data;
1689 struct netdev_linux *dev = netdev_linux_cast(netdev);
1690 bool miimon;
1691
1692 ovs_mutex_lock(&dev->mutex);
1693 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1694 netdev_linux_get_miimon(dev->up.name, &miimon);
1695 if (miimon != dev->miimon) {
1696 dev->miimon = miimon;
1697 netdev_linux_changed(dev, dev->ifi_flags, 0);
1698 }
1699
1700 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1701 }
1702 ovs_mutex_unlock(&dev->mutex);
1703 netdev_close(netdev);
1704 }
1705
1706 shash_destroy(&device_shash);
1707 }
1708
1709 static void
1710 netdev_linux_miimon_wait(void)
1711 {
1712 struct shash device_shash;
1713 struct shash_node *node;
1714
1715 shash_init(&device_shash);
1716 netdev_get_devices(&netdev_linux_class, &device_shash);
1717 SHASH_FOR_EACH (node, &device_shash) {
1718 struct netdev *netdev = node->data;
1719 struct netdev_linux *dev = netdev_linux_cast(netdev);
1720
1721 ovs_mutex_lock(&dev->mutex);
1722 if (dev->miimon_interval > 0) {
1723 timer_wait(&dev->miimon_timer);
1724 }
1725 ovs_mutex_unlock(&dev->mutex);
1726 netdev_close(netdev);
1727 }
1728 shash_destroy(&device_shash);
1729 }
1730
1731 static void
1732 swap_uint64(uint64_t *a, uint64_t *b)
1733 {
1734 uint64_t tmp = *a;
1735 *a = *b;
1736 *b = tmp;
1737 }
1738
1739 /* Copies 'src' into 'dst', performing format conversion in the process.
1740 *
1741 * 'src' is allowed to be misaligned. */
1742 static void
1743 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1744 const struct ovs_vport_stats *src)
1745 {
1746 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1747 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1748 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1749 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1750 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1751 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1752 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1753 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1754 dst->multicast = 0;
1755 dst->collisions = 0;
1756 dst->rx_length_errors = 0;
1757 dst->rx_over_errors = 0;
1758 dst->rx_crc_errors = 0;
1759 dst->rx_frame_errors = 0;
1760 dst->rx_fifo_errors = 0;
1761 dst->rx_missed_errors = 0;
1762 dst->tx_aborted_errors = 0;
1763 dst->tx_carrier_errors = 0;
1764 dst->tx_fifo_errors = 0;
1765 dst->tx_heartbeat_errors = 0;
1766 dst->tx_window_errors = 0;
1767 }
1768
1769 static int
1770 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1771 {
1772 struct dpif_netlink_vport reply;
1773 struct ofpbuf *buf;
1774 int error;
1775
1776 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1777 if (error) {
1778 return error;
1779 } else if (!reply.stats) {
1780 ofpbuf_delete(buf);
1781 return EOPNOTSUPP;
1782 }
1783
1784 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1785
1786 ofpbuf_delete(buf);
1787
1788 return 0;
1789 }
1790
1791 static void
1792 get_stats_via_vport(const struct netdev *netdev_,
1793 struct netdev_stats *stats)
1794 {
1795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1796
1797 if (!netdev->vport_stats_error ||
1798 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1799 int error;
1800
1801 error = get_stats_via_vport__(netdev_, stats);
1802 if (error && error != ENOENT && error != ENODEV) {
1803 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1804 "(%s)",
1805 netdev_get_name(netdev_), ovs_strerror(error));
1806 }
1807 netdev->vport_stats_error = error;
1808 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1809 }
1810 }
1811
1812 /* Retrieves current device stats for 'netdev-linux'. */
1813 static int
1814 netdev_linux_get_stats(const struct netdev *netdev_,
1815 struct netdev_stats *stats)
1816 {
1817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1818 struct netdev_stats dev_stats;
1819 int error;
1820
1821 ovs_mutex_lock(&netdev->mutex);
1822 get_stats_via_vport(netdev_, stats);
1823 error = get_stats_via_netlink(netdev_, &dev_stats);
1824 if (error) {
1825 if (!netdev->vport_stats_error) {
1826 error = 0;
1827 }
1828 } else if (netdev->vport_stats_error) {
1829 /* stats not available from OVS then use netdev stats. */
1830 *stats = dev_stats;
1831 } else {
1832 /* Use kernel netdev's packet and byte counts since vport's counters
1833 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1834 * enabled. */
1835 stats->rx_packets = dev_stats.rx_packets;
1836 stats->rx_bytes = dev_stats.rx_bytes;
1837 stats->tx_packets = dev_stats.tx_packets;
1838 stats->tx_bytes = dev_stats.tx_bytes;
1839
1840 stats->rx_errors += dev_stats.rx_errors;
1841 stats->tx_errors += dev_stats.tx_errors;
1842 stats->rx_dropped += dev_stats.rx_dropped;
1843 stats->tx_dropped += dev_stats.tx_dropped;
1844 stats->multicast += dev_stats.multicast;
1845 stats->collisions += dev_stats.collisions;
1846 stats->rx_length_errors += dev_stats.rx_length_errors;
1847 stats->rx_over_errors += dev_stats.rx_over_errors;
1848 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1849 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1850 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1851 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1852 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1853 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1854 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1855 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1856 stats->tx_window_errors += dev_stats.tx_window_errors;
1857 }
1858 ovs_mutex_unlock(&netdev->mutex);
1859
1860 return error;
1861 }
1862
1863 /* Retrieves current device stats for 'netdev-tap' netdev or
1864 * netdev-internal. */
1865 static int
1866 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1867 {
1868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1869 struct netdev_stats dev_stats;
1870 int error;
1871
1872 ovs_mutex_lock(&netdev->mutex);
1873 get_stats_via_vport(netdev_, stats);
1874 error = get_stats_via_netlink(netdev_, &dev_stats);
1875 if (error) {
1876 if (!netdev->vport_stats_error) {
1877 error = 0;
1878 }
1879 } else if (netdev->vport_stats_error) {
1880 /* Transmit and receive stats will appear to be swapped relative to the
1881 * other ports since we are the one sending the data, not a remote
1882 * computer. For consistency, we swap them back here. This does not
1883 * apply if we are getting stats from the vport layer because it always
1884 * tracks stats from the perspective of the switch. */
1885
1886 *stats = dev_stats;
1887 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1888 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1889 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1890 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1891 stats->rx_length_errors = 0;
1892 stats->rx_over_errors = 0;
1893 stats->rx_crc_errors = 0;
1894 stats->rx_frame_errors = 0;
1895 stats->rx_fifo_errors = 0;
1896 stats->rx_missed_errors = 0;
1897 stats->tx_aborted_errors = 0;
1898 stats->tx_carrier_errors = 0;
1899 stats->tx_fifo_errors = 0;
1900 stats->tx_heartbeat_errors = 0;
1901 stats->tx_window_errors = 0;
1902 } else {
1903 /* Use kernel netdev's packet and byte counts since vport counters
1904 * do not reflect packet counts on the wire when GSO, TSO or GRO
1905 * are enabled. */
1906 stats->rx_packets = dev_stats.tx_packets;
1907 stats->rx_bytes = dev_stats.tx_bytes;
1908 stats->tx_packets = dev_stats.rx_packets;
1909 stats->tx_bytes = dev_stats.rx_bytes;
1910
1911 stats->rx_dropped += dev_stats.tx_dropped;
1912 stats->tx_dropped += dev_stats.rx_dropped;
1913
1914 stats->rx_errors += dev_stats.tx_errors;
1915 stats->tx_errors += dev_stats.rx_errors;
1916
1917 stats->multicast += dev_stats.multicast;
1918 stats->collisions += dev_stats.collisions;
1919 }
1920 stats->tx_dropped += netdev->tx_dropped;
1921 ovs_mutex_unlock(&netdev->mutex);
1922
1923 return error;
1924 }
1925
1926 static int
1927 netdev_internal_get_stats(const struct netdev *netdev_,
1928 struct netdev_stats *stats)
1929 {
1930 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1931 int error;
1932
1933 ovs_mutex_lock(&netdev->mutex);
1934 get_stats_via_vport(netdev_, stats);
1935 error = netdev->vport_stats_error;
1936 ovs_mutex_unlock(&netdev->mutex);
1937
1938 return error;
1939 }
1940
1941 static void
1942 netdev_linux_read_features(struct netdev_linux *netdev)
1943 {
1944 struct ethtool_cmd ecmd;
1945 uint32_t speed;
1946 int error;
1947
1948 if (netdev->cache_valid & VALID_FEATURES) {
1949 return;
1950 }
1951
1952 COVERAGE_INC(netdev_get_ethtool);
1953 memset(&ecmd, 0, sizeof ecmd);
1954 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1955 ETHTOOL_GSET, "ETHTOOL_GSET");
1956 if (error) {
1957 goto out;
1958 }
1959
1960 /* Supported features. */
1961 netdev->supported = 0;
1962 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1963 netdev->supported |= NETDEV_F_10MB_HD;
1964 }
1965 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1966 netdev->supported |= NETDEV_F_10MB_FD;
1967 }
1968 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1969 netdev->supported |= NETDEV_F_100MB_HD;
1970 }
1971 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1972 netdev->supported |= NETDEV_F_100MB_FD;
1973 }
1974 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1975 netdev->supported |= NETDEV_F_1GB_HD;
1976 }
1977 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1978 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1979 netdev->supported |= NETDEV_F_1GB_FD;
1980 }
1981 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1982 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1983 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1984 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1985 netdev->supported |= NETDEV_F_10GB_FD;
1986 }
1987 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1988 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1989 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1990 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1991 netdev->supported |= NETDEV_F_40GB_FD;
1992 }
1993 if (ecmd.supported & SUPPORTED_TP) {
1994 netdev->supported |= NETDEV_F_COPPER;
1995 }
1996 if (ecmd.supported & SUPPORTED_FIBRE) {
1997 netdev->supported |= NETDEV_F_FIBER;
1998 }
1999 if (ecmd.supported & SUPPORTED_Autoneg) {
2000 netdev->supported |= NETDEV_F_AUTONEG;
2001 }
2002 if (ecmd.supported & SUPPORTED_Pause) {
2003 netdev->supported |= NETDEV_F_PAUSE;
2004 }
2005 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2006 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2007 }
2008
2009 /* Advertised features. */
2010 netdev->advertised = 0;
2011 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2012 netdev->advertised |= NETDEV_F_10MB_HD;
2013 }
2014 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2015 netdev->advertised |= NETDEV_F_10MB_FD;
2016 }
2017 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2018 netdev->advertised |= NETDEV_F_100MB_HD;
2019 }
2020 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2021 netdev->advertised |= NETDEV_F_100MB_FD;
2022 }
2023 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2024 netdev->advertised |= NETDEV_F_1GB_HD;
2025 }
2026 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2027 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2028 netdev->advertised |= NETDEV_F_1GB_FD;
2029 }
2030 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2031 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2032 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2033 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2034 netdev->advertised |= NETDEV_F_10GB_FD;
2035 }
2036 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2037 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2038 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2039 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2040 netdev->advertised |= NETDEV_F_40GB_FD;
2041 }
2042 if (ecmd.advertising & ADVERTISED_TP) {
2043 netdev->advertised |= NETDEV_F_COPPER;
2044 }
2045 if (ecmd.advertising & ADVERTISED_FIBRE) {
2046 netdev->advertised |= NETDEV_F_FIBER;
2047 }
2048 if (ecmd.advertising & ADVERTISED_Autoneg) {
2049 netdev->advertised |= NETDEV_F_AUTONEG;
2050 }
2051 if (ecmd.advertising & ADVERTISED_Pause) {
2052 netdev->advertised |= NETDEV_F_PAUSE;
2053 }
2054 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2055 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2056 }
2057
2058 /* Current settings. */
2059 speed = ethtool_cmd_speed(&ecmd);
2060 if (speed == SPEED_10) {
2061 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2062 } else if (speed == SPEED_100) {
2063 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2064 } else if (speed == SPEED_1000) {
2065 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2066 } else if (speed == SPEED_10000) {
2067 netdev->current = NETDEV_F_10GB_FD;
2068 } else if (speed == 40000) {
2069 netdev->current = NETDEV_F_40GB_FD;
2070 } else if (speed == 100000) {
2071 netdev->current = NETDEV_F_100GB_FD;
2072 } else if (speed == 1000000) {
2073 netdev->current = NETDEV_F_1TB_FD;
2074 } else {
2075 netdev->current = 0;
2076 }
2077
2078 if (ecmd.port == PORT_TP) {
2079 netdev->current |= NETDEV_F_COPPER;
2080 } else if (ecmd.port == PORT_FIBRE) {
2081 netdev->current |= NETDEV_F_FIBER;
2082 }
2083
2084 if (ecmd.autoneg) {
2085 netdev->current |= NETDEV_F_AUTONEG;
2086 }
2087
2088 out:
2089 netdev->cache_valid |= VALID_FEATURES;
2090 netdev->get_features_error = error;
2091 }
2092
2093 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2094 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2095 * Returns 0 if successful, otherwise a positive errno value. */
2096 static int
2097 netdev_linux_get_features(const struct netdev *netdev_,
2098 enum netdev_features *current,
2099 enum netdev_features *advertised,
2100 enum netdev_features *supported,
2101 enum netdev_features *peer)
2102 {
2103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2104 int error;
2105
2106 ovs_mutex_lock(&netdev->mutex);
2107 netdev_linux_read_features(netdev);
2108 if (!netdev->get_features_error) {
2109 *current = netdev->current;
2110 *advertised = netdev->advertised;
2111 *supported = netdev->supported;
2112 *peer = 0; /* XXX */
2113 }
2114 error = netdev->get_features_error;
2115 ovs_mutex_unlock(&netdev->mutex);
2116
2117 return error;
2118 }
2119
2120 /* Set the features advertised by 'netdev' to 'advertise'. */
2121 static int
2122 netdev_linux_set_advertisements(struct netdev *netdev_,
2123 enum netdev_features advertise)
2124 {
2125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2126 struct ethtool_cmd ecmd;
2127 int error;
2128
2129 ovs_mutex_lock(&netdev->mutex);
2130
2131 COVERAGE_INC(netdev_get_ethtool);
2132 memset(&ecmd, 0, sizeof ecmd);
2133 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2134 ETHTOOL_GSET, "ETHTOOL_GSET");
2135 if (error) {
2136 goto exit;
2137 }
2138
2139 ecmd.advertising = 0;
2140 if (advertise & NETDEV_F_10MB_HD) {
2141 ecmd.advertising |= ADVERTISED_10baseT_Half;
2142 }
2143 if (advertise & NETDEV_F_10MB_FD) {
2144 ecmd.advertising |= ADVERTISED_10baseT_Full;
2145 }
2146 if (advertise & NETDEV_F_100MB_HD) {
2147 ecmd.advertising |= ADVERTISED_100baseT_Half;
2148 }
2149 if (advertise & NETDEV_F_100MB_FD) {
2150 ecmd.advertising |= ADVERTISED_100baseT_Full;
2151 }
2152 if (advertise & NETDEV_F_1GB_HD) {
2153 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2154 }
2155 if (advertise & NETDEV_F_1GB_FD) {
2156 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2157 }
2158 if (advertise & NETDEV_F_10GB_FD) {
2159 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2160 }
2161 if (advertise & NETDEV_F_COPPER) {
2162 ecmd.advertising |= ADVERTISED_TP;
2163 }
2164 if (advertise & NETDEV_F_FIBER) {
2165 ecmd.advertising |= ADVERTISED_FIBRE;
2166 }
2167 if (advertise & NETDEV_F_AUTONEG) {
2168 ecmd.advertising |= ADVERTISED_Autoneg;
2169 }
2170 if (advertise & NETDEV_F_PAUSE) {
2171 ecmd.advertising |= ADVERTISED_Pause;
2172 }
2173 if (advertise & NETDEV_F_PAUSE_ASYM) {
2174 ecmd.advertising |= ADVERTISED_Asym_Pause;
2175 }
2176 COVERAGE_INC(netdev_set_ethtool);
2177 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2178 ETHTOOL_SSET, "ETHTOOL_SSET");
2179
2180 exit:
2181 ovs_mutex_unlock(&netdev->mutex);
2182 return error;
2183 }
2184
2185 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2186 * successful, otherwise a positive errno value. */
2187 static int
2188 netdev_linux_set_policing(struct netdev *netdev_,
2189 uint32_t kbits_rate, uint32_t kbits_burst)
2190 {
2191 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2192 const char *netdev_name = netdev_get_name(netdev_);
2193 int ifindex;
2194 int error;
2195
2196 if (netdev_is_flow_api_enabled()) {
2197 if (kbits_rate) {
2198 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2199 netdev_name);
2200 }
2201 return EOPNOTSUPP;
2202 }
2203
2204 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2205 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2206 : kbits_burst); /* Stick with user-specified value. */
2207
2208 ovs_mutex_lock(&netdev->mutex);
2209 if (netdev->cache_valid & VALID_POLICING) {
2210 error = netdev->netdev_policing_error;
2211 if (error || (netdev->kbits_rate == kbits_rate &&
2212 netdev->kbits_burst == kbits_burst)) {
2213 /* Assume that settings haven't changed since we last set them. */
2214 goto out;
2215 }
2216 netdev->cache_valid &= ~VALID_POLICING;
2217 }
2218
2219 error = get_ifindex(netdev_, &ifindex);
2220 if (error) {
2221 goto out;
2222 }
2223
2224 COVERAGE_INC(netdev_set_policing);
2225 /* Remove any existing ingress qdisc. */
2226 error = tc_add_del_ingress_qdisc(ifindex, false);
2227 if (error) {
2228 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2229 netdev_name, ovs_strerror(error));
2230 goto out;
2231 }
2232
2233 if (kbits_rate) {
2234 error = tc_add_del_ingress_qdisc(ifindex, true);
2235 if (error) {
2236 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2237 netdev_name, ovs_strerror(error));
2238 goto out;
2239 }
2240
2241 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2242 if (error){
2243 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2244 netdev_name, ovs_strerror(error));
2245 goto out;
2246 }
2247 }
2248
2249 netdev->kbits_rate = kbits_rate;
2250 netdev->kbits_burst = kbits_burst;
2251
2252 out:
2253 if (!error || error == ENODEV) {
2254 netdev->netdev_policing_error = error;
2255 netdev->cache_valid |= VALID_POLICING;
2256 }
2257 ovs_mutex_unlock(&netdev->mutex);
2258 return error;
2259 }
2260
2261 static int
2262 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2263 struct sset *types)
2264 {
2265 const struct tc_ops *const *opsp;
2266 for (opsp = tcs; *opsp != NULL; opsp++) {
2267 const struct tc_ops *ops = *opsp;
2268 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2269 sset_add(types, ops->ovs_name);
2270 }
2271 }
2272 return 0;
2273 }
2274
2275 static const struct tc_ops *
2276 tc_lookup_ovs_name(const char *name)
2277 {
2278 const struct tc_ops *const *opsp;
2279
2280 for (opsp = tcs; *opsp != NULL; opsp++) {
2281 const struct tc_ops *ops = *opsp;
2282 if (!strcmp(name, ops->ovs_name)) {
2283 return ops;
2284 }
2285 }
2286 return NULL;
2287 }
2288
2289 static const struct tc_ops *
2290 tc_lookup_linux_name(const char *name)
2291 {
2292 const struct tc_ops *const *opsp;
2293
2294 for (opsp = tcs; *opsp != NULL; opsp++) {
2295 const struct tc_ops *ops = *opsp;
2296 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2297 return ops;
2298 }
2299 }
2300 return NULL;
2301 }
2302
2303 static struct tc_queue *
2304 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2305 size_t hash)
2306 {
2307 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2308 struct tc_queue *queue;
2309
2310 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2311 if (queue->queue_id == queue_id) {
2312 return queue;
2313 }
2314 }
2315 return NULL;
2316 }
2317
2318 static struct tc_queue *
2319 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2320 {
2321 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2322 }
2323
2324 static int
2325 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2326 const char *type,
2327 struct netdev_qos_capabilities *caps)
2328 {
2329 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2330 if (!ops) {
2331 return EOPNOTSUPP;
2332 }
2333 caps->n_queues = ops->n_queues;
2334 return 0;
2335 }
2336
2337 static int
2338 netdev_linux_get_qos(const struct netdev *netdev_,
2339 const char **typep, struct smap *details)
2340 {
2341 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2342 int error;
2343
2344 ovs_mutex_lock(&netdev->mutex);
2345 error = tc_query_qdisc(netdev_);
2346 if (!error) {
2347 *typep = netdev->tc->ops->ovs_name;
2348 error = (netdev->tc->ops->qdisc_get
2349 ? netdev->tc->ops->qdisc_get(netdev_, details)
2350 : 0);
2351 }
2352 ovs_mutex_unlock(&netdev->mutex);
2353
2354 return error;
2355 }
2356
2357 static int
2358 netdev_linux_set_qos(struct netdev *netdev_,
2359 const char *type, const struct smap *details)
2360 {
2361 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2362 const struct tc_ops *new_ops;
2363 int error;
2364
2365 new_ops = tc_lookup_ovs_name(type);
2366 if (!new_ops || !new_ops->tc_install) {
2367 return EOPNOTSUPP;
2368 }
2369
2370 if (new_ops == &tc_ops_noop) {
2371 return new_ops->tc_install(netdev_, details);
2372 }
2373
2374 ovs_mutex_lock(&netdev->mutex);
2375 error = tc_query_qdisc(netdev_);
2376 if (error) {
2377 goto exit;
2378 }
2379
2380 if (new_ops == netdev->tc->ops) {
2381 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2382 } else {
2383 /* Delete existing qdisc. */
2384 error = tc_del_qdisc(netdev_);
2385 if (error) {
2386 goto exit;
2387 }
2388 ovs_assert(netdev->tc == NULL);
2389
2390 /* Install new qdisc. */
2391 error = new_ops->tc_install(netdev_, details);
2392 ovs_assert((error == 0) == (netdev->tc != NULL));
2393 }
2394
2395 exit:
2396 ovs_mutex_unlock(&netdev->mutex);
2397 return error;
2398 }
2399
2400 static int
2401 netdev_linux_get_queue(const struct netdev *netdev_,
2402 unsigned int queue_id, struct smap *details)
2403 {
2404 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2405 int error;
2406
2407 ovs_mutex_lock(&netdev->mutex);
2408 error = tc_query_qdisc(netdev_);
2409 if (!error) {
2410 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2411 error = (queue
2412 ? netdev->tc->ops->class_get(netdev_, queue, details)
2413 : ENOENT);
2414 }
2415 ovs_mutex_unlock(&netdev->mutex);
2416
2417 return error;
2418 }
2419
2420 static int
2421 netdev_linux_set_queue(struct netdev *netdev_,
2422 unsigned int queue_id, const struct smap *details)
2423 {
2424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2425 int error;
2426
2427 ovs_mutex_lock(&netdev->mutex);
2428 error = tc_query_qdisc(netdev_);
2429 if (!error) {
2430 error = (queue_id < netdev->tc->ops->n_queues
2431 && netdev->tc->ops->class_set
2432 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2433 : EINVAL);
2434 }
2435 ovs_mutex_unlock(&netdev->mutex);
2436
2437 return error;
2438 }
2439
2440 static int
2441 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2442 {
2443 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2444 int error;
2445
2446 ovs_mutex_lock(&netdev->mutex);
2447 error = tc_query_qdisc(netdev_);
2448 if (!error) {
2449 if (netdev->tc->ops->class_delete) {
2450 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2451 error = (queue
2452 ? netdev->tc->ops->class_delete(netdev_, queue)
2453 : ENOENT);
2454 } else {
2455 error = EINVAL;
2456 }
2457 }
2458 ovs_mutex_unlock(&netdev->mutex);
2459
2460 return error;
2461 }
2462
2463 static int
2464 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2465 unsigned int queue_id,
2466 struct netdev_queue_stats *stats)
2467 {
2468 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2469 int error;
2470
2471 ovs_mutex_lock(&netdev->mutex);
2472 error = tc_query_qdisc(netdev_);
2473 if (!error) {
2474 if (netdev->tc->ops->class_get_stats) {
2475 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2476 if (queue) {
2477 stats->created = queue->created;
2478 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2479 stats);
2480 } else {
2481 error = ENOENT;
2482 }
2483 } else {
2484 error = EOPNOTSUPP;
2485 }
2486 }
2487 ovs_mutex_unlock(&netdev->mutex);
2488
2489 return error;
2490 }
2491
2492 struct queue_dump_state {
2493 struct nl_dump dump;
2494 struct ofpbuf buf;
2495 };
2496
2497 static bool
2498 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2499 {
2500 struct ofpbuf request;
2501 struct tcmsg *tcmsg;
2502
2503 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2504 if (!tcmsg) {
2505 return false;
2506 }
2507 tcmsg->tcm_parent = 0;
2508 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2509 ofpbuf_uninit(&request);
2510
2511 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2512 return true;
2513 }
2514
2515 static int
2516 finish_queue_dump(struct queue_dump_state *state)
2517 {
2518 ofpbuf_uninit(&state->buf);
2519 return nl_dump_done(&state->dump);
2520 }
2521
2522 struct netdev_linux_queue_state {
2523 unsigned int *queues;
2524 size_t cur_queue;
2525 size_t n_queues;
2526 };
2527
2528 static int
2529 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2530 {
2531 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2532 int error;
2533
2534 ovs_mutex_lock(&netdev->mutex);
2535 error = tc_query_qdisc(netdev_);
2536 if (!error) {
2537 if (netdev->tc->ops->class_get) {
2538 struct netdev_linux_queue_state *state;
2539 struct tc_queue *queue;
2540 size_t i;
2541
2542 *statep = state = xmalloc(sizeof *state);
2543 state->n_queues = hmap_count(&netdev->tc->queues);
2544 state->cur_queue = 0;
2545 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2546
2547 i = 0;
2548 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2549 state->queues[i++] = queue->queue_id;
2550 }
2551 } else {
2552 error = EOPNOTSUPP;
2553 }
2554 }
2555 ovs_mutex_unlock(&netdev->mutex);
2556
2557 return error;
2558 }
2559
2560 static int
2561 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2562 unsigned int *queue_idp, struct smap *details)
2563 {
2564 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2565 struct netdev_linux_queue_state *state = state_;
2566 int error = EOF;
2567
2568 ovs_mutex_lock(&netdev->mutex);
2569 while (state->cur_queue < state->n_queues) {
2570 unsigned int queue_id = state->queues[state->cur_queue++];
2571 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2572
2573 if (queue) {
2574 *queue_idp = queue_id;
2575 error = netdev->tc->ops->class_get(netdev_, queue, details);
2576 break;
2577 }
2578 }
2579 ovs_mutex_unlock(&netdev->mutex);
2580
2581 return error;
2582 }
2583
2584 static int
2585 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2586 void *state_)
2587 {
2588 struct netdev_linux_queue_state *state = state_;
2589
2590 free(state->queues);
2591 free(state);
2592 return 0;
2593 }
2594
2595 static int
2596 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2597 netdev_dump_queue_stats_cb *cb, void *aux)
2598 {
2599 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2600 int error;
2601
2602 ovs_mutex_lock(&netdev->mutex);
2603 error = tc_query_qdisc(netdev_);
2604 if (!error) {
2605 struct queue_dump_state state;
2606
2607 if (!netdev->tc->ops->class_dump_stats) {
2608 error = EOPNOTSUPP;
2609 } else if (!start_queue_dump(netdev_, &state)) {
2610 error = ENODEV;
2611 } else {
2612 struct ofpbuf msg;
2613 int retval;
2614
2615 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2616 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2617 cb, aux);
2618 if (retval) {
2619 error = retval;
2620 }
2621 }
2622
2623 retval = finish_queue_dump(&state);
2624 if (retval) {
2625 error = retval;
2626 }
2627 }
2628 }
2629 ovs_mutex_unlock(&netdev->mutex);
2630
2631 return error;
2632 }
2633
2634 static int
2635 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2636 struct in_addr netmask)
2637 {
2638 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2639 int error;
2640
2641 ovs_mutex_lock(&netdev->mutex);
2642 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2643 if (!error) {
2644 if (address.s_addr != INADDR_ANY) {
2645 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2646 "SIOCSIFNETMASK", netmask);
2647 }
2648 }
2649
2650 ovs_mutex_unlock(&netdev->mutex);
2651
2652 return error;
2653 }
2654
2655 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2656 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2657 * error. */
2658 static int
2659 netdev_linux_get_addr_list(const struct netdev *netdev_,
2660 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2661 {
2662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2663 int error;
2664
2665 ovs_mutex_lock(&netdev->mutex);
2666 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2667 ovs_mutex_unlock(&netdev->mutex);
2668
2669 return error;
2670 }
2671
2672 static void
2673 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2674 {
2675 struct sockaddr_in sin;
2676 memset(&sin, 0, sizeof sin);
2677 sin.sin_family = AF_INET;
2678 sin.sin_addr = addr;
2679 sin.sin_port = 0;
2680
2681 memset(sa, 0, sizeof *sa);
2682 memcpy(sa, &sin, sizeof sin);
2683 }
2684
2685 static int
2686 do_set_addr(struct netdev *netdev,
2687 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2688 {
2689 struct ifreq ifr;
2690
2691 make_in4_sockaddr(&ifr.ifr_addr, addr);
2692 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2693 ioctl_name);
2694 }
2695
2696 /* Adds 'router' as a default IP gateway. */
2697 static int
2698 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2699 {
2700 struct in_addr any = { INADDR_ANY };
2701 struct rtentry rt;
2702 int error;
2703
2704 memset(&rt, 0, sizeof rt);
2705 make_in4_sockaddr(&rt.rt_dst, any);
2706 make_in4_sockaddr(&rt.rt_gateway, router);
2707 make_in4_sockaddr(&rt.rt_genmask, any);
2708 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2709 error = af_inet_ioctl(SIOCADDRT, &rt);
2710 if (error) {
2711 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2712 }
2713 return error;
2714 }
2715
2716 static int
2717 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2718 char **netdev_name)
2719 {
2720 static const char fn[] = "/proc/net/route";
2721 FILE *stream;
2722 char line[256];
2723 int ln;
2724
2725 *netdev_name = NULL;
2726 stream = fopen(fn, "r");
2727 if (stream == NULL) {
2728 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2729 return errno;
2730 }
2731
2732 ln = 0;
2733 while (fgets(line, sizeof line, stream)) {
2734 if (++ln >= 2) {
2735 char iface[17];
2736 ovs_be32 dest, gateway, mask;
2737 int refcnt, metric, mtu;
2738 unsigned int flags, use, window, irtt;
2739
2740 if (!ovs_scan(line,
2741 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2742 " %d %u %u\n",
2743 iface, &dest, &gateway, &flags, &refcnt,
2744 &use, &metric, &mask, &mtu, &window, &irtt)) {
2745 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2746 fn, ln, line);
2747 continue;
2748 }
2749 if (!(flags & RTF_UP)) {
2750 /* Skip routes that aren't up. */
2751 continue;
2752 }
2753
2754 /* The output of 'dest', 'mask', and 'gateway' were given in
2755 * network byte order, so we don't need need any endian
2756 * conversions here. */
2757 if ((dest & mask) == (host->s_addr & mask)) {
2758 if (!gateway) {
2759 /* The host is directly reachable. */
2760 next_hop->s_addr = 0;
2761 } else {
2762 /* To reach the host, we must go through a gateway. */
2763 next_hop->s_addr = gateway;
2764 }
2765 *netdev_name = xstrdup(iface);
2766 fclose(stream);
2767 return 0;
2768 }
2769 }
2770 }
2771
2772 fclose(stream);
2773 return ENXIO;
2774 }
2775
2776 static int
2777 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2778 {
2779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2780 int error = 0;
2781
2782 ovs_mutex_lock(&netdev->mutex);
2783 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2784 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2785
2786 COVERAGE_INC(netdev_get_ethtool);
2787 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2788 error = netdev_linux_do_ethtool(netdev->up.name,
2789 cmd,
2790 ETHTOOL_GDRVINFO,
2791 "ETHTOOL_GDRVINFO");
2792 if (!error) {
2793 netdev->cache_valid |= VALID_DRVINFO;
2794 }
2795 }
2796
2797 if (!error) {
2798 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2799 smap_add(smap, "driver_version", netdev->drvinfo.version);
2800 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2801 }
2802 ovs_mutex_unlock(&netdev->mutex);
2803
2804 return error;
2805 }
2806
2807 static int
2808 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2809 struct smap *smap)
2810 {
2811 smap_add(smap, "driver_name", "openvswitch");
2812 return 0;
2813 }
2814
2815 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2816 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2817 * returns 0. Otherwise, it returns a positive errno value; in particular,
2818 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2819 static int
2820 netdev_linux_arp_lookup(const struct netdev *netdev,
2821 ovs_be32 ip, struct eth_addr *mac)
2822 {
2823 struct arpreq r;
2824 struct sockaddr_in sin;
2825 int retval;
2826
2827 memset(&r, 0, sizeof r);
2828 memset(&sin, 0, sizeof sin);
2829 sin.sin_family = AF_INET;
2830 sin.sin_addr.s_addr = ip;
2831 sin.sin_port = 0;
2832 memcpy(&r.arp_pa, &sin, sizeof sin);
2833 r.arp_ha.sa_family = ARPHRD_ETHER;
2834 r.arp_flags = 0;
2835 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2836 COVERAGE_INC(netdev_arp_lookup);
2837 retval = af_inet_ioctl(SIOCGARP, &r);
2838 if (!retval) {
2839 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2840 } else if (retval != ENXIO) {
2841 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2842 netdev_get_name(netdev), IP_ARGS(ip),
2843 ovs_strerror(retval));
2844 }
2845 return retval;
2846 }
2847
2848 static int
2849 nd_to_iff_flags(enum netdev_flags nd)
2850 {
2851 int iff = 0;
2852 if (nd & NETDEV_UP) {
2853 iff |= IFF_UP;
2854 }
2855 if (nd & NETDEV_PROMISC) {
2856 iff |= IFF_PROMISC;
2857 }
2858 if (nd & NETDEV_LOOPBACK) {
2859 iff |= IFF_LOOPBACK;
2860 }
2861 return iff;
2862 }
2863
2864 static int
2865 iff_to_nd_flags(int iff)
2866 {
2867 enum netdev_flags nd = 0;
2868 if (iff & IFF_UP) {
2869 nd |= NETDEV_UP;
2870 }
2871 if (iff & IFF_PROMISC) {
2872 nd |= NETDEV_PROMISC;
2873 }
2874 if (iff & IFF_LOOPBACK) {
2875 nd |= NETDEV_LOOPBACK;
2876 }
2877 return nd;
2878 }
2879
2880 static int
2881 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2882 enum netdev_flags on, enum netdev_flags *old_flagsp)
2883 OVS_REQUIRES(netdev->mutex)
2884 {
2885 int old_flags, new_flags;
2886 int error = 0;
2887
2888 old_flags = netdev->ifi_flags;
2889 *old_flagsp = iff_to_nd_flags(old_flags);
2890 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2891 if (new_flags != old_flags) {
2892 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2893 get_flags(&netdev->up, &netdev->ifi_flags);
2894 }
2895
2896 return error;
2897 }
2898
2899 static int
2900 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2901 enum netdev_flags on, enum netdev_flags *old_flagsp)
2902 {
2903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2904 int error = 0;
2905
2906 ovs_mutex_lock(&netdev->mutex);
2907 if (on || off) {
2908 /* Changing flags over netlink isn't support yet. */
2909 error = update_flags(netdev, off, on, old_flagsp);
2910 } else {
2911 /* Try reading flags over netlink, or fall back to ioctl. */
2912 if (!netdev_linux_update_via_netlink(netdev)) {
2913 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
2914 } else {
2915 error = update_flags(netdev, off, on, old_flagsp);
2916 }
2917 }
2918 ovs_mutex_unlock(&netdev->mutex);
2919 return error;
2920 }
2921
2922 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2923 GET_FEATURES, GET_STATUS, \
2924 FLOW_OFFLOAD_API) \
2925 { \
2926 NAME, \
2927 false, /* is_pmd */ \
2928 \
2929 NULL, \
2930 netdev_linux_run, \
2931 netdev_linux_wait, \
2932 \
2933 netdev_linux_alloc, \
2934 CONSTRUCT, \
2935 netdev_linux_destruct, \
2936 netdev_linux_dealloc, \
2937 NULL, /* get_config */ \
2938 NULL, /* set_config */ \
2939 NULL, /* get_tunnel_config */ \
2940 NULL, /* build header */ \
2941 NULL, /* push header */ \
2942 NULL, /* pop header */ \
2943 NULL, /* get_numa_id */ \
2944 NULL, /* set_tx_multiq */ \
2945 \
2946 netdev_linux_send, \
2947 netdev_linux_send_wait, \
2948 \
2949 netdev_linux_set_etheraddr, \
2950 netdev_linux_get_etheraddr, \
2951 netdev_linux_get_mtu, \
2952 netdev_linux_set_mtu, \
2953 netdev_linux_get_ifindex, \
2954 netdev_linux_get_carrier, \
2955 netdev_linux_get_carrier_resets, \
2956 netdev_linux_set_miimon_interval, \
2957 GET_STATS, \
2958 NULL, \
2959 \
2960 GET_FEATURES, \
2961 netdev_linux_set_advertisements, \
2962 NULL, /* get_pt_mode */ \
2963 \
2964 netdev_linux_set_policing, \
2965 netdev_linux_get_qos_types, \
2966 netdev_linux_get_qos_capabilities, \
2967 netdev_linux_get_qos, \
2968 netdev_linux_set_qos, \
2969 netdev_linux_get_queue, \
2970 netdev_linux_set_queue, \
2971 netdev_linux_delete_queue, \
2972 netdev_linux_get_queue_stats, \
2973 netdev_linux_queue_dump_start, \
2974 netdev_linux_queue_dump_next, \
2975 netdev_linux_queue_dump_done, \
2976 netdev_linux_dump_queue_stats, \
2977 \
2978 netdev_linux_set_in4, \
2979 netdev_linux_get_addr_list, \
2980 netdev_linux_add_router, \
2981 netdev_linux_get_next_hop, \
2982 GET_STATUS, \
2983 netdev_linux_arp_lookup, \
2984 \
2985 netdev_linux_update_flags, \
2986 NULL, /* reconfigure */ \
2987 \
2988 netdev_linux_rxq_alloc, \
2989 netdev_linux_rxq_construct, \
2990 netdev_linux_rxq_destruct, \
2991 netdev_linux_rxq_dealloc, \
2992 netdev_linux_rxq_recv, \
2993 netdev_linux_rxq_wait, \
2994 netdev_linux_rxq_drain, \
2995 \
2996 FLOW_OFFLOAD_API \
2997 }
2998
2999 const struct netdev_class netdev_linux_class =
3000 NETDEV_LINUX_CLASS(
3001 "system",
3002 netdev_linux_construct,
3003 netdev_linux_get_stats,
3004 netdev_linux_get_features,
3005 netdev_linux_get_status,
3006 LINUX_FLOW_OFFLOAD_API);
3007
3008 const struct netdev_class netdev_tap_class =
3009 NETDEV_LINUX_CLASS(
3010 "tap",
3011 netdev_linux_construct_tap,
3012 netdev_tap_get_stats,
3013 netdev_linux_get_features,
3014 netdev_linux_get_status,
3015 NO_OFFLOAD_API);
3016
3017 const struct netdev_class netdev_internal_class =
3018 NETDEV_LINUX_CLASS(
3019 "internal",
3020 netdev_linux_construct,
3021 netdev_internal_get_stats,
3022 NULL, /* get_features */
3023 netdev_internal_get_status,
3024 NO_OFFLOAD_API);
3025 \f
3026
3027 #define CODEL_N_QUEUES 0x0000
3028
3029 /* In sufficiently new kernel headers these are defined as enums in
3030 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3031 * kernels. (This overrides any enum definition in the header file but that's
3032 * harmless.) */
3033 #define TCA_CODEL_TARGET 1
3034 #define TCA_CODEL_LIMIT 2
3035 #define TCA_CODEL_INTERVAL 3
3036
3037 struct codel {
3038 struct tc tc;
3039 uint32_t target;
3040 uint32_t limit;
3041 uint32_t interval;
3042 };
3043
3044 static struct codel *
3045 codel_get__(const struct netdev *netdev_)
3046 {
3047 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3048 return CONTAINER_OF(netdev->tc, struct codel, tc);
3049 }
3050
3051 static void
3052 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3053 uint32_t interval)
3054 {
3055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3056 struct codel *codel;
3057
3058 codel = xmalloc(sizeof *codel);
3059 tc_init(&codel->tc, &tc_ops_codel);
3060 codel->target = target;
3061 codel->limit = limit;
3062 codel->interval = interval;
3063
3064 netdev->tc = &codel->tc;
3065 }
3066
3067 static int
3068 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3069 uint32_t interval)
3070 {
3071 size_t opt_offset;
3072 struct ofpbuf request;
3073 struct tcmsg *tcmsg;
3074 uint32_t otarget, olimit, ointerval;
3075 int error;
3076
3077 tc_del_qdisc(netdev);
3078
3079 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3080 NLM_F_EXCL | NLM_F_CREATE, &request);
3081 if (!tcmsg) {
3082 return ENODEV;
3083 }
3084 tcmsg->tcm_handle = tc_make_handle(1, 0);
3085 tcmsg->tcm_parent = TC_H_ROOT;
3086
3087 otarget = target ? target : 5000;
3088 olimit = limit ? limit : 10240;
3089 ointerval = interval ? interval : 100000;
3090
3091 nl_msg_put_string(&request, TCA_KIND, "codel");
3092 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3093 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3094 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3095 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3096 nl_msg_end_nested(&request, opt_offset);
3097
3098 error = tc_transact(&request, NULL);
3099 if (error) {
3100 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3101 "target %u, limit %u, interval %u error %d(%s)",
3102 netdev_get_name(netdev),
3103 otarget, olimit, ointerval,
3104 error, ovs_strerror(error));
3105 }
3106 return error;
3107 }
3108
3109 static void
3110 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3111 const struct smap *details, struct codel *codel)
3112 {
3113 codel->target = smap_get_ullong(details, "target", 0);
3114 codel->limit = smap_get_ullong(details, "limit", 0);
3115 codel->interval = smap_get_ullong(details, "interval", 0);
3116
3117 if (!codel->target) {
3118 codel->target = 5000;
3119 }
3120 if (!codel->limit) {
3121 codel->limit = 10240;
3122 }
3123 if (!codel->interval) {
3124 codel->interval = 100000;
3125 }
3126 }
3127
3128 static int
3129 codel_tc_install(struct netdev *netdev, const struct smap *details)
3130 {
3131 int error;
3132 struct codel codel;
3133
3134 codel_parse_qdisc_details__(netdev, details, &codel);
3135 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3136 codel.interval);
3137 if (!error) {
3138 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3139 }
3140 return error;
3141 }
3142
3143 static int
3144 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3145 {
3146 static const struct nl_policy tca_codel_policy[] = {
3147 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3148 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3149 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3150 };
3151
3152 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3153
3154 if (!nl_parse_nested(nl_options, tca_codel_policy,
3155 attrs, ARRAY_SIZE(tca_codel_policy))) {
3156 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3157 return EPROTO;
3158 }
3159
3160 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3161 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3162 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3163 return 0;
3164 }
3165
3166 static int
3167 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3168 {
3169 struct nlattr *nlattr;
3170 const char * kind;
3171 int error;
3172 struct codel codel;
3173
3174 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3175 if (error != 0) {
3176 return error;
3177 }
3178
3179 error = codel_parse_tca_options__(nlattr, &codel);
3180 if (error != 0) {
3181 return error;
3182 }
3183
3184 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3185 return 0;
3186 }
3187
3188
3189 static void
3190 codel_tc_destroy(struct tc *tc)
3191 {
3192 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3193 tc_destroy(tc);
3194 free(codel);
3195 }
3196
3197 static int
3198 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3199 {
3200 const struct codel *codel = codel_get__(netdev);
3201 smap_add_format(details, "target", "%u", codel->target);
3202 smap_add_format(details, "limit", "%u", codel->limit);
3203 smap_add_format(details, "interval", "%u", codel->interval);
3204 return 0;
3205 }
3206
3207 static int
3208 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3209 {
3210 struct codel codel;
3211
3212 codel_parse_qdisc_details__(netdev, details, &codel);
3213 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3214 codel_get__(netdev)->target = codel.target;
3215 codel_get__(netdev)->limit = codel.limit;
3216 codel_get__(netdev)->interval = codel.interval;
3217 return 0;
3218 }
3219
3220 static const struct tc_ops tc_ops_codel = {
3221 "codel", /* linux_name */
3222 "linux-codel", /* ovs_name */
3223 CODEL_N_QUEUES, /* n_queues */
3224 codel_tc_install,
3225 codel_tc_load,
3226 codel_tc_destroy,
3227 codel_qdisc_get,
3228 codel_qdisc_set,
3229 NULL,
3230 NULL,
3231 NULL,
3232 NULL,
3233 NULL
3234 };
3235 \f
3236 /* FQ-CoDel traffic control class. */
3237
3238 #define FQCODEL_N_QUEUES 0x0000
3239
3240 /* In sufficiently new kernel headers these are defined as enums in
3241 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3242 * kernels. (This overrides any enum definition in the header file but that's
3243 * harmless.) */
3244 #define TCA_FQ_CODEL_TARGET 1
3245 #define TCA_FQ_CODEL_LIMIT 2
3246 #define TCA_FQ_CODEL_INTERVAL 3
3247 #define TCA_FQ_CODEL_ECN 4
3248 #define TCA_FQ_CODEL_FLOWS 5
3249 #define TCA_FQ_CODEL_QUANTUM 6
3250
3251 struct fqcodel {
3252 struct tc tc;
3253 uint32_t target;
3254 uint32_t limit;
3255 uint32_t interval;
3256 uint32_t flows;
3257 uint32_t quantum;
3258 };
3259
3260 static struct fqcodel *
3261 fqcodel_get__(const struct netdev *netdev_)
3262 {
3263 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3264 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3265 }
3266
3267 static void
3268 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3269 uint32_t interval, uint32_t flows, uint32_t quantum)
3270 {
3271 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3272 struct fqcodel *fqcodel;
3273
3274 fqcodel = xmalloc(sizeof *fqcodel);
3275 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3276 fqcodel->target = target;
3277 fqcodel->limit = limit;
3278 fqcodel->interval = interval;
3279 fqcodel->flows = flows;
3280 fqcodel->quantum = quantum;
3281
3282 netdev->tc = &fqcodel->tc;
3283 }
3284
3285 static int
3286 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3287 uint32_t interval, uint32_t flows, uint32_t quantum)
3288 {
3289 size_t opt_offset;
3290 struct ofpbuf request;
3291 struct tcmsg *tcmsg;
3292 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3293 int error;
3294
3295 tc_del_qdisc(netdev);
3296
3297 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3298 NLM_F_EXCL | NLM_F_CREATE, &request);
3299 if (!tcmsg) {
3300 return ENODEV;
3301 }
3302 tcmsg->tcm_handle = tc_make_handle(1, 0);
3303 tcmsg->tcm_parent = TC_H_ROOT;
3304
3305 otarget = target ? target : 5000;
3306 olimit = limit ? limit : 10240;
3307 ointerval = interval ? interval : 100000;
3308 oflows = flows ? flows : 1024;
3309 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3310 not mtu */
3311
3312 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3313 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3314 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3315 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3316 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3317 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3318 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3319 nl_msg_end_nested(&request, opt_offset);
3320
3321 error = tc_transact(&request, NULL);
3322 if (error) {
3323 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3324 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3325 netdev_get_name(netdev),
3326 otarget, olimit, ointerval, oflows, oquantum,
3327 error, ovs_strerror(error));
3328 }
3329 return error;
3330 }
3331
3332 static void
3333 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3334 const struct smap *details, struct fqcodel *fqcodel)
3335 {
3336 fqcodel->target = smap_get_ullong(details, "target", 0);
3337 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3338 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3339 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3340 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3341
3342 if (!fqcodel->target) {
3343 fqcodel->target = 5000;
3344 }
3345 if (!fqcodel->limit) {
3346 fqcodel->limit = 10240;
3347 }
3348 if (!fqcodel->interval) {
3349 fqcodel->interval = 1000000;
3350 }
3351 if (!fqcodel->flows) {
3352 fqcodel->flows = 1024;
3353 }
3354 if (!fqcodel->quantum) {
3355 fqcodel->quantum = 1514;
3356 }
3357 }
3358
3359 static int
3360 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3361 {
3362 int error;
3363 struct fqcodel fqcodel;
3364
3365 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3366 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3367 fqcodel.interval, fqcodel.flows,
3368 fqcodel.quantum);
3369 if (!error) {
3370 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3371 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3372 }
3373 return error;
3374 }
3375
3376 static int
3377 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3378 {
3379 static const struct nl_policy tca_fqcodel_policy[] = {
3380 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3381 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3382 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3383 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3384 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3385 };
3386
3387 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3388
3389 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3390 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3391 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3392 return EPROTO;
3393 }
3394
3395 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3396 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3397 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3398 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3399 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3400 return 0;
3401 }
3402
3403 static int
3404 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3405 {
3406 struct nlattr *nlattr;
3407 const char * kind;
3408 int error;
3409 struct fqcodel fqcodel;
3410
3411 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3412 if (error != 0) {
3413 return error;
3414 }
3415
3416 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3417 if (error != 0) {
3418 return error;
3419 }
3420
3421 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3422 fqcodel.flows, fqcodel.quantum);
3423 return 0;
3424 }
3425
3426 static void
3427 fqcodel_tc_destroy(struct tc *tc)
3428 {
3429 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3430 tc_destroy(tc);
3431 free(fqcodel);
3432 }
3433
3434 static int
3435 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3436 {
3437 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3438 smap_add_format(details, "target", "%u", fqcodel->target);
3439 smap_add_format(details, "limit", "%u", fqcodel->limit);
3440 smap_add_format(details, "interval", "%u", fqcodel->interval);
3441 smap_add_format(details, "flows", "%u", fqcodel->flows);
3442 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3443 return 0;
3444 }
3445
3446 static int
3447 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3448 {
3449 struct fqcodel fqcodel;
3450
3451 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3452 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3453 fqcodel.flows, fqcodel.quantum);
3454 fqcodel_get__(netdev)->target = fqcodel.target;
3455 fqcodel_get__(netdev)->limit = fqcodel.limit;
3456 fqcodel_get__(netdev)->interval = fqcodel.interval;
3457 fqcodel_get__(netdev)->flows = fqcodel.flows;
3458 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3459 return 0;
3460 }
3461
3462 static const struct tc_ops tc_ops_fqcodel = {
3463 "fq_codel", /* linux_name */
3464 "linux-fq_codel", /* ovs_name */
3465 FQCODEL_N_QUEUES, /* n_queues */
3466 fqcodel_tc_install,
3467 fqcodel_tc_load,
3468 fqcodel_tc_destroy,
3469 fqcodel_qdisc_get,
3470 fqcodel_qdisc_set,
3471 NULL,
3472 NULL,
3473 NULL,
3474 NULL,
3475 NULL
3476 };
3477 \f
3478 /* SFQ traffic control class. */
3479
3480 #define SFQ_N_QUEUES 0x0000
3481
3482 struct sfq {
3483 struct tc tc;
3484 uint32_t quantum;
3485 uint32_t perturb;
3486 };
3487
3488 static struct sfq *
3489 sfq_get__(const struct netdev *netdev_)
3490 {
3491 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3492 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3493 }
3494
3495 static void
3496 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3497 {
3498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3499 struct sfq *sfq;
3500
3501 sfq = xmalloc(sizeof *sfq);
3502 tc_init(&sfq->tc, &tc_ops_sfq);
3503 sfq->perturb = perturb;
3504 sfq->quantum = quantum;
3505
3506 netdev->tc = &sfq->tc;
3507 }
3508
3509 static int
3510 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3511 {
3512 struct tc_sfq_qopt opt;
3513 struct ofpbuf request;
3514 struct tcmsg *tcmsg;
3515 int mtu;
3516 int mtu_error, error;
3517 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3518
3519 tc_del_qdisc(netdev);
3520
3521 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3522 NLM_F_EXCL | NLM_F_CREATE, &request);
3523 if (!tcmsg) {
3524 return ENODEV;
3525 }
3526 tcmsg->tcm_handle = tc_make_handle(1, 0);
3527 tcmsg->tcm_parent = TC_H_ROOT;
3528
3529 memset(&opt, 0, sizeof opt);
3530 if (!quantum) {
3531 if (!mtu_error) {
3532 opt.quantum = mtu; /* if we cannot find mtu, use default */
3533 }
3534 } else {
3535 opt.quantum = quantum;
3536 }
3537
3538 if (!perturb) {
3539 opt.perturb_period = 10;
3540 } else {
3541 opt.perturb_period = perturb;
3542 }
3543
3544 nl_msg_put_string(&request, TCA_KIND, "sfq");
3545 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3546
3547 error = tc_transact(&request, NULL);
3548 if (error) {
3549 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3550 "quantum %u, perturb %u error %d(%s)",
3551 netdev_get_name(netdev),
3552 opt.quantum, opt.perturb_period,
3553 error, ovs_strerror(error));
3554 }
3555 return error;
3556 }
3557
3558 static void
3559 sfq_parse_qdisc_details__(struct netdev *netdev,
3560 const struct smap *details, struct sfq *sfq)
3561 {
3562 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3563 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3564
3565 if (!sfq->perturb) {
3566 sfq->perturb = 10;
3567 }
3568
3569 if (!sfq->quantum) {
3570 int mtu;
3571 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3572 sfq->quantum = mtu;
3573 } else {
3574 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3575 "device without mtu");
3576 }
3577 }
3578 }
3579
3580 static int
3581 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3582 {
3583 int error;
3584 struct sfq sfq;
3585
3586 sfq_parse_qdisc_details__(netdev, details, &sfq);
3587 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3588 if (!error) {
3589 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3590 }
3591 return error;
3592 }
3593
3594 static int
3595 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3596 {
3597 const struct tc_sfq_qopt *sfq;
3598 struct nlattr *nlattr;
3599 const char * kind;
3600 int error;
3601
3602 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3603 if (error == 0) {
3604 sfq = nl_attr_get(nlattr);
3605 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3606 return 0;
3607 }
3608
3609 return error;
3610 }
3611
3612 static void
3613 sfq_tc_destroy(struct tc *tc)
3614 {
3615 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3616 tc_destroy(tc);
3617 free(sfq);
3618 }
3619
3620 static int
3621 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3622 {
3623 const struct sfq *sfq = sfq_get__(netdev);
3624 smap_add_format(details, "quantum", "%u", sfq->quantum);
3625 smap_add_format(details, "perturb", "%u", sfq->perturb);
3626 return 0;
3627 }
3628
3629 static int
3630 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3631 {
3632 struct sfq sfq;
3633
3634 sfq_parse_qdisc_details__(netdev, details, &sfq);
3635 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3636 sfq_get__(netdev)->quantum = sfq.quantum;
3637 sfq_get__(netdev)->perturb = sfq.perturb;
3638 return 0;
3639 }
3640
3641 static const struct tc_ops tc_ops_sfq = {
3642 "sfq", /* linux_name */
3643 "linux-sfq", /* ovs_name */
3644 SFQ_N_QUEUES, /* n_queues */
3645 sfq_tc_install,
3646 sfq_tc_load,
3647 sfq_tc_destroy,
3648 sfq_qdisc_get,
3649 sfq_qdisc_set,
3650 NULL,
3651 NULL,
3652 NULL,
3653 NULL,
3654 NULL
3655 };
3656 \f
3657 /* HTB traffic control class. */
3658
3659 #define HTB_N_QUEUES 0xf000
3660 #define HTB_RATE2QUANTUM 10
3661
3662 struct htb {
3663 struct tc tc;
3664 unsigned int max_rate; /* In bytes/s. */
3665 };
3666
3667 struct htb_class {
3668 struct tc_queue tc_queue;
3669 unsigned int min_rate; /* In bytes/s. */
3670 unsigned int max_rate; /* In bytes/s. */
3671 unsigned int burst; /* In bytes. */
3672 unsigned int priority; /* Lower values are higher priorities. */
3673 };
3674
3675 static struct htb *
3676 htb_get__(const struct netdev *netdev_)
3677 {
3678 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3679 return CONTAINER_OF(netdev->tc, struct htb, tc);
3680 }
3681
3682 static void
3683 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3684 {
3685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3686 struct htb *htb;
3687
3688 htb = xmalloc(sizeof *htb);
3689 tc_init(&htb->tc, &tc_ops_htb);
3690 htb->max_rate = max_rate;
3691
3692 netdev->tc = &htb->tc;
3693 }
3694
3695 /* Create an HTB qdisc.
3696 *
3697 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3698 static int
3699 htb_setup_qdisc__(struct netdev *netdev)
3700 {
3701 size_t opt_offset;
3702 struct tc_htb_glob opt;
3703 struct ofpbuf request;
3704 struct tcmsg *tcmsg;
3705
3706 tc_del_qdisc(netdev);
3707
3708 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3709 NLM_F_EXCL | NLM_F_CREATE, &request);
3710 if (!tcmsg) {
3711 return ENODEV;
3712 }
3713 tcmsg->tcm_handle = tc_make_handle(1, 0);
3714 tcmsg->tcm_parent = TC_H_ROOT;
3715
3716 nl_msg_put_string(&request, TCA_KIND, "htb");
3717
3718 memset(&opt, 0, sizeof opt);
3719 opt.rate2quantum = HTB_RATE2QUANTUM;
3720 opt.version = 3;
3721 opt.defcls = 1;
3722
3723 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3724 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3725 nl_msg_end_nested(&request, opt_offset);
3726
3727 return tc_transact(&request, NULL);
3728 }
3729
3730 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3731 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3732 static int
3733 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3734 unsigned int parent, struct htb_class *class)
3735 {
3736 size_t opt_offset;
3737 struct tc_htb_opt opt;
3738 struct ofpbuf request;
3739 struct tcmsg *tcmsg;
3740 int error;
3741 int mtu;
3742
3743 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3744 if (error) {
3745 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3746 netdev_get_name(netdev));
3747 return error;
3748 }
3749
3750 memset(&opt, 0, sizeof opt);
3751 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3752 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3753 /* Makes sure the quantum is at least MTU. Setting quantum will
3754 * make htb ignore the r2q for this class. */
3755 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3756 opt.quantum = mtu;
3757 }
3758 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3759 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3760 opt.prio = class->priority;
3761
3762 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3763 &request);
3764 if (!tcmsg) {
3765 return ENODEV;
3766 }
3767 tcmsg->tcm_handle = handle;
3768 tcmsg->tcm_parent = parent;
3769
3770 nl_msg_put_string(&request, TCA_KIND, "htb");
3771 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3772 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3773 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3774 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3775 nl_msg_end_nested(&request, opt_offset);
3776
3777 error = tc_transact(&request, NULL);
3778 if (error) {
3779 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3780 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3781 netdev_get_name(netdev),
3782 tc_get_major(handle), tc_get_minor(handle),
3783 tc_get_major(parent), tc_get_minor(parent),
3784 class->min_rate, class->max_rate,
3785 class->burst, class->priority, ovs_strerror(error));
3786 }
3787 return error;
3788 }
3789
3790 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3791 * description of them into 'details'. The description complies with the
3792 * specification given in the vswitch database documentation for linux-htb
3793 * queue details. */
3794 static int
3795 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3796 {
3797 static const struct nl_policy tca_htb_policy[] = {
3798 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3799 .min_len = sizeof(struct tc_htb_opt) },
3800 };
3801
3802 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3803 const struct tc_htb_opt *htb;
3804
3805 if (!nl_parse_nested(nl_options, tca_htb_policy,
3806 attrs, ARRAY_SIZE(tca_htb_policy))) {
3807 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3808 return EPROTO;
3809 }
3810
3811 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3812 class->min_rate = htb->rate.rate;
3813 class->max_rate = htb->ceil.rate;
3814 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3815 class->priority = htb->prio;
3816 return 0;
3817 }
3818
3819 static int
3820 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3821 struct htb_class *options,
3822 struct netdev_queue_stats *stats)
3823 {
3824 struct nlattr *nl_options;
3825 unsigned int handle;
3826 int error;
3827
3828 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3829 if (!error && queue_id) {
3830 unsigned int major = tc_get_major(handle);
3831 unsigned int minor = tc_get_minor(handle);
3832 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3833 *queue_id = minor - 1;
3834 } else {
3835 error = EPROTO;
3836 }
3837 }
3838 if (!error && options) {
3839 error = htb_parse_tca_options__(nl_options, options);
3840 }
3841 return error;
3842 }
3843
3844 static void
3845 htb_parse_qdisc_details__(struct netdev *netdev_,
3846 const struct smap *details, struct htb_class *hc)
3847 {
3848 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3849
3850 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3851 if (!hc->max_rate) {
3852 enum netdev_features current;
3853
3854 netdev_linux_read_features(netdev);
3855 current = !netdev->get_features_error ? netdev->current : 0;
3856 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3857 }
3858 hc->min_rate = hc->max_rate;
3859 hc->burst = 0;
3860 hc->priority = 0;
3861 }
3862
3863 static int
3864 htb_parse_class_details__(struct netdev *netdev,
3865 const struct smap *details, struct htb_class *hc)
3866 {
3867 const struct htb *htb = htb_get__(netdev);
3868 int mtu, error;
3869 unsigned long long int max_rate_bit;
3870
3871 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3872 if (error) {
3873 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3874 netdev_get_name(netdev));
3875 return error;
3876 }
3877
3878 /* HTB requires at least an mtu sized min-rate to send any traffic even
3879 * on uncongested links. */
3880 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
3881 hc->min_rate = MAX(hc->min_rate, mtu);
3882 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3883
3884 /* max-rate */
3885 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
3886 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
3887 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3888 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3889
3890 /* burst
3891 *
3892 * According to hints in the documentation that I've read, it is important
3893 * that 'burst' be at least as big as the largest frame that might be
3894 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3895 * but having it a bit too small is a problem. Since netdev_get_mtu()
3896 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3897 * the MTU. We actually add 64, instead of 14, as a guard against
3898 * additional headers get tacked on somewhere that we're not aware of. */
3899 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
3900 hc->burst = MAX(hc->burst, mtu + 64);
3901
3902 /* priority */
3903 hc->priority = smap_get_ullong(details, "priority", 0);
3904
3905 return 0;
3906 }
3907
3908 static int
3909 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3910 unsigned int parent, struct htb_class *options,
3911 struct netdev_queue_stats *stats)
3912 {
3913 struct ofpbuf *reply;
3914 int error;
3915
3916 error = tc_query_class(netdev, handle, parent, &reply);
3917 if (!error) {
3918 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3919 ofpbuf_delete(reply);
3920 }
3921 return error;
3922 }
3923
3924 static int
3925 htb_tc_install(struct netdev *netdev, const struct smap *details)
3926 {
3927 int error;
3928
3929 error = htb_setup_qdisc__(netdev);
3930 if (!error) {
3931 struct htb_class hc;
3932
3933 htb_parse_qdisc_details__(netdev, details, &hc);
3934 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3935 tc_make_handle(1, 0), &hc);
3936 if (!error) {
3937 htb_install__(netdev, hc.max_rate);
3938 }
3939 }
3940 return error;
3941 }
3942
3943 static struct htb_class *
3944 htb_class_cast__(const struct tc_queue *queue)
3945 {
3946 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3947 }
3948
3949 static void
3950 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3951 const struct htb_class *hc)
3952 {
3953 struct htb *htb = htb_get__(netdev);
3954 size_t hash = hash_int(queue_id, 0);
3955 struct tc_queue *queue;
3956 struct htb_class *hcp;
3957
3958 queue = tc_find_queue__(netdev, queue_id, hash);
3959 if (queue) {
3960 hcp = htb_class_cast__(queue);
3961 } else {
3962 hcp = xmalloc(sizeof *hcp);
3963 queue = &hcp->tc_queue;
3964 queue->queue_id = queue_id;
3965 queue->created = time_msec();
3966 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3967 }
3968
3969 hcp->min_rate = hc->min_rate;
3970 hcp->max_rate = hc->max_rate;
3971 hcp->burst = hc->burst;
3972 hcp->priority = hc->priority;
3973 }
3974
3975 static int
3976 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3977 {
3978 struct ofpbuf msg;
3979 struct queue_dump_state state;
3980 struct htb_class hc;
3981
3982 /* Get qdisc options. */
3983 hc.max_rate = 0;
3984 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3985 htb_install__(netdev, hc.max_rate);
3986
3987 /* Get queues. */
3988 if (!start_queue_dump(netdev, &state)) {
3989 return ENODEV;
3990 }
3991 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3992 unsigned int queue_id;
3993
3994 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3995 htb_update_queue__(netdev, queue_id, &hc);
3996 }
3997 }
3998 finish_queue_dump(&state);
3999
4000 return 0;
4001 }
4002
4003 static void
4004 htb_tc_destroy(struct tc *tc)
4005 {
4006 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4007 struct htb_class *hc;
4008
4009 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4010 free(hc);
4011 }
4012 tc_destroy(tc);
4013 free(htb);
4014 }
4015
4016 static int
4017 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4018 {
4019 const struct htb *htb = htb_get__(netdev);
4020 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4021 return 0;
4022 }
4023
4024 static int
4025 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4026 {
4027 struct htb_class hc;
4028 int error;
4029
4030 htb_parse_qdisc_details__(netdev, details, &hc);
4031 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4032 tc_make_handle(1, 0), &hc);
4033 if (!error) {
4034 htb_get__(netdev)->max_rate = hc.max_rate;
4035 }
4036 return error;
4037 }
4038
4039 static int
4040 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4041 const struct tc_queue *queue, struct smap *details)
4042 {
4043 const struct htb_class *hc = htb_class_cast__(queue);
4044
4045 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4046 if (hc->min_rate != hc->max_rate) {
4047 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4048 }
4049 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4050 if (hc->priority) {
4051 smap_add_format(details, "priority", "%u", hc->priority);
4052 }
4053 return 0;
4054 }
4055
4056 static int
4057 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4058 const struct smap *details)
4059 {
4060 struct htb_class hc;
4061 int error;
4062
4063 error = htb_parse_class_details__(netdev, details, &hc);
4064 if (error) {
4065 return error;
4066 }
4067
4068 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4069 tc_make_handle(1, 0xfffe), &hc);
4070 if (error) {
4071 return error;
4072 }
4073
4074 htb_update_queue__(netdev, queue_id, &hc);
4075 return 0;
4076 }
4077
4078 static int
4079 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4080 {
4081 struct htb_class *hc = htb_class_cast__(queue);
4082 struct htb *htb = htb_get__(netdev);
4083 int error;
4084
4085 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4086 if (!error) {
4087 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4088 free(hc);
4089 }
4090 return error;
4091 }
4092
4093 static int
4094 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4095 struct netdev_queue_stats *stats)
4096 {
4097 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4098 tc_make_handle(1, 0xfffe), NULL, stats);
4099 }
4100
4101 static int
4102 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4103 const struct ofpbuf *nlmsg,
4104 netdev_dump_queue_stats_cb *cb, void *aux)
4105 {
4106 struct netdev_queue_stats stats;
4107 unsigned int handle, major, minor;
4108 int error;
4109
4110 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4111 if (error) {
4112 return error;
4113 }
4114
4115 major = tc_get_major(handle);
4116 minor = tc_get_minor(handle);
4117 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4118 (*cb)(minor - 1, &stats, aux);
4119 }
4120 return 0;
4121 }
4122
4123 static const struct tc_ops tc_ops_htb = {
4124 "htb", /* linux_name */
4125 "linux-htb", /* ovs_name */
4126 HTB_N_QUEUES, /* n_queues */
4127 htb_tc_install,
4128 htb_tc_load,
4129 htb_tc_destroy,
4130 htb_qdisc_get,
4131 htb_qdisc_set,
4132 htb_class_get,
4133 htb_class_set,
4134 htb_class_delete,
4135 htb_class_get_stats,
4136 htb_class_dump_stats
4137 };
4138 \f
4139 /* "linux-hfsc" traffic control class. */
4140
4141 #define HFSC_N_QUEUES 0xf000
4142
4143 struct hfsc {
4144 struct tc tc;
4145 uint32_t max_rate;
4146 };
4147
4148 struct hfsc_class {
4149 struct tc_queue tc_queue;
4150 uint32_t min_rate;
4151 uint32_t max_rate;
4152 };
4153
4154 static struct hfsc *
4155 hfsc_get__(const struct netdev *netdev_)
4156 {
4157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4158 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4159 }
4160
4161 static struct hfsc_class *
4162 hfsc_class_cast__(const struct tc_queue *queue)
4163 {
4164 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4165 }
4166
4167 static void
4168 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4169 {
4170 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4171 struct hfsc *hfsc;
4172
4173 hfsc = xmalloc(sizeof *hfsc);
4174 tc_init(&hfsc->tc, &tc_ops_hfsc);
4175 hfsc->max_rate = max_rate;
4176 netdev->tc = &hfsc->tc;
4177 }
4178
4179 static void
4180 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4181 const struct hfsc_class *hc)
4182 {
4183 size_t hash;
4184 struct hfsc *hfsc;
4185 struct hfsc_class *hcp;
4186 struct tc_queue *queue;
4187
4188 hfsc = hfsc_get__(netdev);
4189 hash = hash_int(queue_id, 0);
4190
4191 queue = tc_find_queue__(netdev, queue_id, hash);
4192 if (queue) {
4193 hcp = hfsc_class_cast__(queue);
4194 } else {
4195 hcp = xmalloc(sizeof *hcp);
4196 queue = &hcp->tc_queue;
4197 queue->queue_id = queue_id;
4198 queue->created = time_msec();
4199 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4200 }
4201
4202 hcp->min_rate = hc->min_rate;
4203 hcp->max_rate = hc->max_rate;
4204 }
4205
4206 static int
4207 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4208 {
4209 const struct tc_service_curve *rsc, *fsc, *usc;
4210 static const struct nl_policy tca_hfsc_policy[] = {
4211 [TCA_HFSC_RSC] = {
4212 .type = NL_A_UNSPEC,
4213 .optional = false,
4214 .min_len = sizeof(struct tc_service_curve),
4215 },
4216 [TCA_HFSC_FSC] = {
4217 .type = NL_A_UNSPEC,
4218 .optional = false,
4219 .min_len = sizeof(struct tc_service_curve),
4220 },
4221 [TCA_HFSC_USC] = {
4222 .type = NL_A_UNSPEC,
4223 .optional = false,
4224 .min_len = sizeof(struct tc_service_curve),
4225 },
4226 };
4227 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4228
4229 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4230 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4231 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4232 return EPROTO;
4233 }
4234
4235 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4236 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4237 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4238
4239 if (rsc->m1 != 0 || rsc->d != 0 ||
4240 fsc->m1 != 0 || fsc->d != 0 ||
4241 usc->m1 != 0 || usc->d != 0) {
4242 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4243 "Non-linear service curves are not supported.");
4244 return EPROTO;
4245 }
4246
4247 if (rsc->m2 != fsc->m2) {
4248 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4249 "Real-time service curves are not supported ");
4250 return EPROTO;
4251 }
4252
4253 if (rsc->m2 > usc->m2) {
4254 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4255 "Min-rate service curve is greater than "
4256 "the max-rate service curve.");
4257 return EPROTO;
4258 }
4259
4260 class->min_rate = fsc->m2;
4261 class->max_rate = usc->m2;
4262 return 0;
4263 }
4264
4265 static int
4266 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4267 struct hfsc_class *options,
4268 struct netdev_queue_stats *stats)
4269 {
4270 int error;
4271 unsigned int handle;
4272 struct nlattr *nl_options;
4273
4274 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4275 if (error) {
4276 return error;
4277 }
4278
4279 if (queue_id) {
4280 unsigned int major, minor;
4281
4282 major = tc_get_major(handle);
4283 minor = tc_get_minor(handle);
4284 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4285 *queue_id = minor - 1;
4286 } else {
4287 return EPROTO;
4288 }
4289 }
4290
4291 if (options) {
4292 error = hfsc_parse_tca_options__(nl_options, options);
4293 }
4294
4295 return error;
4296 }
4297
4298 static int
4299 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4300 unsigned int parent, struct hfsc_class *options,
4301 struct netdev_queue_stats *stats)
4302 {
4303 int error;
4304 struct ofpbuf *reply;
4305
4306 error = tc_query_class(netdev, handle, parent, &reply);
4307 if (error) {
4308 return error;
4309 }
4310
4311 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4312 ofpbuf_delete(reply);
4313 return error;
4314 }
4315
4316 static void
4317 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4318 struct hfsc_class *class)
4319 {
4320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4321
4322 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4323 if (!max_rate) {
4324 enum netdev_features current;
4325
4326 netdev_linux_read_features(netdev);
4327 current = !netdev->get_features_error ? netdev->current : 0;
4328 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4329 }
4330
4331 class->min_rate = max_rate;
4332 class->max_rate = max_rate;
4333 }
4334
4335 static int
4336 hfsc_parse_class_details__(struct netdev *netdev,
4337 const struct smap *details,
4338 struct hfsc_class * class)
4339 {
4340 const struct hfsc *hfsc;
4341 uint32_t min_rate, max_rate;
4342
4343 hfsc = hfsc_get__(netdev);
4344
4345 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4346 min_rate = MAX(min_rate, 1);
4347 min_rate = MIN(min_rate, hfsc->max_rate);
4348
4349 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4350 max_rate = MAX(max_rate, min_rate);
4351 max_rate = MIN(max_rate, hfsc->max_rate);
4352
4353 class->min_rate = min_rate;
4354 class->max_rate = max_rate;
4355
4356 return 0;
4357 }
4358
4359 /* Create an HFSC qdisc.
4360 *
4361 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4362 static int
4363 hfsc_setup_qdisc__(struct netdev * netdev)
4364 {
4365 struct tcmsg *tcmsg;
4366 struct ofpbuf request;
4367 struct tc_hfsc_qopt opt;
4368
4369 tc_del_qdisc(netdev);
4370
4371 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4372 NLM_F_EXCL | NLM_F_CREATE, &request);
4373
4374 if (!tcmsg) {
4375 return ENODEV;
4376 }
4377
4378 tcmsg->tcm_handle = tc_make_handle(1, 0);
4379 tcmsg->tcm_parent = TC_H_ROOT;
4380
4381 memset(&opt, 0, sizeof opt);
4382 opt.defcls = 1;
4383
4384 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4385 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4386
4387 return tc_transact(&request, NULL);
4388 }
4389
4390 /* Create an HFSC class.
4391 *
4392 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4393 * sc rate <min_rate> ul rate <max_rate>" */
4394 static int
4395 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4396 unsigned int parent, struct hfsc_class *class)
4397 {
4398 int error;
4399 size_t opt_offset;
4400 struct tcmsg *tcmsg;
4401 struct ofpbuf request;
4402 struct tc_service_curve min, max;
4403
4404 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4405 &request);
4406
4407 if (!tcmsg) {
4408 return ENODEV;
4409 }
4410
4411 tcmsg->tcm_handle = handle;
4412 tcmsg->tcm_parent = parent;
4413
4414 min.m1 = 0;
4415 min.d = 0;
4416 min.m2 = class->min_rate;
4417
4418 max.m1 = 0;
4419 max.d = 0;
4420 max.m2 = class->max_rate;
4421
4422 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4423 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4424 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4425 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4426 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4427 nl_msg_end_nested(&request, opt_offset);
4428
4429 error = tc_transact(&request, NULL);
4430 if (error) {
4431 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4432 "min-rate %ubps, max-rate %ubps (%s)",
4433 netdev_get_name(netdev),
4434 tc_get_major(handle), tc_get_minor(handle),
4435 tc_get_major(parent), tc_get_minor(parent),
4436 class->min_rate, class->max_rate, ovs_strerror(error));
4437 }
4438
4439 return error;
4440 }
4441
4442 static int
4443 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4444 {
4445 int error;
4446 struct hfsc_class class;
4447
4448 error = hfsc_setup_qdisc__(netdev);
4449
4450 if (error) {
4451 return error;
4452 }
4453
4454 hfsc_parse_qdisc_details__(netdev, details, &class);
4455 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4456 tc_make_handle(1, 0), &class);
4457
4458 if (error) {
4459 return error;
4460 }
4461
4462 hfsc_install__(netdev, class.max_rate);
4463 return 0;
4464 }
4465
4466 static int
4467 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4468 {
4469 struct ofpbuf msg;
4470 struct queue_dump_state state;
4471 struct hfsc_class hc;
4472
4473 hc.max_rate = 0;
4474 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4475 hfsc_install__(netdev, hc.max_rate);
4476
4477 if (!start_queue_dump(netdev, &state)) {
4478 return ENODEV;
4479 }
4480
4481 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4482 unsigned int queue_id;
4483
4484 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4485 hfsc_update_queue__(netdev, queue_id, &hc);
4486 }
4487 }
4488
4489 finish_queue_dump(&state);
4490 return 0;
4491 }
4492
4493 static void
4494 hfsc_tc_destroy(struct tc *tc)
4495 {
4496 struct hfsc *hfsc;
4497 struct hfsc_class *hc, *next;
4498
4499 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4500
4501 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4502 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4503 free(hc);
4504 }
4505
4506 tc_destroy(tc);
4507 free(hfsc);
4508 }
4509
4510 static int
4511 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4512 {
4513 const struct hfsc *hfsc;
4514 hfsc = hfsc_get__(netdev);
4515 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4516 return 0;
4517 }
4518
4519 static int
4520 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4521 {
4522 int error;
4523 struct hfsc_class class;
4524
4525 hfsc_parse_qdisc_details__(netdev, details, &class);
4526 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4527 tc_make_handle(1, 0), &class);
4528
4529 if (!error) {
4530 hfsc_get__(netdev)->max_rate = class.max_rate;
4531 }
4532
4533 return error;
4534 }
4535
4536 static int
4537 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4538 const struct tc_queue *queue, struct smap *details)
4539 {
4540 const struct hfsc_class *hc;
4541
4542 hc = hfsc_class_cast__(queue);
4543 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4544 if (hc->min_rate != hc->max_rate) {
4545 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4546 }
4547 return 0;
4548 }
4549
4550 static int
4551 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4552 const struct smap *details)
4553 {
4554 int error;
4555 struct hfsc_class class;
4556
4557 error = hfsc_parse_class_details__(netdev, details, &class);
4558 if (error) {
4559 return error;
4560 }
4561
4562 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4563 tc_make_handle(1, 0xfffe), &class);
4564 if (error) {
4565 return error;
4566 }
4567
4568 hfsc_update_queue__(netdev, queue_id, &class);
4569 return 0;
4570 }
4571
4572 static int
4573 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4574 {
4575 int error;
4576 struct hfsc *hfsc;
4577 struct hfsc_class *hc;
4578
4579 hc = hfsc_class_cast__(queue);
4580 hfsc = hfsc_get__(netdev);
4581
4582 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4583 if (!error) {
4584 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4585 free(hc);
4586 }
4587 return error;
4588 }
4589
4590 static int
4591 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4592 struct netdev_queue_stats *stats)
4593 {
4594 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4595 tc_make_handle(1, 0xfffe), NULL, stats);
4596 }
4597
4598 static int
4599 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4600 const struct ofpbuf *nlmsg,
4601 netdev_dump_queue_stats_cb *cb, void *aux)
4602 {
4603 struct netdev_queue_stats stats;
4604 unsigned int handle, major, minor;
4605 int error;
4606
4607 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4608 if (error) {
4609 return error;
4610 }
4611
4612 major = tc_get_major(handle);
4613 minor = tc_get_minor(handle);
4614 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4615 (*cb)(minor - 1, &stats, aux);
4616 }
4617 return 0;
4618 }
4619
4620 static const struct tc_ops tc_ops_hfsc = {
4621 "hfsc", /* linux_name */
4622 "linux-hfsc", /* ovs_name */
4623 HFSC_N_QUEUES, /* n_queues */
4624 hfsc_tc_install, /* tc_install */
4625 hfsc_tc_load, /* tc_load */
4626 hfsc_tc_destroy, /* tc_destroy */
4627 hfsc_qdisc_get, /* qdisc_get */
4628 hfsc_qdisc_set, /* qdisc_set */
4629 hfsc_class_get, /* class_get */
4630 hfsc_class_set, /* class_set */
4631 hfsc_class_delete, /* class_delete */
4632 hfsc_class_get_stats, /* class_get_stats */
4633 hfsc_class_dump_stats /* class_dump_stats */
4634 };
4635 \f
4636 /* "linux-noop" traffic control class. */
4637
4638 static void
4639 noop_install__(struct netdev *netdev_)
4640 {
4641 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4642 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4643
4644 netdev->tc = CONST_CAST(struct tc *, &tc);
4645 }
4646
4647 static int
4648 noop_tc_install(struct netdev *netdev,
4649 const struct smap *details OVS_UNUSED)
4650 {
4651 noop_install__(netdev);
4652 return 0;
4653 }
4654
4655 static int
4656 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4657 {
4658 noop_install__(netdev);
4659 return 0;
4660 }
4661
4662 static const struct tc_ops tc_ops_noop = {
4663 NULL, /* linux_name */
4664 "linux-noop", /* ovs_name */
4665 0, /* n_queues */
4666 noop_tc_install,
4667 noop_tc_load,
4668 NULL, /* tc_destroy */
4669 NULL, /* qdisc_get */
4670 NULL, /* qdisc_set */
4671 NULL, /* class_get */
4672 NULL, /* class_set */
4673 NULL, /* class_delete */
4674 NULL, /* class_get_stats */
4675 NULL /* class_dump_stats */
4676 };
4677 \f
4678 /* "linux-default" traffic control class.
4679 *
4680 * This class represents the default, unnamed Linux qdisc. It corresponds to
4681 * the "" (empty string) QoS type in the OVS database. */
4682
4683 static void
4684 default_install__(struct netdev *netdev_)
4685 {
4686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4687 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4688
4689 /* Nothing but a tc class implementation is allowed to write to a tc. This
4690 * class never does that, so we can legitimately use a const tc object. */
4691 netdev->tc = CONST_CAST(struct tc *, &tc);
4692 }
4693
4694 static int
4695 default_tc_install(struct netdev *netdev,
4696 const struct smap *details OVS_UNUSED)
4697 {
4698 default_install__(netdev);
4699 return 0;
4700 }
4701
4702 static int
4703 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4704 {
4705 default_install__(netdev);
4706 return 0;
4707 }
4708
4709 static const struct tc_ops tc_ops_default = {
4710 NULL, /* linux_name */
4711 "", /* ovs_name */
4712 0, /* n_queues */
4713 default_tc_install,
4714 default_tc_load,
4715 NULL, /* tc_destroy */
4716 NULL, /* qdisc_get */
4717 NULL, /* qdisc_set */
4718 NULL, /* class_get */
4719 NULL, /* class_set */
4720 NULL, /* class_delete */
4721 NULL, /* class_get_stats */
4722 NULL /* class_dump_stats */
4723 };
4724 \f
4725 /* "linux-other" traffic control class.
4726 *
4727 * */
4728
4729 static int
4730 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4731 {
4732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4733 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4734
4735 /* Nothing but a tc class implementation is allowed to write to a tc. This
4736 * class never does that, so we can legitimately use a const tc object. */
4737 netdev->tc = CONST_CAST(struct tc *, &tc);
4738 return 0;
4739 }
4740
4741 static const struct tc_ops tc_ops_other = {
4742 NULL, /* linux_name */
4743 "linux-other", /* ovs_name */
4744 0, /* n_queues */
4745 NULL, /* tc_install */
4746 other_tc_load,
4747 NULL, /* tc_destroy */
4748 NULL, /* qdisc_get */
4749 NULL, /* qdisc_set */
4750 NULL, /* class_get */
4751 NULL, /* class_set */
4752 NULL, /* class_delete */
4753 NULL, /* class_get_stats */
4754 NULL /* class_dump_stats */
4755 };
4756 \f
4757 /* Traffic control. */
4758
4759 /* Number of kernel "tc" ticks per second. */
4760 static double ticks_per_s;
4761
4762 /* Number of kernel "jiffies" per second. This is used for the purpose of
4763 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4764 * one jiffy's worth of data.
4765 *
4766 * There are two possibilities here:
4767 *
4768 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4769 * approximate range of 100 to 1024. That means that we really need to
4770 * make sure that the qdisc can buffer that much data.
4771 *
4772 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4773 * has finely granular timers and there's no need to fudge additional room
4774 * for buffers. (There's no extra effort needed to implement that: the
4775 * large 'buffer_hz' is used as a divisor, so practically any number will
4776 * come out as 0 in the division. Small integer results in the case of
4777 * really high dividends won't have any real effect anyhow.)
4778 */
4779 static unsigned int buffer_hz;
4780
4781 static struct tcmsg *
4782 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4783 unsigned int flags, struct ofpbuf *request)
4784 {
4785 int ifindex;
4786 int error;
4787
4788 error = get_ifindex(netdev, &ifindex);
4789 if (error) {
4790 return NULL;
4791 }
4792
4793 return tc_make_request(ifindex, type, flags, request);
4794 }
4795
4796 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4797 * of 'kbits_burst'.
4798 *
4799 * This function is equivalent to running:
4800 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4801 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4802 * mtu 65535 drop
4803 *
4804 * The configuration and stats may be seen with the following command:
4805 * /sbin/tc -s filter show dev <devname> parent ffff:
4806 *
4807 * Returns 0 if successful, otherwise a positive errno value.
4808 */
4809 static int
4810 tc_add_policer(struct netdev *netdev,
4811 uint32_t kbits_rate, uint32_t kbits_burst)
4812 {
4813 struct tc_police tc_police;
4814 struct ofpbuf request;
4815 struct tcmsg *tcmsg;
4816 size_t basic_offset;
4817 size_t police_offset;
4818 int error;
4819 int mtu = 65535;
4820
4821 memset(&tc_police, 0, sizeof tc_police);
4822 tc_police.action = TC_POLICE_SHOT;
4823 tc_police.mtu = mtu;
4824 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4825
4826 /* The following appears wrong in one way: In networking a kilobit is
4827 * usually 1000 bits but this uses 1024 bits.
4828 *
4829 * However if you "fix" those problems then "tc filter show ..." shows
4830 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4831 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4832 * tc's point of view. Whatever. */
4833 tc_police.burst = tc_bytes_to_ticks(
4834 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4835
4836 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4837 NLM_F_EXCL | NLM_F_CREATE, &request);
4838 if (!tcmsg) {
4839 return ENODEV;
4840 }
4841 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4842 tcmsg->tcm_info = tc_make_handle(49,
4843 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4844
4845 nl_msg_put_string(&request, TCA_KIND, "basic");
4846 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4847 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4848 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4849 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4850 nl_msg_end_nested(&request, police_offset);
4851 nl_msg_end_nested(&request, basic_offset);
4852
4853 error = tc_transact(&request, NULL);
4854 if (error) {
4855 return error;
4856 }
4857
4858 return 0;
4859 }
4860
4861 static void
4862 read_psched(void)
4863 {
4864 /* The values in psched are not individually very meaningful, but they are
4865 * important. The tables below show some values seen in the wild.
4866 *
4867 * Some notes:
4868 *
4869 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4870 * (Before that, there are hints that it was 1000000000.)
4871 *
4872 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4873 * above.
4874 *
4875 * /proc/net/psched
4876 * -----------------------------------
4877 * [1] 000c8000 000f4240 000f4240 00000064
4878 * [2] 000003e8 00000400 000f4240 3b9aca00
4879 * [3] 000003e8 00000400 000f4240 3b9aca00
4880 * [4] 000003e8 00000400 000f4240 00000064
4881 * [5] 000003e8 00000040 000f4240 3b9aca00
4882 * [6] 000003e8 00000040 000f4240 000000f9
4883 *
4884 * a b c d ticks_per_s buffer_hz
4885 * ------- --------- ---------- ------------- ----------- -------------
4886 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4887 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4888 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4889 * [4] 1,000 1,024 1,000,000 100 976,562 100
4890 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4891 * [6] 1,000 64 1,000,000 249 15,625,000 249
4892 *
4893 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4894 * [2] 2.6.26-1-686-bigmem from Debian lenny
4895 * [3] 2.6.26-2-sparc64 from Debian lenny
4896 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4897 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4898 * [6] 2.6.34 from kernel.org on KVM
4899 */
4900 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4901 static const char fn[] = "/proc/net/psched";
4902 unsigned int a, b, c, d;
4903 FILE *stream;
4904
4905 if (!ovsthread_once_start(&once)) {
4906 return;
4907 }
4908
4909 ticks_per_s = 1.0;
4910 buffer_hz = 100;
4911
4912 stream = fopen(fn, "r");
4913 if (!stream) {
4914 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4915 goto exit;
4916 }
4917
4918 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4919 VLOG_WARN("%s: read failed", fn);
4920 fclose(stream);
4921 goto exit;
4922 }
4923 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4924 fclose(stream);
4925
4926 if (!a || !c) {
4927 VLOG_WARN("%s: invalid scheduler parameters", fn);
4928 goto exit;
4929 }
4930
4931 ticks_per_s = (double) a * c / b;
4932 if (c == 1000000) {
4933 buffer_hz = d;
4934 } else {
4935 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4936 fn, a, b, c, d);
4937 }
4938 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4939
4940 exit:
4941 ovsthread_once_done(&once);
4942 }
4943
4944 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4945 * rate of 'rate' bytes per second. */
4946 static unsigned int
4947 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4948 {
4949 read_psched();
4950 return (rate * ticks) / ticks_per_s;
4951 }
4952
4953 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4954 * rate of 'rate' bytes per second. */
4955 static unsigned int
4956 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4957 {
4958 read_psched();
4959 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4960 }
4961
4962 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4963 * a transmission rate of 'rate' bytes per second. */
4964 static unsigned int
4965 tc_buffer_per_jiffy(unsigned int rate)
4966 {
4967 read_psched();
4968 return rate / buffer_hz;
4969 }
4970
4971 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4972 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4973 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4974 * stores NULL into it if it is absent.
4975 *
4976 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4977 * 'msg'.
4978 *
4979 * Returns 0 if successful, otherwise a positive errno value. */
4980 static int
4981 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4982 struct nlattr **options)
4983 {
4984 static const struct nl_policy tca_policy[] = {
4985 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4986 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4987 };
4988 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4989
4990 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4991 tca_policy, ta, ARRAY_SIZE(ta))) {
4992 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4993 goto error;
4994 }
4995
4996 if (kind) {
4997 *kind = nl_attr_get_string(ta[TCA_KIND]);
4998 }
4999
5000 if (options) {
5001 *options = ta[TCA_OPTIONS];
5002 }
5003
5004 return 0;
5005
5006 error:
5007 if (kind) {
5008 *kind = NULL;
5009 }
5010 if (options) {
5011 *options = NULL;
5012 }
5013 return EPROTO;
5014 }
5015
5016 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5017 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5018 * into '*options', and its queue statistics into '*stats'. Any of the output
5019 * arguments may be null.
5020 *
5021 * Returns 0 if successful, otherwise a positive errno value. */
5022 static int
5023 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5024 struct nlattr **options, struct netdev_queue_stats *stats)
5025 {
5026 static const struct nl_policy tca_policy[] = {
5027 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5028 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5029 };
5030 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5031
5032 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5033 tca_policy, ta, ARRAY_SIZE(ta))) {
5034 VLOG_WARN_RL(&rl, "failed to parse class message");
5035 goto error;
5036 }
5037
5038 if (handlep) {
5039 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5040 *handlep = tc->tcm_handle;
5041 }
5042
5043 if (options) {
5044 *options = ta[TCA_OPTIONS];
5045 }
5046
5047 if (stats) {
5048 const struct gnet_stats_queue *gsq;
5049 struct gnet_stats_basic gsb;
5050
5051 static const struct nl_policy stats_policy[] = {
5052 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5053 .min_len = sizeof gsb },
5054 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5055 .min_len = sizeof *gsq },
5056 };
5057 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5058
5059 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5060 sa, ARRAY_SIZE(sa))) {
5061 VLOG_WARN_RL(&rl, "failed to parse class stats");
5062 goto error;
5063 }
5064
5065 /* Alignment issues screw up the length of struct gnet_stats_basic on
5066 * some arch/bitsize combinations. Newer versions of Linux have a
5067 * struct gnet_stats_basic_packed, but we can't depend on that. The
5068 * easiest thing to do is just to make a copy. */
5069 memset(&gsb, 0, sizeof gsb);
5070 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5071 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5072 stats->tx_bytes = gsb.bytes;
5073 stats->tx_packets = gsb.packets;
5074
5075 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5076 stats->tx_errors = gsq->drops;
5077 }
5078
5079 return 0;
5080
5081 error:
5082 if (options) {
5083 *options = NULL;
5084 }
5085 if (stats) {
5086 memset(stats, 0, sizeof *stats);
5087 }
5088 return EPROTO;
5089 }
5090
5091 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5092 * on 'netdev'. */
5093 static int
5094 tc_query_class(const struct netdev *netdev,
5095 unsigned int handle, unsigned int parent,
5096 struct ofpbuf **replyp)
5097 {
5098 struct ofpbuf request;
5099 struct tcmsg *tcmsg;
5100 int error;
5101
5102 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5103 &request);
5104 if (!tcmsg) {
5105 return ENODEV;
5106 }
5107 tcmsg->tcm_handle = handle;
5108 tcmsg->tcm_parent = parent;
5109
5110 error = tc_transact(&request, replyp);
5111 if (error) {
5112 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5113 netdev_get_name(netdev),
5114 tc_get_major(handle), tc_get_minor(handle),
5115 tc_get_major(parent), tc_get_minor(parent),
5116 ovs_strerror(error));
5117 }
5118 return error;
5119 }
5120
5121 /* Equivalent to "tc class del dev <name> handle <handle>". */
5122 static int
5123 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5124 {
5125 struct ofpbuf request;
5126 struct tcmsg *tcmsg;
5127 int error;
5128
5129 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5130 if (!tcmsg) {
5131 return ENODEV;
5132 }
5133 tcmsg->tcm_handle = handle;
5134 tcmsg->tcm_parent = 0;
5135
5136 error = tc_transact(&request, NULL);
5137 if (error) {
5138 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5139 netdev_get_name(netdev),
5140 tc_get_major(handle), tc_get_minor(handle),
5141 ovs_strerror(error));
5142 }
5143 return error;
5144 }
5145
5146 /* Equivalent to "tc qdisc del dev <name> root". */
5147 static int
5148 tc_del_qdisc(struct netdev *netdev_)
5149 {
5150 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5151 struct ofpbuf request;
5152 struct tcmsg *tcmsg;
5153 int error;
5154
5155 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5156 if (!tcmsg) {
5157 return ENODEV;
5158 }
5159 tcmsg->tcm_handle = tc_make_handle(1, 0);
5160 tcmsg->tcm_parent = TC_H_ROOT;
5161
5162 error = tc_transact(&request, NULL);
5163 if (error == EINVAL) {
5164 /* EINVAL probably means that the default qdisc was in use, in which
5165 * case we've accomplished our purpose. */
5166 error = 0;
5167 }
5168 if (!error && netdev->tc) {
5169 if (netdev->tc->ops->tc_destroy) {
5170 netdev->tc->ops->tc_destroy(netdev->tc);
5171 }
5172 netdev->tc = NULL;
5173 }
5174 return error;
5175 }
5176
5177 static bool
5178 getqdisc_is_safe(void)
5179 {
5180 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5181 static bool safe = false;
5182
5183 if (ovsthread_once_start(&once)) {
5184 struct utsname utsname;
5185 int major, minor;
5186
5187 if (uname(&utsname) == -1) {
5188 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5189 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5190 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5191 } else if (major < 2 || (major == 2 && minor < 35)) {
5192 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5193 utsname.release);
5194 } else {
5195 safe = true;
5196 }
5197 ovsthread_once_done(&once);
5198 }
5199 return safe;
5200 }
5201
5202 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5203 * kernel to determine what they are. Returns 0 if successful, otherwise a
5204 * positive errno value. */
5205 static int
5206 tc_query_qdisc(const struct netdev *netdev_)
5207 {
5208 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5209 struct ofpbuf request, *qdisc;
5210 const struct tc_ops *ops;
5211 struct tcmsg *tcmsg;
5212 int load_error;
5213 int error;
5214
5215 if (netdev->tc) {
5216 return 0;
5217 }
5218
5219 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5220 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5221 * 2.6.35 without that fix backported to it.
5222 *
5223 * To avoid the OOPS, we must not make a request that would attempt to dump
5224 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5225 * few others. There are a few ways that I can see to do this, but most of
5226 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5227 * technique chosen here is to assume that any non-default qdisc that we
5228 * create will have a class with handle 1:0. The built-in qdiscs only have
5229 * a class with handle 0:0.
5230 *
5231 * On Linux 2.6.35+ we use the straightforward method because it allows us
5232 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5233 * in such a case we get no response at all from the kernel (!) if a
5234 * builtin qdisc is in use (which is later caught by "!error &&
5235 * !qdisc->size"). */
5236 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5237 &request);
5238 if (!tcmsg) {
5239 return ENODEV;
5240 }
5241 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5242 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5243
5244 /* Figure out what tc class to instantiate. */
5245 error = tc_transact(&request, &qdisc);
5246 if (!error && qdisc->size) {
5247 const char *kind;
5248
5249 error = tc_parse_qdisc(qdisc, &kind, NULL);
5250 if (error) {
5251 ops = &tc_ops_other;
5252 } else {
5253 ops = tc_lookup_linux_name(kind);
5254 if (!ops) {
5255 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5256 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5257
5258 ops = &tc_ops_other;
5259 }
5260 }
5261 } else if ((!error && !qdisc->size) || error == ENOENT) {
5262 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5263 * set up by some other entity that doesn't have a handle 1:0. We will
5264 * assume that it's the system default qdisc. */
5265 ops = &tc_ops_default;
5266 error = 0;
5267 } else {
5268 /* Who knows? Maybe the device got deleted. */
5269 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5270 netdev_get_name(netdev_), ovs_strerror(error));
5271 ops = &tc_ops_other;
5272 }
5273
5274 /* Instantiate it. */
5275 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5276 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5277 ofpbuf_delete(qdisc);
5278
5279 return error ? error : load_error;
5280 }
5281
5282 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5283 approximate the time to transmit packets of various lengths. For an MTU of
5284 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5285 represents two possible packet lengths; for a MTU of 513 through 1024, four
5286 possible lengths; and so on.
5287
5288 Returns, for the specified 'mtu', the number of bits that packet lengths
5289 need to be shifted right to fit within such a 256-entry table. */
5290 static int
5291 tc_calc_cell_log(unsigned int mtu)
5292 {
5293 int cell_log;
5294
5295 if (!mtu) {
5296 mtu = ETH_PAYLOAD_MAX;
5297 }
5298 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5299
5300 for (cell_log = 0; mtu >= 256; cell_log++) {
5301 mtu >>= 1;
5302 }
5303
5304 return cell_log;
5305 }
5306
5307 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5308 * of 'mtu'. */
5309 static void
5310 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5311 {
5312 memset(rate, 0, sizeof *rate);
5313 rate->cell_log = tc_calc_cell_log(mtu);
5314 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5315 /* rate->cell_align = 0; */ /* distro headers. */
5316 rate->mpu = ETH_TOTAL_MIN;
5317 rate->rate = Bps;
5318 }
5319
5320 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5321 * attribute of the specified "type".
5322 *
5323 * See tc_calc_cell_log() above for a description of "rtab"s. */
5324 static void
5325 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5326 {
5327 uint32_t *rtab;
5328 unsigned int i;
5329
5330 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5331 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5332 unsigned packet_size = (i + 1) << rate->cell_log;
5333 if (packet_size < rate->mpu) {
5334 packet_size = rate->mpu;
5335 }
5336 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5337 }
5338 }
5339
5340 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5341 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5342 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5343 * 0 is fine.) */
5344 static int
5345 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5346 {
5347 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5348 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5349 }
5350 \f
5351 /* Linux-only functions declared in netdev-linux.h */
5352
5353 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5354 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5355 int
5356 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5357 const char *flag_name, bool enable)
5358 {
5359 const char *netdev_name = netdev_get_name(netdev);
5360 struct ethtool_value evalue;
5361 uint32_t new_flags;
5362 int error;
5363
5364 COVERAGE_INC(netdev_get_ethtool);
5365 memset(&evalue, 0, sizeof evalue);
5366 error = netdev_linux_do_ethtool(netdev_name,
5367 (struct ethtool_cmd *)&evalue,
5368 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5369 if (error) {
5370 return error;
5371 }
5372
5373 COVERAGE_INC(netdev_set_ethtool);
5374 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5375 if (new_flags == evalue.data) {
5376 return 0;
5377 }
5378 evalue.data = new_flags;
5379 error = netdev_linux_do_ethtool(netdev_name,
5380 (struct ethtool_cmd *)&evalue,
5381 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5382 if (error) {
5383 return error;
5384 }
5385
5386 COVERAGE_INC(netdev_get_ethtool);
5387 memset(&evalue, 0, sizeof evalue);
5388 error = netdev_linux_do_ethtool(netdev_name,
5389 (struct ethtool_cmd *)&evalue,
5390 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5391 if (error) {
5392 return error;
5393 }
5394
5395 if (new_flags != evalue.data) {
5396 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5397 "device %s failed", enable ? "enable" : "disable",
5398 flag_name, netdev_name);
5399 return EOPNOTSUPP;
5400 }
5401
5402 return 0;
5403 }
5404 \f
5405 /* Utility functions. */
5406
5407 /* Copies 'src' into 'dst', performing format conversion in the process. */
5408 static void
5409 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5410 const struct rtnl_link_stats *src)
5411 {
5412 dst->rx_packets = src->rx_packets;
5413 dst->tx_packets = src->tx_packets;
5414 dst->rx_bytes = src->rx_bytes;
5415 dst->tx_bytes = src->tx_bytes;
5416 dst->rx_errors = src->rx_errors;
5417 dst->tx_errors = src->tx_errors;
5418 dst->rx_dropped = src->rx_dropped;
5419 dst->tx_dropped = src->tx_dropped;
5420 dst->multicast = src->multicast;
5421 dst->collisions = src->collisions;
5422 dst->rx_length_errors = src->rx_length_errors;
5423 dst->rx_over_errors = src->rx_over_errors;
5424 dst->rx_crc_errors = src->rx_crc_errors;
5425 dst->rx_frame_errors = src->rx_frame_errors;
5426 dst->rx_fifo_errors = src->rx_fifo_errors;
5427 dst->rx_missed_errors = src->rx_missed_errors;
5428 dst->tx_aborted_errors = src->tx_aborted_errors;
5429 dst->tx_carrier_errors = src->tx_carrier_errors;
5430 dst->tx_fifo_errors = src->tx_fifo_errors;
5431 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5432 dst->tx_window_errors = src->tx_window_errors;
5433 }
5434
5435 /* Copies 'src' into 'dst', performing format conversion in the process. */
5436 static void
5437 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5438 const struct rtnl_link_stats64 *src)
5439 {
5440 dst->rx_packets = src->rx_packets;
5441 dst->tx_packets = src->tx_packets;
5442 dst->rx_bytes = src->rx_bytes;
5443 dst->tx_bytes = src->tx_bytes;
5444 dst->rx_errors = src->rx_errors;
5445 dst->tx_errors = src->tx_errors;
5446 dst->rx_dropped = src->rx_dropped;
5447 dst->tx_dropped = src->tx_dropped;
5448 dst->multicast = src->multicast;
5449 dst->collisions = src->collisions;
5450 dst->rx_length_errors = src->rx_length_errors;
5451 dst->rx_over_errors = src->rx_over_errors;
5452 dst->rx_crc_errors = src->rx_crc_errors;
5453 dst->rx_frame_errors = src->rx_frame_errors;
5454 dst->rx_fifo_errors = src->rx_fifo_errors;
5455 dst->rx_missed_errors = src->rx_missed_errors;
5456 dst->tx_aborted_errors = src->tx_aborted_errors;
5457 dst->tx_carrier_errors = src->tx_carrier_errors;
5458 dst->tx_fifo_errors = src->tx_fifo_errors;
5459 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5460 dst->tx_window_errors = src->tx_window_errors;
5461 }
5462
5463 static int
5464 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5465 {
5466 struct ofpbuf request;
5467 struct ofpbuf *reply;
5468 int error;
5469
5470 /* Filtering all counters by default */
5471 memset(stats, 0xFF, sizeof(struct netdev_stats));
5472
5473 ofpbuf_init(&request, 0);
5474 nl_msg_put_nlmsghdr(&request,
5475 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5476 RTM_GETLINK, NLM_F_REQUEST);
5477 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5478 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5479 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5480 ofpbuf_uninit(&request);
5481 if (error) {
5482 return error;
5483 }
5484
5485 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5486 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5487 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5488 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5489 error = 0;
5490 } else {
5491 a = nl_attr_find(reply, 0, IFLA_STATS);
5492 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5493 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5494 error = 0;
5495 } else {
5496 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5497 error = EPROTO;
5498 }
5499 }
5500 } else {
5501 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5502 error = EPROTO;
5503 }
5504
5505
5506 ofpbuf_delete(reply);
5507 return error;
5508 }
5509
5510 static int
5511 get_flags(const struct netdev *dev, unsigned int *flags)
5512 {
5513 struct ifreq ifr;
5514 int error;
5515
5516 *flags = 0;
5517 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5518 if (!error) {
5519 *flags = ifr.ifr_flags;
5520 }
5521 return error;
5522 }
5523
5524 static int
5525 set_flags(const char *name, unsigned int flags)
5526 {
5527 struct ifreq ifr;
5528
5529 ifr.ifr_flags = flags;
5530 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5531 }
5532
5533 int
5534 linux_get_ifindex(const char *netdev_name)
5535 {
5536 struct ifreq ifr;
5537 int error;
5538
5539 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5540 COVERAGE_INC(netdev_get_ifindex);
5541
5542 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5543 if (error) {
5544 /* ENODEV probably means that a vif disappeared asynchronously and
5545 * hasn't been removed from the database yet, so reduce the log level
5546 * to INFO for that case. */
5547 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5548 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5549 netdev_name, ovs_strerror(error));
5550 return -error;
5551 }
5552 return ifr.ifr_ifindex;
5553 }
5554
5555 static int
5556 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5557 {
5558 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5559
5560 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5561 netdev_linux_update_via_netlink(netdev);
5562 }
5563
5564 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5565 /* Fall back to ioctl if netlink fails */
5566 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5567
5568 if (ifindex < 0) {
5569 netdev->get_ifindex_error = -ifindex;
5570 netdev->ifindex = 0;
5571 } else {
5572 netdev->get_ifindex_error = 0;
5573 netdev->ifindex = ifindex;
5574 }
5575 netdev->cache_valid |= VALID_IFINDEX;
5576 }
5577
5578 *ifindexp = netdev->ifindex;
5579 return netdev->get_ifindex_error;
5580 }
5581
5582 static int
5583 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5584 {
5585 struct ofpbuf request;
5586 struct ofpbuf *reply;
5587 struct rtnetlink_change chg;
5588 struct rtnetlink_change *change = &chg;
5589 int error;
5590
5591 ofpbuf_init(&request, 0);
5592 nl_msg_put_nlmsghdr(&request,
5593 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5594 RTM_GETLINK, NLM_F_REQUEST);
5595 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5596
5597 /* The correct identifiers for a Linux device are netnsid and ifindex,
5598 * but ifindex changes as the port is moved to another network namespace
5599 * and the interface name statically stored in ovsdb. */
5600 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5601 if (netdev_linux_netnsid_is_remote(netdev)) {
5602 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5603 }
5604 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5605 ofpbuf_uninit(&request);
5606 if (error) {
5607 ofpbuf_delete(reply);
5608 return error;
5609 }
5610
5611 if (rtnetlink_parse(reply, change)
5612 && change->nlmsg_type == RTM_NEWLINK) {
5613 bool changed = false;
5614 error = 0;
5615
5616 /* Update netdev from rtnl msg and increment its seq if needed. */
5617 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5618 netdev->carrier_resets++;
5619 changed = true;
5620 }
5621 if (change->ifi_flags != netdev->ifi_flags) {
5622 netdev->ifi_flags = change->ifi_flags;
5623 changed = true;
5624 }
5625 if (change->mtu && change->mtu != netdev->mtu) {
5626 netdev->mtu = change->mtu;
5627 netdev->cache_valid |= VALID_MTU;
5628 netdev->netdev_mtu_error = 0;
5629 changed = true;
5630 }
5631 if (!eth_addr_is_zero(change->mac)
5632 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5633 netdev->etheraddr = change->mac;
5634 netdev->cache_valid |= VALID_ETHERADDR;
5635 netdev->ether_addr_error = 0;
5636 changed = true;
5637 }
5638 if (change->if_index != netdev->ifindex) {
5639 netdev->ifindex = change->if_index;
5640 netdev->cache_valid |= VALID_IFINDEX;
5641 netdev->get_ifindex_error = 0;
5642 changed = true;
5643 }
5644 if (changed) {
5645 netdev_change_seq_changed(&netdev->up);
5646 }
5647 } else {
5648 error = EINVAL;
5649 }
5650
5651 ofpbuf_delete(reply);
5652 return error;
5653 }
5654
5655 static int
5656 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5657 {
5658 struct ifreq ifr;
5659 int hwaddr_family;
5660 int error;
5661
5662 memset(&ifr, 0, sizeof ifr);
5663 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5664 COVERAGE_INC(netdev_get_hwaddr);
5665 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5666 if (error) {
5667 /* ENODEV probably means that a vif disappeared asynchronously and
5668 * hasn't been removed from the database yet, so reduce the log level
5669 * to INFO for that case. */
5670 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5671 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5672 netdev_name, ovs_strerror(error));
5673 return error;
5674 }
5675 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5676 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5677 hwaddr_family != ARPHRD_NONE) {
5678 VLOG_INFO("%s device has unknown hardware address family %d",
5679 netdev_name, hwaddr_family);
5680 return EINVAL;
5681 }
5682 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5683 return 0;
5684 }
5685
5686 static int
5687 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5688 {
5689 struct ifreq ifr;
5690 int error;
5691
5692 memset(&ifr, 0, sizeof ifr);
5693 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5694 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5695 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5696 COVERAGE_INC(netdev_set_hwaddr);
5697 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5698 if (error) {
5699 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5700 netdev_name, ovs_strerror(error));
5701 }
5702 return error;
5703 }
5704
5705 static int
5706 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5707 int cmd, const char *cmd_name)
5708 {
5709 struct ifreq ifr;
5710 int error;
5711
5712 memset(&ifr, 0, sizeof ifr);
5713 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5714 ifr.ifr_data = (caddr_t) ecmd;
5715
5716 ecmd->cmd = cmd;
5717 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5718 if (error) {
5719 if (error != EOPNOTSUPP) {
5720 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5721 "failed: %s", cmd_name, name, ovs_strerror(error));
5722 } else {
5723 /* The device doesn't support this operation. That's pretty
5724 * common, so there's no point in logging anything. */
5725 }
5726 }
5727 return error;
5728 }
5729
5730 /* Returns an AF_PACKET raw socket or a negative errno value. */
5731 static int
5732 af_packet_sock(void)
5733 {
5734 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5735 static int sock;
5736
5737 if (ovsthread_once_start(&once)) {
5738 sock = socket(AF_PACKET, SOCK_RAW, 0);
5739 if (sock >= 0) {
5740 int error = set_nonblocking(sock);
5741 if (error) {
5742 close(sock);
5743 sock = -error;
5744 }
5745 } else {
5746 sock = -errno;
5747 VLOG_ERR("failed to create packet socket: %s",
5748 ovs_strerror(errno));
5749 }
5750 ovsthread_once_done(&once);
5751 }
5752
5753 return sock;
5754 }