]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
tests: windows ovsdb online compact
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
57 #include "hash.h"
58 #include "openvswitch/hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76 #include "util.h"
77
78 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79
80 COVERAGE_DEFINE(netdev_set_policing);
81 COVERAGE_DEFINE(netdev_arp_lookup);
82 COVERAGE_DEFINE(netdev_get_ifindex);
83 COVERAGE_DEFINE(netdev_get_hwaddr);
84 COVERAGE_DEFINE(netdev_set_hwaddr);
85 COVERAGE_DEFINE(netdev_get_ethtool);
86 COVERAGE_DEFINE(netdev_set_ethtool);
87
88 \f
89 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91 #ifndef ADVERTISED_Pause
92 #define ADVERTISED_Pause (1 << 13)
93 #endif
94 #ifndef ADVERTISED_Asym_Pause
95 #define ADVERTISED_Asym_Pause (1 << 14)
96 #endif
97
98 /* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100 #ifndef ETHTOOL_GFLAGS
101 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #endif
103 #ifndef ETHTOOL_SFLAGS
104 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 #endif
106
107 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109 #ifndef TC_RTAB_SIZE
110 #define TC_RTAB_SIZE 1024
111 #endif
112
113 /* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
122 #ifndef PACKET_AUXDATA
123 #define PACKET_AUXDATA 8
124 #endif
125 #ifndef TP_STATUS_VLAN_VALID
126 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #endif
128 #ifndef TP_STATUS_VLAN_TPID_VALID
129 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #endif
131 #undef tpacket_auxdata
132 #define tpacket_auxdata rpl_tpacket_auxdata
133 struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141 };
142
143 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
150 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 {
152 return ep->speed | (ep->speed_hi << 16);
153 }
154
155 /* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157 #ifndef SUPPORTED_1000baseKX_Full
158 #define SUPPORTED_1000baseKX_Full (1 << 17)
159 #define SUPPORTED_10000baseKX4_Full (1 << 18)
160 #define SUPPORTED_10000baseKR_Full (1 << 19)
161 #define SUPPORTED_10000baseR_FEC (1 << 20)
162 #define ADVERTISED_1000baseKX_Full (1 << 17)
163 #define ADVERTISED_10000baseKX4_Full (1 << 18)
164 #define ADVERTISED_10000baseKR_Full (1 << 19)
165 #define ADVERTISED_10000baseR_FEC (1 << 20)
166 #endif
167
168 /* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170 #ifndef SUPPORTED_40000baseKR4_Full
171 #define SUPPORTED_40000baseKR4_Full (1 << 23)
172 #define SUPPORTED_40000baseCR4_Full (1 << 24)
173 #define SUPPORTED_40000baseSR4_Full (1 << 25)
174 #define SUPPORTED_40000baseLR4_Full (1 << 26)
175 #define ADVERTISED_40000baseKR4_Full (1 << 23)
176 #define ADVERTISED_40000baseCR4_Full (1 << 24)
177 #define ADVERTISED_40000baseSR4_Full (1 << 25)
178 #define ADVERTISED_40000baseLR4_Full (1 << 26)
179 #endif
180
181 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188 #ifndef IFLA_STATS64
189 #define IFLA_STATS64 23
190 #endif
191 #define rtnl_link_stats64 rpl_rtnl_link_stats64
192 struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219 };
220
221 enum {
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
230 };
231 \f
232 /* Traffic control. */
233
234 /* An instance of a traffic control class. Always associated with a particular
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
239 struct tc {
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244 };
245
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
248 /* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252 struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
256 };
257
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264 struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
344
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
374
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401 };
402
403 static void
404 tc_init(struct tc *tc, const struct tc_ops *ops)
405 {
406 tc->ops = ops;
407 hmap_init(&tc->queues);
408 }
409
410 static void
411 tc_destroy(struct tc *tc)
412 {
413 hmap_destroy(&tc->queues);
414 }
415
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_noop;
423 static const struct tc_ops tc_ops_other;
424
425 static const struct tc_ops *const tcs[] = {
426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
431 &tc_ops_noop, /* Non operating qos type. */
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435 };
436
437 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
438 static unsigned int tc_get_major(unsigned int handle);
439 static unsigned int tc_get_minor(unsigned int handle);
440
441 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445 static struct tcmsg *tc_make_request(const struct netdev *, int type,
446 unsigned int flags, struct ofpbuf *);
447 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
448 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
449 static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
451
452 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457 static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460 static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462 static int tc_del_qdisc(struct netdev *netdev);
463 static int tc_query_qdisc(const struct netdev *netdev);
464
465 static int tc_calc_cell_log(unsigned int mtu);
466 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470 \f
471 struct netdev_linux {
472 struct netdev up;
473
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
477 unsigned int cache_valid;
478
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
485 int ifindex;
486 struct eth_addr etheraddr;
487 int mtu;
488 unsigned int ifi_flags;
489 long long int carrier_resets;
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error; /* Cached error code from set policing. */
497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499
500 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
503
504 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
505 struct tc *tc;
506
507 /* For devices of class netdev_tap_class only. */
508 int tap_fd;
509 };
510
511 struct netdev_rxq_linux {
512 struct netdev_rxq up;
513 bool is_tap;
514 int fd;
515 };
516
517 /* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520
521 /* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 *
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
528
529 static void netdev_linux_run(const struct netdev_class *);
530
531 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
532 int cmd, const char *cmd_name);
533 static int get_flags(const struct netdev *, unsigned int *flags);
534 static int set_flags(const char *, unsigned int flags);
535 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
536 enum netdev_flags on, enum netdev_flags *old_flagsp)
537 OVS_REQUIRES(netdev->mutex);
538 static int do_get_ifindex(const char *netdev_name);
539 static int get_ifindex(const struct netdev *, int *ifindexp);
540 static int do_set_addr(struct netdev *netdev,
541 int ioctl_nr, const char *ioctl_name,
542 struct in_addr addr);
543 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
544 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
545 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
546 static int af_packet_sock(void);
547 static bool netdev_linux_miimon_enabled(void);
548 static void netdev_linux_miimon_run(void);
549 static void netdev_linux_miimon_wait(void);
550 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
551
552 static bool
553 is_netdev_linux_class(const struct netdev_class *netdev_class)
554 {
555 return netdev_class->run == netdev_linux_run;
556 }
557
558 static bool
559 is_tap_netdev(const struct netdev *netdev)
560 {
561 return netdev_get_class(netdev) == &netdev_tap_class;
562 }
563
564 static struct netdev_linux *
565 netdev_linux_cast(const struct netdev *netdev)
566 {
567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
568
569 return CONTAINER_OF(netdev, struct netdev_linux, up);
570 }
571
572 static struct netdev_rxq_linux *
573 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
574 {
575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
576 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
577 }
578 \f
579 static void netdev_linux_update(struct netdev_linux *netdev,
580 const struct rtnetlink_change *)
581 OVS_REQUIRES(netdev->mutex);
582 static void netdev_linux_changed(struct netdev_linux *netdev,
583 unsigned int ifi_flags, unsigned int mask)
584 OVS_REQUIRES(netdev->mutex);
585
586 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
588 * if no such socket could be created. */
589 static struct nl_sock *
590 netdev_linux_notify_sock(void)
591 {
592 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
593 static struct nl_sock *sock;
594 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
595 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
596
597 if (ovsthread_once_start(&once)) {
598 int error;
599
600 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 if (!error) {
602 size_t i;
603
604 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
605 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 if (error) {
607 nl_sock_destroy(sock);
608 sock = NULL;
609 break;
610 }
611 }
612 }
613 ovsthread_once_done(&once);
614 }
615
616 return sock;
617 }
618
619 static bool
620 netdev_linux_miimon_enabled(void)
621 {
622 return atomic_count_get(&miimon_cnt) > 0;
623 }
624
625 static void
626 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
627 {
628 struct nl_sock *sock;
629 int error;
630
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
633 }
634
635 sock = netdev_linux_notify_sock();
636 if (!sock) {
637 return;
638 }
639
640 do {
641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub[4096 / 8];
643 struct ofpbuf buf;
644
645 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
646 error = nl_sock_recv(sock, &buf, false);
647 if (!error) {
648 struct rtnetlink_change change;
649
650 if (rtnetlink_parse(&buf, &change)) {
651 struct netdev *netdev_ = NULL;
652 char dev_name[IFNAMSIZ];
653
654 if (!change.ifname) {
655 change.ifname = if_indextoname(change.if_index, dev_name);
656 }
657
658 if (change.ifname) {
659 netdev_ = netdev_from_name(change.ifname);
660 }
661 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
663
664 ovs_mutex_lock(&netdev->mutex);
665 netdev_linux_update(netdev, &change);
666 ovs_mutex_unlock(&netdev->mutex);
667 }
668 netdev_close(netdev_);
669 }
670 } else if (error == ENOBUFS) {
671 struct shash device_shash;
672 struct shash_node *node;
673
674 nl_sock_drain(sock);
675
676 shash_init(&device_shash);
677 netdev_get_devices(&netdev_linux_class, &device_shash);
678 SHASH_FOR_EACH (node, &device_shash) {
679 struct netdev *netdev_ = node->data;
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 unsigned int flags;
682
683 ovs_mutex_lock(&netdev->mutex);
684 get_flags(netdev_, &flags);
685 netdev_linux_changed(netdev, flags, 0);
686 ovs_mutex_unlock(&netdev->mutex);
687
688 netdev_close(netdev_);
689 }
690 shash_destroy(&device_shash);
691 } else if (error != EAGAIN) {
692 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
693 ovs_strerror(error));
694 }
695 ofpbuf_uninit(&buf);
696 } while (!error);
697 }
698
699 static void
700 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
701 {
702 struct nl_sock *sock;
703
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
706 }
707 sock = netdev_linux_notify_sock();
708 if (sock) {
709 nl_sock_wait(sock, POLLIN);
710 }
711 }
712
713 static void
714 netdev_linux_changed(struct netdev_linux *dev,
715 unsigned int ifi_flags, unsigned int mask)
716 OVS_REQUIRES(dev->mutex)
717 {
718 netdev_change_seq_changed(&dev->up);
719
720 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
721 dev->carrier_resets++;
722 }
723 dev->ifi_flags = ifi_flags;
724
725 dev->cache_valid &= mask;
726 if (!(mask & VALID_IN)) {
727 netdev_get_addrs_list_flush();
728 }
729 }
730
731 static void
732 netdev_linux_update(struct netdev_linux *dev,
733 const struct rtnetlink_change *change)
734 OVS_REQUIRES(dev->mutex)
735 {
736 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
737 if (change->nlmsg_type == RTM_NEWLINK) {
738 /* Keep drv-info, and ip addresses. */
739 netdev_linux_changed(dev, change->ifi_flags,
740 VALID_DRVINFO | VALID_IN);
741
742 /* Update netdev from rtnl-change msg. */
743 if (change->mtu) {
744 dev->mtu = change->mtu;
745 dev->cache_valid |= VALID_MTU;
746 dev->netdev_mtu_error = 0;
747 }
748
749 if (!eth_addr_is_zero(change->mac)) {
750 dev->etheraddr = change->mac;
751 dev->cache_valid |= VALID_ETHERADDR;
752 dev->ether_addr_error = 0;
753 }
754
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
758 } else {
759 netdev_linux_changed(dev, change->ifi_flags, 0);
760 }
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
762 /* Invalidates in4, in6. */
763 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
764 } else {
765 OVS_NOT_REACHED();
766 }
767 }
768
769 static struct netdev *
770 netdev_linux_alloc(void)
771 {
772 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
773 return &netdev->up;
774 }
775
776 static void
777 netdev_linux_common_construct(struct netdev_linux *netdev)
778 {
779 ovs_mutex_init(&netdev->mutex);
780 }
781
782 /* Creates system and internal devices. */
783 static int
784 netdev_linux_construct(struct netdev *netdev_)
785 {
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
787 int error;
788
789 netdev_linux_common_construct(netdev);
790
791 error = get_flags(&netdev->up, &netdev->ifi_flags);
792 if (error == ENODEV) {
793 if (netdev->up.netdev_class != &netdev_internal_class) {
794 /* The device does not exist, so don't allow it to be opened. */
795 return ENODEV;
796 } else {
797 /* "Internal" netdevs have to be created as netdev objects before
798 * they exist in the kernel, because creating them in the kernel
799 * happens by passing a netdev object to dpif_port_add().
800 * Therefore, ignore the error. */
801 }
802 }
803
804 return 0;
805 }
806
807 /* For most types of netdevs we open the device for each call of
808 * netdev_open(). However, this is not the case with tap devices,
809 * since it is only possible to open the device once. In this
810 * situation we share a single file descriptor, and consequently
811 * buffers, across all readers. Therefore once data is read it will
812 * be unavailable to other reads for tap devices. */
813 static int
814 netdev_linux_construct_tap(struct netdev *netdev_)
815 {
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
817 static const char tap_dev[] = "/dev/net/tun";
818 const char *name = netdev_->name;
819 struct ifreq ifr;
820 int error;
821
822 netdev_linux_common_construct(netdev);
823
824 /* Open tap device. */
825 netdev->tap_fd = open(tap_dev, O_RDWR);
826 if (netdev->tap_fd < 0) {
827 error = errno;
828 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
829 return error;
830 }
831
832 /* Create tap device. */
833 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
834 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
835 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
836 VLOG_WARN("%s: creating tap device failed: %s", name,
837 ovs_strerror(errno));
838 error = errno;
839 goto error_close;
840 }
841
842 /* Make non-blocking. */
843 error = set_nonblocking(netdev->tap_fd);
844 if (error) {
845 goto error_close;
846 }
847
848 return 0;
849
850 error_close:
851 close(netdev->tap_fd);
852 return error;
853 }
854
855 static void
856 netdev_linux_destruct(struct netdev *netdev_)
857 {
858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
859
860 if (netdev->tc && netdev->tc->ops->tc_destroy) {
861 netdev->tc->ops->tc_destroy(netdev->tc);
862 }
863
864 if (netdev_get_class(netdev_) == &netdev_tap_class
865 && netdev->tap_fd >= 0)
866 {
867 close(netdev->tap_fd);
868 }
869
870 if (netdev->miimon_interval > 0) {
871 atomic_count_dec(&miimon_cnt);
872 }
873
874 ovs_mutex_destroy(&netdev->mutex);
875 }
876
877 static void
878 netdev_linux_dealloc(struct netdev *netdev_)
879 {
880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
881 free(netdev);
882 }
883
884 static struct netdev_rxq *
885 netdev_linux_rxq_alloc(void)
886 {
887 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
888 return &rx->up;
889 }
890
891 static int
892 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
893 {
894 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
895 struct netdev *netdev_ = rx->up.netdev;
896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
897 int error;
898
899 ovs_mutex_lock(&netdev->mutex);
900 rx->is_tap = is_tap_netdev(netdev_);
901 if (rx->is_tap) {
902 rx->fd = netdev->tap_fd;
903 } else {
904 struct sockaddr_ll sll;
905 int ifindex, val;
906 /* Result of tcpdump -dd inbound */
907 static const struct sock_filter filt[] = {
908 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
909 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
910 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
911 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 };
913 static const struct sock_fprog fprog = {
914 ARRAY_SIZE(filt), (struct sock_filter *) filt
915 };
916
917 /* Create file descriptor. */
918 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
919 if (rx->fd < 0) {
920 error = errno;
921 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
922 goto error;
923 }
924
925 val = 1;
926 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
927 error = errno;
928 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
929 netdev_get_name(netdev_), ovs_strerror(error));
930 goto error;
931 }
932
933 /* Set non-blocking mode. */
934 error = set_nonblocking(rx->fd);
935 if (error) {
936 goto error;
937 }
938
939 /* Get ethernet device index. */
940 error = get_ifindex(&netdev->up, &ifindex);
941 if (error) {
942 goto error;
943 }
944
945 /* Bind to specific ethernet device. */
946 memset(&sll, 0, sizeof sll);
947 sll.sll_family = AF_PACKET;
948 sll.sll_ifindex = ifindex;
949 sll.sll_protocol = htons(ETH_P_ALL);
950 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
951 error = errno;
952 VLOG_ERR("%s: failed to bind raw socket (%s)",
953 netdev_get_name(netdev_), ovs_strerror(error));
954 goto error;
955 }
956
957 /* Filter for only inbound packets. */
958 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
959 sizeof fprog);
960 if (error) {
961 error = errno;
962 VLOG_ERR("%s: failed to attach filter (%s)",
963 netdev_get_name(netdev_), ovs_strerror(error));
964 goto error;
965 }
966 }
967 ovs_mutex_unlock(&netdev->mutex);
968
969 return 0;
970
971 error:
972 if (rx->fd >= 0) {
973 close(rx->fd);
974 }
975 ovs_mutex_unlock(&netdev->mutex);
976 return error;
977 }
978
979 static void
980 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
981 {
982 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
983
984 if (!rx->is_tap) {
985 close(rx->fd);
986 }
987 }
988
989 static void
990 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
991 {
992 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
993
994 free(rx);
995 }
996
997 static ovs_be16
998 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
999 {
1000 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1001 return htons(aux->tp_vlan_tpid);
1002 } else if (double_tagged) {
1003 return htons(ETH_TYPE_VLAN_8021AD);
1004 } else {
1005 return htons(ETH_TYPE_VLAN_8021Q);
1006 }
1007 }
1008
1009 static bool
1010 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1011 {
1012 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1013 }
1014
1015 static int
1016 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1017 {
1018 size_t size;
1019 ssize_t retval;
1020 struct iovec iov;
1021 struct cmsghdr *cmsg;
1022 union {
1023 struct cmsghdr cmsg;
1024 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1025 } cmsg_buffer;
1026 struct msghdr msgh;
1027
1028 /* Reserve headroom for a single VLAN tag */
1029 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1030 size = dp_packet_tailroom(buffer);
1031
1032 iov.iov_base = dp_packet_data(buffer);
1033 iov.iov_len = size;
1034 msgh.msg_name = NULL;
1035 msgh.msg_namelen = 0;
1036 msgh.msg_iov = &iov;
1037 msgh.msg_iovlen = 1;
1038 msgh.msg_control = &cmsg_buffer;
1039 msgh.msg_controllen = sizeof cmsg_buffer;
1040 msgh.msg_flags = 0;
1041
1042 do {
1043 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1044 } while (retval < 0 && errno == EINTR);
1045
1046 if (retval < 0) {
1047 return errno;
1048 } else if (retval > size) {
1049 return EMSGSIZE;
1050 }
1051
1052 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1053
1054 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1055 const struct tpacket_auxdata *aux;
1056
1057 if (cmsg->cmsg_level != SOL_PACKET
1058 || cmsg->cmsg_type != PACKET_AUXDATA
1059 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1060 continue;
1061 }
1062
1063 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1064 if (auxdata_has_vlan_tci(aux)) {
1065 struct eth_header *eth;
1066 bool double_tagged;
1067
1068 if (retval < ETH_HEADER_LEN) {
1069 return EINVAL;
1070 }
1071
1072 eth = dp_packet_data(buffer);
1073 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1074
1075 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1076 htons(aux->tp_vlan_tci));
1077 break;
1078 }
1079 }
1080
1081 return 0;
1082 }
1083
1084 static int
1085 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1086 {
1087 ssize_t retval;
1088 size_t size = dp_packet_tailroom(buffer);
1089
1090 do {
1091 retval = read(fd, dp_packet_data(buffer), size);
1092 } while (retval < 0 && errno == EINTR);
1093
1094 if (retval < 0) {
1095 return errno;
1096 }
1097
1098 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1099 return 0;
1100 }
1101
1102 static int
1103 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
1104 {
1105 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1106 struct netdev *netdev = rx->up.netdev;
1107 struct dp_packet *buffer;
1108 ssize_t retval;
1109 int mtu;
1110
1111 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1112 mtu = ETH_PAYLOAD_MAX;
1113 }
1114
1115 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1116 DP_NETDEV_HEADROOM);
1117 retval = (rx->is_tap
1118 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1119 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1120
1121 if (retval) {
1122 if (retval != EAGAIN && retval != EMSGSIZE) {
1123 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1124 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1125 }
1126 dp_packet_delete(buffer);
1127 } else {
1128 batch->packets[0] = buffer;
1129 batch->count = 1;
1130 }
1131
1132 return retval;
1133 }
1134
1135 static void
1136 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1137 {
1138 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1139 poll_fd_wait(rx->fd, POLLIN);
1140 }
1141
1142 static int
1143 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1144 {
1145 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1146 if (rx->is_tap) {
1147 struct ifreq ifr;
1148 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1149 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1150 if (error) {
1151 return error;
1152 }
1153 drain_fd(rx->fd, ifr.ifr_qlen);
1154 return 0;
1155 } else {
1156 return drain_rcvbuf(rx->fd);
1157 }
1158 }
1159
1160 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1161 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1162 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1163 * the packet is too big or too small to transmit on the device.
1164 *
1165 * The caller retains ownership of 'buffer' in all cases.
1166 *
1167 * The kernel maintains a packet transmission queue, so the caller is not
1168 * expected to do additional queuing of packets. */
1169 static int
1170 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1171 struct dp_packet_batch *batch, bool may_steal,
1172 bool concurrent_txq OVS_UNUSED)
1173 {
1174 int i;
1175 int error = 0;
1176
1177 /* 'i' is incremented only if there's no error */
1178 for (i = 0; i < batch->count;) {
1179 const void *data = dp_packet_data(batch->packets[i]);
1180 size_t size = dp_packet_size(batch->packets[i]);
1181 ssize_t retval;
1182
1183 /* Truncate the packet if it is configured. */
1184 size -= dp_packet_get_cutlen(batch->packets[i]);
1185
1186 if (!is_tap_netdev(netdev_)) {
1187 /* Use our AF_PACKET socket to send to this device. */
1188 struct sockaddr_ll sll;
1189 struct msghdr msg;
1190 struct iovec iov;
1191 int ifindex;
1192 int sock;
1193
1194 sock = af_packet_sock();
1195 if (sock < 0) {
1196 return -sock;
1197 }
1198
1199 ifindex = netdev_get_ifindex(netdev_);
1200 if (ifindex < 0) {
1201 return -ifindex;
1202 }
1203
1204 /* We don't bother setting most fields in sockaddr_ll because the
1205 * kernel ignores them for SOCK_RAW. */
1206 memset(&sll, 0, sizeof sll);
1207 sll.sll_family = AF_PACKET;
1208 sll.sll_ifindex = ifindex;
1209
1210 iov.iov_base = CONST_CAST(void *, data);
1211 iov.iov_len = size;
1212
1213 msg.msg_name = &sll;
1214 msg.msg_namelen = sizeof sll;
1215 msg.msg_iov = &iov;
1216 msg.msg_iovlen = 1;
1217 msg.msg_control = NULL;
1218 msg.msg_controllen = 0;
1219 msg.msg_flags = 0;
1220
1221 retval = sendmsg(sock, &msg, 0);
1222 } else {
1223 /* Use the tap fd to send to this device. This is essential for
1224 * tap devices, because packets sent to a tap device with an
1225 * AF_PACKET socket will loop back to be *received* again on the
1226 * tap device. This doesn't occur on other interface types
1227 * because we attach a socket filter to the rx socket. */
1228 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1229
1230 retval = write(netdev->tap_fd, data, size);
1231 }
1232
1233 if (retval < 0) {
1234 if (errno == EINTR) {
1235 /* The send was interrupted by a signal. Retry the packet by
1236 * continuing without incrementing 'i'.*/
1237 continue;
1238 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1239 /* The Linux tap driver returns EIO if the device is not up.
1240 * From the OVS side this is not an error, so ignore it. */
1241 } else {
1242 /* The Linux AF_PACKET implementation never blocks waiting for
1243 * room for packets, instead returning ENOBUFS. Translate this
1244 * into EAGAIN for the caller. */
1245 error = errno == ENOBUFS ? EAGAIN : errno;
1246 break;
1247 }
1248 } else if (retval != size) {
1249 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1250 " of %"PRIuSIZE") on %s", retval, size,
1251 netdev_get_name(netdev_));
1252 error = EMSGSIZE;
1253 break;
1254 }
1255
1256 /* Process the next packet in the batch */
1257 i++;
1258 }
1259
1260 dp_packet_delete_batch(batch, may_steal);
1261
1262 if (error && error != EAGAIN) {
1263 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1264 netdev_get_name(netdev_), ovs_strerror(error));
1265 }
1266
1267 return error;
1268
1269 }
1270
1271 /* Registers with the poll loop to wake up from the next call to poll_block()
1272 * when the packet transmission queue has sufficient room to transmit a packet
1273 * with netdev_send().
1274 *
1275 * The kernel maintains a packet transmission queue, so the client is not
1276 * expected to do additional queuing of packets. Thus, this function is
1277 * unlikely to ever be used. It is included for completeness. */
1278 static void
1279 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1280 {
1281 if (is_tap_netdev(netdev)) {
1282 /* TAP device always accepts packets.*/
1283 poll_immediate_wake();
1284 }
1285 }
1286
1287 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1288 * otherwise a positive errno value. */
1289 static int
1290 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1291 {
1292 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1293 enum netdev_flags old_flags = 0;
1294 int error;
1295
1296 ovs_mutex_lock(&netdev->mutex);
1297
1298 if (netdev->cache_valid & VALID_ETHERADDR) {
1299 error = netdev->ether_addr_error;
1300 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1301 goto exit;
1302 }
1303 netdev->cache_valid &= ~VALID_ETHERADDR;
1304 }
1305
1306 /* Tap devices must be brought down before setting the address. */
1307 if (is_tap_netdev(netdev_)) {
1308 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1309 }
1310 error = set_etheraddr(netdev_get_name(netdev_), mac);
1311 if (!error || error == ENODEV) {
1312 netdev->ether_addr_error = error;
1313 netdev->cache_valid |= VALID_ETHERADDR;
1314 if (!error) {
1315 netdev->etheraddr = mac;
1316 }
1317 }
1318
1319 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1320 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1321 }
1322
1323 exit:
1324 ovs_mutex_unlock(&netdev->mutex);
1325 return error;
1326 }
1327
1328 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1329 static int
1330 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1331 {
1332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1333 int error;
1334
1335 ovs_mutex_lock(&netdev->mutex);
1336 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1337 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1338 &netdev->etheraddr);
1339 netdev->cache_valid |= VALID_ETHERADDR;
1340 }
1341
1342 error = netdev->ether_addr_error;
1343 if (!error) {
1344 *mac = netdev->etheraddr;
1345 }
1346 ovs_mutex_unlock(&netdev->mutex);
1347
1348 return error;
1349 }
1350
1351 static int
1352 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1353 {
1354 int error;
1355
1356 if (!(netdev->cache_valid & VALID_MTU)) {
1357 struct ifreq ifr;
1358
1359 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1360 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1361 netdev->mtu = ifr.ifr_mtu;
1362 netdev->cache_valid |= VALID_MTU;
1363 }
1364
1365 error = netdev->netdev_mtu_error;
1366 if (!error) {
1367 *mtup = netdev->mtu;
1368 }
1369
1370 return error;
1371 }
1372
1373 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1374 * in bytes, not including the hardware header; thus, this is typically 1500
1375 * bytes for Ethernet devices. */
1376 static int
1377 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1378 {
1379 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1380 int error;
1381
1382 ovs_mutex_lock(&netdev->mutex);
1383 error = netdev_linux_get_mtu__(netdev, mtup);
1384 ovs_mutex_unlock(&netdev->mutex);
1385
1386 return error;
1387 }
1388
1389 /* Sets the maximum size of transmitted (MTU) for given device using linux
1390 * networking ioctl interface.
1391 */
1392 static int
1393 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1394 {
1395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1396 struct ifreq ifr;
1397 int error;
1398
1399 ovs_mutex_lock(&netdev->mutex);
1400 if (netdev->cache_valid & VALID_MTU) {
1401 error = netdev->netdev_mtu_error;
1402 if (error || netdev->mtu == mtu) {
1403 goto exit;
1404 }
1405 netdev->cache_valid &= ~VALID_MTU;
1406 }
1407 ifr.ifr_mtu = mtu;
1408 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1409 SIOCSIFMTU, "SIOCSIFMTU");
1410 if (!error || error == ENODEV) {
1411 netdev->netdev_mtu_error = error;
1412 netdev->mtu = ifr.ifr_mtu;
1413 netdev->cache_valid |= VALID_MTU;
1414 }
1415 exit:
1416 ovs_mutex_unlock(&netdev->mutex);
1417 return error;
1418 }
1419
1420 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1421 * On failure, returns a negative errno value. */
1422 static int
1423 netdev_linux_get_ifindex(const struct netdev *netdev_)
1424 {
1425 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1426 int ifindex, error;
1427
1428 ovs_mutex_lock(&netdev->mutex);
1429 error = get_ifindex(netdev_, &ifindex);
1430 ovs_mutex_unlock(&netdev->mutex);
1431
1432 return error ? -error : ifindex;
1433 }
1434
1435 static int
1436 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1437 {
1438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1439
1440 ovs_mutex_lock(&netdev->mutex);
1441 if (netdev->miimon_interval > 0) {
1442 *carrier = netdev->miimon;
1443 } else {
1444 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1445 }
1446 ovs_mutex_unlock(&netdev->mutex);
1447
1448 return 0;
1449 }
1450
1451 static long long int
1452 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1453 {
1454 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1455 long long int carrier_resets;
1456
1457 ovs_mutex_lock(&netdev->mutex);
1458 carrier_resets = netdev->carrier_resets;
1459 ovs_mutex_unlock(&netdev->mutex);
1460
1461 return carrier_resets;
1462 }
1463
1464 static int
1465 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1466 struct mii_ioctl_data *data)
1467 {
1468 struct ifreq ifr;
1469 int error;
1470
1471 memset(&ifr, 0, sizeof ifr);
1472 memcpy(&ifr.ifr_data, data, sizeof *data);
1473 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1474 memcpy(data, &ifr.ifr_data, sizeof *data);
1475
1476 return error;
1477 }
1478
1479 static int
1480 netdev_linux_get_miimon(const char *name, bool *miimon)
1481 {
1482 struct mii_ioctl_data data;
1483 int error;
1484
1485 *miimon = false;
1486
1487 memset(&data, 0, sizeof data);
1488 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1489 if (!error) {
1490 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1491 data.reg_num = MII_BMSR;
1492 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1493 &data);
1494
1495 if (!error) {
1496 *miimon = !!(data.val_out & BMSR_LSTATUS);
1497 }
1498 }
1499 if (error) {
1500 struct ethtool_cmd ecmd;
1501
1502 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1503 name);
1504
1505 COVERAGE_INC(netdev_get_ethtool);
1506 memset(&ecmd, 0, sizeof ecmd);
1507 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1508 "ETHTOOL_GLINK");
1509 if (!error) {
1510 struct ethtool_value eval;
1511
1512 memcpy(&eval, &ecmd, sizeof eval);
1513 *miimon = !!eval.data;
1514 } else {
1515 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1516 }
1517 }
1518
1519 return error;
1520 }
1521
1522 static int
1523 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1524 long long int interval)
1525 {
1526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1527
1528 ovs_mutex_lock(&netdev->mutex);
1529 interval = interval > 0 ? MAX(interval, 100) : 0;
1530 if (netdev->miimon_interval != interval) {
1531 if (interval && !netdev->miimon_interval) {
1532 atomic_count_inc(&miimon_cnt);
1533 } else if (!interval && netdev->miimon_interval) {
1534 atomic_count_dec(&miimon_cnt);
1535 }
1536
1537 netdev->miimon_interval = interval;
1538 timer_set_expired(&netdev->miimon_timer);
1539 }
1540 ovs_mutex_unlock(&netdev->mutex);
1541
1542 return 0;
1543 }
1544
1545 static void
1546 netdev_linux_miimon_run(void)
1547 {
1548 struct shash device_shash;
1549 struct shash_node *node;
1550
1551 shash_init(&device_shash);
1552 netdev_get_devices(&netdev_linux_class, &device_shash);
1553 SHASH_FOR_EACH (node, &device_shash) {
1554 struct netdev *netdev = node->data;
1555 struct netdev_linux *dev = netdev_linux_cast(netdev);
1556 bool miimon;
1557
1558 ovs_mutex_lock(&dev->mutex);
1559 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1560 netdev_linux_get_miimon(dev->up.name, &miimon);
1561 if (miimon != dev->miimon) {
1562 dev->miimon = miimon;
1563 netdev_linux_changed(dev, dev->ifi_flags, 0);
1564 }
1565
1566 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1567 }
1568 ovs_mutex_unlock(&dev->mutex);
1569 netdev_close(netdev);
1570 }
1571
1572 shash_destroy(&device_shash);
1573 }
1574
1575 static void
1576 netdev_linux_miimon_wait(void)
1577 {
1578 struct shash device_shash;
1579 struct shash_node *node;
1580
1581 shash_init(&device_shash);
1582 netdev_get_devices(&netdev_linux_class, &device_shash);
1583 SHASH_FOR_EACH (node, &device_shash) {
1584 struct netdev *netdev = node->data;
1585 struct netdev_linux *dev = netdev_linux_cast(netdev);
1586
1587 ovs_mutex_lock(&dev->mutex);
1588 if (dev->miimon_interval > 0) {
1589 timer_wait(&dev->miimon_timer);
1590 }
1591 ovs_mutex_unlock(&dev->mutex);
1592 netdev_close(netdev);
1593 }
1594 shash_destroy(&device_shash);
1595 }
1596
1597 static void
1598 swap_uint64(uint64_t *a, uint64_t *b)
1599 {
1600 uint64_t tmp = *a;
1601 *a = *b;
1602 *b = tmp;
1603 }
1604
1605 /* Copies 'src' into 'dst', performing format conversion in the process.
1606 *
1607 * 'src' is allowed to be misaligned. */
1608 static void
1609 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1610 const struct ovs_vport_stats *src)
1611 {
1612 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1613 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1614 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1615 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1616 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1617 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1618 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1619 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1620 dst->multicast = 0;
1621 dst->collisions = 0;
1622 dst->rx_length_errors = 0;
1623 dst->rx_over_errors = 0;
1624 dst->rx_crc_errors = 0;
1625 dst->rx_frame_errors = 0;
1626 dst->rx_fifo_errors = 0;
1627 dst->rx_missed_errors = 0;
1628 dst->tx_aborted_errors = 0;
1629 dst->tx_carrier_errors = 0;
1630 dst->tx_fifo_errors = 0;
1631 dst->tx_heartbeat_errors = 0;
1632 dst->tx_window_errors = 0;
1633 }
1634
1635 static int
1636 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1637 {
1638 struct dpif_netlink_vport reply;
1639 struct ofpbuf *buf;
1640 int error;
1641
1642 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1643 if (error) {
1644 return error;
1645 } else if (!reply.stats) {
1646 ofpbuf_delete(buf);
1647 return EOPNOTSUPP;
1648 }
1649
1650 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1651
1652 ofpbuf_delete(buf);
1653
1654 return 0;
1655 }
1656
1657 static void
1658 get_stats_via_vport(const struct netdev *netdev_,
1659 struct netdev_stats *stats)
1660 {
1661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1662
1663 if (!netdev->vport_stats_error ||
1664 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1665 int error;
1666
1667 error = get_stats_via_vport__(netdev_, stats);
1668 if (error && error != ENOENT && error != ENODEV) {
1669 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1670 "(%s)",
1671 netdev_get_name(netdev_), ovs_strerror(error));
1672 }
1673 netdev->vport_stats_error = error;
1674 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1675 }
1676 }
1677
1678 /* Retrieves current device stats for 'netdev-linux'. */
1679 static int
1680 netdev_linux_get_stats(const struct netdev *netdev_,
1681 struct netdev_stats *stats)
1682 {
1683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1684 struct netdev_stats dev_stats;
1685 int error;
1686
1687 ovs_mutex_lock(&netdev->mutex);
1688 get_stats_via_vport(netdev_, stats);
1689 error = get_stats_via_netlink(netdev_, &dev_stats);
1690 if (error) {
1691 if (!netdev->vport_stats_error) {
1692 error = 0;
1693 }
1694 } else if (netdev->vport_stats_error) {
1695 /* stats not available from OVS then use netdev stats. */
1696 *stats = dev_stats;
1697 } else {
1698 /* Use kernel netdev's packet and byte counts since vport's counters
1699 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1700 * enabled. */
1701 stats->rx_packets = dev_stats.rx_packets;
1702 stats->rx_bytes = dev_stats.rx_bytes;
1703 stats->tx_packets = dev_stats.tx_packets;
1704 stats->tx_bytes = dev_stats.tx_bytes;
1705
1706 stats->rx_errors += dev_stats.rx_errors;
1707 stats->tx_errors += dev_stats.tx_errors;
1708 stats->rx_dropped += dev_stats.rx_dropped;
1709 stats->tx_dropped += dev_stats.tx_dropped;
1710 stats->multicast += dev_stats.multicast;
1711 stats->collisions += dev_stats.collisions;
1712 stats->rx_length_errors += dev_stats.rx_length_errors;
1713 stats->rx_over_errors += dev_stats.rx_over_errors;
1714 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1715 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1716 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1717 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1718 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1719 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1720 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1721 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1722 stats->tx_window_errors += dev_stats.tx_window_errors;
1723 }
1724 ovs_mutex_unlock(&netdev->mutex);
1725
1726 return error;
1727 }
1728
1729 /* Retrieves current device stats for 'netdev-tap' netdev or
1730 * netdev-internal. */
1731 static int
1732 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1733 {
1734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1735 struct netdev_stats dev_stats;
1736 int error;
1737
1738 ovs_mutex_lock(&netdev->mutex);
1739 get_stats_via_vport(netdev_, stats);
1740 error = get_stats_via_netlink(netdev_, &dev_stats);
1741 if (error) {
1742 if (!netdev->vport_stats_error) {
1743 error = 0;
1744 }
1745 } else if (netdev->vport_stats_error) {
1746 /* Transmit and receive stats will appear to be swapped relative to the
1747 * other ports since we are the one sending the data, not a remote
1748 * computer. For consistency, we swap them back here. This does not
1749 * apply if we are getting stats from the vport layer because it always
1750 * tracks stats from the perspective of the switch. */
1751
1752 *stats = dev_stats;
1753 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1754 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1755 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1756 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1757 stats->rx_length_errors = 0;
1758 stats->rx_over_errors = 0;
1759 stats->rx_crc_errors = 0;
1760 stats->rx_frame_errors = 0;
1761 stats->rx_fifo_errors = 0;
1762 stats->rx_missed_errors = 0;
1763 stats->tx_aborted_errors = 0;
1764 stats->tx_carrier_errors = 0;
1765 stats->tx_fifo_errors = 0;
1766 stats->tx_heartbeat_errors = 0;
1767 stats->tx_window_errors = 0;
1768 } else {
1769 /* Use kernel netdev's packet and byte counts since vport counters
1770 * do not reflect packet counts on the wire when GSO, TSO or GRO
1771 * are enabled. */
1772 stats->rx_packets = dev_stats.tx_packets;
1773 stats->rx_bytes = dev_stats.tx_bytes;
1774 stats->tx_packets = dev_stats.rx_packets;
1775 stats->tx_bytes = dev_stats.rx_bytes;
1776
1777 stats->rx_dropped += dev_stats.tx_dropped;
1778 stats->tx_dropped += dev_stats.rx_dropped;
1779
1780 stats->rx_errors += dev_stats.tx_errors;
1781 stats->tx_errors += dev_stats.rx_errors;
1782
1783 stats->multicast += dev_stats.multicast;
1784 stats->collisions += dev_stats.collisions;
1785 }
1786 ovs_mutex_unlock(&netdev->mutex);
1787
1788 return error;
1789 }
1790
1791 static int
1792 netdev_internal_get_stats(const struct netdev *netdev_,
1793 struct netdev_stats *stats)
1794 {
1795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1796 int error;
1797
1798 ovs_mutex_lock(&netdev->mutex);
1799 get_stats_via_vport(netdev_, stats);
1800 error = netdev->vport_stats_error;
1801 ovs_mutex_unlock(&netdev->mutex);
1802
1803 return error;
1804 }
1805
1806 static void
1807 netdev_linux_read_features(struct netdev_linux *netdev)
1808 {
1809 struct ethtool_cmd ecmd;
1810 uint32_t speed;
1811 int error;
1812
1813 if (netdev->cache_valid & VALID_FEATURES) {
1814 return;
1815 }
1816
1817 COVERAGE_INC(netdev_get_ethtool);
1818 memset(&ecmd, 0, sizeof ecmd);
1819 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1820 ETHTOOL_GSET, "ETHTOOL_GSET");
1821 if (error) {
1822 goto out;
1823 }
1824
1825 /* Supported features. */
1826 netdev->supported = 0;
1827 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1828 netdev->supported |= NETDEV_F_10MB_HD;
1829 }
1830 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1831 netdev->supported |= NETDEV_F_10MB_FD;
1832 }
1833 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1834 netdev->supported |= NETDEV_F_100MB_HD;
1835 }
1836 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1837 netdev->supported |= NETDEV_F_100MB_FD;
1838 }
1839 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1840 netdev->supported |= NETDEV_F_1GB_HD;
1841 }
1842 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1843 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1844 netdev->supported |= NETDEV_F_1GB_FD;
1845 }
1846 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1847 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1848 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1849 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1850 netdev->supported |= NETDEV_F_10GB_FD;
1851 }
1852 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1853 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1854 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1855 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1856 netdev->supported |= NETDEV_F_40GB_FD;
1857 }
1858 if (ecmd.supported & SUPPORTED_TP) {
1859 netdev->supported |= NETDEV_F_COPPER;
1860 }
1861 if (ecmd.supported & SUPPORTED_FIBRE) {
1862 netdev->supported |= NETDEV_F_FIBER;
1863 }
1864 if (ecmd.supported & SUPPORTED_Autoneg) {
1865 netdev->supported |= NETDEV_F_AUTONEG;
1866 }
1867 if (ecmd.supported & SUPPORTED_Pause) {
1868 netdev->supported |= NETDEV_F_PAUSE;
1869 }
1870 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1871 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1872 }
1873
1874 /* Advertised features. */
1875 netdev->advertised = 0;
1876 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1877 netdev->advertised |= NETDEV_F_10MB_HD;
1878 }
1879 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1880 netdev->advertised |= NETDEV_F_10MB_FD;
1881 }
1882 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1883 netdev->advertised |= NETDEV_F_100MB_HD;
1884 }
1885 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1886 netdev->advertised |= NETDEV_F_100MB_FD;
1887 }
1888 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1889 netdev->advertised |= NETDEV_F_1GB_HD;
1890 }
1891 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1892 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1893 netdev->advertised |= NETDEV_F_1GB_FD;
1894 }
1895 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1896 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1897 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1898 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1899 netdev->advertised |= NETDEV_F_10GB_FD;
1900 }
1901 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1902 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1903 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1904 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1905 netdev->advertised |= NETDEV_F_40GB_FD;
1906 }
1907 if (ecmd.advertising & ADVERTISED_TP) {
1908 netdev->advertised |= NETDEV_F_COPPER;
1909 }
1910 if (ecmd.advertising & ADVERTISED_FIBRE) {
1911 netdev->advertised |= NETDEV_F_FIBER;
1912 }
1913 if (ecmd.advertising & ADVERTISED_Autoneg) {
1914 netdev->advertised |= NETDEV_F_AUTONEG;
1915 }
1916 if (ecmd.advertising & ADVERTISED_Pause) {
1917 netdev->advertised |= NETDEV_F_PAUSE;
1918 }
1919 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1920 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1921 }
1922
1923 /* Current settings. */
1924 speed = ethtool_cmd_speed(&ecmd);
1925 if (speed == SPEED_10) {
1926 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1927 } else if (speed == SPEED_100) {
1928 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1929 } else if (speed == SPEED_1000) {
1930 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1931 } else if (speed == SPEED_10000) {
1932 netdev->current = NETDEV_F_10GB_FD;
1933 } else if (speed == 40000) {
1934 netdev->current = NETDEV_F_40GB_FD;
1935 } else if (speed == 100000) {
1936 netdev->current = NETDEV_F_100GB_FD;
1937 } else if (speed == 1000000) {
1938 netdev->current = NETDEV_F_1TB_FD;
1939 } else {
1940 netdev->current = 0;
1941 }
1942
1943 if (ecmd.port == PORT_TP) {
1944 netdev->current |= NETDEV_F_COPPER;
1945 } else if (ecmd.port == PORT_FIBRE) {
1946 netdev->current |= NETDEV_F_FIBER;
1947 }
1948
1949 if (ecmd.autoneg) {
1950 netdev->current |= NETDEV_F_AUTONEG;
1951 }
1952
1953 out:
1954 netdev->cache_valid |= VALID_FEATURES;
1955 netdev->get_features_error = error;
1956 }
1957
1958 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1959 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1960 * Returns 0 if successful, otherwise a positive errno value. */
1961 static int
1962 netdev_linux_get_features(const struct netdev *netdev_,
1963 enum netdev_features *current,
1964 enum netdev_features *advertised,
1965 enum netdev_features *supported,
1966 enum netdev_features *peer)
1967 {
1968 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1969 int error;
1970
1971 ovs_mutex_lock(&netdev->mutex);
1972 netdev_linux_read_features(netdev);
1973 if (!netdev->get_features_error) {
1974 *current = netdev->current;
1975 *advertised = netdev->advertised;
1976 *supported = netdev->supported;
1977 *peer = 0; /* XXX */
1978 }
1979 error = netdev->get_features_error;
1980 ovs_mutex_unlock(&netdev->mutex);
1981
1982 return error;
1983 }
1984
1985 /* Set the features advertised by 'netdev' to 'advertise'. */
1986 static int
1987 netdev_linux_set_advertisements(struct netdev *netdev_,
1988 enum netdev_features advertise)
1989 {
1990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1991 struct ethtool_cmd ecmd;
1992 int error;
1993
1994 ovs_mutex_lock(&netdev->mutex);
1995
1996 COVERAGE_INC(netdev_get_ethtool);
1997 memset(&ecmd, 0, sizeof ecmd);
1998 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1999 ETHTOOL_GSET, "ETHTOOL_GSET");
2000 if (error) {
2001 goto exit;
2002 }
2003
2004 ecmd.advertising = 0;
2005 if (advertise & NETDEV_F_10MB_HD) {
2006 ecmd.advertising |= ADVERTISED_10baseT_Half;
2007 }
2008 if (advertise & NETDEV_F_10MB_FD) {
2009 ecmd.advertising |= ADVERTISED_10baseT_Full;
2010 }
2011 if (advertise & NETDEV_F_100MB_HD) {
2012 ecmd.advertising |= ADVERTISED_100baseT_Half;
2013 }
2014 if (advertise & NETDEV_F_100MB_FD) {
2015 ecmd.advertising |= ADVERTISED_100baseT_Full;
2016 }
2017 if (advertise & NETDEV_F_1GB_HD) {
2018 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2019 }
2020 if (advertise & NETDEV_F_1GB_FD) {
2021 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2022 }
2023 if (advertise & NETDEV_F_10GB_FD) {
2024 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2025 }
2026 if (advertise & NETDEV_F_COPPER) {
2027 ecmd.advertising |= ADVERTISED_TP;
2028 }
2029 if (advertise & NETDEV_F_FIBER) {
2030 ecmd.advertising |= ADVERTISED_FIBRE;
2031 }
2032 if (advertise & NETDEV_F_AUTONEG) {
2033 ecmd.advertising |= ADVERTISED_Autoneg;
2034 }
2035 if (advertise & NETDEV_F_PAUSE) {
2036 ecmd.advertising |= ADVERTISED_Pause;
2037 }
2038 if (advertise & NETDEV_F_PAUSE_ASYM) {
2039 ecmd.advertising |= ADVERTISED_Asym_Pause;
2040 }
2041 COVERAGE_INC(netdev_set_ethtool);
2042 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2043 ETHTOOL_SSET, "ETHTOOL_SSET");
2044
2045 exit:
2046 ovs_mutex_unlock(&netdev->mutex);
2047 return error;
2048 }
2049
2050 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2051 * successful, otherwise a positive errno value. */
2052 static int
2053 netdev_linux_set_policing(struct netdev *netdev_,
2054 uint32_t kbits_rate, uint32_t kbits_burst)
2055 {
2056 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2057 const char *netdev_name = netdev_get_name(netdev_);
2058 int error;
2059
2060 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2061 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2062 : kbits_burst); /* Stick with user-specified value. */
2063
2064 ovs_mutex_lock(&netdev->mutex);
2065 if (netdev->cache_valid & VALID_POLICING) {
2066 error = netdev->netdev_policing_error;
2067 if (error || (netdev->kbits_rate == kbits_rate &&
2068 netdev->kbits_burst == kbits_burst)) {
2069 /* Assume that settings haven't changed since we last set them. */
2070 goto out;
2071 }
2072 netdev->cache_valid &= ~VALID_POLICING;
2073 }
2074
2075 COVERAGE_INC(netdev_set_policing);
2076 /* Remove any existing ingress qdisc. */
2077 error = tc_add_del_ingress_qdisc(netdev_, false);
2078 if (error) {
2079 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2080 netdev_name, ovs_strerror(error));
2081 goto out;
2082 }
2083
2084 if (kbits_rate) {
2085 error = tc_add_del_ingress_qdisc(netdev_, true);
2086 if (error) {
2087 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2088 netdev_name, ovs_strerror(error));
2089 goto out;
2090 }
2091
2092 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2093 if (error){
2094 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2095 netdev_name, ovs_strerror(error));
2096 goto out;
2097 }
2098 }
2099
2100 netdev->kbits_rate = kbits_rate;
2101 netdev->kbits_burst = kbits_burst;
2102
2103 out:
2104 if (!error || error == ENODEV) {
2105 netdev->netdev_policing_error = error;
2106 netdev->cache_valid |= VALID_POLICING;
2107 }
2108 ovs_mutex_unlock(&netdev->mutex);
2109 return error;
2110 }
2111
2112 static int
2113 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2114 struct sset *types)
2115 {
2116 const struct tc_ops *const *opsp;
2117 for (opsp = tcs; *opsp != NULL; opsp++) {
2118 const struct tc_ops *ops = *opsp;
2119 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2120 sset_add(types, ops->ovs_name);
2121 }
2122 }
2123 return 0;
2124 }
2125
2126 static const struct tc_ops *
2127 tc_lookup_ovs_name(const char *name)
2128 {
2129 const struct tc_ops *const *opsp;
2130
2131 for (opsp = tcs; *opsp != NULL; opsp++) {
2132 const struct tc_ops *ops = *opsp;
2133 if (!strcmp(name, ops->ovs_name)) {
2134 return ops;
2135 }
2136 }
2137 return NULL;
2138 }
2139
2140 static const struct tc_ops *
2141 tc_lookup_linux_name(const char *name)
2142 {
2143 const struct tc_ops *const *opsp;
2144
2145 for (opsp = tcs; *opsp != NULL; opsp++) {
2146 const struct tc_ops *ops = *opsp;
2147 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2148 return ops;
2149 }
2150 }
2151 return NULL;
2152 }
2153
2154 static struct tc_queue *
2155 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2156 size_t hash)
2157 {
2158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2159 struct tc_queue *queue;
2160
2161 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2162 if (queue->queue_id == queue_id) {
2163 return queue;
2164 }
2165 }
2166 return NULL;
2167 }
2168
2169 static struct tc_queue *
2170 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2171 {
2172 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2173 }
2174
2175 static int
2176 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2177 const char *type,
2178 struct netdev_qos_capabilities *caps)
2179 {
2180 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2181 if (!ops) {
2182 return EOPNOTSUPP;
2183 }
2184 caps->n_queues = ops->n_queues;
2185 return 0;
2186 }
2187
2188 static int
2189 netdev_linux_get_qos(const struct netdev *netdev_,
2190 const char **typep, struct smap *details)
2191 {
2192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2193 int error;
2194
2195 ovs_mutex_lock(&netdev->mutex);
2196 error = tc_query_qdisc(netdev_);
2197 if (!error) {
2198 *typep = netdev->tc->ops->ovs_name;
2199 error = (netdev->tc->ops->qdisc_get
2200 ? netdev->tc->ops->qdisc_get(netdev_, details)
2201 : 0);
2202 }
2203 ovs_mutex_unlock(&netdev->mutex);
2204
2205 return error;
2206 }
2207
2208 static int
2209 netdev_linux_set_qos(struct netdev *netdev_,
2210 const char *type, const struct smap *details)
2211 {
2212 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2213 const struct tc_ops *new_ops;
2214 int error;
2215
2216 new_ops = tc_lookup_ovs_name(type);
2217 if (!new_ops || !new_ops->tc_install) {
2218 return EOPNOTSUPP;
2219 }
2220
2221 if (new_ops == &tc_ops_noop) {
2222 return new_ops->tc_install(netdev_, details);
2223 }
2224
2225 ovs_mutex_lock(&netdev->mutex);
2226 error = tc_query_qdisc(netdev_);
2227 if (error) {
2228 goto exit;
2229 }
2230
2231 if (new_ops == netdev->tc->ops) {
2232 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2233 } else {
2234 /* Delete existing qdisc. */
2235 error = tc_del_qdisc(netdev_);
2236 if (error) {
2237 goto exit;
2238 }
2239 ovs_assert(netdev->tc == NULL);
2240
2241 /* Install new qdisc. */
2242 error = new_ops->tc_install(netdev_, details);
2243 ovs_assert((error == 0) == (netdev->tc != NULL));
2244 }
2245
2246 exit:
2247 ovs_mutex_unlock(&netdev->mutex);
2248 return error;
2249 }
2250
2251 static int
2252 netdev_linux_get_queue(const struct netdev *netdev_,
2253 unsigned int queue_id, struct smap *details)
2254 {
2255 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2256 int error;
2257
2258 ovs_mutex_lock(&netdev->mutex);
2259 error = tc_query_qdisc(netdev_);
2260 if (!error) {
2261 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2262 error = (queue
2263 ? netdev->tc->ops->class_get(netdev_, queue, details)
2264 : ENOENT);
2265 }
2266 ovs_mutex_unlock(&netdev->mutex);
2267
2268 return error;
2269 }
2270
2271 static int
2272 netdev_linux_set_queue(struct netdev *netdev_,
2273 unsigned int queue_id, const struct smap *details)
2274 {
2275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2276 int error;
2277
2278 ovs_mutex_lock(&netdev->mutex);
2279 error = tc_query_qdisc(netdev_);
2280 if (!error) {
2281 error = (queue_id < netdev->tc->ops->n_queues
2282 && netdev->tc->ops->class_set
2283 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2284 : EINVAL);
2285 }
2286 ovs_mutex_unlock(&netdev->mutex);
2287
2288 return error;
2289 }
2290
2291 static int
2292 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2293 {
2294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2295 int error;
2296
2297 ovs_mutex_lock(&netdev->mutex);
2298 error = tc_query_qdisc(netdev_);
2299 if (!error) {
2300 if (netdev->tc->ops->class_delete) {
2301 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2302 error = (queue
2303 ? netdev->tc->ops->class_delete(netdev_, queue)
2304 : ENOENT);
2305 } else {
2306 error = EINVAL;
2307 }
2308 }
2309 ovs_mutex_unlock(&netdev->mutex);
2310
2311 return error;
2312 }
2313
2314 static int
2315 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2316 unsigned int queue_id,
2317 struct netdev_queue_stats *stats)
2318 {
2319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2320 int error;
2321
2322 ovs_mutex_lock(&netdev->mutex);
2323 error = tc_query_qdisc(netdev_);
2324 if (!error) {
2325 if (netdev->tc->ops->class_get_stats) {
2326 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2327 if (queue) {
2328 stats->created = queue->created;
2329 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2330 stats);
2331 } else {
2332 error = ENOENT;
2333 }
2334 } else {
2335 error = EOPNOTSUPP;
2336 }
2337 }
2338 ovs_mutex_unlock(&netdev->mutex);
2339
2340 return error;
2341 }
2342
2343 struct queue_dump_state {
2344 struct nl_dump dump;
2345 struct ofpbuf buf;
2346 };
2347
2348 static bool
2349 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2350 {
2351 struct ofpbuf request;
2352 struct tcmsg *tcmsg;
2353
2354 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2355 if (!tcmsg) {
2356 return false;
2357 }
2358 tcmsg->tcm_parent = 0;
2359 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2360 ofpbuf_uninit(&request);
2361
2362 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2363 return true;
2364 }
2365
2366 static int
2367 finish_queue_dump(struct queue_dump_state *state)
2368 {
2369 ofpbuf_uninit(&state->buf);
2370 return nl_dump_done(&state->dump);
2371 }
2372
2373 struct netdev_linux_queue_state {
2374 unsigned int *queues;
2375 size_t cur_queue;
2376 size_t n_queues;
2377 };
2378
2379 static int
2380 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2381 {
2382 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2383 int error;
2384
2385 ovs_mutex_lock(&netdev->mutex);
2386 error = tc_query_qdisc(netdev_);
2387 if (!error) {
2388 if (netdev->tc->ops->class_get) {
2389 struct netdev_linux_queue_state *state;
2390 struct tc_queue *queue;
2391 size_t i;
2392
2393 *statep = state = xmalloc(sizeof *state);
2394 state->n_queues = hmap_count(&netdev->tc->queues);
2395 state->cur_queue = 0;
2396 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2397
2398 i = 0;
2399 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2400 state->queues[i++] = queue->queue_id;
2401 }
2402 } else {
2403 error = EOPNOTSUPP;
2404 }
2405 }
2406 ovs_mutex_unlock(&netdev->mutex);
2407
2408 return error;
2409 }
2410
2411 static int
2412 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2413 unsigned int *queue_idp, struct smap *details)
2414 {
2415 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2416 struct netdev_linux_queue_state *state = state_;
2417 int error = EOF;
2418
2419 ovs_mutex_lock(&netdev->mutex);
2420 while (state->cur_queue < state->n_queues) {
2421 unsigned int queue_id = state->queues[state->cur_queue++];
2422 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2423
2424 if (queue) {
2425 *queue_idp = queue_id;
2426 error = netdev->tc->ops->class_get(netdev_, queue, details);
2427 break;
2428 }
2429 }
2430 ovs_mutex_unlock(&netdev->mutex);
2431
2432 return error;
2433 }
2434
2435 static int
2436 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2437 void *state_)
2438 {
2439 struct netdev_linux_queue_state *state = state_;
2440
2441 free(state->queues);
2442 free(state);
2443 return 0;
2444 }
2445
2446 static int
2447 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2448 netdev_dump_queue_stats_cb *cb, void *aux)
2449 {
2450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2451 int error;
2452
2453 ovs_mutex_lock(&netdev->mutex);
2454 error = tc_query_qdisc(netdev_);
2455 if (!error) {
2456 struct queue_dump_state state;
2457
2458 if (!netdev->tc->ops->class_dump_stats) {
2459 error = EOPNOTSUPP;
2460 } else if (!start_queue_dump(netdev_, &state)) {
2461 error = ENODEV;
2462 } else {
2463 struct ofpbuf msg;
2464 int retval;
2465
2466 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2467 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2468 cb, aux);
2469 if (retval) {
2470 error = retval;
2471 }
2472 }
2473
2474 retval = finish_queue_dump(&state);
2475 if (retval) {
2476 error = retval;
2477 }
2478 }
2479 }
2480 ovs_mutex_unlock(&netdev->mutex);
2481
2482 return error;
2483 }
2484
2485 static int
2486 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2487 struct in_addr netmask)
2488 {
2489 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2490 int error;
2491
2492 ovs_mutex_lock(&netdev->mutex);
2493 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2494 if (!error) {
2495 if (address.s_addr != INADDR_ANY) {
2496 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2497 "SIOCSIFNETMASK", netmask);
2498 }
2499 }
2500
2501 ovs_mutex_unlock(&netdev->mutex);
2502
2503 return error;
2504 }
2505
2506 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2507 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2508 * error. */
2509 static int
2510 netdev_linux_get_addr_list(const struct netdev *netdev_,
2511 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2512 {
2513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2514 int error;
2515
2516 ovs_mutex_lock(&netdev->mutex);
2517 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2518 ovs_mutex_unlock(&netdev->mutex);
2519
2520 return error;
2521 }
2522
2523 static void
2524 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2525 {
2526 struct sockaddr_in sin;
2527 memset(&sin, 0, sizeof sin);
2528 sin.sin_family = AF_INET;
2529 sin.sin_addr = addr;
2530 sin.sin_port = 0;
2531
2532 memset(sa, 0, sizeof *sa);
2533 memcpy(sa, &sin, sizeof sin);
2534 }
2535
2536 static int
2537 do_set_addr(struct netdev *netdev,
2538 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2539 {
2540 struct ifreq ifr;
2541
2542 make_in4_sockaddr(&ifr.ifr_addr, addr);
2543 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2544 ioctl_name);
2545 }
2546
2547 /* Adds 'router' as a default IP gateway. */
2548 static int
2549 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2550 {
2551 struct in_addr any = { INADDR_ANY };
2552 struct rtentry rt;
2553 int error;
2554
2555 memset(&rt, 0, sizeof rt);
2556 make_in4_sockaddr(&rt.rt_dst, any);
2557 make_in4_sockaddr(&rt.rt_gateway, router);
2558 make_in4_sockaddr(&rt.rt_genmask, any);
2559 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2560 error = af_inet_ioctl(SIOCADDRT, &rt);
2561 if (error) {
2562 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2563 }
2564 return error;
2565 }
2566
2567 static int
2568 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2569 char **netdev_name)
2570 {
2571 static const char fn[] = "/proc/net/route";
2572 FILE *stream;
2573 char line[256];
2574 int ln;
2575
2576 *netdev_name = NULL;
2577 stream = fopen(fn, "r");
2578 if (stream == NULL) {
2579 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2580 return errno;
2581 }
2582
2583 ln = 0;
2584 while (fgets(line, sizeof line, stream)) {
2585 if (++ln >= 2) {
2586 char iface[17];
2587 ovs_be32 dest, gateway, mask;
2588 int refcnt, metric, mtu;
2589 unsigned int flags, use, window, irtt;
2590
2591 if (!ovs_scan(line,
2592 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2593 " %d %u %u\n",
2594 iface, &dest, &gateway, &flags, &refcnt,
2595 &use, &metric, &mask, &mtu, &window, &irtt)) {
2596 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2597 fn, ln, line);
2598 continue;
2599 }
2600 if (!(flags & RTF_UP)) {
2601 /* Skip routes that aren't up. */
2602 continue;
2603 }
2604
2605 /* The output of 'dest', 'mask', and 'gateway' were given in
2606 * network byte order, so we don't need need any endian
2607 * conversions here. */
2608 if ((dest & mask) == (host->s_addr & mask)) {
2609 if (!gateway) {
2610 /* The host is directly reachable. */
2611 next_hop->s_addr = 0;
2612 } else {
2613 /* To reach the host, we must go through a gateway. */
2614 next_hop->s_addr = gateway;
2615 }
2616 *netdev_name = xstrdup(iface);
2617 fclose(stream);
2618 return 0;
2619 }
2620 }
2621 }
2622
2623 fclose(stream);
2624 return ENXIO;
2625 }
2626
2627 static int
2628 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2629 {
2630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2631 int error = 0;
2632
2633 ovs_mutex_lock(&netdev->mutex);
2634 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2635 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2636
2637 COVERAGE_INC(netdev_get_ethtool);
2638 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2639 error = netdev_linux_do_ethtool(netdev->up.name,
2640 cmd,
2641 ETHTOOL_GDRVINFO,
2642 "ETHTOOL_GDRVINFO");
2643 if (!error) {
2644 netdev->cache_valid |= VALID_DRVINFO;
2645 }
2646 }
2647
2648 if (!error) {
2649 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2650 smap_add(smap, "driver_version", netdev->drvinfo.version);
2651 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2652 }
2653 ovs_mutex_unlock(&netdev->mutex);
2654
2655 return error;
2656 }
2657
2658 static int
2659 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2660 struct smap *smap)
2661 {
2662 smap_add(smap, "driver_name", "openvswitch");
2663 return 0;
2664 }
2665
2666 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2667 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2668 * returns 0. Otherwise, it returns a positive errno value; in particular,
2669 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2670 static int
2671 netdev_linux_arp_lookup(const struct netdev *netdev,
2672 ovs_be32 ip, struct eth_addr *mac)
2673 {
2674 struct arpreq r;
2675 struct sockaddr_in sin;
2676 int retval;
2677
2678 memset(&r, 0, sizeof r);
2679 memset(&sin, 0, sizeof sin);
2680 sin.sin_family = AF_INET;
2681 sin.sin_addr.s_addr = ip;
2682 sin.sin_port = 0;
2683 memcpy(&r.arp_pa, &sin, sizeof sin);
2684 r.arp_ha.sa_family = ARPHRD_ETHER;
2685 r.arp_flags = 0;
2686 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2687 COVERAGE_INC(netdev_arp_lookup);
2688 retval = af_inet_ioctl(SIOCGARP, &r);
2689 if (!retval) {
2690 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2691 } else if (retval != ENXIO) {
2692 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2693 netdev_get_name(netdev), IP_ARGS(ip),
2694 ovs_strerror(retval));
2695 }
2696 return retval;
2697 }
2698
2699 static int
2700 nd_to_iff_flags(enum netdev_flags nd)
2701 {
2702 int iff = 0;
2703 if (nd & NETDEV_UP) {
2704 iff |= IFF_UP;
2705 }
2706 if (nd & NETDEV_PROMISC) {
2707 iff |= IFF_PROMISC;
2708 }
2709 if (nd & NETDEV_LOOPBACK) {
2710 iff |= IFF_LOOPBACK;
2711 }
2712 return iff;
2713 }
2714
2715 static int
2716 iff_to_nd_flags(int iff)
2717 {
2718 enum netdev_flags nd = 0;
2719 if (iff & IFF_UP) {
2720 nd |= NETDEV_UP;
2721 }
2722 if (iff & IFF_PROMISC) {
2723 nd |= NETDEV_PROMISC;
2724 }
2725 if (iff & IFF_LOOPBACK) {
2726 nd |= NETDEV_LOOPBACK;
2727 }
2728 return nd;
2729 }
2730
2731 static int
2732 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2733 enum netdev_flags on, enum netdev_flags *old_flagsp)
2734 OVS_REQUIRES(netdev->mutex)
2735 {
2736 int old_flags, new_flags;
2737 int error = 0;
2738
2739 old_flags = netdev->ifi_flags;
2740 *old_flagsp = iff_to_nd_flags(old_flags);
2741 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2742 if (new_flags != old_flags) {
2743 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2744 get_flags(&netdev->up, &netdev->ifi_flags);
2745 }
2746
2747 return error;
2748 }
2749
2750 static int
2751 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2752 enum netdev_flags on, enum netdev_flags *old_flagsp)
2753 {
2754 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2755 int error;
2756
2757 ovs_mutex_lock(&netdev->mutex);
2758 error = update_flags(netdev, off, on, old_flagsp);
2759 ovs_mutex_unlock(&netdev->mutex);
2760
2761 return error;
2762 }
2763
2764 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2765 GET_FEATURES, GET_STATUS) \
2766 { \
2767 NAME, \
2768 false, /* is_pmd */ \
2769 \
2770 NULL, \
2771 netdev_linux_run, \
2772 netdev_linux_wait, \
2773 \
2774 netdev_linux_alloc, \
2775 CONSTRUCT, \
2776 netdev_linux_destruct, \
2777 netdev_linux_dealloc, \
2778 NULL, /* get_config */ \
2779 NULL, /* set_config */ \
2780 NULL, /* get_tunnel_config */ \
2781 NULL, /* build header */ \
2782 NULL, /* push header */ \
2783 NULL, /* pop header */ \
2784 NULL, /* get_numa_id */ \
2785 NULL, /* set_tx_multiq */ \
2786 \
2787 netdev_linux_send, \
2788 netdev_linux_send_wait, \
2789 \
2790 netdev_linux_set_etheraddr, \
2791 netdev_linux_get_etheraddr, \
2792 netdev_linux_get_mtu, \
2793 netdev_linux_set_mtu, \
2794 netdev_linux_get_ifindex, \
2795 netdev_linux_get_carrier, \
2796 netdev_linux_get_carrier_resets, \
2797 netdev_linux_set_miimon_interval, \
2798 GET_STATS, \
2799 \
2800 GET_FEATURES, \
2801 netdev_linux_set_advertisements, \
2802 \
2803 netdev_linux_set_policing, \
2804 netdev_linux_get_qos_types, \
2805 netdev_linux_get_qos_capabilities, \
2806 netdev_linux_get_qos, \
2807 netdev_linux_set_qos, \
2808 netdev_linux_get_queue, \
2809 netdev_linux_set_queue, \
2810 netdev_linux_delete_queue, \
2811 netdev_linux_get_queue_stats, \
2812 netdev_linux_queue_dump_start, \
2813 netdev_linux_queue_dump_next, \
2814 netdev_linux_queue_dump_done, \
2815 netdev_linux_dump_queue_stats, \
2816 \
2817 netdev_linux_set_in4, \
2818 netdev_linux_get_addr_list, \
2819 netdev_linux_add_router, \
2820 netdev_linux_get_next_hop, \
2821 GET_STATUS, \
2822 netdev_linux_arp_lookup, \
2823 \
2824 netdev_linux_update_flags, \
2825 NULL, /* reconfigure */ \
2826 \
2827 netdev_linux_rxq_alloc, \
2828 netdev_linux_rxq_construct, \
2829 netdev_linux_rxq_destruct, \
2830 netdev_linux_rxq_dealloc, \
2831 netdev_linux_rxq_recv, \
2832 netdev_linux_rxq_wait, \
2833 netdev_linux_rxq_drain, \
2834 }
2835
2836 const struct netdev_class netdev_linux_class =
2837 NETDEV_LINUX_CLASS(
2838 "system",
2839 netdev_linux_construct,
2840 netdev_linux_get_stats,
2841 netdev_linux_get_features,
2842 netdev_linux_get_status);
2843
2844 const struct netdev_class netdev_tap_class =
2845 NETDEV_LINUX_CLASS(
2846 "tap",
2847 netdev_linux_construct_tap,
2848 netdev_tap_get_stats,
2849 netdev_linux_get_features,
2850 netdev_linux_get_status);
2851
2852 const struct netdev_class netdev_internal_class =
2853 NETDEV_LINUX_CLASS(
2854 "internal",
2855 netdev_linux_construct,
2856 netdev_internal_get_stats,
2857 NULL, /* get_features */
2858 netdev_internal_get_status);
2859 \f
2860
2861 #define CODEL_N_QUEUES 0x0000
2862
2863 /* In sufficiently new kernel headers these are defined as enums in
2864 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2865 * kernels. (This overrides any enum definition in the header file but that's
2866 * harmless.) */
2867 #define TCA_CODEL_TARGET 1
2868 #define TCA_CODEL_LIMIT 2
2869 #define TCA_CODEL_INTERVAL 3
2870
2871 struct codel {
2872 struct tc tc;
2873 uint32_t target;
2874 uint32_t limit;
2875 uint32_t interval;
2876 };
2877
2878 static struct codel *
2879 codel_get__(const struct netdev *netdev_)
2880 {
2881 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2882 return CONTAINER_OF(netdev->tc, struct codel, tc);
2883 }
2884
2885 static void
2886 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2887 uint32_t interval)
2888 {
2889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2890 struct codel *codel;
2891
2892 codel = xmalloc(sizeof *codel);
2893 tc_init(&codel->tc, &tc_ops_codel);
2894 codel->target = target;
2895 codel->limit = limit;
2896 codel->interval = interval;
2897
2898 netdev->tc = &codel->tc;
2899 }
2900
2901 static int
2902 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2903 uint32_t interval)
2904 {
2905 size_t opt_offset;
2906 struct ofpbuf request;
2907 struct tcmsg *tcmsg;
2908 uint32_t otarget, olimit, ointerval;
2909 int error;
2910
2911 tc_del_qdisc(netdev);
2912
2913 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2914 NLM_F_EXCL | NLM_F_CREATE, &request);
2915 if (!tcmsg) {
2916 return ENODEV;
2917 }
2918 tcmsg->tcm_handle = tc_make_handle(1, 0);
2919 tcmsg->tcm_parent = TC_H_ROOT;
2920
2921 otarget = target ? target : 5000;
2922 olimit = limit ? limit : 10240;
2923 ointerval = interval ? interval : 100000;
2924
2925 nl_msg_put_string(&request, TCA_KIND, "codel");
2926 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2927 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2928 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2929 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2930 nl_msg_end_nested(&request, opt_offset);
2931
2932 error = tc_transact(&request, NULL);
2933 if (error) {
2934 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2935 "target %u, limit %u, interval %u error %d(%s)",
2936 netdev_get_name(netdev),
2937 otarget, olimit, ointerval,
2938 error, ovs_strerror(error));
2939 }
2940 return error;
2941 }
2942
2943 static void
2944 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2945 const struct smap *details, struct codel *codel)
2946 {
2947 codel->target = smap_get_ullong(details, "target", 0);
2948 codel->limit = smap_get_ullong(details, "limit", 0);
2949 codel->interval = smap_get_ullong(details, "interval", 0);
2950
2951 if (!codel->target) {
2952 codel->target = 5000;
2953 }
2954 if (!codel->limit) {
2955 codel->limit = 10240;
2956 }
2957 if (!codel->interval) {
2958 codel->interval = 100000;
2959 }
2960 }
2961
2962 static int
2963 codel_tc_install(struct netdev *netdev, const struct smap *details)
2964 {
2965 int error;
2966 struct codel codel;
2967
2968 codel_parse_qdisc_details__(netdev, details, &codel);
2969 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2970 codel.interval);
2971 if (!error) {
2972 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2973 }
2974 return error;
2975 }
2976
2977 static int
2978 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2979 {
2980 static const struct nl_policy tca_codel_policy[] = {
2981 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2982 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2983 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2984 };
2985
2986 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2987
2988 if (!nl_parse_nested(nl_options, tca_codel_policy,
2989 attrs, ARRAY_SIZE(tca_codel_policy))) {
2990 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2991 return EPROTO;
2992 }
2993
2994 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2995 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2996 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2997 return 0;
2998 }
2999
3000 static int
3001 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3002 {
3003 struct nlattr *nlattr;
3004 const char * kind;
3005 int error;
3006 struct codel codel;
3007
3008 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3009 if (error != 0) {
3010 return error;
3011 }
3012
3013 error = codel_parse_tca_options__(nlattr, &codel);
3014 if (error != 0) {
3015 return error;
3016 }
3017
3018 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3019 return 0;
3020 }
3021
3022
3023 static void
3024 codel_tc_destroy(struct tc *tc)
3025 {
3026 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3027 tc_destroy(tc);
3028 free(codel);
3029 }
3030
3031 static int
3032 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3033 {
3034 const struct codel *codel = codel_get__(netdev);
3035 smap_add_format(details, "target", "%u", codel->target);
3036 smap_add_format(details, "limit", "%u", codel->limit);
3037 smap_add_format(details, "interval", "%u", codel->interval);
3038 return 0;
3039 }
3040
3041 static int
3042 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3043 {
3044 struct codel codel;
3045
3046 codel_parse_qdisc_details__(netdev, details, &codel);
3047 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3048 codel_get__(netdev)->target = codel.target;
3049 codel_get__(netdev)->limit = codel.limit;
3050 codel_get__(netdev)->interval = codel.interval;
3051 return 0;
3052 }
3053
3054 static const struct tc_ops tc_ops_codel = {
3055 "codel", /* linux_name */
3056 "linux-codel", /* ovs_name */
3057 CODEL_N_QUEUES, /* n_queues */
3058 codel_tc_install,
3059 codel_tc_load,
3060 codel_tc_destroy,
3061 codel_qdisc_get,
3062 codel_qdisc_set,
3063 NULL,
3064 NULL,
3065 NULL,
3066 NULL,
3067 NULL
3068 };
3069 \f
3070 /* FQ-CoDel traffic control class. */
3071
3072 #define FQCODEL_N_QUEUES 0x0000
3073
3074 /* In sufficiently new kernel headers these are defined as enums in
3075 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3076 * kernels. (This overrides any enum definition in the header file but that's
3077 * harmless.) */
3078 #define TCA_FQ_CODEL_TARGET 1
3079 #define TCA_FQ_CODEL_LIMIT 2
3080 #define TCA_FQ_CODEL_INTERVAL 3
3081 #define TCA_FQ_CODEL_ECN 4
3082 #define TCA_FQ_CODEL_FLOWS 5
3083 #define TCA_FQ_CODEL_QUANTUM 6
3084
3085 struct fqcodel {
3086 struct tc tc;
3087 uint32_t target;
3088 uint32_t limit;
3089 uint32_t interval;
3090 uint32_t flows;
3091 uint32_t quantum;
3092 };
3093
3094 static struct fqcodel *
3095 fqcodel_get__(const struct netdev *netdev_)
3096 {
3097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3098 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3099 }
3100
3101 static void
3102 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3103 uint32_t interval, uint32_t flows, uint32_t quantum)
3104 {
3105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3106 struct fqcodel *fqcodel;
3107
3108 fqcodel = xmalloc(sizeof *fqcodel);
3109 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3110 fqcodel->target = target;
3111 fqcodel->limit = limit;
3112 fqcodel->interval = interval;
3113 fqcodel->flows = flows;
3114 fqcodel->quantum = quantum;
3115
3116 netdev->tc = &fqcodel->tc;
3117 }
3118
3119 static int
3120 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3121 uint32_t interval, uint32_t flows, uint32_t quantum)
3122 {
3123 size_t opt_offset;
3124 struct ofpbuf request;
3125 struct tcmsg *tcmsg;
3126 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3127 int error;
3128
3129 tc_del_qdisc(netdev);
3130
3131 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3132 NLM_F_EXCL | NLM_F_CREATE, &request);
3133 if (!tcmsg) {
3134 return ENODEV;
3135 }
3136 tcmsg->tcm_handle = tc_make_handle(1, 0);
3137 tcmsg->tcm_parent = TC_H_ROOT;
3138
3139 otarget = target ? target : 5000;
3140 olimit = limit ? limit : 10240;
3141 ointerval = interval ? interval : 100000;
3142 oflows = flows ? flows : 1024;
3143 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3144 not mtu */
3145
3146 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3147 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3148 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3149 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3150 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3151 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3152 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3153 nl_msg_end_nested(&request, opt_offset);
3154
3155 error = tc_transact(&request, NULL);
3156 if (error) {
3157 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3158 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3159 netdev_get_name(netdev),
3160 otarget, olimit, ointerval, oflows, oquantum,
3161 error, ovs_strerror(error));
3162 }
3163 return error;
3164 }
3165
3166 static void
3167 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3168 const struct smap *details, struct fqcodel *fqcodel)
3169 {
3170 fqcodel->target = smap_get_ullong(details, "target", 0);
3171 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3172 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3173 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3174 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3175
3176 if (!fqcodel->target) {
3177 fqcodel->target = 5000;
3178 }
3179 if (!fqcodel->limit) {
3180 fqcodel->limit = 10240;
3181 }
3182 if (!fqcodel->interval) {
3183 fqcodel->interval = 1000000;
3184 }
3185 if (!fqcodel->flows) {
3186 fqcodel->flows = 1024;
3187 }
3188 if (!fqcodel->quantum) {
3189 fqcodel->quantum = 1514;
3190 }
3191 }
3192
3193 static int
3194 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3195 {
3196 int error;
3197 struct fqcodel fqcodel;
3198
3199 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3200 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3201 fqcodel.interval, fqcodel.flows,
3202 fqcodel.quantum);
3203 if (!error) {
3204 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3205 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3206 }
3207 return error;
3208 }
3209
3210 static int
3211 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3212 {
3213 static const struct nl_policy tca_fqcodel_policy[] = {
3214 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3215 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3218 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3219 };
3220
3221 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3222
3223 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3224 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3225 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3226 return EPROTO;
3227 }
3228
3229 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3230 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3231 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3232 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3233 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3234 return 0;
3235 }
3236
3237 static int
3238 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3239 {
3240 struct nlattr *nlattr;
3241 const char * kind;
3242 int error;
3243 struct fqcodel fqcodel;
3244
3245 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3246 if (error != 0) {
3247 return error;
3248 }
3249
3250 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3251 if (error != 0) {
3252 return error;
3253 }
3254
3255 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3256 fqcodel.flows, fqcodel.quantum);
3257 return 0;
3258 }
3259
3260 static void
3261 fqcodel_tc_destroy(struct tc *tc)
3262 {
3263 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3264 tc_destroy(tc);
3265 free(fqcodel);
3266 }
3267
3268 static int
3269 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3270 {
3271 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3272 smap_add_format(details, "target", "%u", fqcodel->target);
3273 smap_add_format(details, "limit", "%u", fqcodel->limit);
3274 smap_add_format(details, "interval", "%u", fqcodel->interval);
3275 smap_add_format(details, "flows", "%u", fqcodel->flows);
3276 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3277 return 0;
3278 }
3279
3280 static int
3281 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3282 {
3283 struct fqcodel fqcodel;
3284
3285 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3286 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3287 fqcodel.flows, fqcodel.quantum);
3288 fqcodel_get__(netdev)->target = fqcodel.target;
3289 fqcodel_get__(netdev)->limit = fqcodel.limit;
3290 fqcodel_get__(netdev)->interval = fqcodel.interval;
3291 fqcodel_get__(netdev)->flows = fqcodel.flows;
3292 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3293 return 0;
3294 }
3295
3296 static const struct tc_ops tc_ops_fqcodel = {
3297 "fq_codel", /* linux_name */
3298 "linux-fq_codel", /* ovs_name */
3299 FQCODEL_N_QUEUES, /* n_queues */
3300 fqcodel_tc_install,
3301 fqcodel_tc_load,
3302 fqcodel_tc_destroy,
3303 fqcodel_qdisc_get,
3304 fqcodel_qdisc_set,
3305 NULL,
3306 NULL,
3307 NULL,
3308 NULL,
3309 NULL
3310 };
3311 \f
3312 /* SFQ traffic control class. */
3313
3314 #define SFQ_N_QUEUES 0x0000
3315
3316 struct sfq {
3317 struct tc tc;
3318 uint32_t quantum;
3319 uint32_t perturb;
3320 };
3321
3322 static struct sfq *
3323 sfq_get__(const struct netdev *netdev_)
3324 {
3325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3326 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3327 }
3328
3329 static void
3330 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3331 {
3332 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3333 struct sfq *sfq;
3334
3335 sfq = xmalloc(sizeof *sfq);
3336 tc_init(&sfq->tc, &tc_ops_sfq);
3337 sfq->perturb = perturb;
3338 sfq->quantum = quantum;
3339
3340 netdev->tc = &sfq->tc;
3341 }
3342
3343 static int
3344 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3345 {
3346 struct tc_sfq_qopt opt;
3347 struct ofpbuf request;
3348 struct tcmsg *tcmsg;
3349 int mtu;
3350 int mtu_error, error;
3351 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3352
3353 tc_del_qdisc(netdev);
3354
3355 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3356 NLM_F_EXCL | NLM_F_CREATE, &request);
3357 if (!tcmsg) {
3358 return ENODEV;
3359 }
3360 tcmsg->tcm_handle = tc_make_handle(1, 0);
3361 tcmsg->tcm_parent = TC_H_ROOT;
3362
3363 memset(&opt, 0, sizeof opt);
3364 if (!quantum) {
3365 if (!mtu_error) {
3366 opt.quantum = mtu; /* if we cannot find mtu, use default */
3367 }
3368 } else {
3369 opt.quantum = quantum;
3370 }
3371
3372 if (!perturb) {
3373 opt.perturb_period = 10;
3374 } else {
3375 opt.perturb_period = perturb;
3376 }
3377
3378 nl_msg_put_string(&request, TCA_KIND, "sfq");
3379 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3380
3381 error = tc_transact(&request, NULL);
3382 if (error) {
3383 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3384 "quantum %u, perturb %u error %d(%s)",
3385 netdev_get_name(netdev),
3386 opt.quantum, opt.perturb_period,
3387 error, ovs_strerror(error));
3388 }
3389 return error;
3390 }
3391
3392 static void
3393 sfq_parse_qdisc_details__(struct netdev *netdev,
3394 const struct smap *details, struct sfq *sfq)
3395 {
3396 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3397 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3398
3399 if (!sfq->perturb) {
3400 sfq->perturb = 10;
3401 }
3402
3403 if (!sfq->quantum) {
3404 int mtu;
3405 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3406 sfq->quantum = mtu;
3407 } else {
3408 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3409 "device without mtu");
3410 }
3411 }
3412 }
3413
3414 static int
3415 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3416 {
3417 int error;
3418 struct sfq sfq;
3419
3420 sfq_parse_qdisc_details__(netdev, details, &sfq);
3421 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3422 if (!error) {
3423 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3424 }
3425 return error;
3426 }
3427
3428 static int
3429 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3430 {
3431 const struct tc_sfq_qopt *sfq;
3432 struct nlattr *nlattr;
3433 const char * kind;
3434 int error;
3435
3436 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3437 if (error == 0) {
3438 sfq = nl_attr_get(nlattr);
3439 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3440 return 0;
3441 }
3442
3443 return error;
3444 }
3445
3446 static void
3447 sfq_tc_destroy(struct tc *tc)
3448 {
3449 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3450 tc_destroy(tc);
3451 free(sfq);
3452 }
3453
3454 static int
3455 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3456 {
3457 const struct sfq *sfq = sfq_get__(netdev);
3458 smap_add_format(details, "quantum", "%u", sfq->quantum);
3459 smap_add_format(details, "perturb", "%u", sfq->perturb);
3460 return 0;
3461 }
3462
3463 static int
3464 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3465 {
3466 struct sfq sfq;
3467
3468 sfq_parse_qdisc_details__(netdev, details, &sfq);
3469 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3470 sfq_get__(netdev)->quantum = sfq.quantum;
3471 sfq_get__(netdev)->perturb = sfq.perturb;
3472 return 0;
3473 }
3474
3475 static const struct tc_ops tc_ops_sfq = {
3476 "sfq", /* linux_name */
3477 "linux-sfq", /* ovs_name */
3478 SFQ_N_QUEUES, /* n_queues */
3479 sfq_tc_install,
3480 sfq_tc_load,
3481 sfq_tc_destroy,
3482 sfq_qdisc_get,
3483 sfq_qdisc_set,
3484 NULL,
3485 NULL,
3486 NULL,
3487 NULL,
3488 NULL
3489 };
3490 \f
3491 /* HTB traffic control class. */
3492
3493 #define HTB_N_QUEUES 0xf000
3494 #define HTB_RATE2QUANTUM 10
3495
3496 struct htb {
3497 struct tc tc;
3498 unsigned int max_rate; /* In bytes/s. */
3499 };
3500
3501 struct htb_class {
3502 struct tc_queue tc_queue;
3503 unsigned int min_rate; /* In bytes/s. */
3504 unsigned int max_rate; /* In bytes/s. */
3505 unsigned int burst; /* In bytes. */
3506 unsigned int priority; /* Lower values are higher priorities. */
3507 };
3508
3509 static struct htb *
3510 htb_get__(const struct netdev *netdev_)
3511 {
3512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3513 return CONTAINER_OF(netdev->tc, struct htb, tc);
3514 }
3515
3516 static void
3517 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3518 {
3519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3520 struct htb *htb;
3521
3522 htb = xmalloc(sizeof *htb);
3523 tc_init(&htb->tc, &tc_ops_htb);
3524 htb->max_rate = max_rate;
3525
3526 netdev->tc = &htb->tc;
3527 }
3528
3529 /* Create an HTB qdisc.
3530 *
3531 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3532 static int
3533 htb_setup_qdisc__(struct netdev *netdev)
3534 {
3535 size_t opt_offset;
3536 struct tc_htb_glob opt;
3537 struct ofpbuf request;
3538 struct tcmsg *tcmsg;
3539
3540 tc_del_qdisc(netdev);
3541
3542 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3543 NLM_F_EXCL | NLM_F_CREATE, &request);
3544 if (!tcmsg) {
3545 return ENODEV;
3546 }
3547 tcmsg->tcm_handle = tc_make_handle(1, 0);
3548 tcmsg->tcm_parent = TC_H_ROOT;
3549
3550 nl_msg_put_string(&request, TCA_KIND, "htb");
3551
3552 memset(&opt, 0, sizeof opt);
3553 opt.rate2quantum = HTB_RATE2QUANTUM;
3554 opt.version = 3;
3555 opt.defcls = 1;
3556
3557 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3558 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3559 nl_msg_end_nested(&request, opt_offset);
3560
3561 return tc_transact(&request, NULL);
3562 }
3563
3564 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3565 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3566 static int
3567 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3568 unsigned int parent, struct htb_class *class)
3569 {
3570 size_t opt_offset;
3571 struct tc_htb_opt opt;
3572 struct ofpbuf request;
3573 struct tcmsg *tcmsg;
3574 int error;
3575 int mtu;
3576
3577 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3578 if (error) {
3579 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3580 netdev_get_name(netdev));
3581 return error;
3582 }
3583
3584 memset(&opt, 0, sizeof opt);
3585 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3586 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3587 /* Makes sure the quantum is at least MTU. Setting quantum will
3588 * make htb ignore the r2q for this class. */
3589 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3590 opt.quantum = mtu;
3591 }
3592 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3593 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3594 opt.prio = class->priority;
3595
3596 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3597 if (!tcmsg) {
3598 return ENODEV;
3599 }
3600 tcmsg->tcm_handle = handle;
3601 tcmsg->tcm_parent = parent;
3602
3603 nl_msg_put_string(&request, TCA_KIND, "htb");
3604 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3605 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3606 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3607 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3608 nl_msg_end_nested(&request, opt_offset);
3609
3610 error = tc_transact(&request, NULL);
3611 if (error) {
3612 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3613 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3614 netdev_get_name(netdev),
3615 tc_get_major(handle), tc_get_minor(handle),
3616 tc_get_major(parent), tc_get_minor(parent),
3617 class->min_rate, class->max_rate,
3618 class->burst, class->priority, ovs_strerror(error));
3619 }
3620 return error;
3621 }
3622
3623 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3624 * description of them into 'details'. The description complies with the
3625 * specification given in the vswitch database documentation for linux-htb
3626 * queue details. */
3627 static int
3628 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3629 {
3630 static const struct nl_policy tca_htb_policy[] = {
3631 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3632 .min_len = sizeof(struct tc_htb_opt) },
3633 };
3634
3635 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3636 const struct tc_htb_opt *htb;
3637
3638 if (!nl_parse_nested(nl_options, tca_htb_policy,
3639 attrs, ARRAY_SIZE(tca_htb_policy))) {
3640 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3641 return EPROTO;
3642 }
3643
3644 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3645 class->min_rate = htb->rate.rate;
3646 class->max_rate = htb->ceil.rate;
3647 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3648 class->priority = htb->prio;
3649 return 0;
3650 }
3651
3652 static int
3653 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3654 struct htb_class *options,
3655 struct netdev_queue_stats *stats)
3656 {
3657 struct nlattr *nl_options;
3658 unsigned int handle;
3659 int error;
3660
3661 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3662 if (!error && queue_id) {
3663 unsigned int major = tc_get_major(handle);
3664 unsigned int minor = tc_get_minor(handle);
3665 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3666 *queue_id = minor - 1;
3667 } else {
3668 error = EPROTO;
3669 }
3670 }
3671 if (!error && options) {
3672 error = htb_parse_tca_options__(nl_options, options);
3673 }
3674 return error;
3675 }
3676
3677 static void
3678 htb_parse_qdisc_details__(struct netdev *netdev_,
3679 const struct smap *details, struct htb_class *hc)
3680 {
3681 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3682
3683 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3684 if (!hc->max_rate) {
3685 enum netdev_features current;
3686
3687 netdev_linux_read_features(netdev);
3688 current = !netdev->get_features_error ? netdev->current : 0;
3689 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3690 }
3691 hc->min_rate = hc->max_rate;
3692 hc->burst = 0;
3693 hc->priority = 0;
3694 }
3695
3696 static int
3697 htb_parse_class_details__(struct netdev *netdev,
3698 const struct smap *details, struct htb_class *hc)
3699 {
3700 const struct htb *htb = htb_get__(netdev);
3701 int mtu, error;
3702
3703 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3704 if (error) {
3705 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3706 netdev_get_name(netdev));
3707 return error;
3708 }
3709
3710 /* HTB requires at least an mtu sized min-rate to send any traffic even
3711 * on uncongested links. */
3712 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
3713 hc->min_rate = MAX(hc->min_rate, mtu);
3714 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3715
3716 /* max-rate */
3717 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3718 if (!hc->max_rate) {
3719 hc->max_rate = htb->max_rate;
3720 }
3721 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3722 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3723
3724 /* burst
3725 *
3726 * According to hints in the documentation that I've read, it is important
3727 * that 'burst' be at least as big as the largest frame that might be
3728 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3729 * but having it a bit too small is a problem. Since netdev_get_mtu()
3730 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3731 * the MTU. We actually add 64, instead of 14, as a guard against
3732 * additional headers get tacked on somewhere that we're not aware of. */
3733 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
3734 hc->burst = MAX(hc->burst, mtu + 64);
3735
3736 /* priority */
3737 hc->priority = smap_get_ullong(details, "priority", 0);
3738
3739 return 0;
3740 }
3741
3742 static int
3743 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3744 unsigned int parent, struct htb_class *options,
3745 struct netdev_queue_stats *stats)
3746 {
3747 struct ofpbuf *reply;
3748 int error;
3749
3750 error = tc_query_class(netdev, handle, parent, &reply);
3751 if (!error) {
3752 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3753 ofpbuf_delete(reply);
3754 }
3755 return error;
3756 }
3757
3758 static int
3759 htb_tc_install(struct netdev *netdev, const struct smap *details)
3760 {
3761 int error;
3762
3763 error = htb_setup_qdisc__(netdev);
3764 if (!error) {
3765 struct htb_class hc;
3766
3767 htb_parse_qdisc_details__(netdev, details, &hc);
3768 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3769 tc_make_handle(1, 0), &hc);
3770 if (!error) {
3771 htb_install__(netdev, hc.max_rate);
3772 }
3773 }
3774 return error;
3775 }
3776
3777 static struct htb_class *
3778 htb_class_cast__(const struct tc_queue *queue)
3779 {
3780 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3781 }
3782
3783 static void
3784 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3785 const struct htb_class *hc)
3786 {
3787 struct htb *htb = htb_get__(netdev);
3788 size_t hash = hash_int(queue_id, 0);
3789 struct tc_queue *queue;
3790 struct htb_class *hcp;
3791
3792 queue = tc_find_queue__(netdev, queue_id, hash);
3793 if (queue) {
3794 hcp = htb_class_cast__(queue);
3795 } else {
3796 hcp = xmalloc(sizeof *hcp);
3797 queue = &hcp->tc_queue;
3798 queue->queue_id = queue_id;
3799 queue->created = time_msec();
3800 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3801 }
3802
3803 hcp->min_rate = hc->min_rate;
3804 hcp->max_rate = hc->max_rate;
3805 hcp->burst = hc->burst;
3806 hcp->priority = hc->priority;
3807 }
3808
3809 static int
3810 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3811 {
3812 struct ofpbuf msg;
3813 struct queue_dump_state state;
3814 struct htb_class hc;
3815
3816 /* Get qdisc options. */
3817 hc.max_rate = 0;
3818 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3819 htb_install__(netdev, hc.max_rate);
3820
3821 /* Get queues. */
3822 if (!start_queue_dump(netdev, &state)) {
3823 return ENODEV;
3824 }
3825 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3826 unsigned int queue_id;
3827
3828 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3829 htb_update_queue__(netdev, queue_id, &hc);
3830 }
3831 }
3832 finish_queue_dump(&state);
3833
3834 return 0;
3835 }
3836
3837 static void
3838 htb_tc_destroy(struct tc *tc)
3839 {
3840 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3841 struct htb_class *hc;
3842
3843 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3844 free(hc);
3845 }
3846 tc_destroy(tc);
3847 free(htb);
3848 }
3849
3850 static int
3851 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3852 {
3853 const struct htb *htb = htb_get__(netdev);
3854 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3855 return 0;
3856 }
3857
3858 static int
3859 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3860 {
3861 struct htb_class hc;
3862 int error;
3863
3864 htb_parse_qdisc_details__(netdev, details, &hc);
3865 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3866 tc_make_handle(1, 0), &hc);
3867 if (!error) {
3868 htb_get__(netdev)->max_rate = hc.max_rate;
3869 }
3870 return error;
3871 }
3872
3873 static int
3874 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3875 const struct tc_queue *queue, struct smap *details)
3876 {
3877 const struct htb_class *hc = htb_class_cast__(queue);
3878
3879 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3880 if (hc->min_rate != hc->max_rate) {
3881 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3882 }
3883 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3884 if (hc->priority) {
3885 smap_add_format(details, "priority", "%u", hc->priority);
3886 }
3887 return 0;
3888 }
3889
3890 static int
3891 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3892 const struct smap *details)
3893 {
3894 struct htb_class hc;
3895 int error;
3896
3897 error = htb_parse_class_details__(netdev, details, &hc);
3898 if (error) {
3899 return error;
3900 }
3901
3902 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3903 tc_make_handle(1, 0xfffe), &hc);
3904 if (error) {
3905 return error;
3906 }
3907
3908 htb_update_queue__(netdev, queue_id, &hc);
3909 return 0;
3910 }
3911
3912 static int
3913 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3914 {
3915 struct htb_class *hc = htb_class_cast__(queue);
3916 struct htb *htb = htb_get__(netdev);
3917 int error;
3918
3919 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3920 if (!error) {
3921 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3922 free(hc);
3923 }
3924 return error;
3925 }
3926
3927 static int
3928 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3929 struct netdev_queue_stats *stats)
3930 {
3931 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3932 tc_make_handle(1, 0xfffe), NULL, stats);
3933 }
3934
3935 static int
3936 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3937 const struct ofpbuf *nlmsg,
3938 netdev_dump_queue_stats_cb *cb, void *aux)
3939 {
3940 struct netdev_queue_stats stats;
3941 unsigned int handle, major, minor;
3942 int error;
3943
3944 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3945 if (error) {
3946 return error;
3947 }
3948
3949 major = tc_get_major(handle);
3950 minor = tc_get_minor(handle);
3951 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3952 (*cb)(minor - 1, &stats, aux);
3953 }
3954 return 0;
3955 }
3956
3957 static const struct tc_ops tc_ops_htb = {
3958 "htb", /* linux_name */
3959 "linux-htb", /* ovs_name */
3960 HTB_N_QUEUES, /* n_queues */
3961 htb_tc_install,
3962 htb_tc_load,
3963 htb_tc_destroy,
3964 htb_qdisc_get,
3965 htb_qdisc_set,
3966 htb_class_get,
3967 htb_class_set,
3968 htb_class_delete,
3969 htb_class_get_stats,
3970 htb_class_dump_stats
3971 };
3972 \f
3973 /* "linux-hfsc" traffic control class. */
3974
3975 #define HFSC_N_QUEUES 0xf000
3976
3977 struct hfsc {
3978 struct tc tc;
3979 uint32_t max_rate;
3980 };
3981
3982 struct hfsc_class {
3983 struct tc_queue tc_queue;
3984 uint32_t min_rate;
3985 uint32_t max_rate;
3986 };
3987
3988 static struct hfsc *
3989 hfsc_get__(const struct netdev *netdev_)
3990 {
3991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3992 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3993 }
3994
3995 static struct hfsc_class *
3996 hfsc_class_cast__(const struct tc_queue *queue)
3997 {
3998 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3999 }
4000
4001 static void
4002 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4003 {
4004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4005 struct hfsc *hfsc;
4006
4007 hfsc = xmalloc(sizeof *hfsc);
4008 tc_init(&hfsc->tc, &tc_ops_hfsc);
4009 hfsc->max_rate = max_rate;
4010 netdev->tc = &hfsc->tc;
4011 }
4012
4013 static void
4014 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4015 const struct hfsc_class *hc)
4016 {
4017 size_t hash;
4018 struct hfsc *hfsc;
4019 struct hfsc_class *hcp;
4020 struct tc_queue *queue;
4021
4022 hfsc = hfsc_get__(netdev);
4023 hash = hash_int(queue_id, 0);
4024
4025 queue = tc_find_queue__(netdev, queue_id, hash);
4026 if (queue) {
4027 hcp = hfsc_class_cast__(queue);
4028 } else {
4029 hcp = xmalloc(sizeof *hcp);
4030 queue = &hcp->tc_queue;
4031 queue->queue_id = queue_id;
4032 queue->created = time_msec();
4033 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4034 }
4035
4036 hcp->min_rate = hc->min_rate;
4037 hcp->max_rate = hc->max_rate;
4038 }
4039
4040 static int
4041 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4042 {
4043 const struct tc_service_curve *rsc, *fsc, *usc;
4044 static const struct nl_policy tca_hfsc_policy[] = {
4045 [TCA_HFSC_RSC] = {
4046 .type = NL_A_UNSPEC,
4047 .optional = false,
4048 .min_len = sizeof(struct tc_service_curve),
4049 },
4050 [TCA_HFSC_FSC] = {
4051 .type = NL_A_UNSPEC,
4052 .optional = false,
4053 .min_len = sizeof(struct tc_service_curve),
4054 },
4055 [TCA_HFSC_USC] = {
4056 .type = NL_A_UNSPEC,
4057 .optional = false,
4058 .min_len = sizeof(struct tc_service_curve),
4059 },
4060 };
4061 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4062
4063 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4064 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4065 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4066 return EPROTO;
4067 }
4068
4069 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4070 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4071 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4072
4073 if (rsc->m1 != 0 || rsc->d != 0 ||
4074 fsc->m1 != 0 || fsc->d != 0 ||
4075 usc->m1 != 0 || usc->d != 0) {
4076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4077 "Non-linear service curves are not supported.");
4078 return EPROTO;
4079 }
4080
4081 if (rsc->m2 != fsc->m2) {
4082 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4083 "Real-time service curves are not supported ");
4084 return EPROTO;
4085 }
4086
4087 if (rsc->m2 > usc->m2) {
4088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4089 "Min-rate service curve is greater than "
4090 "the max-rate service curve.");
4091 return EPROTO;
4092 }
4093
4094 class->min_rate = fsc->m2;
4095 class->max_rate = usc->m2;
4096 return 0;
4097 }
4098
4099 static int
4100 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4101 struct hfsc_class *options,
4102 struct netdev_queue_stats *stats)
4103 {
4104 int error;
4105 unsigned int handle;
4106 struct nlattr *nl_options;
4107
4108 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4109 if (error) {
4110 return error;
4111 }
4112
4113 if (queue_id) {
4114 unsigned int major, minor;
4115
4116 major = tc_get_major(handle);
4117 minor = tc_get_minor(handle);
4118 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4119 *queue_id = minor - 1;
4120 } else {
4121 return EPROTO;
4122 }
4123 }
4124
4125 if (options) {
4126 error = hfsc_parse_tca_options__(nl_options, options);
4127 }
4128
4129 return error;
4130 }
4131
4132 static int
4133 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4134 unsigned int parent, struct hfsc_class *options,
4135 struct netdev_queue_stats *stats)
4136 {
4137 int error;
4138 struct ofpbuf *reply;
4139
4140 error = tc_query_class(netdev, handle, parent, &reply);
4141 if (error) {
4142 return error;
4143 }
4144
4145 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4146 ofpbuf_delete(reply);
4147 return error;
4148 }
4149
4150 static void
4151 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4152 struct hfsc_class *class)
4153 {
4154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4155
4156 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4157 if (!max_rate) {
4158 enum netdev_features current;
4159
4160 netdev_linux_read_features(netdev);
4161 current = !netdev->get_features_error ? netdev->current : 0;
4162 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4163 }
4164
4165 class->min_rate = max_rate;
4166 class->max_rate = max_rate;
4167 }
4168
4169 static int
4170 hfsc_parse_class_details__(struct netdev *netdev,
4171 const struct smap *details,
4172 struct hfsc_class * class)
4173 {
4174 const struct hfsc *hfsc;
4175 uint32_t min_rate, max_rate;
4176
4177 hfsc = hfsc_get__(netdev);
4178
4179 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4180 min_rate = MAX(min_rate, 1);
4181 min_rate = MIN(min_rate, hfsc->max_rate);
4182
4183 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4184 max_rate = MAX(max_rate, min_rate);
4185 max_rate = MIN(max_rate, hfsc->max_rate);
4186
4187 class->min_rate = min_rate;
4188 class->max_rate = max_rate;
4189
4190 return 0;
4191 }
4192
4193 /* Create an HFSC qdisc.
4194 *
4195 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4196 static int
4197 hfsc_setup_qdisc__(struct netdev * netdev)
4198 {
4199 struct tcmsg *tcmsg;
4200 struct ofpbuf request;
4201 struct tc_hfsc_qopt opt;
4202
4203 tc_del_qdisc(netdev);
4204
4205 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4206 NLM_F_EXCL | NLM_F_CREATE, &request);
4207
4208 if (!tcmsg) {
4209 return ENODEV;
4210 }
4211
4212 tcmsg->tcm_handle = tc_make_handle(1, 0);
4213 tcmsg->tcm_parent = TC_H_ROOT;
4214
4215 memset(&opt, 0, sizeof opt);
4216 opt.defcls = 1;
4217
4218 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4219 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4220
4221 return tc_transact(&request, NULL);
4222 }
4223
4224 /* Create an HFSC class.
4225 *
4226 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4227 * sc rate <min_rate> ul rate <max_rate>" */
4228 static int
4229 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4230 unsigned int parent, struct hfsc_class *class)
4231 {
4232 int error;
4233 size_t opt_offset;
4234 struct tcmsg *tcmsg;
4235 struct ofpbuf request;
4236 struct tc_service_curve min, max;
4237
4238 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4239
4240 if (!tcmsg) {
4241 return ENODEV;
4242 }
4243
4244 tcmsg->tcm_handle = handle;
4245 tcmsg->tcm_parent = parent;
4246
4247 min.m1 = 0;
4248 min.d = 0;
4249 min.m2 = class->min_rate;
4250
4251 max.m1 = 0;
4252 max.d = 0;
4253 max.m2 = class->max_rate;
4254
4255 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4256 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4257 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4258 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4259 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4260 nl_msg_end_nested(&request, opt_offset);
4261
4262 error = tc_transact(&request, NULL);
4263 if (error) {
4264 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4265 "min-rate %ubps, max-rate %ubps (%s)",
4266 netdev_get_name(netdev),
4267 tc_get_major(handle), tc_get_minor(handle),
4268 tc_get_major(parent), tc_get_minor(parent),
4269 class->min_rate, class->max_rate, ovs_strerror(error));
4270 }
4271
4272 return error;
4273 }
4274
4275 static int
4276 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4277 {
4278 int error;
4279 struct hfsc_class class;
4280
4281 error = hfsc_setup_qdisc__(netdev);
4282
4283 if (error) {
4284 return error;
4285 }
4286
4287 hfsc_parse_qdisc_details__(netdev, details, &class);
4288 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4289 tc_make_handle(1, 0), &class);
4290
4291 if (error) {
4292 return error;
4293 }
4294
4295 hfsc_install__(netdev, class.max_rate);
4296 return 0;
4297 }
4298
4299 static int
4300 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4301 {
4302 struct ofpbuf msg;
4303 struct queue_dump_state state;
4304 struct hfsc_class hc;
4305
4306 hc.max_rate = 0;
4307 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4308 hfsc_install__(netdev, hc.max_rate);
4309
4310 if (!start_queue_dump(netdev, &state)) {
4311 return ENODEV;
4312 }
4313
4314 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4315 unsigned int queue_id;
4316
4317 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4318 hfsc_update_queue__(netdev, queue_id, &hc);
4319 }
4320 }
4321
4322 finish_queue_dump(&state);
4323 return 0;
4324 }
4325
4326 static void
4327 hfsc_tc_destroy(struct tc *tc)
4328 {
4329 struct hfsc *hfsc;
4330 struct hfsc_class *hc, *next;
4331
4332 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4333
4334 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4335 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4336 free(hc);
4337 }
4338
4339 tc_destroy(tc);
4340 free(hfsc);
4341 }
4342
4343 static int
4344 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4345 {
4346 const struct hfsc *hfsc;
4347 hfsc = hfsc_get__(netdev);
4348 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4349 return 0;
4350 }
4351
4352 static int
4353 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4354 {
4355 int error;
4356 struct hfsc_class class;
4357
4358 hfsc_parse_qdisc_details__(netdev, details, &class);
4359 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4360 tc_make_handle(1, 0), &class);
4361
4362 if (!error) {
4363 hfsc_get__(netdev)->max_rate = class.max_rate;
4364 }
4365
4366 return error;
4367 }
4368
4369 static int
4370 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4371 const struct tc_queue *queue, struct smap *details)
4372 {
4373 const struct hfsc_class *hc;
4374
4375 hc = hfsc_class_cast__(queue);
4376 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4377 if (hc->min_rate != hc->max_rate) {
4378 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4379 }
4380 return 0;
4381 }
4382
4383 static int
4384 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4385 const struct smap *details)
4386 {
4387 int error;
4388 struct hfsc_class class;
4389
4390 error = hfsc_parse_class_details__(netdev, details, &class);
4391 if (error) {
4392 return error;
4393 }
4394
4395 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4396 tc_make_handle(1, 0xfffe), &class);
4397 if (error) {
4398 return error;
4399 }
4400
4401 hfsc_update_queue__(netdev, queue_id, &class);
4402 return 0;
4403 }
4404
4405 static int
4406 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4407 {
4408 int error;
4409 struct hfsc *hfsc;
4410 struct hfsc_class *hc;
4411
4412 hc = hfsc_class_cast__(queue);
4413 hfsc = hfsc_get__(netdev);
4414
4415 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4416 if (!error) {
4417 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4418 free(hc);
4419 }
4420 return error;
4421 }
4422
4423 static int
4424 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4425 struct netdev_queue_stats *stats)
4426 {
4427 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4428 tc_make_handle(1, 0xfffe), NULL, stats);
4429 }
4430
4431 static int
4432 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4433 const struct ofpbuf *nlmsg,
4434 netdev_dump_queue_stats_cb *cb, void *aux)
4435 {
4436 struct netdev_queue_stats stats;
4437 unsigned int handle, major, minor;
4438 int error;
4439
4440 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4441 if (error) {
4442 return error;
4443 }
4444
4445 major = tc_get_major(handle);
4446 minor = tc_get_minor(handle);
4447 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4448 (*cb)(minor - 1, &stats, aux);
4449 }
4450 return 0;
4451 }
4452
4453 static const struct tc_ops tc_ops_hfsc = {
4454 "hfsc", /* linux_name */
4455 "linux-hfsc", /* ovs_name */
4456 HFSC_N_QUEUES, /* n_queues */
4457 hfsc_tc_install, /* tc_install */
4458 hfsc_tc_load, /* tc_load */
4459 hfsc_tc_destroy, /* tc_destroy */
4460 hfsc_qdisc_get, /* qdisc_get */
4461 hfsc_qdisc_set, /* qdisc_set */
4462 hfsc_class_get, /* class_get */
4463 hfsc_class_set, /* class_set */
4464 hfsc_class_delete, /* class_delete */
4465 hfsc_class_get_stats, /* class_get_stats */
4466 hfsc_class_dump_stats /* class_dump_stats */
4467 };
4468 \f
4469 /* "linux-noop" traffic control class. */
4470
4471 static void
4472 noop_install__(struct netdev *netdev_)
4473 {
4474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4475 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4476
4477 netdev->tc = CONST_CAST(struct tc *, &tc);
4478 }
4479
4480 static int
4481 noop_tc_install(struct netdev *netdev,
4482 const struct smap *details OVS_UNUSED)
4483 {
4484 noop_install__(netdev);
4485 return 0;
4486 }
4487
4488 static int
4489 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4490 {
4491 noop_install__(netdev);
4492 return 0;
4493 }
4494
4495 static const struct tc_ops tc_ops_noop = {
4496 NULL, /* linux_name */
4497 "linux-noop", /* ovs_name */
4498 0, /* n_queues */
4499 noop_tc_install,
4500 noop_tc_load,
4501 NULL, /* tc_destroy */
4502 NULL, /* qdisc_get */
4503 NULL, /* qdisc_set */
4504 NULL, /* class_get */
4505 NULL, /* class_set */
4506 NULL, /* class_delete */
4507 NULL, /* class_get_stats */
4508 NULL /* class_dump_stats */
4509 };
4510 \f
4511 /* "linux-default" traffic control class.
4512 *
4513 * This class represents the default, unnamed Linux qdisc. It corresponds to
4514 * the "" (empty string) QoS type in the OVS database. */
4515
4516 static void
4517 default_install__(struct netdev *netdev_)
4518 {
4519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4520 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4521
4522 /* Nothing but a tc class implementation is allowed to write to a tc. This
4523 * class never does that, so we can legitimately use a const tc object. */
4524 netdev->tc = CONST_CAST(struct tc *, &tc);
4525 }
4526
4527 static int
4528 default_tc_install(struct netdev *netdev,
4529 const struct smap *details OVS_UNUSED)
4530 {
4531 default_install__(netdev);
4532 return 0;
4533 }
4534
4535 static int
4536 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4537 {
4538 default_install__(netdev);
4539 return 0;
4540 }
4541
4542 static const struct tc_ops tc_ops_default = {
4543 NULL, /* linux_name */
4544 "", /* ovs_name */
4545 0, /* n_queues */
4546 default_tc_install,
4547 default_tc_load,
4548 NULL, /* tc_destroy */
4549 NULL, /* qdisc_get */
4550 NULL, /* qdisc_set */
4551 NULL, /* class_get */
4552 NULL, /* class_set */
4553 NULL, /* class_delete */
4554 NULL, /* class_get_stats */
4555 NULL /* class_dump_stats */
4556 };
4557 \f
4558 /* "linux-other" traffic control class.
4559 *
4560 * */
4561
4562 static int
4563 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4564 {
4565 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4566 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4567
4568 /* Nothing but a tc class implementation is allowed to write to a tc. This
4569 * class never does that, so we can legitimately use a const tc object. */
4570 netdev->tc = CONST_CAST(struct tc *, &tc);
4571 return 0;
4572 }
4573
4574 static const struct tc_ops tc_ops_other = {
4575 NULL, /* linux_name */
4576 "linux-other", /* ovs_name */
4577 0, /* n_queues */
4578 NULL, /* tc_install */
4579 other_tc_load,
4580 NULL, /* tc_destroy */
4581 NULL, /* qdisc_get */
4582 NULL, /* qdisc_set */
4583 NULL, /* class_get */
4584 NULL, /* class_set */
4585 NULL, /* class_delete */
4586 NULL, /* class_get_stats */
4587 NULL /* class_dump_stats */
4588 };
4589 \f
4590 /* Traffic control. */
4591
4592 /* Number of kernel "tc" ticks per second. */
4593 static double ticks_per_s;
4594
4595 /* Number of kernel "jiffies" per second. This is used for the purpose of
4596 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4597 * one jiffy's worth of data.
4598 *
4599 * There are two possibilities here:
4600 *
4601 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4602 * approximate range of 100 to 1024. That means that we really need to
4603 * make sure that the qdisc can buffer that much data.
4604 *
4605 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4606 * has finely granular timers and there's no need to fudge additional room
4607 * for buffers. (There's no extra effort needed to implement that: the
4608 * large 'buffer_hz' is used as a divisor, so practically any number will
4609 * come out as 0 in the division. Small integer results in the case of
4610 * really high dividends won't have any real effect anyhow.)
4611 */
4612 static unsigned int buffer_hz;
4613
4614 /* Returns tc handle 'major':'minor'. */
4615 static unsigned int
4616 tc_make_handle(unsigned int major, unsigned int minor)
4617 {
4618 return TC_H_MAKE(major << 16, minor);
4619 }
4620
4621 /* Returns the major number from 'handle'. */
4622 static unsigned int
4623 tc_get_major(unsigned int handle)
4624 {
4625 return TC_H_MAJ(handle) >> 16;
4626 }
4627
4628 /* Returns the minor number from 'handle'. */
4629 static unsigned int
4630 tc_get_minor(unsigned int handle)
4631 {
4632 return TC_H_MIN(handle);
4633 }
4634
4635 static struct tcmsg *
4636 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4637 struct ofpbuf *request)
4638 {
4639 struct tcmsg *tcmsg;
4640 int ifindex;
4641 int error;
4642
4643 error = get_ifindex(netdev, &ifindex);
4644 if (error) {
4645 return NULL;
4646 }
4647
4648 ofpbuf_init(request, 512);
4649 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4650 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4651 tcmsg->tcm_family = AF_UNSPEC;
4652 tcmsg->tcm_ifindex = ifindex;
4653 /* Caller should fill in tcmsg->tcm_handle. */
4654 /* Caller should fill in tcmsg->tcm_parent. */
4655
4656 return tcmsg;
4657 }
4658
4659 static int
4660 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4661 {
4662 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4663 ofpbuf_uninit(request);
4664 return error;
4665 }
4666
4667 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4668 * policing configuration.
4669 *
4670 * This function is equivalent to running the following when 'add' is true:
4671 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4672 *
4673 * This function is equivalent to running the following when 'add' is false:
4674 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4675 *
4676 * The configuration and stats may be seen with the following command:
4677 * /sbin/tc -s qdisc show dev <devname>
4678 *
4679 * Returns 0 if successful, otherwise a positive errno value.
4680 */
4681 static int
4682 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4683 {
4684 struct ofpbuf request;
4685 struct tcmsg *tcmsg;
4686 int error;
4687 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4688 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4689
4690 tcmsg = tc_make_request(netdev, type, flags, &request);
4691 if (!tcmsg) {
4692 return ENODEV;
4693 }
4694 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4695 tcmsg->tcm_parent = TC_H_INGRESS;
4696 nl_msg_put_string(&request, TCA_KIND, "ingress");
4697 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4698
4699 error = tc_transact(&request, NULL);
4700 if (error) {
4701 /* If we're deleting the qdisc, don't worry about some of the
4702 * error conditions. */
4703 if (!add && (error == ENOENT || error == EINVAL)) {
4704 return 0;
4705 }
4706 return error;
4707 }
4708
4709 return 0;
4710 }
4711
4712 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4713 * of 'kbits_burst'.
4714 *
4715 * This function is equivalent to running:
4716 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4717 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4718 * mtu 65535 drop
4719 *
4720 * The configuration and stats may be seen with the following command:
4721 * /sbin/tc -s filter show dev <devname> parent ffff:
4722 *
4723 * Returns 0 if successful, otherwise a positive errno value.
4724 */
4725 static int
4726 tc_add_policer(struct netdev *netdev,
4727 uint32_t kbits_rate, uint32_t kbits_burst)
4728 {
4729 struct tc_police tc_police;
4730 struct ofpbuf request;
4731 struct tcmsg *tcmsg;
4732 size_t basic_offset;
4733 size_t police_offset;
4734 int error;
4735 int mtu = 65535;
4736
4737 memset(&tc_police, 0, sizeof tc_police);
4738 tc_police.action = TC_POLICE_SHOT;
4739 tc_police.mtu = mtu;
4740 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4741
4742 /* The following appears wrong in one way: In networking a kilobit is
4743 * usually 1000 bits but this uses 1024 bits.
4744 *
4745 * However if you "fix" those problems then "tc filter show ..." shows
4746 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4747 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4748 * tc's point of view. Whatever. */
4749 tc_police.burst = tc_bytes_to_ticks(
4750 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4751
4752 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4753 NLM_F_EXCL | NLM_F_CREATE, &request);
4754 if (!tcmsg) {
4755 return ENODEV;
4756 }
4757 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4758 tcmsg->tcm_info = tc_make_handle(49,
4759 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4760
4761 nl_msg_put_string(&request, TCA_KIND, "basic");
4762 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4763 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4764 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4765 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4766 nl_msg_end_nested(&request, police_offset);
4767 nl_msg_end_nested(&request, basic_offset);
4768
4769 error = tc_transact(&request, NULL);
4770 if (error) {
4771 return error;
4772 }
4773
4774 return 0;
4775 }
4776
4777 static void
4778 read_psched(void)
4779 {
4780 /* The values in psched are not individually very meaningful, but they are
4781 * important. The tables below show some values seen in the wild.
4782 *
4783 * Some notes:
4784 *
4785 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4786 * (Before that, there are hints that it was 1000000000.)
4787 *
4788 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4789 * above.
4790 *
4791 * /proc/net/psched
4792 * -----------------------------------
4793 * [1] 000c8000 000f4240 000f4240 00000064
4794 * [2] 000003e8 00000400 000f4240 3b9aca00
4795 * [3] 000003e8 00000400 000f4240 3b9aca00
4796 * [4] 000003e8 00000400 000f4240 00000064
4797 * [5] 000003e8 00000040 000f4240 3b9aca00
4798 * [6] 000003e8 00000040 000f4240 000000f9
4799 *
4800 * a b c d ticks_per_s buffer_hz
4801 * ------- --------- ---------- ------------- ----------- -------------
4802 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4803 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4804 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4805 * [4] 1,000 1,024 1,000,000 100 976,562 100
4806 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4807 * [6] 1,000 64 1,000,000 249 15,625,000 249
4808 *
4809 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4810 * [2] 2.6.26-1-686-bigmem from Debian lenny
4811 * [3] 2.6.26-2-sparc64 from Debian lenny
4812 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4813 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4814 * [6] 2.6.34 from kernel.org on KVM
4815 */
4816 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4817 static const char fn[] = "/proc/net/psched";
4818 unsigned int a, b, c, d;
4819 FILE *stream;
4820
4821 if (!ovsthread_once_start(&once)) {
4822 return;
4823 }
4824
4825 ticks_per_s = 1.0;
4826 buffer_hz = 100;
4827
4828 stream = fopen(fn, "r");
4829 if (!stream) {
4830 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4831 goto exit;
4832 }
4833
4834 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4835 VLOG_WARN("%s: read failed", fn);
4836 fclose(stream);
4837 goto exit;
4838 }
4839 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4840 fclose(stream);
4841
4842 if (!a || !c) {
4843 VLOG_WARN("%s: invalid scheduler parameters", fn);
4844 goto exit;
4845 }
4846
4847 ticks_per_s = (double) a * c / b;
4848 if (c == 1000000) {
4849 buffer_hz = d;
4850 } else {
4851 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4852 fn, a, b, c, d);
4853 }
4854 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4855
4856 exit:
4857 ovsthread_once_done(&once);
4858 }
4859
4860 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4861 * rate of 'rate' bytes per second. */
4862 static unsigned int
4863 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4864 {
4865 read_psched();
4866 return (rate * ticks) / ticks_per_s;
4867 }
4868
4869 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4870 * rate of 'rate' bytes per second. */
4871 static unsigned int
4872 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4873 {
4874 read_psched();
4875 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4876 }
4877
4878 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4879 * a transmission rate of 'rate' bytes per second. */
4880 static unsigned int
4881 tc_buffer_per_jiffy(unsigned int rate)
4882 {
4883 read_psched();
4884 return rate / buffer_hz;
4885 }
4886
4887 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4888 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4889 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4890 * stores NULL into it if it is absent.
4891 *
4892 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4893 * 'msg'.
4894 *
4895 * Returns 0 if successful, otherwise a positive errno value. */
4896 static int
4897 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4898 struct nlattr **options)
4899 {
4900 static const struct nl_policy tca_policy[] = {
4901 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4902 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4903 };
4904 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4905
4906 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4907 tca_policy, ta, ARRAY_SIZE(ta))) {
4908 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4909 goto error;
4910 }
4911
4912 if (kind) {
4913 *kind = nl_attr_get_string(ta[TCA_KIND]);
4914 }
4915
4916 if (options) {
4917 *options = ta[TCA_OPTIONS];
4918 }
4919
4920 return 0;
4921
4922 error:
4923 if (kind) {
4924 *kind = NULL;
4925 }
4926 if (options) {
4927 *options = NULL;
4928 }
4929 return EPROTO;
4930 }
4931
4932 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4933 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4934 * into '*options', and its queue statistics into '*stats'. Any of the output
4935 * arguments may be null.
4936 *
4937 * Returns 0 if successful, otherwise a positive errno value. */
4938 static int
4939 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4940 struct nlattr **options, struct netdev_queue_stats *stats)
4941 {
4942 static const struct nl_policy tca_policy[] = {
4943 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4944 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4945 };
4946 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4947
4948 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4949 tca_policy, ta, ARRAY_SIZE(ta))) {
4950 VLOG_WARN_RL(&rl, "failed to parse class message");
4951 goto error;
4952 }
4953
4954 if (handlep) {
4955 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4956 *handlep = tc->tcm_handle;
4957 }
4958
4959 if (options) {
4960 *options = ta[TCA_OPTIONS];
4961 }
4962
4963 if (stats) {
4964 const struct gnet_stats_queue *gsq;
4965 struct gnet_stats_basic gsb;
4966
4967 static const struct nl_policy stats_policy[] = {
4968 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4969 .min_len = sizeof gsb },
4970 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4971 .min_len = sizeof *gsq },
4972 };
4973 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4974
4975 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4976 sa, ARRAY_SIZE(sa))) {
4977 VLOG_WARN_RL(&rl, "failed to parse class stats");
4978 goto error;
4979 }
4980
4981 /* Alignment issues screw up the length of struct gnet_stats_basic on
4982 * some arch/bitsize combinations. Newer versions of Linux have a
4983 * struct gnet_stats_basic_packed, but we can't depend on that. The
4984 * easiest thing to do is just to make a copy. */
4985 memset(&gsb, 0, sizeof gsb);
4986 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4987 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4988 stats->tx_bytes = gsb.bytes;
4989 stats->tx_packets = gsb.packets;
4990
4991 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4992 stats->tx_errors = gsq->drops;
4993 }
4994
4995 return 0;
4996
4997 error:
4998 if (options) {
4999 *options = NULL;
5000 }
5001 if (stats) {
5002 memset(stats, 0, sizeof *stats);
5003 }
5004 return EPROTO;
5005 }
5006
5007 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5008 * on 'netdev'. */
5009 static int
5010 tc_query_class(const struct netdev *netdev,
5011 unsigned int handle, unsigned int parent,
5012 struct ofpbuf **replyp)
5013 {
5014 struct ofpbuf request;
5015 struct tcmsg *tcmsg;
5016 int error;
5017
5018 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5019 if (!tcmsg) {
5020 return ENODEV;
5021 }
5022 tcmsg->tcm_handle = handle;
5023 tcmsg->tcm_parent = parent;
5024
5025 error = tc_transact(&request, replyp);
5026 if (error) {
5027 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5028 netdev_get_name(netdev),
5029 tc_get_major(handle), tc_get_minor(handle),
5030 tc_get_major(parent), tc_get_minor(parent),
5031 ovs_strerror(error));
5032 }
5033 return error;
5034 }
5035
5036 /* Equivalent to "tc class del dev <name> handle <handle>". */
5037 static int
5038 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5039 {
5040 struct ofpbuf request;
5041 struct tcmsg *tcmsg;
5042 int error;
5043
5044 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5045 if (!tcmsg) {
5046 return ENODEV;
5047 }
5048 tcmsg->tcm_handle = handle;
5049 tcmsg->tcm_parent = 0;
5050
5051 error = tc_transact(&request, NULL);
5052 if (error) {
5053 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5054 netdev_get_name(netdev),
5055 tc_get_major(handle), tc_get_minor(handle),
5056 ovs_strerror(error));
5057 }
5058 return error;
5059 }
5060
5061 /* Equivalent to "tc qdisc del dev <name> root". */
5062 static int
5063 tc_del_qdisc(struct netdev *netdev_)
5064 {
5065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5066 struct ofpbuf request;
5067 struct tcmsg *tcmsg;
5068 int error;
5069
5070 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5071 if (!tcmsg) {
5072 return ENODEV;
5073 }
5074 tcmsg->tcm_handle = tc_make_handle(1, 0);
5075 tcmsg->tcm_parent = TC_H_ROOT;
5076
5077 error = tc_transact(&request, NULL);
5078 if (error == EINVAL) {
5079 /* EINVAL probably means that the default qdisc was in use, in which
5080 * case we've accomplished our purpose. */
5081 error = 0;
5082 }
5083 if (!error && netdev->tc) {
5084 if (netdev->tc->ops->tc_destroy) {
5085 netdev->tc->ops->tc_destroy(netdev->tc);
5086 }
5087 netdev->tc = NULL;
5088 }
5089 return error;
5090 }
5091
5092 static bool
5093 getqdisc_is_safe(void)
5094 {
5095 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5096 static bool safe = false;
5097
5098 if (ovsthread_once_start(&once)) {
5099 struct utsname utsname;
5100 int major, minor;
5101
5102 if (uname(&utsname) == -1) {
5103 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5104 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5105 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5106 } else if (major < 2 || (major == 2 && minor < 35)) {
5107 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5108 utsname.release);
5109 } else {
5110 safe = true;
5111 }
5112 ovsthread_once_done(&once);
5113 }
5114 return safe;
5115 }
5116
5117 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5118 * kernel to determine what they are. Returns 0 if successful, otherwise a
5119 * positive errno value. */
5120 static int
5121 tc_query_qdisc(const struct netdev *netdev_)
5122 {
5123 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5124 struct ofpbuf request, *qdisc;
5125 const struct tc_ops *ops;
5126 struct tcmsg *tcmsg;
5127 int load_error;
5128 int error;
5129
5130 if (netdev->tc) {
5131 return 0;
5132 }
5133
5134 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5135 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5136 * 2.6.35 without that fix backported to it.
5137 *
5138 * To avoid the OOPS, we must not make a request that would attempt to dump
5139 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5140 * few others. There are a few ways that I can see to do this, but most of
5141 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5142 * technique chosen here is to assume that any non-default qdisc that we
5143 * create will have a class with handle 1:0. The built-in qdiscs only have
5144 * a class with handle 0:0.
5145 *
5146 * On Linux 2.6.35+ we use the straightforward method because it allows us
5147 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5148 * in such a case we get no response at all from the kernel (!) if a
5149 * builtin qdisc is in use (which is later caught by "!error &&
5150 * !qdisc->size"). */
5151 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5152 if (!tcmsg) {
5153 return ENODEV;
5154 }
5155 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5156 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5157
5158 /* Figure out what tc class to instantiate. */
5159 error = tc_transact(&request, &qdisc);
5160 if (!error && qdisc->size) {
5161 const char *kind;
5162
5163 error = tc_parse_qdisc(qdisc, &kind, NULL);
5164 if (error) {
5165 ops = &tc_ops_other;
5166 } else {
5167 ops = tc_lookup_linux_name(kind);
5168 if (!ops) {
5169 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5170 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5171
5172 ops = &tc_ops_other;
5173 }
5174 }
5175 } else if ((!error && !qdisc->size) || error == ENOENT) {
5176 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5177 * set up by some other entity that doesn't have a handle 1:0. We will
5178 * assume that it's the system default qdisc. */
5179 ops = &tc_ops_default;
5180 error = 0;
5181 } else {
5182 /* Who knows? Maybe the device got deleted. */
5183 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5184 netdev_get_name(netdev_), ovs_strerror(error));
5185 ops = &tc_ops_other;
5186 }
5187
5188 /* Instantiate it. */
5189 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5190 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5191 ofpbuf_delete(qdisc);
5192
5193 return error ? error : load_error;
5194 }
5195
5196 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5197 approximate the time to transmit packets of various lengths. For an MTU of
5198 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5199 represents two possible packet lengths; for a MTU of 513 through 1024, four
5200 possible lengths; and so on.
5201
5202 Returns, for the specified 'mtu', the number of bits that packet lengths
5203 need to be shifted right to fit within such a 256-entry table. */
5204 static int
5205 tc_calc_cell_log(unsigned int mtu)
5206 {
5207 int cell_log;
5208
5209 if (!mtu) {
5210 mtu = ETH_PAYLOAD_MAX;
5211 }
5212 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5213
5214 for (cell_log = 0; mtu >= 256; cell_log++) {
5215 mtu >>= 1;
5216 }
5217
5218 return cell_log;
5219 }
5220
5221 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5222 * of 'mtu'. */
5223 static void
5224 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5225 {
5226 memset(rate, 0, sizeof *rate);
5227 rate->cell_log = tc_calc_cell_log(mtu);
5228 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5229 /* rate->cell_align = 0; */ /* distro headers. */
5230 rate->mpu = ETH_TOTAL_MIN;
5231 rate->rate = Bps;
5232 }
5233
5234 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5235 * attribute of the specified "type".
5236 *
5237 * See tc_calc_cell_log() above for a description of "rtab"s. */
5238 static void
5239 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5240 {
5241 uint32_t *rtab;
5242 unsigned int i;
5243
5244 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5245 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5246 unsigned packet_size = (i + 1) << rate->cell_log;
5247 if (packet_size < rate->mpu) {
5248 packet_size = rate->mpu;
5249 }
5250 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5251 }
5252 }
5253
5254 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5255 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5256 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5257 * 0 is fine.) */
5258 static int
5259 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5260 {
5261 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5262 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5263 }
5264 \f
5265 /* Linux-only functions declared in netdev-linux.h */
5266
5267 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5268 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5269 int
5270 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5271 const char *flag_name, bool enable)
5272 {
5273 const char *netdev_name = netdev_get_name(netdev);
5274 struct ethtool_value evalue;
5275 uint32_t new_flags;
5276 int error;
5277
5278 COVERAGE_INC(netdev_get_ethtool);
5279 memset(&evalue, 0, sizeof evalue);
5280 error = netdev_linux_do_ethtool(netdev_name,
5281 (struct ethtool_cmd *)&evalue,
5282 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5283 if (error) {
5284 return error;
5285 }
5286
5287 COVERAGE_INC(netdev_set_ethtool);
5288 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5289 if (new_flags == evalue.data) {
5290 return 0;
5291 }
5292 evalue.data = new_flags;
5293 error = netdev_linux_do_ethtool(netdev_name,
5294 (struct ethtool_cmd *)&evalue,
5295 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5296 if (error) {
5297 return error;
5298 }
5299
5300 COVERAGE_INC(netdev_get_ethtool);
5301 memset(&evalue, 0, sizeof evalue);
5302 error = netdev_linux_do_ethtool(netdev_name,
5303 (struct ethtool_cmd *)&evalue,
5304 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5305 if (error) {
5306 return error;
5307 }
5308
5309 if (new_flags != evalue.data) {
5310 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5311 "device %s failed", enable ? "enable" : "disable",
5312 flag_name, netdev_name);
5313 return EOPNOTSUPP;
5314 }
5315
5316 return 0;
5317 }
5318 \f
5319 /* Utility functions. */
5320
5321 /* Copies 'src' into 'dst', performing format conversion in the process. */
5322 static void
5323 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5324 const struct rtnl_link_stats *src)
5325 {
5326 dst->rx_packets = src->rx_packets;
5327 dst->tx_packets = src->tx_packets;
5328 dst->rx_bytes = src->rx_bytes;
5329 dst->tx_bytes = src->tx_bytes;
5330 dst->rx_errors = src->rx_errors;
5331 dst->tx_errors = src->tx_errors;
5332 dst->rx_dropped = src->rx_dropped;
5333 dst->tx_dropped = src->tx_dropped;
5334 dst->multicast = src->multicast;
5335 dst->collisions = src->collisions;
5336 dst->rx_length_errors = src->rx_length_errors;
5337 dst->rx_over_errors = src->rx_over_errors;
5338 dst->rx_crc_errors = src->rx_crc_errors;
5339 dst->rx_frame_errors = src->rx_frame_errors;
5340 dst->rx_fifo_errors = src->rx_fifo_errors;
5341 dst->rx_missed_errors = src->rx_missed_errors;
5342 dst->tx_aborted_errors = src->tx_aborted_errors;
5343 dst->tx_carrier_errors = src->tx_carrier_errors;
5344 dst->tx_fifo_errors = src->tx_fifo_errors;
5345 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5346 dst->tx_window_errors = src->tx_window_errors;
5347 }
5348
5349 /* Copies 'src' into 'dst', performing format conversion in the process. */
5350 static void
5351 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5352 const struct rtnl_link_stats64 *src)
5353 {
5354 dst->rx_packets = src->rx_packets;
5355 dst->tx_packets = src->tx_packets;
5356 dst->rx_bytes = src->rx_bytes;
5357 dst->tx_bytes = src->tx_bytes;
5358 dst->rx_errors = src->rx_errors;
5359 dst->tx_errors = src->tx_errors;
5360 dst->rx_dropped = src->rx_dropped;
5361 dst->tx_dropped = src->tx_dropped;
5362 dst->multicast = src->multicast;
5363 dst->collisions = src->collisions;
5364 dst->rx_length_errors = src->rx_length_errors;
5365 dst->rx_over_errors = src->rx_over_errors;
5366 dst->rx_crc_errors = src->rx_crc_errors;
5367 dst->rx_frame_errors = src->rx_frame_errors;
5368 dst->rx_fifo_errors = src->rx_fifo_errors;
5369 dst->rx_missed_errors = src->rx_missed_errors;
5370 dst->tx_aborted_errors = src->tx_aborted_errors;
5371 dst->tx_carrier_errors = src->tx_carrier_errors;
5372 dst->tx_fifo_errors = src->tx_fifo_errors;
5373 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5374 dst->tx_window_errors = src->tx_window_errors;
5375 }
5376
5377 static int
5378 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5379 {
5380 struct ofpbuf request;
5381 struct ofpbuf *reply;
5382 int error;
5383
5384 /* Filtering all counters by default */
5385 memset(stats, 0xFF, sizeof(struct netdev_stats));
5386
5387 ofpbuf_init(&request, 0);
5388 nl_msg_put_nlmsghdr(&request,
5389 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5390 RTM_GETLINK, NLM_F_REQUEST);
5391 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5392 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5393 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5394 ofpbuf_uninit(&request);
5395 if (error) {
5396 return error;
5397 }
5398
5399 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5400 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5401 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5402 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5403 error = 0;
5404 } else {
5405 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5406 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5407 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5408 error = 0;
5409 } else {
5410 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5411 error = EPROTO;
5412 }
5413 }
5414 } else {
5415 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5416 error = EPROTO;
5417 }
5418
5419
5420 ofpbuf_delete(reply);
5421 return error;
5422 }
5423
5424 static int
5425 get_flags(const struct netdev *dev, unsigned int *flags)
5426 {
5427 struct ifreq ifr;
5428 int error;
5429
5430 *flags = 0;
5431 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5432 if (!error) {
5433 *flags = ifr.ifr_flags;
5434 }
5435 return error;
5436 }
5437
5438 static int
5439 set_flags(const char *name, unsigned int flags)
5440 {
5441 struct ifreq ifr;
5442
5443 ifr.ifr_flags = flags;
5444 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5445 }
5446
5447 static int
5448 do_get_ifindex(const char *netdev_name)
5449 {
5450 struct ifreq ifr;
5451 int error;
5452
5453 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5454 COVERAGE_INC(netdev_get_ifindex);
5455
5456 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5457 if (error) {
5458 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5459 netdev_name, ovs_strerror(error));
5460 return -error;
5461 }
5462 return ifr.ifr_ifindex;
5463 }
5464
5465 static int
5466 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5467 {
5468 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5469
5470 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5471 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5472
5473 if (ifindex < 0) {
5474 netdev->get_ifindex_error = -ifindex;
5475 netdev->ifindex = 0;
5476 } else {
5477 netdev->get_ifindex_error = 0;
5478 netdev->ifindex = ifindex;
5479 }
5480 netdev->cache_valid |= VALID_IFINDEX;
5481 }
5482
5483 *ifindexp = netdev->ifindex;
5484 return netdev->get_ifindex_error;
5485 }
5486
5487 static int
5488 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5489 {
5490 struct ifreq ifr;
5491 int hwaddr_family;
5492 int error;
5493
5494 memset(&ifr, 0, sizeof ifr);
5495 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5496 COVERAGE_INC(netdev_get_hwaddr);
5497 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5498 if (error) {
5499 /* ENODEV probably means that a vif disappeared asynchronously and
5500 * hasn't been removed from the database yet, so reduce the log level
5501 * to INFO for that case. */
5502 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5503 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5504 netdev_name, ovs_strerror(error));
5505 return error;
5506 }
5507 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5508 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5509 VLOG_INFO("%s device has unknown hardware address family %d",
5510 netdev_name, hwaddr_family);
5511 return EINVAL;
5512 }
5513 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5514 return 0;
5515 }
5516
5517 static int
5518 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5519 {
5520 struct ifreq ifr;
5521 int error;
5522
5523 memset(&ifr, 0, sizeof ifr);
5524 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5525 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5526 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5527 COVERAGE_INC(netdev_set_hwaddr);
5528 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5529 if (error) {
5530 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5531 netdev_name, ovs_strerror(error));
5532 }
5533 return error;
5534 }
5535
5536 static int
5537 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5538 int cmd, const char *cmd_name)
5539 {
5540 struct ifreq ifr;
5541 int error;
5542
5543 memset(&ifr, 0, sizeof ifr);
5544 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5545 ifr.ifr_data = (caddr_t) ecmd;
5546
5547 ecmd->cmd = cmd;
5548 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5549 if (error) {
5550 if (error != EOPNOTSUPP) {
5551 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5552 "failed: %s", cmd_name, name, ovs_strerror(error));
5553 } else {
5554 /* The device doesn't support this operation. That's pretty
5555 * common, so there's no point in logging anything. */
5556 }
5557 }
5558 return error;
5559 }
5560
5561 /* Returns an AF_PACKET raw socket or a negative errno value. */
5562 static int
5563 af_packet_sock(void)
5564 {
5565 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5566 static int sock;
5567
5568 if (ovsthread_once_start(&once)) {
5569 sock = socket(AF_PACKET, SOCK_RAW, 0);
5570 if (sock >= 0) {
5571 int error = set_nonblocking(sock);
5572 if (error) {
5573 close(sock);
5574 sock = -error;
5575 }
5576 } else {
5577 sock = -errno;
5578 VLOG_ERR("failed to create packet socket: %s",
5579 ovs_strerror(errno));
5580 }
5581 ovsthread_once_done(&once);
5582 }
5583
5584 return sock;
5585 }