]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
dp-packet: Fix use of uninitialised value at emc_lookup.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
57 #include "hash.h"
58 #include "hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
86
87 \f
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
92 #endif
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
95 #endif
96
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #endif
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 #endif
105
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108 #ifndef TC_RTAB_SIZE
109 #define TC_RTAB_SIZE 1024
110 #endif
111
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
123 #endif
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #endif
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #endif
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140 };
141
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
143 *
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
150 {
151 return ep->speed | (ep->speed_hi << 16);
152 }
153
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
165 #endif
166
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
178 #endif
179
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
181 *
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
187 #ifndef IFLA_STATS64
188 #define IFLA_STATS64 23
189 #endif
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
192 uint64_t rx_packets;
193 uint64_t tx_packets;
194 uint64_t rx_bytes;
195 uint64_t tx_bytes;
196 uint64_t rx_errors;
197 uint64_t tx_errors;
198 uint64_t rx_dropped;
199 uint64_t tx_dropped;
200 uint64_t multicast;
201 uint64_t collisions;
202
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
209
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
215
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
218 };
219
220 enum {
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
223 VALID_IN = 1 << 2,
224 VALID_MTU = 1 << 3,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
229 };
230 \f
231 /* Traffic control. */
232
233 /* An instance of a traffic control class. Always associated with a particular
234 * network device.
235 *
236 * Each TC implementation subclasses this with whatever additional data it
237 * needs. */
238 struct tc {
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
243 };
244
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
246
247 /* One traffic control queue.
248 *
249 * Each TC implementation subclasses this with whatever additional data it
250 * needs. */
251 struct tc_queue {
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
255 };
256
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
259 *
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
263 struct tc_ops {
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
268
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
271
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
275
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
281 *
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
285 *
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
288 *
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
292
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
296 *
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
302 * 'netdev'.
303 *
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
307
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
310 * tc_destroy(tc).
311 *
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
315 *
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
318
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
320 *
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
324 *
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
328 *
329 * This function may be null if 'tc' is not configurable.
330 */
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
332
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
335 *
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
339 *
340 * This function may be null if 'tc' is not configurable.
341 */
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
343
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
346 *
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
350 *
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
354 *
355 * This function may be null if 'tc' does not have queues ('n_queues' is
356 * 0). */
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
359
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
363 * 'n_queues'.
364 *
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
368 *
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
373
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
376 *
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
380
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
383 *
384 * On success, initializes '*stats'.
385 *
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
391
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
394 *
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
400 };
401
402 static void
403 tc_init(struct tc *tc, const struct tc_ops *ops)
404 {
405 tc->ops = ops;
406 hmap_init(&tc->queues);
407 }
408
409 static void
410 tc_destroy(struct tc *tc)
411 {
412 hmap_destroy(&tc->queues);
413 }
414
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_other;
422
423 static const struct tc_ops *const tcs[] = {
424 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
425 &tc_ops_hfsc, /* Hierarchical fair service curve. */
426 &tc_ops_codel, /* Controlled delay */
427 &tc_ops_fqcodel, /* Fair queue controlled delay */
428 &tc_ops_sfq, /* Stochastic fair queueing */
429 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
430 &tc_ops_other, /* Some other qdisc. */
431 NULL
432 };
433
434 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
435 static unsigned int tc_get_major(unsigned int handle);
436 static unsigned int tc_get_minor(unsigned int handle);
437
438 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
439 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
440 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
441
442 static struct tcmsg *tc_make_request(const struct netdev *, int type,
443 unsigned int flags, struct ofpbuf *);
444 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
445 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
446 static int tc_add_policer(struct netdev *,
447 uint32_t kbits_rate, uint32_t kbits_burst);
448
449 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
450 struct nlattr **options);
451 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
452 struct nlattr **options,
453 struct netdev_queue_stats *);
454 static int tc_query_class(const struct netdev *,
455 unsigned int handle, unsigned int parent,
456 struct ofpbuf **replyp);
457 static int tc_delete_class(const struct netdev *, unsigned int handle);
458
459 static int tc_del_qdisc(struct netdev *netdev);
460 static int tc_query_qdisc(const struct netdev *netdev);
461
462 static int tc_calc_cell_log(unsigned int mtu);
463 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
464 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
465 const struct tc_ratespec *rate);
466 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
467 \f
468 struct netdev_linux {
469 struct netdev up;
470
471 /* Protects all members below. */
472 struct ovs_mutex mutex;
473
474 unsigned int cache_valid;
475
476 bool miimon; /* Link status of last poll. */
477 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
478 struct timer miimon_timer;
479
480 /* The following are figured out "on demand" only. They are only valid
481 * when the corresponding VALID_* bit in 'cache_valid' is set. */
482 int ifindex;
483 struct eth_addr etheraddr;
484 int mtu;
485 unsigned int ifi_flags;
486 long long int carrier_resets;
487 uint32_t kbits_rate; /* Policing data. */
488 uint32_t kbits_burst;
489 int vport_stats_error; /* Cached error code from vport_get_stats().
490 0 or an errno value. */
491 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
492 int ether_addr_error; /* Cached error code from set/get etheraddr. */
493 int netdev_policing_error; /* Cached error code from set policing. */
494 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
495 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
496
497 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
499 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
500
501 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
502 struct tc *tc;
503
504 /* For devices of class netdev_tap_class only. */
505 int tap_fd;
506 };
507
508 struct netdev_rxq_linux {
509 struct netdev_rxq up;
510 bool is_tap;
511 int fd;
512 };
513
514 /* This is set pretty low because we probably won't learn anything from the
515 * additional log messages. */
516 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
517
518 /* Polling miimon status for all ports causes performance degradation when
519 * handling a large number of ports. If there are no devices using miimon, then
520 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
521 *
522 * Readers do not depend on this variable synchronizing with the related
523 * changes in the device miimon status, so we can use atomic_count. */
524 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
525
526 static void netdev_linux_run(void);
527
528 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
529 int cmd, const char *cmd_name);
530 static int get_flags(const struct netdev *, unsigned int *flags);
531 static int set_flags(const char *, unsigned int flags);
532 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
533 enum netdev_flags on, enum netdev_flags *old_flagsp)
534 OVS_REQUIRES(netdev->mutex);
535 static int do_get_ifindex(const char *netdev_name);
536 static int get_ifindex(const struct netdev *, int *ifindexp);
537 static int do_set_addr(struct netdev *netdev,
538 int ioctl_nr, const char *ioctl_name,
539 struct in_addr addr);
540 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
541 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
542 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
543 static int af_packet_sock(void);
544 static bool netdev_linux_miimon_enabled(void);
545 static void netdev_linux_miimon_run(void);
546 static void netdev_linux_miimon_wait(void);
547 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
548
549 static bool
550 is_netdev_linux_class(const struct netdev_class *netdev_class)
551 {
552 return netdev_class->run == netdev_linux_run;
553 }
554
555 static bool
556 is_tap_netdev(const struct netdev *netdev)
557 {
558 return netdev_get_class(netdev) == &netdev_tap_class;
559 }
560
561 static struct netdev_linux *
562 netdev_linux_cast(const struct netdev *netdev)
563 {
564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
565
566 return CONTAINER_OF(netdev, struct netdev_linux, up);
567 }
568
569 static struct netdev_rxq_linux *
570 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
571 {
572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
573 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
574 }
575 \f
576 static void netdev_linux_update(struct netdev_linux *netdev,
577 const struct rtnetlink_change *)
578 OVS_REQUIRES(netdev->mutex);
579 static void netdev_linux_changed(struct netdev_linux *netdev,
580 unsigned int ifi_flags, unsigned int mask)
581 OVS_REQUIRES(netdev->mutex);
582
583 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
585 * if no such socket could be created. */
586 static struct nl_sock *
587 netdev_linux_notify_sock(void)
588 {
589 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
590 static struct nl_sock *sock;
591 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
592 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
593
594 if (ovsthread_once_start(&once)) {
595 int error;
596
597 error = nl_sock_create(NETLINK_ROUTE, &sock);
598 if (!error) {
599 size_t i;
600
601 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
602 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
603 if (error) {
604 nl_sock_destroy(sock);
605 sock = NULL;
606 break;
607 }
608 }
609 }
610 ovsthread_once_done(&once);
611 }
612
613 return sock;
614 }
615
616 static bool
617 netdev_linux_miimon_enabled(void)
618 {
619 return atomic_count_get(&miimon_cnt) > 0;
620 }
621
622 static void
623 netdev_linux_run(void)
624 {
625 struct nl_sock *sock;
626 int error;
627
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
630 }
631
632 sock = netdev_linux_notify_sock();
633 if (!sock) {
634 return;
635 }
636
637 do {
638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
639 uint64_t buf_stub[4096 / 8];
640 struct ofpbuf buf;
641
642 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
643 error = nl_sock_recv(sock, &buf, false);
644 if (!error) {
645 struct rtnetlink_change change;
646
647 if (rtnetlink_parse(&buf, &change)) {
648 struct netdev *netdev_ = NULL;
649 char dev_name[IFNAMSIZ];
650
651 if (!change.ifname) {
652 change.ifname = if_indextoname(change.if_index, dev_name);
653 }
654
655 if (change.ifname) {
656 netdev_ = netdev_from_name(change.ifname);
657 }
658 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
660
661 ovs_mutex_lock(&netdev->mutex);
662 netdev_linux_update(netdev, &change);
663 ovs_mutex_unlock(&netdev->mutex);
664 }
665 netdev_close(netdev_);
666 }
667 } else if (error == ENOBUFS) {
668 struct shash device_shash;
669 struct shash_node *node;
670
671 nl_sock_drain(sock);
672
673 shash_init(&device_shash);
674 netdev_get_devices(&netdev_linux_class, &device_shash);
675 SHASH_FOR_EACH (node, &device_shash) {
676 struct netdev *netdev_ = node->data;
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 unsigned int flags;
679
680 ovs_mutex_lock(&netdev->mutex);
681 get_flags(netdev_, &flags);
682 netdev_linux_changed(netdev, flags, 0);
683 ovs_mutex_unlock(&netdev->mutex);
684
685 netdev_close(netdev_);
686 }
687 shash_destroy(&device_shash);
688 } else if (error != EAGAIN) {
689 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
690 ovs_strerror(error));
691 }
692 ofpbuf_uninit(&buf);
693 } while (!error);
694 }
695
696 static void
697 netdev_linux_wait(void)
698 {
699 struct nl_sock *sock;
700
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
703 }
704 sock = netdev_linux_notify_sock();
705 if (sock) {
706 nl_sock_wait(sock, POLLIN);
707 }
708 }
709
710 static void
711 netdev_linux_changed(struct netdev_linux *dev,
712 unsigned int ifi_flags, unsigned int mask)
713 OVS_REQUIRES(dev->mutex)
714 {
715 netdev_change_seq_changed(&dev->up);
716
717 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
718 dev->carrier_resets++;
719 }
720 dev->ifi_flags = ifi_flags;
721
722 dev->cache_valid &= mask;
723 if (!(mask & VALID_IN)) {
724 netdev_get_addrs_list_flush();
725 }
726 }
727
728 static void
729 netdev_linux_update(struct netdev_linux *dev,
730 const struct rtnetlink_change *change)
731 OVS_REQUIRES(dev->mutex)
732 {
733 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
734 if (change->nlmsg_type == RTM_NEWLINK) {
735 /* Keep drv-info, and ip addresses. */
736 netdev_linux_changed(dev, change->ifi_flags,
737 VALID_DRVINFO | VALID_IN);
738
739 /* Update netdev from rtnl-change msg. */
740 if (change->mtu) {
741 dev->mtu = change->mtu;
742 dev->cache_valid |= VALID_MTU;
743 dev->netdev_mtu_error = 0;
744 }
745
746 if (!eth_addr_is_zero(change->mac)) {
747 dev->etheraddr = change->mac;
748 dev->cache_valid |= VALID_ETHERADDR;
749 dev->ether_addr_error = 0;
750 }
751
752 dev->ifindex = change->if_index;
753 dev->cache_valid |= VALID_IFINDEX;
754 dev->get_ifindex_error = 0;
755 } else {
756 netdev_linux_changed(dev, change->ifi_flags, 0);
757 }
758 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
759 /* Invalidates in4, in6. */
760 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
761 } else {
762 OVS_NOT_REACHED();
763 }
764 }
765
766 static struct netdev *
767 netdev_linux_alloc(void)
768 {
769 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
770 return &netdev->up;
771 }
772
773 static void
774 netdev_linux_common_construct(struct netdev_linux *netdev)
775 {
776 ovs_mutex_init(&netdev->mutex);
777 }
778
779 /* Creates system and internal devices. */
780 static int
781 netdev_linux_construct(struct netdev *netdev_)
782 {
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
784 int error;
785
786 netdev_linux_common_construct(netdev);
787
788 error = get_flags(&netdev->up, &netdev->ifi_flags);
789 if (error == ENODEV) {
790 if (netdev->up.netdev_class != &netdev_internal_class) {
791 /* The device does not exist, so don't allow it to be opened. */
792 return ENODEV;
793 } else {
794 /* "Internal" netdevs have to be created as netdev objects before
795 * they exist in the kernel, because creating them in the kernel
796 * happens by passing a netdev object to dpif_port_add().
797 * Therefore, ignore the error. */
798 }
799 }
800
801 return 0;
802 }
803
804 /* For most types of netdevs we open the device for each call of
805 * netdev_open(). However, this is not the case with tap devices,
806 * since it is only possible to open the device once. In this
807 * situation we share a single file descriptor, and consequently
808 * buffers, across all readers. Therefore once data is read it will
809 * be unavailable to other reads for tap devices. */
810 static int
811 netdev_linux_construct_tap(struct netdev *netdev_)
812 {
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 static const char tap_dev[] = "/dev/net/tun";
815 const char *name = netdev_->name;
816 struct ifreq ifr;
817 int error;
818
819 netdev_linux_common_construct(netdev);
820
821 /* Open tap device. */
822 netdev->tap_fd = open(tap_dev, O_RDWR);
823 if (netdev->tap_fd < 0) {
824 error = errno;
825 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
826 return error;
827 }
828
829 /* Create tap device. */
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
831 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
832 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
833 VLOG_WARN("%s: creating tap device failed: %s", name,
834 ovs_strerror(errno));
835 error = errno;
836 goto error_close;
837 }
838
839 /* Make non-blocking. */
840 error = set_nonblocking(netdev->tap_fd);
841 if (error) {
842 goto error_close;
843 }
844
845 return 0;
846
847 error_close:
848 close(netdev->tap_fd);
849 return error;
850 }
851
852 static void
853 netdev_linux_destruct(struct netdev *netdev_)
854 {
855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
856
857 if (netdev->tc && netdev->tc->ops->tc_destroy) {
858 netdev->tc->ops->tc_destroy(netdev->tc);
859 }
860
861 if (netdev_get_class(netdev_) == &netdev_tap_class
862 && netdev->tap_fd >= 0)
863 {
864 close(netdev->tap_fd);
865 }
866
867 if (netdev->miimon_interval > 0) {
868 atomic_count_dec(&miimon_cnt);
869 }
870
871 ovs_mutex_destroy(&netdev->mutex);
872 }
873
874 static void
875 netdev_linux_dealloc(struct netdev *netdev_)
876 {
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
878 free(netdev);
879 }
880
881 static struct netdev_rxq *
882 netdev_linux_rxq_alloc(void)
883 {
884 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
885 return &rx->up;
886 }
887
888 static int
889 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
890 {
891 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
892 struct netdev *netdev_ = rx->up.netdev;
893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
894 int error;
895
896 ovs_mutex_lock(&netdev->mutex);
897 rx->is_tap = is_tap_netdev(netdev_);
898 if (rx->is_tap) {
899 rx->fd = netdev->tap_fd;
900 } else {
901 struct sockaddr_ll sll;
902 int ifindex, val;
903 /* Result of tcpdump -dd inbound */
904 static const struct sock_filter filt[] = {
905 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
906 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
907 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
908 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
909 };
910 static const struct sock_fprog fprog = {
911 ARRAY_SIZE(filt), (struct sock_filter *) filt
912 };
913
914 /* Create file descriptor. */
915 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
916 if (rx->fd < 0) {
917 error = errno;
918 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
919 goto error;
920 }
921
922 val = 1;
923 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
924 error = errno;
925 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
926 netdev_get_name(netdev_), ovs_strerror(error));
927 goto error;
928 }
929
930 /* Set non-blocking mode. */
931 error = set_nonblocking(rx->fd);
932 if (error) {
933 goto error;
934 }
935
936 /* Get ethernet device index. */
937 error = get_ifindex(&netdev->up, &ifindex);
938 if (error) {
939 goto error;
940 }
941
942 /* Bind to specific ethernet device. */
943 memset(&sll, 0, sizeof sll);
944 sll.sll_family = AF_PACKET;
945 sll.sll_ifindex = ifindex;
946 sll.sll_protocol = htons(ETH_P_ALL);
947 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
948 error = errno;
949 VLOG_ERR("%s: failed to bind raw socket (%s)",
950 netdev_get_name(netdev_), ovs_strerror(error));
951 goto error;
952 }
953
954 /* Filter for only inbound packets. */
955 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
956 sizeof fprog);
957 if (error) {
958 error = errno;
959 VLOG_ERR("%s: failed to attach filter (%s)",
960 netdev_get_name(netdev_), ovs_strerror(error));
961 goto error;
962 }
963 }
964 ovs_mutex_unlock(&netdev->mutex);
965
966 return 0;
967
968 error:
969 if (rx->fd >= 0) {
970 close(rx->fd);
971 }
972 ovs_mutex_unlock(&netdev->mutex);
973 return error;
974 }
975
976 static void
977 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
978 {
979 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
980
981 if (!rx->is_tap) {
982 close(rx->fd);
983 }
984 }
985
986 static void
987 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
988 {
989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
990
991 free(rx);
992 }
993
994 static ovs_be16
995 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
996 {
997 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
998 return htons(aux->tp_vlan_tpid);
999 } else {
1000 return htons(ETH_TYPE_VLAN);
1001 }
1002 }
1003
1004 static bool
1005 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1006 {
1007 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1008 }
1009
1010 static int
1011 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1012 {
1013 size_t size;
1014 ssize_t retval;
1015 struct iovec iov;
1016 struct cmsghdr *cmsg;
1017 union {
1018 struct cmsghdr cmsg;
1019 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1020 } cmsg_buffer;
1021 struct msghdr msgh;
1022
1023 /* Reserve headroom for a single VLAN tag */
1024 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1025 size = dp_packet_tailroom(buffer);
1026
1027 iov.iov_base = dp_packet_data(buffer);
1028 iov.iov_len = size;
1029 msgh.msg_name = NULL;
1030 msgh.msg_namelen = 0;
1031 msgh.msg_iov = &iov;
1032 msgh.msg_iovlen = 1;
1033 msgh.msg_control = &cmsg_buffer;
1034 msgh.msg_controllen = sizeof cmsg_buffer;
1035 msgh.msg_flags = 0;
1036
1037 do {
1038 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1039 } while (retval < 0 && errno == EINTR);
1040
1041 if (retval < 0) {
1042 return errno;
1043 } else if (retval > size) {
1044 return EMSGSIZE;
1045 }
1046
1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1048
1049 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1050 const struct tpacket_auxdata *aux;
1051
1052 if (cmsg->cmsg_level != SOL_PACKET
1053 || cmsg->cmsg_type != PACKET_AUXDATA
1054 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1055 continue;
1056 }
1057
1058 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1059 if (auxdata_has_vlan_tci(aux)) {
1060 if (retval < ETH_HEADER_LEN) {
1061 return EINVAL;
1062 }
1063
1064 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1065 htons(aux->tp_vlan_tci));
1066 break;
1067 }
1068 }
1069
1070 return 0;
1071 }
1072
1073 static int
1074 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1075 {
1076 ssize_t retval;
1077 size_t size = dp_packet_tailroom(buffer);
1078
1079 do {
1080 retval = read(fd, dp_packet_data(buffer), size);
1081 } while (retval < 0 && errno == EINTR);
1082
1083 if (retval < 0) {
1084 return errno;
1085 }
1086
1087 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1088 return 0;
1089 }
1090
1091 static int
1092 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1093 int *c)
1094 {
1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1096 struct netdev *netdev = rx->up.netdev;
1097 struct dp_packet *buffer;
1098 ssize_t retval;
1099 int mtu;
1100
1101 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1102 mtu = ETH_PAYLOAD_MAX;
1103 }
1104
1105 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1106 DP_NETDEV_HEADROOM);
1107 retval = (rx->is_tap
1108 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1109 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1110
1111 if (retval) {
1112 if (retval != EAGAIN && retval != EMSGSIZE) {
1113 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1114 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1115 }
1116 dp_packet_delete(buffer);
1117 } else {
1118 dp_packet_pad(buffer);
1119 packets[0] = buffer;
1120 *c = 1;
1121 }
1122
1123 return retval;
1124 }
1125
1126 static void
1127 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1128 {
1129 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1130 poll_fd_wait(rx->fd, POLLIN);
1131 }
1132
1133 static int
1134 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1135 {
1136 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1137 if (rx->is_tap) {
1138 struct ifreq ifr;
1139 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1140 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1141 if (error) {
1142 return error;
1143 }
1144 drain_fd(rx->fd, ifr.ifr_qlen);
1145 return 0;
1146 } else {
1147 return drain_rcvbuf(rx->fd);
1148 }
1149 }
1150
1151 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1152 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1153 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1154 * the packet is too big or too small to transmit on the device.
1155 *
1156 * The caller retains ownership of 'buffer' in all cases.
1157 *
1158 * The kernel maintains a packet transmission queue, so the caller is not
1159 * expected to do additional queuing of packets. */
1160 static int
1161 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1162 struct dp_packet **pkts, int cnt, bool may_steal)
1163 {
1164 int i;
1165 int error = 0;
1166
1167 /* 'i' is incremented only if there's no error */
1168 for (i = 0; i < cnt;) {
1169 const void *data = dp_packet_data(pkts[i]);
1170 size_t size = dp_packet_size(pkts[i]);
1171 ssize_t retval;
1172
1173 if (!is_tap_netdev(netdev_)) {
1174 /* Use our AF_PACKET socket to send to this device. */
1175 struct sockaddr_ll sll;
1176 struct msghdr msg;
1177 struct iovec iov;
1178 int ifindex;
1179 int sock;
1180
1181 sock = af_packet_sock();
1182 if (sock < 0) {
1183 return -sock;
1184 }
1185
1186 ifindex = netdev_get_ifindex(netdev_);
1187 if (ifindex < 0) {
1188 return -ifindex;
1189 }
1190
1191 /* We don't bother setting most fields in sockaddr_ll because the
1192 * kernel ignores them for SOCK_RAW. */
1193 memset(&sll, 0, sizeof sll);
1194 sll.sll_family = AF_PACKET;
1195 sll.sll_ifindex = ifindex;
1196
1197 iov.iov_base = CONST_CAST(void *, data);
1198 iov.iov_len = size;
1199
1200 msg.msg_name = &sll;
1201 msg.msg_namelen = sizeof sll;
1202 msg.msg_iov = &iov;
1203 msg.msg_iovlen = 1;
1204 msg.msg_control = NULL;
1205 msg.msg_controllen = 0;
1206 msg.msg_flags = 0;
1207
1208 retval = sendmsg(sock, &msg, 0);
1209 } else {
1210 /* Use the tap fd to send to this device. This is essential for
1211 * tap devices, because packets sent to a tap device with an
1212 * AF_PACKET socket will loop back to be *received* again on the
1213 * tap device. This doesn't occur on other interface types
1214 * because we attach a socket filter to the rx socket. */
1215 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1216
1217 retval = write(netdev->tap_fd, data, size);
1218 }
1219
1220 if (retval < 0) {
1221 /* The Linux AF_PACKET implementation never blocks waiting for room
1222 * for packets, instead returning ENOBUFS. Translate this into
1223 * EAGAIN for the caller. */
1224 error = errno == ENOBUFS ? EAGAIN : errno;
1225 if (error == EINTR) {
1226 /* continue without incrementing 'i', i.e. retry this packet */
1227 continue;
1228 }
1229 break;
1230 } else if (retval != size) {
1231 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1232 " of %"PRIuSIZE") on %s", retval, size,
1233 netdev_get_name(netdev_));
1234 error = EMSGSIZE;
1235 break;
1236 }
1237
1238 /* Process the next packet in the batch */
1239 i++;
1240 }
1241
1242 if (may_steal) {
1243 for (i = 0; i < cnt; i++) {
1244 dp_packet_delete(pkts[i]);
1245 }
1246 }
1247
1248 if (error && error != EAGAIN) {
1249 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1250 netdev_get_name(netdev_), ovs_strerror(error));
1251 }
1252
1253 return error;
1254
1255 }
1256
1257 /* Registers with the poll loop to wake up from the next call to poll_block()
1258 * when the packet transmission queue has sufficient room to transmit a packet
1259 * with netdev_send().
1260 *
1261 * The kernel maintains a packet transmission queue, so the client is not
1262 * expected to do additional queuing of packets. Thus, this function is
1263 * unlikely to ever be used. It is included for completeness. */
1264 static void
1265 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1266 {
1267 if (is_tap_netdev(netdev)) {
1268 /* TAP device always accepts packets.*/
1269 poll_immediate_wake();
1270 }
1271 }
1272
1273 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1274 * otherwise a positive errno value. */
1275 static int
1276 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1277 {
1278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1279 enum netdev_flags old_flags = 0;
1280 int error;
1281
1282 ovs_mutex_lock(&netdev->mutex);
1283
1284 if (netdev->cache_valid & VALID_ETHERADDR) {
1285 error = netdev->ether_addr_error;
1286 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1287 goto exit;
1288 }
1289 netdev->cache_valid &= ~VALID_ETHERADDR;
1290 }
1291
1292 /* Tap devices must be brought down before setting the address. */
1293 if (is_tap_netdev(netdev_)) {
1294 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1295 }
1296 error = set_etheraddr(netdev_get_name(netdev_), mac);
1297 if (!error || error == ENODEV) {
1298 netdev->ether_addr_error = error;
1299 netdev->cache_valid |= VALID_ETHERADDR;
1300 if (!error) {
1301 netdev->etheraddr = mac;
1302 }
1303 }
1304
1305 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1306 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1307 }
1308
1309 exit:
1310 ovs_mutex_unlock(&netdev->mutex);
1311 return error;
1312 }
1313
1314 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1315 static int
1316 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1317 {
1318 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1319 int error;
1320
1321 ovs_mutex_lock(&netdev->mutex);
1322 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1323 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1324 &netdev->etheraddr);
1325 netdev->cache_valid |= VALID_ETHERADDR;
1326 }
1327
1328 error = netdev->ether_addr_error;
1329 if (!error) {
1330 *mac = netdev->etheraddr;
1331 }
1332 ovs_mutex_unlock(&netdev->mutex);
1333
1334 return error;
1335 }
1336
1337 static int
1338 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1339 {
1340 int error;
1341
1342 if (!(netdev->cache_valid & VALID_MTU)) {
1343 struct ifreq ifr;
1344
1345 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1346 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1347 netdev->mtu = ifr.ifr_mtu;
1348 netdev->cache_valid |= VALID_MTU;
1349 }
1350
1351 error = netdev->netdev_mtu_error;
1352 if (!error) {
1353 *mtup = netdev->mtu;
1354 }
1355
1356 return error;
1357 }
1358
1359 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1360 * in bytes, not including the hardware header; thus, this is typically 1500
1361 * bytes for Ethernet devices. */
1362 static int
1363 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1364 {
1365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1366 int error;
1367
1368 ovs_mutex_lock(&netdev->mutex);
1369 error = netdev_linux_get_mtu__(netdev, mtup);
1370 ovs_mutex_unlock(&netdev->mutex);
1371
1372 return error;
1373 }
1374
1375 /* Sets the maximum size of transmitted (MTU) for given device using linux
1376 * networking ioctl interface.
1377 */
1378 static int
1379 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1380 {
1381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1382 struct ifreq ifr;
1383 int error;
1384
1385 ovs_mutex_lock(&netdev->mutex);
1386 if (netdev->cache_valid & VALID_MTU) {
1387 error = netdev->netdev_mtu_error;
1388 if (error || netdev->mtu == mtu) {
1389 goto exit;
1390 }
1391 netdev->cache_valid &= ~VALID_MTU;
1392 }
1393 ifr.ifr_mtu = mtu;
1394 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1395 SIOCSIFMTU, "SIOCSIFMTU");
1396 if (!error || error == ENODEV) {
1397 netdev->netdev_mtu_error = error;
1398 netdev->mtu = ifr.ifr_mtu;
1399 netdev->cache_valid |= VALID_MTU;
1400 }
1401 exit:
1402 ovs_mutex_unlock(&netdev->mutex);
1403 return error;
1404 }
1405
1406 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1407 * On failure, returns a negative errno value. */
1408 static int
1409 netdev_linux_get_ifindex(const struct netdev *netdev_)
1410 {
1411 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1412 int ifindex, error;
1413
1414 ovs_mutex_lock(&netdev->mutex);
1415 error = get_ifindex(netdev_, &ifindex);
1416 ovs_mutex_unlock(&netdev->mutex);
1417
1418 return error ? -error : ifindex;
1419 }
1420
1421 static int
1422 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1423 {
1424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1425
1426 ovs_mutex_lock(&netdev->mutex);
1427 if (netdev->miimon_interval > 0) {
1428 *carrier = netdev->miimon;
1429 } else {
1430 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1431 }
1432 ovs_mutex_unlock(&netdev->mutex);
1433
1434 return 0;
1435 }
1436
1437 static long long int
1438 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1439 {
1440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1441 long long int carrier_resets;
1442
1443 ovs_mutex_lock(&netdev->mutex);
1444 carrier_resets = netdev->carrier_resets;
1445 ovs_mutex_unlock(&netdev->mutex);
1446
1447 return carrier_resets;
1448 }
1449
1450 static int
1451 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1452 struct mii_ioctl_data *data)
1453 {
1454 struct ifreq ifr;
1455 int error;
1456
1457 memset(&ifr, 0, sizeof ifr);
1458 memcpy(&ifr.ifr_data, data, sizeof *data);
1459 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1460 memcpy(data, &ifr.ifr_data, sizeof *data);
1461
1462 return error;
1463 }
1464
1465 static int
1466 netdev_linux_get_miimon(const char *name, bool *miimon)
1467 {
1468 struct mii_ioctl_data data;
1469 int error;
1470
1471 *miimon = false;
1472
1473 memset(&data, 0, sizeof data);
1474 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1475 if (!error) {
1476 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1477 data.reg_num = MII_BMSR;
1478 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1479 &data);
1480
1481 if (!error) {
1482 *miimon = !!(data.val_out & BMSR_LSTATUS);
1483 } else {
1484 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1485 }
1486 } else {
1487 struct ethtool_cmd ecmd;
1488
1489 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1490 name);
1491
1492 COVERAGE_INC(netdev_get_ethtool);
1493 memset(&ecmd, 0, sizeof ecmd);
1494 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1495 "ETHTOOL_GLINK");
1496 if (!error) {
1497 struct ethtool_value eval;
1498
1499 memcpy(&eval, &ecmd, sizeof eval);
1500 *miimon = !!eval.data;
1501 } else {
1502 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1503 }
1504 }
1505
1506 return error;
1507 }
1508
1509 static int
1510 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1511 long long int interval)
1512 {
1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1514
1515 ovs_mutex_lock(&netdev->mutex);
1516 interval = interval > 0 ? MAX(interval, 100) : 0;
1517 if (netdev->miimon_interval != interval) {
1518 if (interval && !netdev->miimon_interval) {
1519 atomic_count_inc(&miimon_cnt);
1520 } else if (!interval && netdev->miimon_interval) {
1521 atomic_count_dec(&miimon_cnt);
1522 }
1523
1524 netdev->miimon_interval = interval;
1525 timer_set_expired(&netdev->miimon_timer);
1526 }
1527 ovs_mutex_unlock(&netdev->mutex);
1528
1529 return 0;
1530 }
1531
1532 static void
1533 netdev_linux_miimon_run(void)
1534 {
1535 struct shash device_shash;
1536 struct shash_node *node;
1537
1538 shash_init(&device_shash);
1539 netdev_get_devices(&netdev_linux_class, &device_shash);
1540 SHASH_FOR_EACH (node, &device_shash) {
1541 struct netdev *netdev = node->data;
1542 struct netdev_linux *dev = netdev_linux_cast(netdev);
1543 bool miimon;
1544
1545 ovs_mutex_lock(&dev->mutex);
1546 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1547 netdev_linux_get_miimon(dev->up.name, &miimon);
1548 if (miimon != dev->miimon) {
1549 dev->miimon = miimon;
1550 netdev_linux_changed(dev, dev->ifi_flags, 0);
1551 }
1552
1553 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1554 }
1555 ovs_mutex_unlock(&dev->mutex);
1556 netdev_close(netdev);
1557 }
1558
1559 shash_destroy(&device_shash);
1560 }
1561
1562 static void
1563 netdev_linux_miimon_wait(void)
1564 {
1565 struct shash device_shash;
1566 struct shash_node *node;
1567
1568 shash_init(&device_shash);
1569 netdev_get_devices(&netdev_linux_class, &device_shash);
1570 SHASH_FOR_EACH (node, &device_shash) {
1571 struct netdev *netdev = node->data;
1572 struct netdev_linux *dev = netdev_linux_cast(netdev);
1573
1574 ovs_mutex_lock(&dev->mutex);
1575 if (dev->miimon_interval > 0) {
1576 timer_wait(&dev->miimon_timer);
1577 }
1578 ovs_mutex_unlock(&dev->mutex);
1579 netdev_close(netdev);
1580 }
1581 shash_destroy(&device_shash);
1582 }
1583
1584 static void
1585 swap_uint64(uint64_t *a, uint64_t *b)
1586 {
1587 uint64_t tmp = *a;
1588 *a = *b;
1589 *b = tmp;
1590 }
1591
1592 /* Copies 'src' into 'dst', performing format conversion in the process.
1593 *
1594 * 'src' is allowed to be misaligned. */
1595 static void
1596 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1597 const struct ovs_vport_stats *src)
1598 {
1599 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1600 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1601 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1602 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1603 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1604 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1605 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1606 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1607 dst->multicast = 0;
1608 dst->collisions = 0;
1609 dst->rx_length_errors = 0;
1610 dst->rx_over_errors = 0;
1611 dst->rx_crc_errors = 0;
1612 dst->rx_frame_errors = 0;
1613 dst->rx_fifo_errors = 0;
1614 dst->rx_missed_errors = 0;
1615 dst->tx_aborted_errors = 0;
1616 dst->tx_carrier_errors = 0;
1617 dst->tx_fifo_errors = 0;
1618 dst->tx_heartbeat_errors = 0;
1619 dst->tx_window_errors = 0;
1620 }
1621
1622 static int
1623 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1624 {
1625 struct dpif_netlink_vport reply;
1626 struct ofpbuf *buf;
1627 int error;
1628
1629 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1630 if (error) {
1631 return error;
1632 } else if (!reply.stats) {
1633 ofpbuf_delete(buf);
1634 return EOPNOTSUPP;
1635 }
1636
1637 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1638
1639 ofpbuf_delete(buf);
1640
1641 return 0;
1642 }
1643
1644 static void
1645 get_stats_via_vport(const struct netdev *netdev_,
1646 struct netdev_stats *stats)
1647 {
1648 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1649
1650 if (!netdev->vport_stats_error ||
1651 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1652 int error;
1653
1654 error = get_stats_via_vport__(netdev_, stats);
1655 if (error && error != ENOENT && error != ENODEV) {
1656 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1657 "(%s)",
1658 netdev_get_name(netdev_), ovs_strerror(error));
1659 }
1660 netdev->vport_stats_error = error;
1661 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1662 }
1663 }
1664
1665 /* Retrieves current device stats for 'netdev-linux'. */
1666 static int
1667 netdev_linux_get_stats(const struct netdev *netdev_,
1668 struct netdev_stats *stats)
1669 {
1670 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1671 struct netdev_stats dev_stats;
1672 int error;
1673
1674 ovs_mutex_lock(&netdev->mutex);
1675 get_stats_via_vport(netdev_, stats);
1676 error = get_stats_via_netlink(netdev_, &dev_stats);
1677 if (error) {
1678 if (!netdev->vport_stats_error) {
1679 error = 0;
1680 }
1681 } else if (netdev->vport_stats_error) {
1682 /* stats not available from OVS then use netdev stats. */
1683 *stats = dev_stats;
1684 } else {
1685 /* Use kernel netdev's packet and byte counts since vport's counters
1686 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1687 * enabled. */
1688 stats->rx_packets = dev_stats.rx_packets;
1689 stats->rx_bytes = dev_stats.rx_bytes;
1690 stats->tx_packets = dev_stats.tx_packets;
1691 stats->tx_bytes = dev_stats.tx_bytes;
1692
1693 stats->rx_errors += dev_stats.rx_errors;
1694 stats->tx_errors += dev_stats.tx_errors;
1695 stats->rx_dropped += dev_stats.rx_dropped;
1696 stats->tx_dropped += dev_stats.tx_dropped;
1697 stats->multicast += dev_stats.multicast;
1698 stats->collisions += dev_stats.collisions;
1699 stats->rx_length_errors += dev_stats.rx_length_errors;
1700 stats->rx_over_errors += dev_stats.rx_over_errors;
1701 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1702 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1703 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1704 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1705 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1706 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1707 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1708 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1709 stats->tx_window_errors += dev_stats.tx_window_errors;
1710 }
1711 ovs_mutex_unlock(&netdev->mutex);
1712
1713 return error;
1714 }
1715
1716 /* Retrieves current device stats for 'netdev-tap' netdev or
1717 * netdev-internal. */
1718 static int
1719 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1720 {
1721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1722 struct netdev_stats dev_stats;
1723 int error;
1724
1725 ovs_mutex_lock(&netdev->mutex);
1726 get_stats_via_vport(netdev_, stats);
1727 error = get_stats_via_netlink(netdev_, &dev_stats);
1728 if (error) {
1729 if (!netdev->vport_stats_error) {
1730 error = 0;
1731 }
1732 } else if (netdev->vport_stats_error) {
1733 /* Transmit and receive stats will appear to be swapped relative to the
1734 * other ports since we are the one sending the data, not a remote
1735 * computer. For consistency, we swap them back here. This does not
1736 * apply if we are getting stats from the vport layer because it always
1737 * tracks stats from the perspective of the switch. */
1738
1739 *stats = dev_stats;
1740 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1741 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1742 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1743 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1744 stats->rx_length_errors = 0;
1745 stats->rx_over_errors = 0;
1746 stats->rx_crc_errors = 0;
1747 stats->rx_frame_errors = 0;
1748 stats->rx_fifo_errors = 0;
1749 stats->rx_missed_errors = 0;
1750 stats->tx_aborted_errors = 0;
1751 stats->tx_carrier_errors = 0;
1752 stats->tx_fifo_errors = 0;
1753 stats->tx_heartbeat_errors = 0;
1754 stats->tx_window_errors = 0;
1755 } else {
1756 /* Use kernel netdev's packet and byte counts since vport counters
1757 * do not reflect packet counts on the wire when GSO, TSO or GRO
1758 * are enabled. */
1759 stats->rx_packets = dev_stats.tx_packets;
1760 stats->rx_bytes = dev_stats.tx_bytes;
1761 stats->tx_packets = dev_stats.rx_packets;
1762 stats->tx_bytes = dev_stats.rx_bytes;
1763
1764 stats->rx_dropped += dev_stats.tx_dropped;
1765 stats->tx_dropped += dev_stats.rx_dropped;
1766
1767 stats->rx_errors += dev_stats.tx_errors;
1768 stats->tx_errors += dev_stats.rx_errors;
1769
1770 stats->multicast += dev_stats.multicast;
1771 stats->collisions += dev_stats.collisions;
1772 }
1773 ovs_mutex_unlock(&netdev->mutex);
1774
1775 return error;
1776 }
1777
1778 static int
1779 netdev_internal_get_stats(const struct netdev *netdev_,
1780 struct netdev_stats *stats)
1781 {
1782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1783 int error;
1784
1785 ovs_mutex_lock(&netdev->mutex);
1786 get_stats_via_vport(netdev_, stats);
1787 error = netdev->vport_stats_error;
1788 ovs_mutex_unlock(&netdev->mutex);
1789
1790 return error;
1791 }
1792
1793 static void
1794 netdev_linux_read_features(struct netdev_linux *netdev)
1795 {
1796 struct ethtool_cmd ecmd;
1797 uint32_t speed;
1798 int error;
1799
1800 if (netdev->cache_valid & VALID_FEATURES) {
1801 return;
1802 }
1803
1804 COVERAGE_INC(netdev_get_ethtool);
1805 memset(&ecmd, 0, sizeof ecmd);
1806 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1807 ETHTOOL_GSET, "ETHTOOL_GSET");
1808 if (error) {
1809 goto out;
1810 }
1811
1812 /* Supported features. */
1813 netdev->supported = 0;
1814 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1815 netdev->supported |= NETDEV_F_10MB_HD;
1816 }
1817 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1818 netdev->supported |= NETDEV_F_10MB_FD;
1819 }
1820 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1821 netdev->supported |= NETDEV_F_100MB_HD;
1822 }
1823 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1824 netdev->supported |= NETDEV_F_100MB_FD;
1825 }
1826 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1827 netdev->supported |= NETDEV_F_1GB_HD;
1828 }
1829 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1830 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1831 netdev->supported |= NETDEV_F_1GB_FD;
1832 }
1833 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1834 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1835 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1836 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1837 netdev->supported |= NETDEV_F_10GB_FD;
1838 }
1839 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1840 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1841 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1842 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1843 netdev->supported |= NETDEV_F_40GB_FD;
1844 }
1845 if (ecmd.supported & SUPPORTED_TP) {
1846 netdev->supported |= NETDEV_F_COPPER;
1847 }
1848 if (ecmd.supported & SUPPORTED_FIBRE) {
1849 netdev->supported |= NETDEV_F_FIBER;
1850 }
1851 if (ecmd.supported & SUPPORTED_Autoneg) {
1852 netdev->supported |= NETDEV_F_AUTONEG;
1853 }
1854 if (ecmd.supported & SUPPORTED_Pause) {
1855 netdev->supported |= NETDEV_F_PAUSE;
1856 }
1857 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1858 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1859 }
1860
1861 /* Advertised features. */
1862 netdev->advertised = 0;
1863 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1864 netdev->advertised |= NETDEV_F_10MB_HD;
1865 }
1866 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1867 netdev->advertised |= NETDEV_F_10MB_FD;
1868 }
1869 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1870 netdev->advertised |= NETDEV_F_100MB_HD;
1871 }
1872 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1873 netdev->advertised |= NETDEV_F_100MB_FD;
1874 }
1875 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1876 netdev->advertised |= NETDEV_F_1GB_HD;
1877 }
1878 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1879 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1880 netdev->advertised |= NETDEV_F_1GB_FD;
1881 }
1882 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1883 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1884 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1885 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1886 netdev->advertised |= NETDEV_F_10GB_FD;
1887 }
1888 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1889 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1890 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1891 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1892 netdev->advertised |= NETDEV_F_40GB_FD;
1893 }
1894 if (ecmd.advertising & ADVERTISED_TP) {
1895 netdev->advertised |= NETDEV_F_COPPER;
1896 }
1897 if (ecmd.advertising & ADVERTISED_FIBRE) {
1898 netdev->advertised |= NETDEV_F_FIBER;
1899 }
1900 if (ecmd.advertising & ADVERTISED_Autoneg) {
1901 netdev->advertised |= NETDEV_F_AUTONEG;
1902 }
1903 if (ecmd.advertising & ADVERTISED_Pause) {
1904 netdev->advertised |= NETDEV_F_PAUSE;
1905 }
1906 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1907 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1908 }
1909
1910 /* Current settings. */
1911 speed = ethtool_cmd_speed(&ecmd);
1912 if (speed == SPEED_10) {
1913 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1914 } else if (speed == SPEED_100) {
1915 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1916 } else if (speed == SPEED_1000) {
1917 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1918 } else if (speed == SPEED_10000) {
1919 netdev->current = NETDEV_F_10GB_FD;
1920 } else if (speed == 40000) {
1921 netdev->current = NETDEV_F_40GB_FD;
1922 } else if (speed == 100000) {
1923 netdev->current = NETDEV_F_100GB_FD;
1924 } else if (speed == 1000000) {
1925 netdev->current = NETDEV_F_1TB_FD;
1926 } else {
1927 netdev->current = 0;
1928 }
1929
1930 if (ecmd.port == PORT_TP) {
1931 netdev->current |= NETDEV_F_COPPER;
1932 } else if (ecmd.port == PORT_FIBRE) {
1933 netdev->current |= NETDEV_F_FIBER;
1934 }
1935
1936 if (ecmd.autoneg) {
1937 netdev->current |= NETDEV_F_AUTONEG;
1938 }
1939
1940 out:
1941 netdev->cache_valid |= VALID_FEATURES;
1942 netdev->get_features_error = error;
1943 }
1944
1945 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1946 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1947 * Returns 0 if successful, otherwise a positive errno value. */
1948 static int
1949 netdev_linux_get_features(const struct netdev *netdev_,
1950 enum netdev_features *current,
1951 enum netdev_features *advertised,
1952 enum netdev_features *supported,
1953 enum netdev_features *peer)
1954 {
1955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1956 int error;
1957
1958 ovs_mutex_lock(&netdev->mutex);
1959 netdev_linux_read_features(netdev);
1960 if (!netdev->get_features_error) {
1961 *current = netdev->current;
1962 *advertised = netdev->advertised;
1963 *supported = netdev->supported;
1964 *peer = 0; /* XXX */
1965 }
1966 error = netdev->get_features_error;
1967 ovs_mutex_unlock(&netdev->mutex);
1968
1969 return error;
1970 }
1971
1972 /* Set the features advertised by 'netdev' to 'advertise'. */
1973 static int
1974 netdev_linux_set_advertisements(struct netdev *netdev_,
1975 enum netdev_features advertise)
1976 {
1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1978 struct ethtool_cmd ecmd;
1979 int error;
1980
1981 ovs_mutex_lock(&netdev->mutex);
1982
1983 COVERAGE_INC(netdev_get_ethtool);
1984 memset(&ecmd, 0, sizeof ecmd);
1985 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1986 ETHTOOL_GSET, "ETHTOOL_GSET");
1987 if (error) {
1988 goto exit;
1989 }
1990
1991 ecmd.advertising = 0;
1992 if (advertise & NETDEV_F_10MB_HD) {
1993 ecmd.advertising |= ADVERTISED_10baseT_Half;
1994 }
1995 if (advertise & NETDEV_F_10MB_FD) {
1996 ecmd.advertising |= ADVERTISED_10baseT_Full;
1997 }
1998 if (advertise & NETDEV_F_100MB_HD) {
1999 ecmd.advertising |= ADVERTISED_100baseT_Half;
2000 }
2001 if (advertise & NETDEV_F_100MB_FD) {
2002 ecmd.advertising |= ADVERTISED_100baseT_Full;
2003 }
2004 if (advertise & NETDEV_F_1GB_HD) {
2005 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2006 }
2007 if (advertise & NETDEV_F_1GB_FD) {
2008 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2009 }
2010 if (advertise & NETDEV_F_10GB_FD) {
2011 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2012 }
2013 if (advertise & NETDEV_F_COPPER) {
2014 ecmd.advertising |= ADVERTISED_TP;
2015 }
2016 if (advertise & NETDEV_F_FIBER) {
2017 ecmd.advertising |= ADVERTISED_FIBRE;
2018 }
2019 if (advertise & NETDEV_F_AUTONEG) {
2020 ecmd.advertising |= ADVERTISED_Autoneg;
2021 }
2022 if (advertise & NETDEV_F_PAUSE) {
2023 ecmd.advertising |= ADVERTISED_Pause;
2024 }
2025 if (advertise & NETDEV_F_PAUSE_ASYM) {
2026 ecmd.advertising |= ADVERTISED_Asym_Pause;
2027 }
2028 COVERAGE_INC(netdev_set_ethtool);
2029 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2030 ETHTOOL_SSET, "ETHTOOL_SSET");
2031
2032 exit:
2033 ovs_mutex_unlock(&netdev->mutex);
2034 return error;
2035 }
2036
2037 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2038 * successful, otherwise a positive errno value. */
2039 static int
2040 netdev_linux_set_policing(struct netdev *netdev_,
2041 uint32_t kbits_rate, uint32_t kbits_burst)
2042 {
2043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2044 const char *netdev_name = netdev_get_name(netdev_);
2045 int error;
2046
2047 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2048 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2049 : kbits_burst); /* Stick with user-specified value. */
2050
2051 ovs_mutex_lock(&netdev->mutex);
2052 if (netdev->cache_valid & VALID_POLICING) {
2053 error = netdev->netdev_policing_error;
2054 if (error || (netdev->kbits_rate == kbits_rate &&
2055 netdev->kbits_burst == kbits_burst)) {
2056 /* Assume that settings haven't changed since we last set them. */
2057 goto out;
2058 }
2059 netdev->cache_valid &= ~VALID_POLICING;
2060 }
2061
2062 COVERAGE_INC(netdev_set_policing);
2063 /* Remove any existing ingress qdisc. */
2064 error = tc_add_del_ingress_qdisc(netdev_, false);
2065 if (error) {
2066 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2067 netdev_name, ovs_strerror(error));
2068 goto out;
2069 }
2070
2071 if (kbits_rate) {
2072 error = tc_add_del_ingress_qdisc(netdev_, true);
2073 if (error) {
2074 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2075 netdev_name, ovs_strerror(error));
2076 goto out;
2077 }
2078
2079 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2080 if (error){
2081 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2082 netdev_name, ovs_strerror(error));
2083 goto out;
2084 }
2085 }
2086
2087 netdev->kbits_rate = kbits_rate;
2088 netdev->kbits_burst = kbits_burst;
2089
2090 out:
2091 if (!error || error == ENODEV) {
2092 netdev->netdev_policing_error = error;
2093 netdev->cache_valid |= VALID_POLICING;
2094 }
2095 ovs_mutex_unlock(&netdev->mutex);
2096 return error;
2097 }
2098
2099 static int
2100 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2101 struct sset *types)
2102 {
2103 const struct tc_ops *const *opsp;
2104
2105 for (opsp = tcs; *opsp != NULL; opsp++) {
2106 const struct tc_ops *ops = *opsp;
2107 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2108 sset_add(types, ops->ovs_name);
2109 }
2110 }
2111 return 0;
2112 }
2113
2114 static const struct tc_ops *
2115 tc_lookup_ovs_name(const char *name)
2116 {
2117 const struct tc_ops *const *opsp;
2118
2119 for (opsp = tcs; *opsp != NULL; opsp++) {
2120 const struct tc_ops *ops = *opsp;
2121 if (!strcmp(name, ops->ovs_name)) {
2122 return ops;
2123 }
2124 }
2125 return NULL;
2126 }
2127
2128 static const struct tc_ops *
2129 tc_lookup_linux_name(const char *name)
2130 {
2131 const struct tc_ops *const *opsp;
2132
2133 for (opsp = tcs; *opsp != NULL; opsp++) {
2134 const struct tc_ops *ops = *opsp;
2135 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2136 return ops;
2137 }
2138 }
2139 return NULL;
2140 }
2141
2142 static struct tc_queue *
2143 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2144 size_t hash)
2145 {
2146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2147 struct tc_queue *queue;
2148
2149 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2150 if (queue->queue_id == queue_id) {
2151 return queue;
2152 }
2153 }
2154 return NULL;
2155 }
2156
2157 static struct tc_queue *
2158 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2159 {
2160 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2161 }
2162
2163 static int
2164 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2165 const char *type,
2166 struct netdev_qos_capabilities *caps)
2167 {
2168 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2169 if (!ops) {
2170 return EOPNOTSUPP;
2171 }
2172 caps->n_queues = ops->n_queues;
2173 return 0;
2174 }
2175
2176 static int
2177 netdev_linux_get_qos(const struct netdev *netdev_,
2178 const char **typep, struct smap *details)
2179 {
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2181 int error;
2182
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2185 if (!error) {
2186 *typep = netdev->tc->ops->ovs_name;
2187 error = (netdev->tc->ops->qdisc_get
2188 ? netdev->tc->ops->qdisc_get(netdev_, details)
2189 : 0);
2190 }
2191 ovs_mutex_unlock(&netdev->mutex);
2192
2193 return error;
2194 }
2195
2196 static int
2197 netdev_linux_set_qos(struct netdev *netdev_,
2198 const char *type, const struct smap *details)
2199 {
2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2201 const struct tc_ops *new_ops;
2202 int error;
2203
2204 new_ops = tc_lookup_ovs_name(type);
2205 if (!new_ops || !new_ops->tc_install) {
2206 return EOPNOTSUPP;
2207 }
2208
2209 ovs_mutex_lock(&netdev->mutex);
2210 error = tc_query_qdisc(netdev_);
2211 if (error) {
2212 goto exit;
2213 }
2214
2215 if (new_ops == netdev->tc->ops) {
2216 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2217 } else {
2218 /* Delete existing qdisc. */
2219 error = tc_del_qdisc(netdev_);
2220 if (error) {
2221 goto exit;
2222 }
2223 ovs_assert(netdev->tc == NULL);
2224
2225 /* Install new qdisc. */
2226 error = new_ops->tc_install(netdev_, details);
2227 ovs_assert((error == 0) == (netdev->tc != NULL));
2228 }
2229
2230 exit:
2231 ovs_mutex_unlock(&netdev->mutex);
2232 return error;
2233 }
2234
2235 static int
2236 netdev_linux_get_queue(const struct netdev *netdev_,
2237 unsigned int queue_id, struct smap *details)
2238 {
2239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2240 int error;
2241
2242 ovs_mutex_lock(&netdev->mutex);
2243 error = tc_query_qdisc(netdev_);
2244 if (!error) {
2245 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2246 error = (queue
2247 ? netdev->tc->ops->class_get(netdev_, queue, details)
2248 : ENOENT);
2249 }
2250 ovs_mutex_unlock(&netdev->mutex);
2251
2252 return error;
2253 }
2254
2255 static int
2256 netdev_linux_set_queue(struct netdev *netdev_,
2257 unsigned int queue_id, const struct smap *details)
2258 {
2259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2260 int error;
2261
2262 ovs_mutex_lock(&netdev->mutex);
2263 error = tc_query_qdisc(netdev_);
2264 if (!error) {
2265 error = (queue_id < netdev->tc->ops->n_queues
2266 && netdev->tc->ops->class_set
2267 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2268 : EINVAL);
2269 }
2270 ovs_mutex_unlock(&netdev->mutex);
2271
2272 return error;
2273 }
2274
2275 static int
2276 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2277 {
2278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2279 int error;
2280
2281 ovs_mutex_lock(&netdev->mutex);
2282 error = tc_query_qdisc(netdev_);
2283 if (!error) {
2284 if (netdev->tc->ops->class_delete) {
2285 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2286 error = (queue
2287 ? netdev->tc->ops->class_delete(netdev_, queue)
2288 : ENOENT);
2289 } else {
2290 error = EINVAL;
2291 }
2292 }
2293 ovs_mutex_unlock(&netdev->mutex);
2294
2295 return error;
2296 }
2297
2298 static int
2299 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2300 unsigned int queue_id,
2301 struct netdev_queue_stats *stats)
2302 {
2303 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2304 int error;
2305
2306 ovs_mutex_lock(&netdev->mutex);
2307 error = tc_query_qdisc(netdev_);
2308 if (!error) {
2309 if (netdev->tc->ops->class_get_stats) {
2310 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2311 if (queue) {
2312 stats->created = queue->created;
2313 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2314 stats);
2315 } else {
2316 error = ENOENT;
2317 }
2318 } else {
2319 error = EOPNOTSUPP;
2320 }
2321 }
2322 ovs_mutex_unlock(&netdev->mutex);
2323
2324 return error;
2325 }
2326
2327 struct queue_dump_state {
2328 struct nl_dump dump;
2329 struct ofpbuf buf;
2330 };
2331
2332 static bool
2333 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2334 {
2335 struct ofpbuf request;
2336 struct tcmsg *tcmsg;
2337
2338 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2339 if (!tcmsg) {
2340 return false;
2341 }
2342 tcmsg->tcm_parent = 0;
2343 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2344 ofpbuf_uninit(&request);
2345
2346 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2347 return true;
2348 }
2349
2350 static int
2351 finish_queue_dump(struct queue_dump_state *state)
2352 {
2353 ofpbuf_uninit(&state->buf);
2354 return nl_dump_done(&state->dump);
2355 }
2356
2357 struct netdev_linux_queue_state {
2358 unsigned int *queues;
2359 size_t cur_queue;
2360 size_t n_queues;
2361 };
2362
2363 static int
2364 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2365 {
2366 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2367 int error;
2368
2369 ovs_mutex_lock(&netdev->mutex);
2370 error = tc_query_qdisc(netdev_);
2371 if (!error) {
2372 if (netdev->tc->ops->class_get) {
2373 struct netdev_linux_queue_state *state;
2374 struct tc_queue *queue;
2375 size_t i;
2376
2377 *statep = state = xmalloc(sizeof *state);
2378 state->n_queues = hmap_count(&netdev->tc->queues);
2379 state->cur_queue = 0;
2380 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2381
2382 i = 0;
2383 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2384 state->queues[i++] = queue->queue_id;
2385 }
2386 } else {
2387 error = EOPNOTSUPP;
2388 }
2389 }
2390 ovs_mutex_unlock(&netdev->mutex);
2391
2392 return error;
2393 }
2394
2395 static int
2396 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2397 unsigned int *queue_idp, struct smap *details)
2398 {
2399 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2400 struct netdev_linux_queue_state *state = state_;
2401 int error = EOF;
2402
2403 ovs_mutex_lock(&netdev->mutex);
2404 while (state->cur_queue < state->n_queues) {
2405 unsigned int queue_id = state->queues[state->cur_queue++];
2406 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2407
2408 if (queue) {
2409 *queue_idp = queue_id;
2410 error = netdev->tc->ops->class_get(netdev_, queue, details);
2411 break;
2412 }
2413 }
2414 ovs_mutex_unlock(&netdev->mutex);
2415
2416 return error;
2417 }
2418
2419 static int
2420 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2421 void *state_)
2422 {
2423 struct netdev_linux_queue_state *state = state_;
2424
2425 free(state->queues);
2426 free(state);
2427 return 0;
2428 }
2429
2430 static int
2431 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2432 netdev_dump_queue_stats_cb *cb, void *aux)
2433 {
2434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2435 int error;
2436
2437 ovs_mutex_lock(&netdev->mutex);
2438 error = tc_query_qdisc(netdev_);
2439 if (!error) {
2440 struct queue_dump_state state;
2441
2442 if (!netdev->tc->ops->class_dump_stats) {
2443 error = EOPNOTSUPP;
2444 } else if (!start_queue_dump(netdev_, &state)) {
2445 error = ENODEV;
2446 } else {
2447 struct ofpbuf msg;
2448 int retval;
2449
2450 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2451 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2452 cb, aux);
2453 if (retval) {
2454 error = retval;
2455 }
2456 }
2457
2458 retval = finish_queue_dump(&state);
2459 if (retval) {
2460 error = retval;
2461 }
2462 }
2463 }
2464 ovs_mutex_unlock(&netdev->mutex);
2465
2466 return error;
2467 }
2468
2469 static int
2470 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2471 struct in_addr netmask)
2472 {
2473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2474 int error;
2475
2476 ovs_mutex_lock(&netdev->mutex);
2477 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2478 if (!error) {
2479 if (address.s_addr != INADDR_ANY) {
2480 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2481 "SIOCSIFNETMASK", netmask);
2482 }
2483 }
2484
2485 ovs_mutex_unlock(&netdev->mutex);
2486
2487 return error;
2488 }
2489
2490 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2491 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2492 * error. */
2493 static int
2494 netdev_linux_get_addr_list(const struct netdev *netdev_,
2495 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2496 {
2497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2498 int error;
2499
2500 ovs_mutex_lock(&netdev->mutex);
2501 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2502 ovs_mutex_unlock(&netdev->mutex);
2503
2504 return error;
2505 }
2506
2507 static void
2508 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2509 {
2510 struct sockaddr_in sin;
2511 memset(&sin, 0, sizeof sin);
2512 sin.sin_family = AF_INET;
2513 sin.sin_addr = addr;
2514 sin.sin_port = 0;
2515
2516 memset(sa, 0, sizeof *sa);
2517 memcpy(sa, &sin, sizeof sin);
2518 }
2519
2520 static int
2521 do_set_addr(struct netdev *netdev,
2522 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2523 {
2524 struct ifreq ifr;
2525
2526 make_in4_sockaddr(&ifr.ifr_addr, addr);
2527 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2528 ioctl_name);
2529 }
2530
2531 /* Adds 'router' as a default IP gateway. */
2532 static int
2533 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2534 {
2535 struct in_addr any = { INADDR_ANY };
2536 struct rtentry rt;
2537 int error;
2538
2539 memset(&rt, 0, sizeof rt);
2540 make_in4_sockaddr(&rt.rt_dst, any);
2541 make_in4_sockaddr(&rt.rt_gateway, router);
2542 make_in4_sockaddr(&rt.rt_genmask, any);
2543 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2544 error = af_inet_ioctl(SIOCADDRT, &rt);
2545 if (error) {
2546 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2547 }
2548 return error;
2549 }
2550
2551 static int
2552 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2553 char **netdev_name)
2554 {
2555 static const char fn[] = "/proc/net/route";
2556 FILE *stream;
2557 char line[256];
2558 int ln;
2559
2560 *netdev_name = NULL;
2561 stream = fopen(fn, "r");
2562 if (stream == NULL) {
2563 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2564 return errno;
2565 }
2566
2567 ln = 0;
2568 while (fgets(line, sizeof line, stream)) {
2569 if (++ln >= 2) {
2570 char iface[17];
2571 ovs_be32 dest, gateway, mask;
2572 int refcnt, metric, mtu;
2573 unsigned int flags, use, window, irtt;
2574
2575 if (!ovs_scan(line,
2576 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2577 " %d %u %u\n",
2578 iface, &dest, &gateway, &flags, &refcnt,
2579 &use, &metric, &mask, &mtu, &window, &irtt)) {
2580 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2581 fn, ln, line);
2582 continue;
2583 }
2584 if (!(flags & RTF_UP)) {
2585 /* Skip routes that aren't up. */
2586 continue;
2587 }
2588
2589 /* The output of 'dest', 'mask', and 'gateway' were given in
2590 * network byte order, so we don't need need any endian
2591 * conversions here. */
2592 if ((dest & mask) == (host->s_addr & mask)) {
2593 if (!gateway) {
2594 /* The host is directly reachable. */
2595 next_hop->s_addr = 0;
2596 } else {
2597 /* To reach the host, we must go through a gateway. */
2598 next_hop->s_addr = gateway;
2599 }
2600 *netdev_name = xstrdup(iface);
2601 fclose(stream);
2602 return 0;
2603 }
2604 }
2605 }
2606
2607 fclose(stream);
2608 return ENXIO;
2609 }
2610
2611 static int
2612 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2613 {
2614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2615 int error = 0;
2616
2617 ovs_mutex_lock(&netdev->mutex);
2618 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2619 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2620
2621 COVERAGE_INC(netdev_get_ethtool);
2622 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2623 error = netdev_linux_do_ethtool(netdev->up.name,
2624 cmd,
2625 ETHTOOL_GDRVINFO,
2626 "ETHTOOL_GDRVINFO");
2627 if (!error) {
2628 netdev->cache_valid |= VALID_DRVINFO;
2629 }
2630 }
2631
2632 if (!error) {
2633 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2634 smap_add(smap, "driver_version", netdev->drvinfo.version);
2635 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2636 }
2637 ovs_mutex_unlock(&netdev->mutex);
2638
2639 return error;
2640 }
2641
2642 static int
2643 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2644 struct smap *smap)
2645 {
2646 smap_add(smap, "driver_name", "openvswitch");
2647 return 0;
2648 }
2649
2650 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2651 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2652 * returns 0. Otherwise, it returns a positive errno value; in particular,
2653 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2654 static int
2655 netdev_linux_arp_lookup(const struct netdev *netdev,
2656 ovs_be32 ip, struct eth_addr *mac)
2657 {
2658 struct arpreq r;
2659 struct sockaddr_in sin;
2660 int retval;
2661
2662 memset(&r, 0, sizeof r);
2663 memset(&sin, 0, sizeof sin);
2664 sin.sin_family = AF_INET;
2665 sin.sin_addr.s_addr = ip;
2666 sin.sin_port = 0;
2667 memcpy(&r.arp_pa, &sin, sizeof sin);
2668 r.arp_ha.sa_family = ARPHRD_ETHER;
2669 r.arp_flags = 0;
2670 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2671 COVERAGE_INC(netdev_arp_lookup);
2672 retval = af_inet_ioctl(SIOCGARP, &r);
2673 if (!retval) {
2674 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2675 } else if (retval != ENXIO) {
2676 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2677 netdev_get_name(netdev), IP_ARGS(ip),
2678 ovs_strerror(retval));
2679 }
2680 return retval;
2681 }
2682
2683 static int
2684 nd_to_iff_flags(enum netdev_flags nd)
2685 {
2686 int iff = 0;
2687 if (nd & NETDEV_UP) {
2688 iff |= IFF_UP;
2689 }
2690 if (nd & NETDEV_PROMISC) {
2691 iff |= IFF_PROMISC;
2692 }
2693 if (nd & NETDEV_LOOPBACK) {
2694 iff |= IFF_LOOPBACK;
2695 }
2696 return iff;
2697 }
2698
2699 static int
2700 iff_to_nd_flags(int iff)
2701 {
2702 enum netdev_flags nd = 0;
2703 if (iff & IFF_UP) {
2704 nd |= NETDEV_UP;
2705 }
2706 if (iff & IFF_PROMISC) {
2707 nd |= NETDEV_PROMISC;
2708 }
2709 if (iff & IFF_LOOPBACK) {
2710 nd |= NETDEV_LOOPBACK;
2711 }
2712 return nd;
2713 }
2714
2715 static int
2716 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2717 enum netdev_flags on, enum netdev_flags *old_flagsp)
2718 OVS_REQUIRES(netdev->mutex)
2719 {
2720 int old_flags, new_flags;
2721 int error = 0;
2722
2723 old_flags = netdev->ifi_flags;
2724 *old_flagsp = iff_to_nd_flags(old_flags);
2725 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2726 if (new_flags != old_flags) {
2727 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2728 get_flags(&netdev->up, &netdev->ifi_flags);
2729 }
2730
2731 return error;
2732 }
2733
2734 static int
2735 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2736 enum netdev_flags on, enum netdev_flags *old_flagsp)
2737 {
2738 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2739 int error;
2740
2741 ovs_mutex_lock(&netdev->mutex);
2742 error = update_flags(netdev, off, on, old_flagsp);
2743 ovs_mutex_unlock(&netdev->mutex);
2744
2745 return error;
2746 }
2747
2748 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2749 GET_FEATURES, GET_STATUS) \
2750 { \
2751 NAME, \
2752 false, /* is_pmd */ \
2753 \
2754 NULL, \
2755 netdev_linux_run, \
2756 netdev_linux_wait, \
2757 \
2758 netdev_linux_alloc, \
2759 CONSTRUCT, \
2760 netdev_linux_destruct, \
2761 netdev_linux_dealloc, \
2762 NULL, /* get_config */ \
2763 NULL, /* set_config */ \
2764 NULL, /* get_tunnel_config */ \
2765 NULL, /* build header */ \
2766 NULL, /* push header */ \
2767 NULL, /* pop header */ \
2768 NULL, /* get_numa_id */ \
2769 NULL, /* set_multiq */ \
2770 \
2771 netdev_linux_send, \
2772 netdev_linux_send_wait, \
2773 \
2774 netdev_linux_set_etheraddr, \
2775 netdev_linux_get_etheraddr, \
2776 netdev_linux_get_mtu, \
2777 netdev_linux_set_mtu, \
2778 netdev_linux_get_ifindex, \
2779 netdev_linux_get_carrier, \
2780 netdev_linux_get_carrier_resets, \
2781 netdev_linux_set_miimon_interval, \
2782 GET_STATS, \
2783 \
2784 GET_FEATURES, \
2785 netdev_linux_set_advertisements, \
2786 \
2787 netdev_linux_set_policing, \
2788 netdev_linux_get_qos_types, \
2789 netdev_linux_get_qos_capabilities, \
2790 netdev_linux_get_qos, \
2791 netdev_linux_set_qos, \
2792 netdev_linux_get_queue, \
2793 netdev_linux_set_queue, \
2794 netdev_linux_delete_queue, \
2795 netdev_linux_get_queue_stats, \
2796 netdev_linux_queue_dump_start, \
2797 netdev_linux_queue_dump_next, \
2798 netdev_linux_queue_dump_done, \
2799 netdev_linux_dump_queue_stats, \
2800 \
2801 netdev_linux_set_in4, \
2802 netdev_linux_get_addr_list, \
2803 netdev_linux_add_router, \
2804 netdev_linux_get_next_hop, \
2805 GET_STATUS, \
2806 netdev_linux_arp_lookup, \
2807 \
2808 netdev_linux_update_flags, \
2809 \
2810 netdev_linux_rxq_alloc, \
2811 netdev_linux_rxq_construct, \
2812 netdev_linux_rxq_destruct, \
2813 netdev_linux_rxq_dealloc, \
2814 netdev_linux_rxq_recv, \
2815 netdev_linux_rxq_wait, \
2816 netdev_linux_rxq_drain, \
2817 }
2818
2819 const struct netdev_class netdev_linux_class =
2820 NETDEV_LINUX_CLASS(
2821 "system",
2822 netdev_linux_construct,
2823 netdev_linux_get_stats,
2824 netdev_linux_get_features,
2825 netdev_linux_get_status);
2826
2827 const struct netdev_class netdev_tap_class =
2828 NETDEV_LINUX_CLASS(
2829 "tap",
2830 netdev_linux_construct_tap,
2831 netdev_tap_get_stats,
2832 netdev_linux_get_features,
2833 netdev_linux_get_status);
2834
2835 const struct netdev_class netdev_internal_class =
2836 NETDEV_LINUX_CLASS(
2837 "internal",
2838 netdev_linux_construct,
2839 netdev_internal_get_stats,
2840 NULL, /* get_features */
2841 netdev_internal_get_status);
2842 \f
2843
2844 #define CODEL_N_QUEUES 0x0000
2845
2846 /* In sufficiently new kernel headers these are defined as enums in
2847 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2848 * kernels. (This overrides any enum definition in the header file but that's
2849 * harmless.) */
2850 #define TCA_CODEL_TARGET 1
2851 #define TCA_CODEL_LIMIT 2
2852 #define TCA_CODEL_INTERVAL 3
2853
2854 struct codel {
2855 struct tc tc;
2856 uint32_t target;
2857 uint32_t limit;
2858 uint32_t interval;
2859 };
2860
2861 static struct codel *
2862 codel_get__(const struct netdev *netdev_)
2863 {
2864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2865 return CONTAINER_OF(netdev->tc, struct codel, tc);
2866 }
2867
2868 static void
2869 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2870 uint32_t interval)
2871 {
2872 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2873 struct codel *codel;
2874
2875 codel = xmalloc(sizeof *codel);
2876 tc_init(&codel->tc, &tc_ops_codel);
2877 codel->target = target;
2878 codel->limit = limit;
2879 codel->interval = interval;
2880
2881 netdev->tc = &codel->tc;
2882 }
2883
2884 static int
2885 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2886 uint32_t interval)
2887 {
2888 size_t opt_offset;
2889 struct ofpbuf request;
2890 struct tcmsg *tcmsg;
2891 uint32_t otarget, olimit, ointerval;
2892 int error;
2893
2894 tc_del_qdisc(netdev);
2895
2896 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2897 NLM_F_EXCL | NLM_F_CREATE, &request);
2898 if (!tcmsg) {
2899 return ENODEV;
2900 }
2901 tcmsg->tcm_handle = tc_make_handle(1, 0);
2902 tcmsg->tcm_parent = TC_H_ROOT;
2903
2904 otarget = target ? target : 5000;
2905 olimit = limit ? limit : 10240;
2906 ointerval = interval ? interval : 100000;
2907
2908 nl_msg_put_string(&request, TCA_KIND, "codel");
2909 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2910 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2911 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2912 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2913 nl_msg_end_nested(&request, opt_offset);
2914
2915 error = tc_transact(&request, NULL);
2916 if (error) {
2917 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2918 "target %u, limit %u, interval %u error %d(%s)",
2919 netdev_get_name(netdev),
2920 otarget, olimit, ointerval,
2921 error, ovs_strerror(error));
2922 }
2923 return error;
2924 }
2925
2926 static void
2927 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2928 const struct smap *details, struct codel *codel)
2929 {
2930 const char *target_s;
2931 const char *limit_s;
2932 const char *interval_s;
2933
2934 target_s = smap_get(details, "target");
2935 limit_s = smap_get(details, "limit");
2936 interval_s = smap_get(details, "interval");
2937
2938 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2939 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2940 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2941
2942 if (!codel->target) {
2943 codel->target = 5000;
2944 }
2945 if (!codel->limit) {
2946 codel->limit = 10240;
2947 }
2948 if (!codel->interval) {
2949 codel->interval = 100000;
2950 }
2951 }
2952
2953 static int
2954 codel_tc_install(struct netdev *netdev, const struct smap *details)
2955 {
2956 int error;
2957 struct codel codel;
2958
2959 codel_parse_qdisc_details__(netdev, details, &codel);
2960 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2961 codel.interval);
2962 if (!error) {
2963 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2964 }
2965 return error;
2966 }
2967
2968 static int
2969 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2970 {
2971 static const struct nl_policy tca_codel_policy[] = {
2972 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2973 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2974 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2975 };
2976
2977 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2978
2979 if (!nl_parse_nested(nl_options, tca_codel_policy,
2980 attrs, ARRAY_SIZE(tca_codel_policy))) {
2981 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2982 return EPROTO;
2983 }
2984
2985 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2986 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2987 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2988 return 0;
2989 }
2990
2991 static int
2992 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2993 {
2994 struct nlattr *nlattr;
2995 const char * kind;
2996 int error;
2997 struct codel codel;
2998
2999 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3000 if (error != 0) {
3001 return error;
3002 }
3003
3004 error = codel_parse_tca_options__(nlattr, &codel);
3005 if (error != 0) {
3006 return error;
3007 }
3008
3009 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3010 return 0;
3011 }
3012
3013
3014 static void
3015 codel_tc_destroy(struct tc *tc)
3016 {
3017 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3018 tc_destroy(tc);
3019 free(codel);
3020 }
3021
3022 static int
3023 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3024 {
3025 const struct codel *codel = codel_get__(netdev);
3026 smap_add_format(details, "target", "%u", codel->target);
3027 smap_add_format(details, "limit", "%u", codel->limit);
3028 smap_add_format(details, "interval", "%u", codel->interval);
3029 return 0;
3030 }
3031
3032 static int
3033 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3034 {
3035 struct codel codel;
3036
3037 codel_parse_qdisc_details__(netdev, details, &codel);
3038 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3039 codel_get__(netdev)->target = codel.target;
3040 codel_get__(netdev)->limit = codel.limit;
3041 codel_get__(netdev)->interval = codel.interval;
3042 return 0;
3043 }
3044
3045 static const struct tc_ops tc_ops_codel = {
3046 "codel", /* linux_name */
3047 "linux-codel", /* ovs_name */
3048 CODEL_N_QUEUES, /* n_queues */
3049 codel_tc_install,
3050 codel_tc_load,
3051 codel_tc_destroy,
3052 codel_qdisc_get,
3053 codel_qdisc_set,
3054 NULL,
3055 NULL,
3056 NULL,
3057 NULL,
3058 NULL
3059 };
3060 \f
3061 /* FQ-CoDel traffic control class. */
3062
3063 #define FQCODEL_N_QUEUES 0x0000
3064
3065 /* In sufficiently new kernel headers these are defined as enums in
3066 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3067 * kernels. (This overrides any enum definition in the header file but that's
3068 * harmless.) */
3069 #define TCA_FQ_CODEL_TARGET 1
3070 #define TCA_FQ_CODEL_LIMIT 2
3071 #define TCA_FQ_CODEL_INTERVAL 3
3072 #define TCA_FQ_CODEL_ECN 4
3073 #define TCA_FQ_CODEL_FLOWS 5
3074 #define TCA_FQ_CODEL_QUANTUM 6
3075
3076 struct fqcodel {
3077 struct tc tc;
3078 uint32_t target;
3079 uint32_t limit;
3080 uint32_t interval;
3081 uint32_t flows;
3082 uint32_t quantum;
3083 };
3084
3085 static struct fqcodel *
3086 fqcodel_get__(const struct netdev *netdev_)
3087 {
3088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3089 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3090 }
3091
3092 static void
3093 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3094 uint32_t interval, uint32_t flows, uint32_t quantum)
3095 {
3096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3097 struct fqcodel *fqcodel;
3098
3099 fqcodel = xmalloc(sizeof *fqcodel);
3100 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3101 fqcodel->target = target;
3102 fqcodel->limit = limit;
3103 fqcodel->interval = interval;
3104 fqcodel->flows = flows;
3105 fqcodel->quantum = quantum;
3106
3107 netdev->tc = &fqcodel->tc;
3108 }
3109
3110 static int
3111 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3112 uint32_t interval, uint32_t flows, uint32_t quantum)
3113 {
3114 size_t opt_offset;
3115 struct ofpbuf request;
3116 struct tcmsg *tcmsg;
3117 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3118 int error;
3119
3120 tc_del_qdisc(netdev);
3121
3122 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3123 NLM_F_EXCL | NLM_F_CREATE, &request);
3124 if (!tcmsg) {
3125 return ENODEV;
3126 }
3127 tcmsg->tcm_handle = tc_make_handle(1, 0);
3128 tcmsg->tcm_parent = TC_H_ROOT;
3129
3130 otarget = target ? target : 5000;
3131 olimit = limit ? limit : 10240;
3132 ointerval = interval ? interval : 100000;
3133 oflows = flows ? flows : 1024;
3134 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3135 not mtu */
3136
3137 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3138 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3139 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3141 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3142 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3143 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3144 nl_msg_end_nested(&request, opt_offset);
3145
3146 error = tc_transact(&request, NULL);
3147 if (error) {
3148 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3149 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3150 netdev_get_name(netdev),
3151 otarget, olimit, ointerval, oflows, oquantum,
3152 error, ovs_strerror(error));
3153 }
3154 return error;
3155 }
3156
3157 static void
3158 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3159 const struct smap *details, struct fqcodel *fqcodel)
3160 {
3161 const char *target_s;
3162 const char *limit_s;
3163 const char *interval_s;
3164 const char *flows_s;
3165 const char *quantum_s;
3166
3167 target_s = smap_get(details, "target");
3168 limit_s = smap_get(details, "limit");
3169 interval_s = smap_get(details, "interval");
3170 flows_s = smap_get(details, "flows");
3171 quantum_s = smap_get(details, "quantum");
3172 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3173 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3174 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3175 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3176 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3177 if (!fqcodel->target) {
3178 fqcodel->target = 5000;
3179 }
3180 if (!fqcodel->limit) {
3181 fqcodel->limit = 10240;
3182 }
3183 if (!fqcodel->interval) {
3184 fqcodel->interval = 1000000;
3185 }
3186 if (!fqcodel->flows) {
3187 fqcodel->flows = 1024;
3188 }
3189 if (!fqcodel->quantum) {
3190 fqcodel->quantum = 1514;
3191 }
3192 }
3193
3194 static int
3195 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3196 {
3197 int error;
3198 struct fqcodel fqcodel;
3199
3200 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3201 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3202 fqcodel.interval, fqcodel.flows,
3203 fqcodel.quantum);
3204 if (!error) {
3205 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3206 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3207 }
3208 return error;
3209 }
3210
3211 static int
3212 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3213 {
3214 static const struct nl_policy tca_fqcodel_policy[] = {
3215 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3218 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3219 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3220 };
3221
3222 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3223
3224 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3225 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3226 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3227 return EPROTO;
3228 }
3229
3230 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3231 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3232 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3233 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3234 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3235 return 0;
3236 }
3237
3238 static int
3239 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3240 {
3241 struct nlattr *nlattr;
3242 const char * kind;
3243 int error;
3244 struct fqcodel fqcodel;
3245
3246 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3247 if (error != 0) {
3248 return error;
3249 }
3250
3251 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3252 if (error != 0) {
3253 return error;
3254 }
3255
3256 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3257 fqcodel.flows, fqcodel.quantum);
3258 return 0;
3259 }
3260
3261 static void
3262 fqcodel_tc_destroy(struct tc *tc)
3263 {
3264 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3265 tc_destroy(tc);
3266 free(fqcodel);
3267 }
3268
3269 static int
3270 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3271 {
3272 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3273 smap_add_format(details, "target", "%u", fqcodel->target);
3274 smap_add_format(details, "limit", "%u", fqcodel->limit);
3275 smap_add_format(details, "interval", "%u", fqcodel->interval);
3276 smap_add_format(details, "flows", "%u", fqcodel->flows);
3277 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3278 return 0;
3279 }
3280
3281 static int
3282 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3283 {
3284 struct fqcodel fqcodel;
3285
3286 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3287 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3288 fqcodel.flows, fqcodel.quantum);
3289 fqcodel_get__(netdev)->target = fqcodel.target;
3290 fqcodel_get__(netdev)->limit = fqcodel.limit;
3291 fqcodel_get__(netdev)->interval = fqcodel.interval;
3292 fqcodel_get__(netdev)->flows = fqcodel.flows;
3293 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3294 return 0;
3295 }
3296
3297 static const struct tc_ops tc_ops_fqcodel = {
3298 "fq_codel", /* linux_name */
3299 "linux-fq_codel", /* ovs_name */
3300 FQCODEL_N_QUEUES, /* n_queues */
3301 fqcodel_tc_install,
3302 fqcodel_tc_load,
3303 fqcodel_tc_destroy,
3304 fqcodel_qdisc_get,
3305 fqcodel_qdisc_set,
3306 NULL,
3307 NULL,
3308 NULL,
3309 NULL,
3310 NULL
3311 };
3312 \f
3313 /* SFQ traffic control class. */
3314
3315 #define SFQ_N_QUEUES 0x0000
3316
3317 struct sfq {
3318 struct tc tc;
3319 uint32_t quantum;
3320 uint32_t perturb;
3321 };
3322
3323 static struct sfq *
3324 sfq_get__(const struct netdev *netdev_)
3325 {
3326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3327 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3328 }
3329
3330 static void
3331 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3332 {
3333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3334 struct sfq *sfq;
3335
3336 sfq = xmalloc(sizeof *sfq);
3337 tc_init(&sfq->tc, &tc_ops_sfq);
3338 sfq->perturb = perturb;
3339 sfq->quantum = quantum;
3340
3341 netdev->tc = &sfq->tc;
3342 }
3343
3344 static int
3345 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3346 {
3347 struct tc_sfq_qopt opt;
3348 struct ofpbuf request;
3349 struct tcmsg *tcmsg;
3350 int mtu;
3351 int mtu_error, error;
3352 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3353
3354 tc_del_qdisc(netdev);
3355
3356 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3357 NLM_F_EXCL | NLM_F_CREATE, &request);
3358 if (!tcmsg) {
3359 return ENODEV;
3360 }
3361 tcmsg->tcm_handle = tc_make_handle(1, 0);
3362 tcmsg->tcm_parent = TC_H_ROOT;
3363
3364 memset(&opt, 0, sizeof opt);
3365 if (!quantum) {
3366 if (!mtu_error) {
3367 opt.quantum = mtu; /* if we cannot find mtu, use default */
3368 }
3369 } else {
3370 opt.quantum = quantum;
3371 }
3372
3373 if (!perturb) {
3374 opt.perturb_period = 10;
3375 } else {
3376 opt.perturb_period = perturb;
3377 }
3378
3379 nl_msg_put_string(&request, TCA_KIND, "sfq");
3380 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3381
3382 error = tc_transact(&request, NULL);
3383 if (error) {
3384 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3385 "quantum %u, perturb %u error %d(%s)",
3386 netdev_get_name(netdev),
3387 opt.quantum, opt.perturb_period,
3388 error, ovs_strerror(error));
3389 }
3390 return error;
3391 }
3392
3393 static void
3394 sfq_parse_qdisc_details__(struct netdev *netdev,
3395 const struct smap *details, struct sfq *sfq)
3396 {
3397 const char *perturb_s;
3398 const char *quantum_s;
3399 int mtu;
3400 int mtu_error;
3401
3402 perturb_s = smap_get(details, "perturb");
3403 quantum_s = smap_get(details, "quantum");
3404 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3405 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3406 if (!sfq->perturb) {
3407 sfq->perturb = 10;
3408 }
3409
3410 if (!sfq->quantum) {
3411 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3412 if (!mtu_error) {
3413 sfq->quantum = mtu;
3414 } else {
3415 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3416 "device without mtu");
3417 return;
3418 }
3419 }
3420 }
3421
3422 static int
3423 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3424 {
3425 int error;
3426 struct sfq sfq;
3427
3428 sfq_parse_qdisc_details__(netdev, details, &sfq);
3429 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3430 if (!error) {
3431 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3432 }
3433 return error;
3434 }
3435
3436 static int
3437 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3438 {
3439 const struct tc_sfq_qopt *sfq;
3440 struct nlattr *nlattr;
3441 const char * kind;
3442 int error;
3443
3444 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3445 if (error == 0) {
3446 sfq = nl_attr_get(nlattr);
3447 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3448 return 0;
3449 }
3450
3451 return error;
3452 }
3453
3454 static void
3455 sfq_tc_destroy(struct tc *tc)
3456 {
3457 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3458 tc_destroy(tc);
3459 free(sfq);
3460 }
3461
3462 static int
3463 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3464 {
3465 const struct sfq *sfq = sfq_get__(netdev);
3466 smap_add_format(details, "quantum", "%u", sfq->quantum);
3467 smap_add_format(details, "perturb", "%u", sfq->perturb);
3468 return 0;
3469 }
3470
3471 static int
3472 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3473 {
3474 struct sfq sfq;
3475
3476 sfq_parse_qdisc_details__(netdev, details, &sfq);
3477 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3478 sfq_get__(netdev)->quantum = sfq.quantum;
3479 sfq_get__(netdev)->perturb = sfq.perturb;
3480 return 0;
3481 }
3482
3483 static const struct tc_ops tc_ops_sfq = {
3484 "sfq", /* linux_name */
3485 "linux-sfq", /* ovs_name */
3486 SFQ_N_QUEUES, /* n_queues */
3487 sfq_tc_install,
3488 sfq_tc_load,
3489 sfq_tc_destroy,
3490 sfq_qdisc_get,
3491 sfq_qdisc_set,
3492 NULL,
3493 NULL,
3494 NULL,
3495 NULL,
3496 NULL
3497 };
3498 \f
3499 /* HTB traffic control class. */
3500
3501 #define HTB_N_QUEUES 0xf000
3502 #define HTB_RATE2QUANTUM 10
3503
3504 struct htb {
3505 struct tc tc;
3506 unsigned int max_rate; /* In bytes/s. */
3507 };
3508
3509 struct htb_class {
3510 struct tc_queue tc_queue;
3511 unsigned int min_rate; /* In bytes/s. */
3512 unsigned int max_rate; /* In bytes/s. */
3513 unsigned int burst; /* In bytes. */
3514 unsigned int priority; /* Lower values are higher priorities. */
3515 };
3516
3517 static struct htb *
3518 htb_get__(const struct netdev *netdev_)
3519 {
3520 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3521 return CONTAINER_OF(netdev->tc, struct htb, tc);
3522 }
3523
3524 static void
3525 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3526 {
3527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3528 struct htb *htb;
3529
3530 htb = xmalloc(sizeof *htb);
3531 tc_init(&htb->tc, &tc_ops_htb);
3532 htb->max_rate = max_rate;
3533
3534 netdev->tc = &htb->tc;
3535 }
3536
3537 /* Create an HTB qdisc.
3538 *
3539 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3540 static int
3541 htb_setup_qdisc__(struct netdev *netdev)
3542 {
3543 size_t opt_offset;
3544 struct tc_htb_glob opt;
3545 struct ofpbuf request;
3546 struct tcmsg *tcmsg;
3547
3548 tc_del_qdisc(netdev);
3549
3550 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3551 NLM_F_EXCL | NLM_F_CREATE, &request);
3552 if (!tcmsg) {
3553 return ENODEV;
3554 }
3555 tcmsg->tcm_handle = tc_make_handle(1, 0);
3556 tcmsg->tcm_parent = TC_H_ROOT;
3557
3558 nl_msg_put_string(&request, TCA_KIND, "htb");
3559
3560 memset(&opt, 0, sizeof opt);
3561 opt.rate2quantum = HTB_RATE2QUANTUM;
3562 opt.version = 3;
3563 opt.defcls = 1;
3564
3565 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3566 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3567 nl_msg_end_nested(&request, opt_offset);
3568
3569 return tc_transact(&request, NULL);
3570 }
3571
3572 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3573 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3574 static int
3575 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3576 unsigned int parent, struct htb_class *class)
3577 {
3578 size_t opt_offset;
3579 struct tc_htb_opt opt;
3580 struct ofpbuf request;
3581 struct tcmsg *tcmsg;
3582 int error;
3583 int mtu;
3584
3585 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3586 if (error) {
3587 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3588 netdev_get_name(netdev));
3589 return error;
3590 }
3591
3592 memset(&opt, 0, sizeof opt);
3593 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3594 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3595 /* Makes sure the quantum is at least MTU. Setting quantum will
3596 * make htb ignore the r2q for this class. */
3597 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3598 opt.quantum = mtu;
3599 }
3600 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3601 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3602 opt.prio = class->priority;
3603
3604 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3605 if (!tcmsg) {
3606 return ENODEV;
3607 }
3608 tcmsg->tcm_handle = handle;
3609 tcmsg->tcm_parent = parent;
3610
3611 nl_msg_put_string(&request, TCA_KIND, "htb");
3612 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3613 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3614 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3615 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3616 nl_msg_end_nested(&request, opt_offset);
3617
3618 error = tc_transact(&request, NULL);
3619 if (error) {
3620 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3621 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3622 netdev_get_name(netdev),
3623 tc_get_major(handle), tc_get_minor(handle),
3624 tc_get_major(parent), tc_get_minor(parent),
3625 class->min_rate, class->max_rate,
3626 class->burst, class->priority, ovs_strerror(error));
3627 }
3628 return error;
3629 }
3630
3631 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3632 * description of them into 'details'. The description complies with the
3633 * specification given in the vswitch database documentation for linux-htb
3634 * queue details. */
3635 static int
3636 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3637 {
3638 static const struct nl_policy tca_htb_policy[] = {
3639 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3640 .min_len = sizeof(struct tc_htb_opt) },
3641 };
3642
3643 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3644 const struct tc_htb_opt *htb;
3645
3646 if (!nl_parse_nested(nl_options, tca_htb_policy,
3647 attrs, ARRAY_SIZE(tca_htb_policy))) {
3648 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3649 return EPROTO;
3650 }
3651
3652 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3653 class->min_rate = htb->rate.rate;
3654 class->max_rate = htb->ceil.rate;
3655 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3656 class->priority = htb->prio;
3657 return 0;
3658 }
3659
3660 static int
3661 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3662 struct htb_class *options,
3663 struct netdev_queue_stats *stats)
3664 {
3665 struct nlattr *nl_options;
3666 unsigned int handle;
3667 int error;
3668
3669 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3670 if (!error && queue_id) {
3671 unsigned int major = tc_get_major(handle);
3672 unsigned int minor = tc_get_minor(handle);
3673 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3674 *queue_id = minor - 1;
3675 } else {
3676 error = EPROTO;
3677 }
3678 }
3679 if (!error && options) {
3680 error = htb_parse_tca_options__(nl_options, options);
3681 }
3682 return error;
3683 }
3684
3685 static void
3686 htb_parse_qdisc_details__(struct netdev *netdev_,
3687 const struct smap *details, struct htb_class *hc)
3688 {
3689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3690 const char *max_rate_s;
3691
3692 max_rate_s = smap_get(details, "max-rate");
3693 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3694 if (!hc->max_rate) {
3695 enum netdev_features current;
3696
3697 netdev_linux_read_features(netdev);
3698 current = !netdev->get_features_error ? netdev->current : 0;
3699 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3700 }
3701 hc->min_rate = hc->max_rate;
3702 hc->burst = 0;
3703 hc->priority = 0;
3704 }
3705
3706 static int
3707 htb_parse_class_details__(struct netdev *netdev,
3708 const struct smap *details, struct htb_class *hc)
3709 {
3710 const struct htb *htb = htb_get__(netdev);
3711 const char *min_rate_s = smap_get(details, "min-rate");
3712 const char *max_rate_s = smap_get(details, "max-rate");
3713 const char *burst_s = smap_get(details, "burst");
3714 const char *priority_s = smap_get(details, "priority");
3715 int mtu, error;
3716
3717 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3718 if (error) {
3719 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3720 netdev_get_name(netdev));
3721 return error;
3722 }
3723
3724 /* HTB requires at least an mtu sized min-rate to send any traffic even
3725 * on uncongested links. */
3726 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3727 hc->min_rate = MAX(hc->min_rate, mtu);
3728 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3729
3730 /* max-rate */
3731 hc->max_rate = (max_rate_s
3732 ? strtoull(max_rate_s, NULL, 10) / 8
3733 : htb->max_rate);
3734 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3735 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3736
3737 /* burst
3738 *
3739 * According to hints in the documentation that I've read, it is important
3740 * that 'burst' be at least as big as the largest frame that might be
3741 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3742 * but having it a bit too small is a problem. Since netdev_get_mtu()
3743 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3744 * the MTU. We actually add 64, instead of 14, as a guard against
3745 * additional headers get tacked on somewhere that we're not aware of. */
3746 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3747 hc->burst = MAX(hc->burst, mtu + 64);
3748
3749 /* priority */
3750 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3751
3752 return 0;
3753 }
3754
3755 static int
3756 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3757 unsigned int parent, struct htb_class *options,
3758 struct netdev_queue_stats *stats)
3759 {
3760 struct ofpbuf *reply;
3761 int error;
3762
3763 error = tc_query_class(netdev, handle, parent, &reply);
3764 if (!error) {
3765 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3766 ofpbuf_delete(reply);
3767 }
3768 return error;
3769 }
3770
3771 static int
3772 htb_tc_install(struct netdev *netdev, const struct smap *details)
3773 {
3774 int error;
3775
3776 error = htb_setup_qdisc__(netdev);
3777 if (!error) {
3778 struct htb_class hc;
3779
3780 htb_parse_qdisc_details__(netdev, details, &hc);
3781 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3782 tc_make_handle(1, 0), &hc);
3783 if (!error) {
3784 htb_install__(netdev, hc.max_rate);
3785 }
3786 }
3787 return error;
3788 }
3789
3790 static struct htb_class *
3791 htb_class_cast__(const struct tc_queue *queue)
3792 {
3793 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3794 }
3795
3796 static void
3797 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3798 const struct htb_class *hc)
3799 {
3800 struct htb *htb = htb_get__(netdev);
3801 size_t hash = hash_int(queue_id, 0);
3802 struct tc_queue *queue;
3803 struct htb_class *hcp;
3804
3805 queue = tc_find_queue__(netdev, queue_id, hash);
3806 if (queue) {
3807 hcp = htb_class_cast__(queue);
3808 } else {
3809 hcp = xmalloc(sizeof *hcp);
3810 queue = &hcp->tc_queue;
3811 queue->queue_id = queue_id;
3812 queue->created = time_msec();
3813 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3814 }
3815
3816 hcp->min_rate = hc->min_rate;
3817 hcp->max_rate = hc->max_rate;
3818 hcp->burst = hc->burst;
3819 hcp->priority = hc->priority;
3820 }
3821
3822 static int
3823 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3824 {
3825 struct ofpbuf msg;
3826 struct queue_dump_state state;
3827 struct htb_class hc;
3828
3829 /* Get qdisc options. */
3830 hc.max_rate = 0;
3831 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3832 htb_install__(netdev, hc.max_rate);
3833
3834 /* Get queues. */
3835 if (!start_queue_dump(netdev, &state)) {
3836 return ENODEV;
3837 }
3838 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3839 unsigned int queue_id;
3840
3841 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3842 htb_update_queue__(netdev, queue_id, &hc);
3843 }
3844 }
3845 finish_queue_dump(&state);
3846
3847 return 0;
3848 }
3849
3850 static void
3851 htb_tc_destroy(struct tc *tc)
3852 {
3853 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3854 struct htb_class *hc, *next;
3855
3856 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3857 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3858 free(hc);
3859 }
3860 tc_destroy(tc);
3861 free(htb);
3862 }
3863
3864 static int
3865 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3866 {
3867 const struct htb *htb = htb_get__(netdev);
3868 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3869 return 0;
3870 }
3871
3872 static int
3873 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3874 {
3875 struct htb_class hc;
3876 int error;
3877
3878 htb_parse_qdisc_details__(netdev, details, &hc);
3879 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3880 tc_make_handle(1, 0), &hc);
3881 if (!error) {
3882 htb_get__(netdev)->max_rate = hc.max_rate;
3883 }
3884 return error;
3885 }
3886
3887 static int
3888 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3889 const struct tc_queue *queue, struct smap *details)
3890 {
3891 const struct htb_class *hc = htb_class_cast__(queue);
3892
3893 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3894 if (hc->min_rate != hc->max_rate) {
3895 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3896 }
3897 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3898 if (hc->priority) {
3899 smap_add_format(details, "priority", "%u", hc->priority);
3900 }
3901 return 0;
3902 }
3903
3904 static int
3905 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3906 const struct smap *details)
3907 {
3908 struct htb_class hc;
3909 int error;
3910
3911 error = htb_parse_class_details__(netdev, details, &hc);
3912 if (error) {
3913 return error;
3914 }
3915
3916 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3917 tc_make_handle(1, 0xfffe), &hc);
3918 if (error) {
3919 return error;
3920 }
3921
3922 htb_update_queue__(netdev, queue_id, &hc);
3923 return 0;
3924 }
3925
3926 static int
3927 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3928 {
3929 struct htb_class *hc = htb_class_cast__(queue);
3930 struct htb *htb = htb_get__(netdev);
3931 int error;
3932
3933 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3934 if (!error) {
3935 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3936 free(hc);
3937 }
3938 return error;
3939 }
3940
3941 static int
3942 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3943 struct netdev_queue_stats *stats)
3944 {
3945 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3946 tc_make_handle(1, 0xfffe), NULL, stats);
3947 }
3948
3949 static int
3950 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3951 const struct ofpbuf *nlmsg,
3952 netdev_dump_queue_stats_cb *cb, void *aux)
3953 {
3954 struct netdev_queue_stats stats;
3955 unsigned int handle, major, minor;
3956 int error;
3957
3958 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3959 if (error) {
3960 return error;
3961 }
3962
3963 major = tc_get_major(handle);
3964 minor = tc_get_minor(handle);
3965 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3966 (*cb)(minor - 1, &stats, aux);
3967 }
3968 return 0;
3969 }
3970
3971 static const struct tc_ops tc_ops_htb = {
3972 "htb", /* linux_name */
3973 "linux-htb", /* ovs_name */
3974 HTB_N_QUEUES, /* n_queues */
3975 htb_tc_install,
3976 htb_tc_load,
3977 htb_tc_destroy,
3978 htb_qdisc_get,
3979 htb_qdisc_set,
3980 htb_class_get,
3981 htb_class_set,
3982 htb_class_delete,
3983 htb_class_get_stats,
3984 htb_class_dump_stats
3985 };
3986 \f
3987 /* "linux-hfsc" traffic control class. */
3988
3989 #define HFSC_N_QUEUES 0xf000
3990
3991 struct hfsc {
3992 struct tc tc;
3993 uint32_t max_rate;
3994 };
3995
3996 struct hfsc_class {
3997 struct tc_queue tc_queue;
3998 uint32_t min_rate;
3999 uint32_t max_rate;
4000 };
4001
4002 static struct hfsc *
4003 hfsc_get__(const struct netdev *netdev_)
4004 {
4005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4006 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4007 }
4008
4009 static struct hfsc_class *
4010 hfsc_class_cast__(const struct tc_queue *queue)
4011 {
4012 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4013 }
4014
4015 static void
4016 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4017 {
4018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4019 struct hfsc *hfsc;
4020
4021 hfsc = xmalloc(sizeof *hfsc);
4022 tc_init(&hfsc->tc, &tc_ops_hfsc);
4023 hfsc->max_rate = max_rate;
4024 netdev->tc = &hfsc->tc;
4025 }
4026
4027 static void
4028 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4029 const struct hfsc_class *hc)
4030 {
4031 size_t hash;
4032 struct hfsc *hfsc;
4033 struct hfsc_class *hcp;
4034 struct tc_queue *queue;
4035
4036 hfsc = hfsc_get__(netdev);
4037 hash = hash_int(queue_id, 0);
4038
4039 queue = tc_find_queue__(netdev, queue_id, hash);
4040 if (queue) {
4041 hcp = hfsc_class_cast__(queue);
4042 } else {
4043 hcp = xmalloc(sizeof *hcp);
4044 queue = &hcp->tc_queue;
4045 queue->queue_id = queue_id;
4046 queue->created = time_msec();
4047 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4048 }
4049
4050 hcp->min_rate = hc->min_rate;
4051 hcp->max_rate = hc->max_rate;
4052 }
4053
4054 static int
4055 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4056 {
4057 const struct tc_service_curve *rsc, *fsc, *usc;
4058 static const struct nl_policy tca_hfsc_policy[] = {
4059 [TCA_HFSC_RSC] = {
4060 .type = NL_A_UNSPEC,
4061 .optional = false,
4062 .min_len = sizeof(struct tc_service_curve),
4063 },
4064 [TCA_HFSC_FSC] = {
4065 .type = NL_A_UNSPEC,
4066 .optional = false,
4067 .min_len = sizeof(struct tc_service_curve),
4068 },
4069 [TCA_HFSC_USC] = {
4070 .type = NL_A_UNSPEC,
4071 .optional = false,
4072 .min_len = sizeof(struct tc_service_curve),
4073 },
4074 };
4075 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4076
4077 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4078 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4079 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4080 return EPROTO;
4081 }
4082
4083 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4084 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4085 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4086
4087 if (rsc->m1 != 0 || rsc->d != 0 ||
4088 fsc->m1 != 0 || fsc->d != 0 ||
4089 usc->m1 != 0 || usc->d != 0) {
4090 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4091 "Non-linear service curves are not supported.");
4092 return EPROTO;
4093 }
4094
4095 if (rsc->m2 != fsc->m2) {
4096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4097 "Real-time service curves are not supported ");
4098 return EPROTO;
4099 }
4100
4101 if (rsc->m2 > usc->m2) {
4102 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4103 "Min-rate service curve is greater than "
4104 "the max-rate service curve.");
4105 return EPROTO;
4106 }
4107
4108 class->min_rate = fsc->m2;
4109 class->max_rate = usc->m2;
4110 return 0;
4111 }
4112
4113 static int
4114 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4115 struct hfsc_class *options,
4116 struct netdev_queue_stats *stats)
4117 {
4118 int error;
4119 unsigned int handle;
4120 struct nlattr *nl_options;
4121
4122 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4123 if (error) {
4124 return error;
4125 }
4126
4127 if (queue_id) {
4128 unsigned int major, minor;
4129
4130 major = tc_get_major(handle);
4131 minor = tc_get_minor(handle);
4132 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4133 *queue_id = minor - 1;
4134 } else {
4135 return EPROTO;
4136 }
4137 }
4138
4139 if (options) {
4140 error = hfsc_parse_tca_options__(nl_options, options);
4141 }
4142
4143 return error;
4144 }
4145
4146 static int
4147 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4148 unsigned int parent, struct hfsc_class *options,
4149 struct netdev_queue_stats *stats)
4150 {
4151 int error;
4152 struct ofpbuf *reply;
4153
4154 error = tc_query_class(netdev, handle, parent, &reply);
4155 if (error) {
4156 return error;
4157 }
4158
4159 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4160 ofpbuf_delete(reply);
4161 return error;
4162 }
4163
4164 static void
4165 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4166 struct hfsc_class *class)
4167 {
4168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4169 uint32_t max_rate;
4170 const char *max_rate_s;
4171
4172 max_rate_s = smap_get(details, "max-rate");
4173 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4174
4175 if (!max_rate) {
4176 enum netdev_features current;
4177
4178 netdev_linux_read_features(netdev);
4179 current = !netdev->get_features_error ? netdev->current : 0;
4180 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4181 }
4182
4183 class->min_rate = max_rate;
4184 class->max_rate = max_rate;
4185 }
4186
4187 static int
4188 hfsc_parse_class_details__(struct netdev *netdev,
4189 const struct smap *details,
4190 struct hfsc_class * class)
4191 {
4192 const struct hfsc *hfsc;
4193 uint32_t min_rate, max_rate;
4194 const char *min_rate_s, *max_rate_s;
4195
4196 hfsc = hfsc_get__(netdev);
4197 min_rate_s = smap_get(details, "min-rate");
4198 max_rate_s = smap_get(details, "max-rate");
4199
4200 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4201 min_rate = MAX(min_rate, 1);
4202 min_rate = MIN(min_rate, hfsc->max_rate);
4203
4204 max_rate = (max_rate_s
4205 ? strtoull(max_rate_s, NULL, 10) / 8
4206 : hfsc->max_rate);
4207 max_rate = MAX(max_rate, min_rate);
4208 max_rate = MIN(max_rate, hfsc->max_rate);
4209
4210 class->min_rate = min_rate;
4211 class->max_rate = max_rate;
4212
4213 return 0;
4214 }
4215
4216 /* Create an HFSC qdisc.
4217 *
4218 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4219 static int
4220 hfsc_setup_qdisc__(struct netdev * netdev)
4221 {
4222 struct tcmsg *tcmsg;
4223 struct ofpbuf request;
4224 struct tc_hfsc_qopt opt;
4225
4226 tc_del_qdisc(netdev);
4227
4228 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4229 NLM_F_EXCL | NLM_F_CREATE, &request);
4230
4231 if (!tcmsg) {
4232 return ENODEV;
4233 }
4234
4235 tcmsg->tcm_handle = tc_make_handle(1, 0);
4236 tcmsg->tcm_parent = TC_H_ROOT;
4237
4238 memset(&opt, 0, sizeof opt);
4239 opt.defcls = 1;
4240
4241 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4242 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4243
4244 return tc_transact(&request, NULL);
4245 }
4246
4247 /* Create an HFSC class.
4248 *
4249 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4250 * sc rate <min_rate> ul rate <max_rate>" */
4251 static int
4252 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4253 unsigned int parent, struct hfsc_class *class)
4254 {
4255 int error;
4256 size_t opt_offset;
4257 struct tcmsg *tcmsg;
4258 struct ofpbuf request;
4259 struct tc_service_curve min, max;
4260
4261 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4262
4263 if (!tcmsg) {
4264 return ENODEV;
4265 }
4266
4267 tcmsg->tcm_handle = handle;
4268 tcmsg->tcm_parent = parent;
4269
4270 min.m1 = 0;
4271 min.d = 0;
4272 min.m2 = class->min_rate;
4273
4274 max.m1 = 0;
4275 max.d = 0;
4276 max.m2 = class->max_rate;
4277
4278 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4279 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4280 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4281 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4282 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4283 nl_msg_end_nested(&request, opt_offset);
4284
4285 error = tc_transact(&request, NULL);
4286 if (error) {
4287 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4288 "min-rate %ubps, max-rate %ubps (%s)",
4289 netdev_get_name(netdev),
4290 tc_get_major(handle), tc_get_minor(handle),
4291 tc_get_major(parent), tc_get_minor(parent),
4292 class->min_rate, class->max_rate, ovs_strerror(error));
4293 }
4294
4295 return error;
4296 }
4297
4298 static int
4299 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4300 {
4301 int error;
4302 struct hfsc_class class;
4303
4304 error = hfsc_setup_qdisc__(netdev);
4305
4306 if (error) {
4307 return error;
4308 }
4309
4310 hfsc_parse_qdisc_details__(netdev, details, &class);
4311 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4312 tc_make_handle(1, 0), &class);
4313
4314 if (error) {
4315 return error;
4316 }
4317
4318 hfsc_install__(netdev, class.max_rate);
4319 return 0;
4320 }
4321
4322 static int
4323 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4324 {
4325 struct ofpbuf msg;
4326 struct queue_dump_state state;
4327 struct hfsc_class hc;
4328
4329 hc.max_rate = 0;
4330 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4331 hfsc_install__(netdev, hc.max_rate);
4332
4333 if (!start_queue_dump(netdev, &state)) {
4334 return ENODEV;
4335 }
4336
4337 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4338 unsigned int queue_id;
4339
4340 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4341 hfsc_update_queue__(netdev, queue_id, &hc);
4342 }
4343 }
4344
4345 finish_queue_dump(&state);
4346 return 0;
4347 }
4348
4349 static void
4350 hfsc_tc_destroy(struct tc *tc)
4351 {
4352 struct hfsc *hfsc;
4353 struct hfsc_class *hc, *next;
4354
4355 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4356
4357 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4358 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4359 free(hc);
4360 }
4361
4362 tc_destroy(tc);
4363 free(hfsc);
4364 }
4365
4366 static int
4367 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4368 {
4369 const struct hfsc *hfsc;
4370 hfsc = hfsc_get__(netdev);
4371 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4372 return 0;
4373 }
4374
4375 static int
4376 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4377 {
4378 int error;
4379 struct hfsc_class class;
4380
4381 hfsc_parse_qdisc_details__(netdev, details, &class);
4382 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4383 tc_make_handle(1, 0), &class);
4384
4385 if (!error) {
4386 hfsc_get__(netdev)->max_rate = class.max_rate;
4387 }
4388
4389 return error;
4390 }
4391
4392 static int
4393 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4394 const struct tc_queue *queue, struct smap *details)
4395 {
4396 const struct hfsc_class *hc;
4397
4398 hc = hfsc_class_cast__(queue);
4399 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4400 if (hc->min_rate != hc->max_rate) {
4401 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4402 }
4403 return 0;
4404 }
4405
4406 static int
4407 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4408 const struct smap *details)
4409 {
4410 int error;
4411 struct hfsc_class class;
4412
4413 error = hfsc_parse_class_details__(netdev, details, &class);
4414 if (error) {
4415 return error;
4416 }
4417
4418 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4419 tc_make_handle(1, 0xfffe), &class);
4420 if (error) {
4421 return error;
4422 }
4423
4424 hfsc_update_queue__(netdev, queue_id, &class);
4425 return 0;
4426 }
4427
4428 static int
4429 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4430 {
4431 int error;
4432 struct hfsc *hfsc;
4433 struct hfsc_class *hc;
4434
4435 hc = hfsc_class_cast__(queue);
4436 hfsc = hfsc_get__(netdev);
4437
4438 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4439 if (!error) {
4440 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4441 free(hc);
4442 }
4443 return error;
4444 }
4445
4446 static int
4447 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4448 struct netdev_queue_stats *stats)
4449 {
4450 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4451 tc_make_handle(1, 0xfffe), NULL, stats);
4452 }
4453
4454 static int
4455 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4456 const struct ofpbuf *nlmsg,
4457 netdev_dump_queue_stats_cb *cb, void *aux)
4458 {
4459 struct netdev_queue_stats stats;
4460 unsigned int handle, major, minor;
4461 int error;
4462
4463 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4464 if (error) {
4465 return error;
4466 }
4467
4468 major = tc_get_major(handle);
4469 minor = tc_get_minor(handle);
4470 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4471 (*cb)(minor - 1, &stats, aux);
4472 }
4473 return 0;
4474 }
4475
4476 static const struct tc_ops tc_ops_hfsc = {
4477 "hfsc", /* linux_name */
4478 "linux-hfsc", /* ovs_name */
4479 HFSC_N_QUEUES, /* n_queues */
4480 hfsc_tc_install, /* tc_install */
4481 hfsc_tc_load, /* tc_load */
4482 hfsc_tc_destroy, /* tc_destroy */
4483 hfsc_qdisc_get, /* qdisc_get */
4484 hfsc_qdisc_set, /* qdisc_set */
4485 hfsc_class_get, /* class_get */
4486 hfsc_class_set, /* class_set */
4487 hfsc_class_delete, /* class_delete */
4488 hfsc_class_get_stats, /* class_get_stats */
4489 hfsc_class_dump_stats /* class_dump_stats */
4490 };
4491 \f
4492 /* "linux-default" traffic control class.
4493 *
4494 * This class represents the default, unnamed Linux qdisc. It corresponds to
4495 * the "" (empty string) QoS type in the OVS database. */
4496
4497 static void
4498 default_install__(struct netdev *netdev_)
4499 {
4500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4501 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4502
4503 /* Nothing but a tc class implementation is allowed to write to a tc. This
4504 * class never does that, so we can legitimately use a const tc object. */
4505 netdev->tc = CONST_CAST(struct tc *, &tc);
4506 }
4507
4508 static int
4509 default_tc_install(struct netdev *netdev,
4510 const struct smap *details OVS_UNUSED)
4511 {
4512 default_install__(netdev);
4513 return 0;
4514 }
4515
4516 static int
4517 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4518 {
4519 default_install__(netdev);
4520 return 0;
4521 }
4522
4523 static const struct tc_ops tc_ops_default = {
4524 NULL, /* linux_name */
4525 "", /* ovs_name */
4526 0, /* n_queues */
4527 default_tc_install,
4528 default_tc_load,
4529 NULL, /* tc_destroy */
4530 NULL, /* qdisc_get */
4531 NULL, /* qdisc_set */
4532 NULL, /* class_get */
4533 NULL, /* class_set */
4534 NULL, /* class_delete */
4535 NULL, /* class_get_stats */
4536 NULL /* class_dump_stats */
4537 };
4538 \f
4539 /* "linux-other" traffic control class.
4540 *
4541 * */
4542
4543 static int
4544 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4545 {
4546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4547 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4548
4549 /* Nothing but a tc class implementation is allowed to write to a tc. This
4550 * class never does that, so we can legitimately use a const tc object. */
4551 netdev->tc = CONST_CAST(struct tc *, &tc);
4552 return 0;
4553 }
4554
4555 static const struct tc_ops tc_ops_other = {
4556 NULL, /* linux_name */
4557 "linux-other", /* ovs_name */
4558 0, /* n_queues */
4559 NULL, /* tc_install */
4560 other_tc_load,
4561 NULL, /* tc_destroy */
4562 NULL, /* qdisc_get */
4563 NULL, /* qdisc_set */
4564 NULL, /* class_get */
4565 NULL, /* class_set */
4566 NULL, /* class_delete */
4567 NULL, /* class_get_stats */
4568 NULL /* class_dump_stats */
4569 };
4570 \f
4571 /* Traffic control. */
4572
4573 /* Number of kernel "tc" ticks per second. */
4574 static double ticks_per_s;
4575
4576 /* Number of kernel "jiffies" per second. This is used for the purpose of
4577 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4578 * one jiffy's worth of data.
4579 *
4580 * There are two possibilities here:
4581 *
4582 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4583 * approximate range of 100 to 1024. That means that we really need to
4584 * make sure that the qdisc can buffer that much data.
4585 *
4586 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4587 * has finely granular timers and there's no need to fudge additional room
4588 * for buffers. (There's no extra effort needed to implement that: the
4589 * large 'buffer_hz' is used as a divisor, so practically any number will
4590 * come out as 0 in the division. Small integer results in the case of
4591 * really high dividends won't have any real effect anyhow.)
4592 */
4593 static unsigned int buffer_hz;
4594
4595 /* Returns tc handle 'major':'minor'. */
4596 static unsigned int
4597 tc_make_handle(unsigned int major, unsigned int minor)
4598 {
4599 return TC_H_MAKE(major << 16, minor);
4600 }
4601
4602 /* Returns the major number from 'handle'. */
4603 static unsigned int
4604 tc_get_major(unsigned int handle)
4605 {
4606 return TC_H_MAJ(handle) >> 16;
4607 }
4608
4609 /* Returns the minor number from 'handle'. */
4610 static unsigned int
4611 tc_get_minor(unsigned int handle)
4612 {
4613 return TC_H_MIN(handle);
4614 }
4615
4616 static struct tcmsg *
4617 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4618 struct ofpbuf *request)
4619 {
4620 struct tcmsg *tcmsg;
4621 int ifindex;
4622 int error;
4623
4624 error = get_ifindex(netdev, &ifindex);
4625 if (error) {
4626 return NULL;
4627 }
4628
4629 ofpbuf_init(request, 512);
4630 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4631 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4632 tcmsg->tcm_family = AF_UNSPEC;
4633 tcmsg->tcm_ifindex = ifindex;
4634 /* Caller should fill in tcmsg->tcm_handle. */
4635 /* Caller should fill in tcmsg->tcm_parent. */
4636
4637 return tcmsg;
4638 }
4639
4640 static int
4641 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4642 {
4643 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4644 ofpbuf_uninit(request);
4645 return error;
4646 }
4647
4648 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4649 * policing configuration.
4650 *
4651 * This function is equivalent to running the following when 'add' is true:
4652 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4653 *
4654 * This function is equivalent to running the following when 'add' is false:
4655 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4656 *
4657 * The configuration and stats may be seen with the following command:
4658 * /sbin/tc -s qdisc show dev <devname>
4659 *
4660 * Returns 0 if successful, otherwise a positive errno value.
4661 */
4662 static int
4663 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4664 {
4665 struct ofpbuf request;
4666 struct tcmsg *tcmsg;
4667 int error;
4668 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4669 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4670
4671 tcmsg = tc_make_request(netdev, type, flags, &request);
4672 if (!tcmsg) {
4673 return ENODEV;
4674 }
4675 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4676 tcmsg->tcm_parent = TC_H_INGRESS;
4677 nl_msg_put_string(&request, TCA_KIND, "ingress");
4678 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4679
4680 error = tc_transact(&request, NULL);
4681 if (error) {
4682 /* If we're deleting the qdisc, don't worry about some of the
4683 * error conditions. */
4684 if (!add && (error == ENOENT || error == EINVAL)) {
4685 return 0;
4686 }
4687 return error;
4688 }
4689
4690 return 0;
4691 }
4692
4693 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4694 * of 'kbits_burst'.
4695 *
4696 * This function is equivalent to running:
4697 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4698 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4699 * mtu 65535 drop
4700 *
4701 * The configuration and stats may be seen with the following command:
4702 * /sbin/tc -s filter show dev <devname> parent ffff:
4703 *
4704 * Returns 0 if successful, otherwise a positive errno value.
4705 */
4706 static int
4707 tc_add_policer(struct netdev *netdev,
4708 uint32_t kbits_rate, uint32_t kbits_burst)
4709 {
4710 struct tc_police tc_police;
4711 struct ofpbuf request;
4712 struct tcmsg *tcmsg;
4713 size_t basic_offset;
4714 size_t police_offset;
4715 int error;
4716 int mtu = 65535;
4717
4718 memset(&tc_police, 0, sizeof tc_police);
4719 tc_police.action = TC_POLICE_SHOT;
4720 tc_police.mtu = mtu;
4721 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4722
4723 /* The following appears wrong in two ways:
4724 *
4725 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4726 * arguments (or at least consistently "bytes" as both or "bits" as
4727 * both), but this supplies bytes for the first argument and bits for the
4728 * second.
4729 *
4730 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4731 *
4732 * However if you "fix" those problems then "tc filter show ..." shows
4733 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4734 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4735 * tc's point of view. Whatever. */
4736 tc_police.burst = tc_bytes_to_ticks(
4737 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4738
4739 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4740 NLM_F_EXCL | NLM_F_CREATE, &request);
4741 if (!tcmsg) {
4742 return ENODEV;
4743 }
4744 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4745 tcmsg->tcm_info = tc_make_handle(49,
4746 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4747
4748 nl_msg_put_string(&request, TCA_KIND, "basic");
4749 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4750 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4751 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4752 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4753 nl_msg_end_nested(&request, police_offset);
4754 nl_msg_end_nested(&request, basic_offset);
4755
4756 error = tc_transact(&request, NULL);
4757 if (error) {
4758 return error;
4759 }
4760
4761 return 0;
4762 }
4763
4764 static void
4765 read_psched(void)
4766 {
4767 /* The values in psched are not individually very meaningful, but they are
4768 * important. The tables below show some values seen in the wild.
4769 *
4770 * Some notes:
4771 *
4772 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4773 * (Before that, there are hints that it was 1000000000.)
4774 *
4775 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4776 * above.
4777 *
4778 * /proc/net/psched
4779 * -----------------------------------
4780 * [1] 000c8000 000f4240 000f4240 00000064
4781 * [2] 000003e8 00000400 000f4240 3b9aca00
4782 * [3] 000003e8 00000400 000f4240 3b9aca00
4783 * [4] 000003e8 00000400 000f4240 00000064
4784 * [5] 000003e8 00000040 000f4240 3b9aca00
4785 * [6] 000003e8 00000040 000f4240 000000f9
4786 *
4787 * a b c d ticks_per_s buffer_hz
4788 * ------- --------- ---------- ------------- ----------- -------------
4789 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4790 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4791 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4792 * [4] 1,000 1,024 1,000,000 100 976,562 100
4793 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4794 * [6] 1,000 64 1,000,000 249 15,625,000 249
4795 *
4796 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4797 * [2] 2.6.26-1-686-bigmem from Debian lenny
4798 * [3] 2.6.26-2-sparc64 from Debian lenny
4799 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4800 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4801 * [6] 2.6.34 from kernel.org on KVM
4802 */
4803 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4804 static const char fn[] = "/proc/net/psched";
4805 unsigned int a, b, c, d;
4806 FILE *stream;
4807
4808 if (!ovsthread_once_start(&once)) {
4809 return;
4810 }
4811
4812 ticks_per_s = 1.0;
4813 buffer_hz = 100;
4814
4815 stream = fopen(fn, "r");
4816 if (!stream) {
4817 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4818 goto exit;
4819 }
4820
4821 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4822 VLOG_WARN("%s: read failed", fn);
4823 fclose(stream);
4824 goto exit;
4825 }
4826 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4827 fclose(stream);
4828
4829 if (!a || !c) {
4830 VLOG_WARN("%s: invalid scheduler parameters", fn);
4831 goto exit;
4832 }
4833
4834 ticks_per_s = (double) a * c / b;
4835 if (c == 1000000) {
4836 buffer_hz = d;
4837 } else {
4838 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4839 fn, a, b, c, d);
4840 }
4841 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4842
4843 exit:
4844 ovsthread_once_done(&once);
4845 }
4846
4847 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4848 * rate of 'rate' bytes per second. */
4849 static unsigned int
4850 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4851 {
4852 read_psched();
4853 return (rate * ticks) / ticks_per_s;
4854 }
4855
4856 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4857 * rate of 'rate' bytes per second. */
4858 static unsigned int
4859 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4860 {
4861 read_psched();
4862 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4863 }
4864
4865 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4866 * a transmission rate of 'rate' bytes per second. */
4867 static unsigned int
4868 tc_buffer_per_jiffy(unsigned int rate)
4869 {
4870 read_psched();
4871 return rate / buffer_hz;
4872 }
4873
4874 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4875 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4876 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4877 * stores NULL into it if it is absent.
4878 *
4879 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4880 * 'msg'.
4881 *
4882 * Returns 0 if successful, otherwise a positive errno value. */
4883 static int
4884 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4885 struct nlattr **options)
4886 {
4887 static const struct nl_policy tca_policy[] = {
4888 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4889 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4890 };
4891 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4892
4893 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4894 tca_policy, ta, ARRAY_SIZE(ta))) {
4895 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4896 goto error;
4897 }
4898
4899 if (kind) {
4900 *kind = nl_attr_get_string(ta[TCA_KIND]);
4901 }
4902
4903 if (options) {
4904 *options = ta[TCA_OPTIONS];
4905 }
4906
4907 return 0;
4908
4909 error:
4910 if (kind) {
4911 *kind = NULL;
4912 }
4913 if (options) {
4914 *options = NULL;
4915 }
4916 return EPROTO;
4917 }
4918
4919 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4920 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4921 * into '*options', and its queue statistics into '*stats'. Any of the output
4922 * arguments may be null.
4923 *
4924 * Returns 0 if successful, otherwise a positive errno value. */
4925 static int
4926 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4927 struct nlattr **options, struct netdev_queue_stats *stats)
4928 {
4929 static const struct nl_policy tca_policy[] = {
4930 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4931 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4932 };
4933 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4934
4935 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4936 tca_policy, ta, ARRAY_SIZE(ta))) {
4937 VLOG_WARN_RL(&rl, "failed to parse class message");
4938 goto error;
4939 }
4940
4941 if (handlep) {
4942 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4943 *handlep = tc->tcm_handle;
4944 }
4945
4946 if (options) {
4947 *options = ta[TCA_OPTIONS];
4948 }
4949
4950 if (stats) {
4951 const struct gnet_stats_queue *gsq;
4952 struct gnet_stats_basic gsb;
4953
4954 static const struct nl_policy stats_policy[] = {
4955 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4956 .min_len = sizeof gsb },
4957 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4958 .min_len = sizeof *gsq },
4959 };
4960 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4961
4962 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4963 sa, ARRAY_SIZE(sa))) {
4964 VLOG_WARN_RL(&rl, "failed to parse class stats");
4965 goto error;
4966 }
4967
4968 /* Alignment issues screw up the length of struct gnet_stats_basic on
4969 * some arch/bitsize combinations. Newer versions of Linux have a
4970 * struct gnet_stats_basic_packed, but we can't depend on that. The
4971 * easiest thing to do is just to make a copy. */
4972 memset(&gsb, 0, sizeof gsb);
4973 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4974 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4975 stats->tx_bytes = gsb.bytes;
4976 stats->tx_packets = gsb.packets;
4977
4978 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4979 stats->tx_errors = gsq->drops;
4980 }
4981
4982 return 0;
4983
4984 error:
4985 if (options) {
4986 *options = NULL;
4987 }
4988 if (stats) {
4989 memset(stats, 0, sizeof *stats);
4990 }
4991 return EPROTO;
4992 }
4993
4994 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4995 * on 'netdev'. */
4996 static int
4997 tc_query_class(const struct netdev *netdev,
4998 unsigned int handle, unsigned int parent,
4999 struct ofpbuf **replyp)
5000 {
5001 struct ofpbuf request;
5002 struct tcmsg *tcmsg;
5003 int error;
5004
5005 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5006 if (!tcmsg) {
5007 return ENODEV;
5008 }
5009 tcmsg->tcm_handle = handle;
5010 tcmsg->tcm_parent = parent;
5011
5012 error = tc_transact(&request, replyp);
5013 if (error) {
5014 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5015 netdev_get_name(netdev),
5016 tc_get_major(handle), tc_get_minor(handle),
5017 tc_get_major(parent), tc_get_minor(parent),
5018 ovs_strerror(error));
5019 }
5020 return error;
5021 }
5022
5023 /* Equivalent to "tc class del dev <name> handle <handle>". */
5024 static int
5025 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5026 {
5027 struct ofpbuf request;
5028 struct tcmsg *tcmsg;
5029 int error;
5030
5031 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5032 if (!tcmsg) {
5033 return ENODEV;
5034 }
5035 tcmsg->tcm_handle = handle;
5036 tcmsg->tcm_parent = 0;
5037
5038 error = tc_transact(&request, NULL);
5039 if (error) {
5040 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5041 netdev_get_name(netdev),
5042 tc_get_major(handle), tc_get_minor(handle),
5043 ovs_strerror(error));
5044 }
5045 return error;
5046 }
5047
5048 /* Equivalent to "tc qdisc del dev <name> root". */
5049 static int
5050 tc_del_qdisc(struct netdev *netdev_)
5051 {
5052 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5053 struct ofpbuf request;
5054 struct tcmsg *tcmsg;
5055 int error;
5056
5057 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5058 if (!tcmsg) {
5059 return ENODEV;
5060 }
5061 tcmsg->tcm_handle = tc_make_handle(1, 0);
5062 tcmsg->tcm_parent = TC_H_ROOT;
5063
5064 error = tc_transact(&request, NULL);
5065 if (error == EINVAL) {
5066 /* EINVAL probably means that the default qdisc was in use, in which
5067 * case we've accomplished our purpose. */
5068 error = 0;
5069 }
5070 if (!error && netdev->tc) {
5071 if (netdev->tc->ops->tc_destroy) {
5072 netdev->tc->ops->tc_destroy(netdev->tc);
5073 }
5074 netdev->tc = NULL;
5075 }
5076 return error;
5077 }
5078
5079 static bool
5080 getqdisc_is_safe(void)
5081 {
5082 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5083 static bool safe = false;
5084
5085 if (ovsthread_once_start(&once)) {
5086 struct utsname utsname;
5087 int major, minor;
5088
5089 if (uname(&utsname) == -1) {
5090 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5091 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5092 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5093 } else if (major < 2 || (major == 2 && minor < 35)) {
5094 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5095 utsname.release);
5096 } else {
5097 safe = true;
5098 }
5099 ovsthread_once_done(&once);
5100 }
5101 return safe;
5102 }
5103
5104 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5105 * kernel to determine what they are. Returns 0 if successful, otherwise a
5106 * positive errno value. */
5107 static int
5108 tc_query_qdisc(const struct netdev *netdev_)
5109 {
5110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5111 struct ofpbuf request, *qdisc;
5112 const struct tc_ops *ops;
5113 struct tcmsg *tcmsg;
5114 int load_error;
5115 int error;
5116
5117 if (netdev->tc) {
5118 return 0;
5119 }
5120
5121 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5122 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5123 * 2.6.35 without that fix backported to it.
5124 *
5125 * To avoid the OOPS, we must not make a request that would attempt to dump
5126 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5127 * few others. There are a few ways that I can see to do this, but most of
5128 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5129 * technique chosen here is to assume that any non-default qdisc that we
5130 * create will have a class with handle 1:0. The built-in qdiscs only have
5131 * a class with handle 0:0.
5132 *
5133 * On Linux 2.6.35+ we use the straightforward method because it allows us
5134 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5135 * in such a case we get no response at all from the kernel (!) if a
5136 * builtin qdisc is in use (which is later caught by "!error &&
5137 * !qdisc->size"). */
5138 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5139 if (!tcmsg) {
5140 return ENODEV;
5141 }
5142 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5143 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5144
5145 /* Figure out what tc class to instantiate. */
5146 error = tc_transact(&request, &qdisc);
5147 if (!error && qdisc->size) {
5148 const char *kind;
5149
5150 error = tc_parse_qdisc(qdisc, &kind, NULL);
5151 if (error) {
5152 ops = &tc_ops_other;
5153 } else {
5154 ops = tc_lookup_linux_name(kind);
5155 if (!ops) {
5156 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5157 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5158
5159 ops = &tc_ops_other;
5160 }
5161 }
5162 } else if ((!error && !qdisc->size) || error == ENOENT) {
5163 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5164 * set up by some other entity that doesn't have a handle 1:0. We will
5165 * assume that it's the system default qdisc. */
5166 ops = &tc_ops_default;
5167 error = 0;
5168 } else {
5169 /* Who knows? Maybe the device got deleted. */
5170 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5171 netdev_get_name(netdev_), ovs_strerror(error));
5172 ops = &tc_ops_other;
5173 }
5174
5175 /* Instantiate it. */
5176 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5177 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5178 ofpbuf_delete(qdisc);
5179
5180 return error ? error : load_error;
5181 }
5182
5183 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5184 approximate the time to transmit packets of various lengths. For an MTU of
5185 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5186 represents two possible packet lengths; for a MTU of 513 through 1024, four
5187 possible lengths; and so on.
5188
5189 Returns, for the specified 'mtu', the number of bits that packet lengths
5190 need to be shifted right to fit within such a 256-entry table. */
5191 static int
5192 tc_calc_cell_log(unsigned int mtu)
5193 {
5194 int cell_log;
5195
5196 if (!mtu) {
5197 mtu = ETH_PAYLOAD_MAX;
5198 }
5199 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5200
5201 for (cell_log = 0; mtu >= 256; cell_log++) {
5202 mtu >>= 1;
5203 }
5204
5205 return cell_log;
5206 }
5207
5208 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5209 * of 'mtu'. */
5210 static void
5211 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5212 {
5213 memset(rate, 0, sizeof *rate);
5214 rate->cell_log = tc_calc_cell_log(mtu);
5215 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5216 /* rate->cell_align = 0; */ /* distro headers. */
5217 rate->mpu = ETH_TOTAL_MIN;
5218 rate->rate = Bps;
5219 }
5220
5221 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5222 * attribute of the specified "type".
5223 *
5224 * See tc_calc_cell_log() above for a description of "rtab"s. */
5225 static void
5226 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5227 {
5228 uint32_t *rtab;
5229 unsigned int i;
5230
5231 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5232 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5233 unsigned packet_size = (i + 1) << rate->cell_log;
5234 if (packet_size < rate->mpu) {
5235 packet_size = rate->mpu;
5236 }
5237 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5238 }
5239 }
5240
5241 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5242 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5243 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5244 * 0 is fine.) */
5245 static int
5246 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5247 {
5248 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5249 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5250 }
5251 \f
5252 /* Linux-only functions declared in netdev-linux.h */
5253
5254 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5255 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5256 int
5257 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5258 const char *flag_name, bool enable)
5259 {
5260 const char *netdev_name = netdev_get_name(netdev);
5261 struct ethtool_value evalue;
5262 uint32_t new_flags;
5263 int error;
5264
5265 COVERAGE_INC(netdev_get_ethtool);
5266 memset(&evalue, 0, sizeof evalue);
5267 error = netdev_linux_do_ethtool(netdev_name,
5268 (struct ethtool_cmd *)&evalue,
5269 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5270 if (error) {
5271 return error;
5272 }
5273
5274 COVERAGE_INC(netdev_set_ethtool);
5275 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5276 if (new_flags == evalue.data) {
5277 return 0;
5278 }
5279 evalue.data = new_flags;
5280 error = netdev_linux_do_ethtool(netdev_name,
5281 (struct ethtool_cmd *)&evalue,
5282 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5283 if (error) {
5284 return error;
5285 }
5286
5287 COVERAGE_INC(netdev_get_ethtool);
5288 memset(&evalue, 0, sizeof evalue);
5289 error = netdev_linux_do_ethtool(netdev_name,
5290 (struct ethtool_cmd *)&evalue,
5291 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5292 if (error) {
5293 return error;
5294 }
5295
5296 if (new_flags != evalue.data) {
5297 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5298 "device %s failed", enable ? "enable" : "disable",
5299 flag_name, netdev_name);
5300 return EOPNOTSUPP;
5301 }
5302
5303 return 0;
5304 }
5305 \f
5306 /* Utility functions. */
5307
5308 /* Copies 'src' into 'dst', performing format conversion in the process. */
5309 static void
5310 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5311 const struct rtnl_link_stats *src)
5312 {
5313 dst->rx_packets = src->rx_packets;
5314 dst->tx_packets = src->tx_packets;
5315 dst->rx_bytes = src->rx_bytes;
5316 dst->tx_bytes = src->tx_bytes;
5317 dst->rx_errors = src->rx_errors;
5318 dst->tx_errors = src->tx_errors;
5319 dst->rx_dropped = src->rx_dropped;
5320 dst->tx_dropped = src->tx_dropped;
5321 dst->multicast = src->multicast;
5322 dst->collisions = src->collisions;
5323 dst->rx_length_errors = src->rx_length_errors;
5324 dst->rx_over_errors = src->rx_over_errors;
5325 dst->rx_crc_errors = src->rx_crc_errors;
5326 dst->rx_frame_errors = src->rx_frame_errors;
5327 dst->rx_fifo_errors = src->rx_fifo_errors;
5328 dst->rx_missed_errors = src->rx_missed_errors;
5329 dst->tx_aborted_errors = src->tx_aborted_errors;
5330 dst->tx_carrier_errors = src->tx_carrier_errors;
5331 dst->tx_fifo_errors = src->tx_fifo_errors;
5332 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5333 dst->tx_window_errors = src->tx_window_errors;
5334 }
5335
5336 /* Copies 'src' into 'dst', performing format conversion in the process. */
5337 static void
5338 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5339 const struct rtnl_link_stats64 *src)
5340 {
5341 dst->rx_packets = src->rx_packets;
5342 dst->tx_packets = src->tx_packets;
5343 dst->rx_bytes = src->rx_bytes;
5344 dst->tx_bytes = src->tx_bytes;
5345 dst->rx_errors = src->rx_errors;
5346 dst->tx_errors = src->tx_errors;
5347 dst->rx_dropped = src->rx_dropped;
5348 dst->tx_dropped = src->tx_dropped;
5349 dst->multicast = src->multicast;
5350 dst->collisions = src->collisions;
5351 dst->rx_length_errors = src->rx_length_errors;
5352 dst->rx_over_errors = src->rx_over_errors;
5353 dst->rx_crc_errors = src->rx_crc_errors;
5354 dst->rx_frame_errors = src->rx_frame_errors;
5355 dst->rx_fifo_errors = src->rx_fifo_errors;
5356 dst->rx_missed_errors = src->rx_missed_errors;
5357 dst->tx_aborted_errors = src->tx_aborted_errors;
5358 dst->tx_carrier_errors = src->tx_carrier_errors;
5359 dst->tx_fifo_errors = src->tx_fifo_errors;
5360 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5361 dst->tx_window_errors = src->tx_window_errors;
5362 }
5363
5364 static int
5365 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5366 {
5367 struct ofpbuf request;
5368 struct ofpbuf *reply;
5369 int error;
5370
5371 ofpbuf_init(&request, 0);
5372 nl_msg_put_nlmsghdr(&request,
5373 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5374 RTM_GETLINK, NLM_F_REQUEST);
5375 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5376 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5377 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5378 ofpbuf_uninit(&request);
5379 if (error) {
5380 return error;
5381 }
5382
5383 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5384 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5385 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5386 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5387 error = 0;
5388 } else {
5389 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5390 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5391 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5392 error = 0;
5393 } else {
5394 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5395 error = EPROTO;
5396 }
5397 }
5398 } else {
5399 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5400 error = EPROTO;
5401 }
5402
5403
5404 ofpbuf_delete(reply);
5405 return error;
5406 }
5407
5408 static int
5409 get_flags(const struct netdev *dev, unsigned int *flags)
5410 {
5411 struct ifreq ifr;
5412 int error;
5413
5414 *flags = 0;
5415 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5416 if (!error) {
5417 *flags = ifr.ifr_flags;
5418 }
5419 return error;
5420 }
5421
5422 static int
5423 set_flags(const char *name, unsigned int flags)
5424 {
5425 struct ifreq ifr;
5426
5427 ifr.ifr_flags = flags;
5428 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5429 }
5430
5431 static int
5432 do_get_ifindex(const char *netdev_name)
5433 {
5434 struct ifreq ifr;
5435 int error;
5436
5437 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5438 COVERAGE_INC(netdev_get_ifindex);
5439
5440 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5441 if (error) {
5442 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5443 netdev_name, ovs_strerror(error));
5444 return -error;
5445 }
5446 return ifr.ifr_ifindex;
5447 }
5448
5449 static int
5450 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5451 {
5452 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5453
5454 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5455 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5456
5457 if (ifindex < 0) {
5458 netdev->get_ifindex_error = -ifindex;
5459 netdev->ifindex = 0;
5460 } else {
5461 netdev->get_ifindex_error = 0;
5462 netdev->ifindex = ifindex;
5463 }
5464 netdev->cache_valid |= VALID_IFINDEX;
5465 }
5466
5467 *ifindexp = netdev->ifindex;
5468 return netdev->get_ifindex_error;
5469 }
5470
5471 static int
5472 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5473 {
5474 struct ifreq ifr;
5475 int hwaddr_family;
5476 int error;
5477
5478 memset(&ifr, 0, sizeof ifr);
5479 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5480 COVERAGE_INC(netdev_get_hwaddr);
5481 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5482 if (error) {
5483 /* ENODEV probably means that a vif disappeared asynchronously and
5484 * hasn't been removed from the database yet, so reduce the log level
5485 * to INFO for that case. */
5486 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5487 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5488 netdev_name, ovs_strerror(error));
5489 return error;
5490 }
5491 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5492 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5493 VLOG_INFO("%s device has unknown hardware address family %d",
5494 netdev_name, hwaddr_family);
5495 return EINVAL;
5496 }
5497 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5498 return 0;
5499 }
5500
5501 static int
5502 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5503 {
5504 struct ifreq ifr;
5505 int error;
5506
5507 memset(&ifr, 0, sizeof ifr);
5508 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5509 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5510 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5511 COVERAGE_INC(netdev_set_hwaddr);
5512 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5513 if (error) {
5514 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5515 netdev_name, ovs_strerror(error));
5516 }
5517 return error;
5518 }
5519
5520 static int
5521 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5522 int cmd, const char *cmd_name)
5523 {
5524 struct ifreq ifr;
5525 int error;
5526
5527 memset(&ifr, 0, sizeof ifr);
5528 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5529 ifr.ifr_data = (caddr_t) ecmd;
5530
5531 ecmd->cmd = cmd;
5532 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5533 if (error) {
5534 if (error != EOPNOTSUPP) {
5535 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5536 "failed: %s", cmd_name, name, ovs_strerror(error));
5537 } else {
5538 /* The device doesn't support this operation. That's pretty
5539 * common, so there's no point in logging anything. */
5540 }
5541 }
5542 return error;
5543 }
5544
5545 /* Returns an AF_PACKET raw socket or a negative errno value. */
5546 static int
5547 af_packet_sock(void)
5548 {
5549 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5550 static int sock;
5551
5552 if (ovsthread_once_start(&once)) {
5553 sock = socket(AF_PACKET, SOCK_RAW, 0);
5554 if (sock >= 0) {
5555 int error = set_nonblocking(sock);
5556 if (error) {
5557 close(sock);
5558 sock = -error;
5559 }
5560 } else {
5561 sock = -errno;
5562 VLOG_ERR("failed to create packet socket: %s",
5563 ovs_strerror(errno));
5564 }
5565 ovsthread_once_done(&once);
5566 }
5567
5568 return sock;
5569 }