]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
json: Move from lib to include/openvswitch.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <poll.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50
51 #include "coverage.h"
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
57 #include "hash.h"
58 #include "openvswitch/hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
63 #include "netlink.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76 #include "util.h"
77
78 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79
80 COVERAGE_DEFINE(netdev_set_policing);
81 COVERAGE_DEFINE(netdev_arp_lookup);
82 COVERAGE_DEFINE(netdev_get_ifindex);
83 COVERAGE_DEFINE(netdev_get_hwaddr);
84 COVERAGE_DEFINE(netdev_set_hwaddr);
85 COVERAGE_DEFINE(netdev_get_ethtool);
86 COVERAGE_DEFINE(netdev_set_ethtool);
87
88 \f
89 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91 #ifndef ADVERTISED_Pause
92 #define ADVERTISED_Pause (1 << 13)
93 #endif
94 #ifndef ADVERTISED_Asym_Pause
95 #define ADVERTISED_Asym_Pause (1 << 14)
96 #endif
97
98 /* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100 #ifndef ETHTOOL_GFLAGS
101 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #endif
103 #ifndef ETHTOOL_SFLAGS
104 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 #endif
106
107 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109 #ifndef TC_RTAB_SIZE
110 #define TC_RTAB_SIZE 1024
111 #endif
112
113 /* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
122 #ifndef PACKET_AUXDATA
123 #define PACKET_AUXDATA 8
124 #endif
125 #ifndef TP_STATUS_VLAN_VALID
126 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #endif
128 #ifndef TP_STATUS_VLAN_TPID_VALID
129 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #endif
131 #undef tpacket_auxdata
132 #define tpacket_auxdata rpl_tpacket_auxdata
133 struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141 };
142
143 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
150 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 {
152 return ep->speed | (ep->speed_hi << 16);
153 }
154
155 /* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157 #ifndef SUPPORTED_1000baseKX_Full
158 #define SUPPORTED_1000baseKX_Full (1 << 17)
159 #define SUPPORTED_10000baseKX4_Full (1 << 18)
160 #define SUPPORTED_10000baseKR_Full (1 << 19)
161 #define SUPPORTED_10000baseR_FEC (1 << 20)
162 #define ADVERTISED_1000baseKX_Full (1 << 17)
163 #define ADVERTISED_10000baseKX4_Full (1 << 18)
164 #define ADVERTISED_10000baseKR_Full (1 << 19)
165 #define ADVERTISED_10000baseR_FEC (1 << 20)
166 #endif
167
168 /* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170 #ifndef SUPPORTED_40000baseKR4_Full
171 #define SUPPORTED_40000baseKR4_Full (1 << 23)
172 #define SUPPORTED_40000baseCR4_Full (1 << 24)
173 #define SUPPORTED_40000baseSR4_Full (1 << 25)
174 #define SUPPORTED_40000baseLR4_Full (1 << 26)
175 #define ADVERTISED_40000baseKR4_Full (1 << 23)
176 #define ADVERTISED_40000baseCR4_Full (1 << 24)
177 #define ADVERTISED_40000baseSR4_Full (1 << 25)
178 #define ADVERTISED_40000baseLR4_Full (1 << 26)
179 #endif
180
181 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188 #ifndef IFLA_STATS64
189 #define IFLA_STATS64 23
190 #endif
191 #define rtnl_link_stats64 rpl_rtnl_link_stats64
192 struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219 };
220
221 enum {
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
230 };
231 \f
232 /* Traffic control. */
233
234 /* An instance of a traffic control class. Always associated with a particular
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
239 struct tc {
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244 };
245
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
248 /* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252 struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
256 };
257
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264 struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
344
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
374
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401 };
402
403 static void
404 tc_init(struct tc *tc, const struct tc_ops *ops)
405 {
406 tc->ops = ops;
407 hmap_init(&tc->queues);
408 }
409
410 static void
411 tc_destroy(struct tc *tc)
412 {
413 hmap_destroy(&tc->queues);
414 }
415
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_noop;
423 static const struct tc_ops tc_ops_other;
424
425 static const struct tc_ops *const tcs[] = {
426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
431 &tc_ops_noop, /* Non operating qos type. */
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435 };
436
437 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
438 static unsigned int tc_get_major(unsigned int handle);
439 static unsigned int tc_get_minor(unsigned int handle);
440
441 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445 static struct tcmsg *tc_make_request(const struct netdev *, int type,
446 unsigned int flags, struct ofpbuf *);
447 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
448 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
449 static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
451
452 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457 static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460 static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462 static int tc_del_qdisc(struct netdev *netdev);
463 static int tc_query_qdisc(const struct netdev *netdev);
464
465 static int tc_calc_cell_log(unsigned int mtu);
466 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470 \f
471 struct netdev_linux {
472 struct netdev up;
473
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
477 unsigned int cache_valid;
478
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
485 int ifindex;
486 struct eth_addr etheraddr;
487 int mtu;
488 unsigned int ifi_flags;
489 long long int carrier_resets;
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error; /* Cached error code from set policing. */
497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499
500 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
503
504 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
505 struct tc *tc;
506
507 /* For devices of class netdev_tap_class only. */
508 int tap_fd;
509 };
510
511 struct netdev_rxq_linux {
512 struct netdev_rxq up;
513 bool is_tap;
514 int fd;
515 };
516
517 /* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520
521 /* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 *
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
528
529 static void netdev_linux_run(void);
530
531 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
532 int cmd, const char *cmd_name);
533 static int get_flags(const struct netdev *, unsigned int *flags);
534 static int set_flags(const char *, unsigned int flags);
535 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
536 enum netdev_flags on, enum netdev_flags *old_flagsp)
537 OVS_REQUIRES(netdev->mutex);
538 static int do_get_ifindex(const char *netdev_name);
539 static int get_ifindex(const struct netdev *, int *ifindexp);
540 static int do_set_addr(struct netdev *netdev,
541 int ioctl_nr, const char *ioctl_name,
542 struct in_addr addr);
543 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
544 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
545 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
546 static int af_packet_sock(void);
547 static bool netdev_linux_miimon_enabled(void);
548 static void netdev_linux_miimon_run(void);
549 static void netdev_linux_miimon_wait(void);
550 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
551
552 static bool
553 is_netdev_linux_class(const struct netdev_class *netdev_class)
554 {
555 return netdev_class->run == netdev_linux_run;
556 }
557
558 static bool
559 is_tap_netdev(const struct netdev *netdev)
560 {
561 return netdev_get_class(netdev) == &netdev_tap_class;
562 }
563
564 static struct netdev_linux *
565 netdev_linux_cast(const struct netdev *netdev)
566 {
567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
568
569 return CONTAINER_OF(netdev, struct netdev_linux, up);
570 }
571
572 static struct netdev_rxq_linux *
573 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
574 {
575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
576 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
577 }
578 \f
579 static void netdev_linux_update(struct netdev_linux *netdev,
580 const struct rtnetlink_change *)
581 OVS_REQUIRES(netdev->mutex);
582 static void netdev_linux_changed(struct netdev_linux *netdev,
583 unsigned int ifi_flags, unsigned int mask)
584 OVS_REQUIRES(netdev->mutex);
585
586 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
588 * if no such socket could be created. */
589 static struct nl_sock *
590 netdev_linux_notify_sock(void)
591 {
592 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
593 static struct nl_sock *sock;
594 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
595 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
596
597 if (ovsthread_once_start(&once)) {
598 int error;
599
600 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 if (!error) {
602 size_t i;
603
604 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
605 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 if (error) {
607 nl_sock_destroy(sock);
608 sock = NULL;
609 break;
610 }
611 }
612 }
613 ovsthread_once_done(&once);
614 }
615
616 return sock;
617 }
618
619 static bool
620 netdev_linux_miimon_enabled(void)
621 {
622 return atomic_count_get(&miimon_cnt) > 0;
623 }
624
625 static void
626 netdev_linux_run(void)
627 {
628 struct nl_sock *sock;
629 int error;
630
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
633 }
634
635 sock = netdev_linux_notify_sock();
636 if (!sock) {
637 return;
638 }
639
640 do {
641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub[4096 / 8];
643 struct ofpbuf buf;
644
645 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
646 error = nl_sock_recv(sock, &buf, false);
647 if (!error) {
648 struct rtnetlink_change change;
649
650 if (rtnetlink_parse(&buf, &change)) {
651 struct netdev *netdev_ = NULL;
652 char dev_name[IFNAMSIZ];
653
654 if (!change.ifname) {
655 change.ifname = if_indextoname(change.if_index, dev_name);
656 }
657
658 if (change.ifname) {
659 netdev_ = netdev_from_name(change.ifname);
660 }
661 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
663
664 ovs_mutex_lock(&netdev->mutex);
665 netdev_linux_update(netdev, &change);
666 ovs_mutex_unlock(&netdev->mutex);
667 }
668 netdev_close(netdev_);
669 }
670 } else if (error == ENOBUFS) {
671 struct shash device_shash;
672 struct shash_node *node;
673
674 nl_sock_drain(sock);
675
676 shash_init(&device_shash);
677 netdev_get_devices(&netdev_linux_class, &device_shash);
678 SHASH_FOR_EACH (node, &device_shash) {
679 struct netdev *netdev_ = node->data;
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 unsigned int flags;
682
683 ovs_mutex_lock(&netdev->mutex);
684 get_flags(netdev_, &flags);
685 netdev_linux_changed(netdev, flags, 0);
686 ovs_mutex_unlock(&netdev->mutex);
687
688 netdev_close(netdev_);
689 }
690 shash_destroy(&device_shash);
691 } else if (error != EAGAIN) {
692 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
693 ovs_strerror(error));
694 }
695 ofpbuf_uninit(&buf);
696 } while (!error);
697 }
698
699 static void
700 netdev_linux_wait(void)
701 {
702 struct nl_sock *sock;
703
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
706 }
707 sock = netdev_linux_notify_sock();
708 if (sock) {
709 nl_sock_wait(sock, POLLIN);
710 }
711 }
712
713 static void
714 netdev_linux_changed(struct netdev_linux *dev,
715 unsigned int ifi_flags, unsigned int mask)
716 OVS_REQUIRES(dev->mutex)
717 {
718 netdev_change_seq_changed(&dev->up);
719
720 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
721 dev->carrier_resets++;
722 }
723 dev->ifi_flags = ifi_flags;
724
725 dev->cache_valid &= mask;
726 if (!(mask & VALID_IN)) {
727 netdev_get_addrs_list_flush();
728 }
729 }
730
731 static void
732 netdev_linux_update(struct netdev_linux *dev,
733 const struct rtnetlink_change *change)
734 OVS_REQUIRES(dev->mutex)
735 {
736 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
737 if (change->nlmsg_type == RTM_NEWLINK) {
738 /* Keep drv-info, and ip addresses. */
739 netdev_linux_changed(dev, change->ifi_flags,
740 VALID_DRVINFO | VALID_IN);
741
742 /* Update netdev from rtnl-change msg. */
743 if (change->mtu) {
744 dev->mtu = change->mtu;
745 dev->cache_valid |= VALID_MTU;
746 dev->netdev_mtu_error = 0;
747 }
748
749 if (!eth_addr_is_zero(change->mac)) {
750 dev->etheraddr = change->mac;
751 dev->cache_valid |= VALID_ETHERADDR;
752 dev->ether_addr_error = 0;
753 }
754
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
758 } else {
759 netdev_linux_changed(dev, change->ifi_flags, 0);
760 }
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
762 /* Invalidates in4, in6. */
763 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
764 } else {
765 OVS_NOT_REACHED();
766 }
767 }
768
769 static struct netdev *
770 netdev_linux_alloc(void)
771 {
772 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
773 return &netdev->up;
774 }
775
776 static void
777 netdev_linux_common_construct(struct netdev_linux *netdev)
778 {
779 ovs_mutex_init(&netdev->mutex);
780 }
781
782 /* Creates system and internal devices. */
783 static int
784 netdev_linux_construct(struct netdev *netdev_)
785 {
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
787 int error;
788
789 netdev_linux_common_construct(netdev);
790
791 error = get_flags(&netdev->up, &netdev->ifi_flags);
792 if (error == ENODEV) {
793 if (netdev->up.netdev_class != &netdev_internal_class) {
794 /* The device does not exist, so don't allow it to be opened. */
795 return ENODEV;
796 } else {
797 /* "Internal" netdevs have to be created as netdev objects before
798 * they exist in the kernel, because creating them in the kernel
799 * happens by passing a netdev object to dpif_port_add().
800 * Therefore, ignore the error. */
801 }
802 }
803
804 return 0;
805 }
806
807 /* For most types of netdevs we open the device for each call of
808 * netdev_open(). However, this is not the case with tap devices,
809 * since it is only possible to open the device once. In this
810 * situation we share a single file descriptor, and consequently
811 * buffers, across all readers. Therefore once data is read it will
812 * be unavailable to other reads for tap devices. */
813 static int
814 netdev_linux_construct_tap(struct netdev *netdev_)
815 {
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
817 static const char tap_dev[] = "/dev/net/tun";
818 const char *name = netdev_->name;
819 struct ifreq ifr;
820 int error;
821
822 netdev_linux_common_construct(netdev);
823
824 /* Open tap device. */
825 netdev->tap_fd = open(tap_dev, O_RDWR);
826 if (netdev->tap_fd < 0) {
827 error = errno;
828 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
829 return error;
830 }
831
832 /* Create tap device. */
833 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
834 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
835 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
836 VLOG_WARN("%s: creating tap device failed: %s", name,
837 ovs_strerror(errno));
838 error = errno;
839 goto error_close;
840 }
841
842 /* Make non-blocking. */
843 error = set_nonblocking(netdev->tap_fd);
844 if (error) {
845 goto error_close;
846 }
847
848 return 0;
849
850 error_close:
851 close(netdev->tap_fd);
852 return error;
853 }
854
855 static void
856 netdev_linux_destruct(struct netdev *netdev_)
857 {
858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
859
860 if (netdev->tc && netdev->tc->ops->tc_destroy) {
861 netdev->tc->ops->tc_destroy(netdev->tc);
862 }
863
864 if (netdev_get_class(netdev_) == &netdev_tap_class
865 && netdev->tap_fd >= 0)
866 {
867 close(netdev->tap_fd);
868 }
869
870 if (netdev->miimon_interval > 0) {
871 atomic_count_dec(&miimon_cnt);
872 }
873
874 ovs_mutex_destroy(&netdev->mutex);
875 }
876
877 static void
878 netdev_linux_dealloc(struct netdev *netdev_)
879 {
880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
881 free(netdev);
882 }
883
884 static struct netdev_rxq *
885 netdev_linux_rxq_alloc(void)
886 {
887 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
888 return &rx->up;
889 }
890
891 static int
892 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
893 {
894 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
895 struct netdev *netdev_ = rx->up.netdev;
896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
897 int error;
898
899 ovs_mutex_lock(&netdev->mutex);
900 rx->is_tap = is_tap_netdev(netdev_);
901 if (rx->is_tap) {
902 rx->fd = netdev->tap_fd;
903 } else {
904 struct sockaddr_ll sll;
905 int ifindex, val;
906 /* Result of tcpdump -dd inbound */
907 static const struct sock_filter filt[] = {
908 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
909 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
910 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
911 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 };
913 static const struct sock_fprog fprog = {
914 ARRAY_SIZE(filt), (struct sock_filter *) filt
915 };
916
917 /* Create file descriptor. */
918 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
919 if (rx->fd < 0) {
920 error = errno;
921 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
922 goto error;
923 }
924
925 val = 1;
926 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
927 error = errno;
928 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
929 netdev_get_name(netdev_), ovs_strerror(error));
930 goto error;
931 }
932
933 /* Set non-blocking mode. */
934 error = set_nonblocking(rx->fd);
935 if (error) {
936 goto error;
937 }
938
939 /* Get ethernet device index. */
940 error = get_ifindex(&netdev->up, &ifindex);
941 if (error) {
942 goto error;
943 }
944
945 /* Bind to specific ethernet device. */
946 memset(&sll, 0, sizeof sll);
947 sll.sll_family = AF_PACKET;
948 sll.sll_ifindex = ifindex;
949 sll.sll_protocol = htons(ETH_P_ALL);
950 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
951 error = errno;
952 VLOG_ERR("%s: failed to bind raw socket (%s)",
953 netdev_get_name(netdev_), ovs_strerror(error));
954 goto error;
955 }
956
957 /* Filter for only inbound packets. */
958 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
959 sizeof fprog);
960 if (error) {
961 error = errno;
962 VLOG_ERR("%s: failed to attach filter (%s)",
963 netdev_get_name(netdev_), ovs_strerror(error));
964 goto error;
965 }
966 }
967 ovs_mutex_unlock(&netdev->mutex);
968
969 return 0;
970
971 error:
972 if (rx->fd >= 0) {
973 close(rx->fd);
974 }
975 ovs_mutex_unlock(&netdev->mutex);
976 return error;
977 }
978
979 static void
980 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
981 {
982 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
983
984 if (!rx->is_tap) {
985 close(rx->fd);
986 }
987 }
988
989 static void
990 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
991 {
992 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
993
994 free(rx);
995 }
996
997 static ovs_be16
998 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
999 {
1000 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1001 return htons(aux->tp_vlan_tpid);
1002 } else {
1003 return htons(ETH_TYPE_VLAN);
1004 }
1005 }
1006
1007 static bool
1008 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1009 {
1010 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1011 }
1012
1013 static int
1014 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1015 {
1016 size_t size;
1017 ssize_t retval;
1018 struct iovec iov;
1019 struct cmsghdr *cmsg;
1020 union {
1021 struct cmsghdr cmsg;
1022 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1023 } cmsg_buffer;
1024 struct msghdr msgh;
1025
1026 /* Reserve headroom for a single VLAN tag */
1027 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1028 size = dp_packet_tailroom(buffer);
1029
1030 iov.iov_base = dp_packet_data(buffer);
1031 iov.iov_len = size;
1032 msgh.msg_name = NULL;
1033 msgh.msg_namelen = 0;
1034 msgh.msg_iov = &iov;
1035 msgh.msg_iovlen = 1;
1036 msgh.msg_control = &cmsg_buffer;
1037 msgh.msg_controllen = sizeof cmsg_buffer;
1038 msgh.msg_flags = 0;
1039
1040 do {
1041 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1042 } while (retval < 0 && errno == EINTR);
1043
1044 if (retval < 0) {
1045 return errno;
1046 } else if (retval > size) {
1047 return EMSGSIZE;
1048 }
1049
1050 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1051
1052 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1053 const struct tpacket_auxdata *aux;
1054
1055 if (cmsg->cmsg_level != SOL_PACKET
1056 || cmsg->cmsg_type != PACKET_AUXDATA
1057 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1058 continue;
1059 }
1060
1061 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1062 if (auxdata_has_vlan_tci(aux)) {
1063 if (retval < ETH_HEADER_LEN) {
1064 return EINVAL;
1065 }
1066
1067 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1068 htons(aux->tp_vlan_tci));
1069 break;
1070 }
1071 }
1072
1073 return 0;
1074 }
1075
1076 static int
1077 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1078 {
1079 ssize_t retval;
1080 size_t size = dp_packet_tailroom(buffer);
1081
1082 do {
1083 retval = read(fd, dp_packet_data(buffer), size);
1084 } while (retval < 0 && errno == EINTR);
1085
1086 if (retval < 0) {
1087 return errno;
1088 }
1089
1090 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1091 return 0;
1092 }
1093
1094 static int
1095 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
1096 {
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 struct netdev *netdev = rx->up.netdev;
1099 struct dp_packet *buffer;
1100 ssize_t retval;
1101 int mtu;
1102
1103 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1104 mtu = ETH_PAYLOAD_MAX;
1105 }
1106
1107 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1108 DP_NETDEV_HEADROOM);
1109 retval = (rx->is_tap
1110 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1111 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1112
1113 if (retval) {
1114 if (retval != EAGAIN && retval != EMSGSIZE) {
1115 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1116 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1117 }
1118 dp_packet_delete(buffer);
1119 } else {
1120 dp_packet_pad(buffer);
1121 batch->packets[0] = buffer;
1122 batch->count = 1;
1123 }
1124
1125 return retval;
1126 }
1127
1128 static void
1129 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1130 {
1131 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1132 poll_fd_wait(rx->fd, POLLIN);
1133 }
1134
1135 static int
1136 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1137 {
1138 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1139 if (rx->is_tap) {
1140 struct ifreq ifr;
1141 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1142 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1143 if (error) {
1144 return error;
1145 }
1146 drain_fd(rx->fd, ifr.ifr_qlen);
1147 return 0;
1148 } else {
1149 return drain_rcvbuf(rx->fd);
1150 }
1151 }
1152
1153 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1154 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1155 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1156 * the packet is too big or too small to transmit on the device.
1157 *
1158 * The caller retains ownership of 'buffer' in all cases.
1159 *
1160 * The kernel maintains a packet transmission queue, so the caller is not
1161 * expected to do additional queuing of packets. */
1162 static int
1163 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1164 struct dp_packet_batch *batch, bool may_steal)
1165 {
1166 int i;
1167 int error = 0;
1168
1169 /* 'i' is incremented only if there's no error */
1170 for (i = 0; i < batch->count;) {
1171 const void *data = dp_packet_data(batch->packets[i]);
1172 size_t size = dp_packet_size(batch->packets[i]);
1173 ssize_t retval;
1174
1175 /* Truncate the packet if it is configured. */
1176 size -= dp_packet_get_cutlen(batch->packets[i]);
1177
1178 if (!is_tap_netdev(netdev_)) {
1179 /* Use our AF_PACKET socket to send to this device. */
1180 struct sockaddr_ll sll;
1181 struct msghdr msg;
1182 struct iovec iov;
1183 int ifindex;
1184 int sock;
1185
1186 sock = af_packet_sock();
1187 if (sock < 0) {
1188 return -sock;
1189 }
1190
1191 ifindex = netdev_get_ifindex(netdev_);
1192 if (ifindex < 0) {
1193 return -ifindex;
1194 }
1195
1196 /* We don't bother setting most fields in sockaddr_ll because the
1197 * kernel ignores them for SOCK_RAW. */
1198 memset(&sll, 0, sizeof sll);
1199 sll.sll_family = AF_PACKET;
1200 sll.sll_ifindex = ifindex;
1201
1202 iov.iov_base = CONST_CAST(void *, data);
1203 iov.iov_len = size;
1204
1205 msg.msg_name = &sll;
1206 msg.msg_namelen = sizeof sll;
1207 msg.msg_iov = &iov;
1208 msg.msg_iovlen = 1;
1209 msg.msg_control = NULL;
1210 msg.msg_controllen = 0;
1211 msg.msg_flags = 0;
1212
1213 retval = sendmsg(sock, &msg, 0);
1214 } else {
1215 /* Use the tap fd to send to this device. This is essential for
1216 * tap devices, because packets sent to a tap device with an
1217 * AF_PACKET socket will loop back to be *received* again on the
1218 * tap device. This doesn't occur on other interface types
1219 * because we attach a socket filter to the rx socket. */
1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1221
1222 retval = write(netdev->tap_fd, data, size);
1223 }
1224
1225 if (retval < 0) {
1226 if (errno == EINTR) {
1227 /* The send was interrupted by a signal. Retry the packet by
1228 * continuing without incrementing 'i'.*/
1229 continue;
1230 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1231 /* The Linux tap driver returns EIO if the device is not up.
1232 * From the OVS side this is not an error, so ignore it. */
1233 } else {
1234 /* The Linux AF_PACKET implementation never blocks waiting for
1235 * room for packets, instead returning ENOBUFS. Translate this
1236 * into EAGAIN for the caller. */
1237 error = errno == ENOBUFS ? EAGAIN : errno;
1238 break;
1239 }
1240 } else if (retval != size) {
1241 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1242 " of %"PRIuSIZE") on %s", retval, size,
1243 netdev_get_name(netdev_));
1244 error = EMSGSIZE;
1245 break;
1246 }
1247
1248 /* Process the next packet in the batch */
1249 i++;
1250 }
1251
1252 dp_packet_delete_batch(batch, may_steal);
1253
1254 if (error && error != EAGAIN) {
1255 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1256 netdev_get_name(netdev_), ovs_strerror(error));
1257 }
1258
1259 return error;
1260
1261 }
1262
1263 /* Registers with the poll loop to wake up from the next call to poll_block()
1264 * when the packet transmission queue has sufficient room to transmit a packet
1265 * with netdev_send().
1266 *
1267 * The kernel maintains a packet transmission queue, so the client is not
1268 * expected to do additional queuing of packets. Thus, this function is
1269 * unlikely to ever be used. It is included for completeness. */
1270 static void
1271 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1272 {
1273 if (is_tap_netdev(netdev)) {
1274 /* TAP device always accepts packets.*/
1275 poll_immediate_wake();
1276 }
1277 }
1278
1279 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1280 * otherwise a positive errno value. */
1281 static int
1282 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1283 {
1284 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1285 enum netdev_flags old_flags = 0;
1286 int error;
1287
1288 ovs_mutex_lock(&netdev->mutex);
1289
1290 if (netdev->cache_valid & VALID_ETHERADDR) {
1291 error = netdev->ether_addr_error;
1292 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1293 goto exit;
1294 }
1295 netdev->cache_valid &= ~VALID_ETHERADDR;
1296 }
1297
1298 /* Tap devices must be brought down before setting the address. */
1299 if (is_tap_netdev(netdev_)) {
1300 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1301 }
1302 error = set_etheraddr(netdev_get_name(netdev_), mac);
1303 if (!error || error == ENODEV) {
1304 netdev->ether_addr_error = error;
1305 netdev->cache_valid |= VALID_ETHERADDR;
1306 if (!error) {
1307 netdev->etheraddr = mac;
1308 }
1309 }
1310
1311 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1312 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1313 }
1314
1315 exit:
1316 ovs_mutex_unlock(&netdev->mutex);
1317 return error;
1318 }
1319
1320 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1321 static int
1322 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1323 {
1324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1325 int error;
1326
1327 ovs_mutex_lock(&netdev->mutex);
1328 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1329 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1330 &netdev->etheraddr);
1331 netdev->cache_valid |= VALID_ETHERADDR;
1332 }
1333
1334 error = netdev->ether_addr_error;
1335 if (!error) {
1336 *mac = netdev->etheraddr;
1337 }
1338 ovs_mutex_unlock(&netdev->mutex);
1339
1340 return error;
1341 }
1342
1343 static int
1344 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1345 {
1346 int error;
1347
1348 if (!(netdev->cache_valid & VALID_MTU)) {
1349 struct ifreq ifr;
1350
1351 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1352 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1353 netdev->mtu = ifr.ifr_mtu;
1354 netdev->cache_valid |= VALID_MTU;
1355 }
1356
1357 error = netdev->netdev_mtu_error;
1358 if (!error) {
1359 *mtup = netdev->mtu;
1360 }
1361
1362 return error;
1363 }
1364
1365 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1366 * in bytes, not including the hardware header; thus, this is typically 1500
1367 * bytes for Ethernet devices. */
1368 static int
1369 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1370 {
1371 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1372 int error;
1373
1374 ovs_mutex_lock(&netdev->mutex);
1375 error = netdev_linux_get_mtu__(netdev, mtup);
1376 ovs_mutex_unlock(&netdev->mutex);
1377
1378 return error;
1379 }
1380
1381 /* Sets the maximum size of transmitted (MTU) for given device using linux
1382 * networking ioctl interface.
1383 */
1384 static int
1385 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1386 {
1387 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1388 struct ifreq ifr;
1389 int error;
1390
1391 ovs_mutex_lock(&netdev->mutex);
1392 if (netdev->cache_valid & VALID_MTU) {
1393 error = netdev->netdev_mtu_error;
1394 if (error || netdev->mtu == mtu) {
1395 goto exit;
1396 }
1397 netdev->cache_valid &= ~VALID_MTU;
1398 }
1399 ifr.ifr_mtu = mtu;
1400 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1401 SIOCSIFMTU, "SIOCSIFMTU");
1402 if (!error || error == ENODEV) {
1403 netdev->netdev_mtu_error = error;
1404 netdev->mtu = ifr.ifr_mtu;
1405 netdev->cache_valid |= VALID_MTU;
1406 }
1407 exit:
1408 ovs_mutex_unlock(&netdev->mutex);
1409 return error;
1410 }
1411
1412 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1413 * On failure, returns a negative errno value. */
1414 static int
1415 netdev_linux_get_ifindex(const struct netdev *netdev_)
1416 {
1417 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1418 int ifindex, error;
1419
1420 ovs_mutex_lock(&netdev->mutex);
1421 error = get_ifindex(netdev_, &ifindex);
1422 ovs_mutex_unlock(&netdev->mutex);
1423
1424 return error ? -error : ifindex;
1425 }
1426
1427 static int
1428 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1429 {
1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1431
1432 ovs_mutex_lock(&netdev->mutex);
1433 if (netdev->miimon_interval > 0) {
1434 *carrier = netdev->miimon;
1435 } else {
1436 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1437 }
1438 ovs_mutex_unlock(&netdev->mutex);
1439
1440 return 0;
1441 }
1442
1443 static long long int
1444 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1445 {
1446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1447 long long int carrier_resets;
1448
1449 ovs_mutex_lock(&netdev->mutex);
1450 carrier_resets = netdev->carrier_resets;
1451 ovs_mutex_unlock(&netdev->mutex);
1452
1453 return carrier_resets;
1454 }
1455
1456 static int
1457 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1458 struct mii_ioctl_data *data)
1459 {
1460 struct ifreq ifr;
1461 int error;
1462
1463 memset(&ifr, 0, sizeof ifr);
1464 memcpy(&ifr.ifr_data, data, sizeof *data);
1465 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1466 memcpy(data, &ifr.ifr_data, sizeof *data);
1467
1468 return error;
1469 }
1470
1471 static int
1472 netdev_linux_get_miimon(const char *name, bool *miimon)
1473 {
1474 struct mii_ioctl_data data;
1475 int error;
1476
1477 *miimon = false;
1478
1479 memset(&data, 0, sizeof data);
1480 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1481 if (!error) {
1482 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1483 data.reg_num = MII_BMSR;
1484 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1485 &data);
1486
1487 if (!error) {
1488 *miimon = !!(data.val_out & BMSR_LSTATUS);
1489 } else {
1490 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1491 }
1492 } else {
1493 struct ethtool_cmd ecmd;
1494
1495 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1496 name);
1497
1498 COVERAGE_INC(netdev_get_ethtool);
1499 memset(&ecmd, 0, sizeof ecmd);
1500 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1501 "ETHTOOL_GLINK");
1502 if (!error) {
1503 struct ethtool_value eval;
1504
1505 memcpy(&eval, &ecmd, sizeof eval);
1506 *miimon = !!eval.data;
1507 } else {
1508 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1509 }
1510 }
1511
1512 return error;
1513 }
1514
1515 static int
1516 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1517 long long int interval)
1518 {
1519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1520
1521 ovs_mutex_lock(&netdev->mutex);
1522 interval = interval > 0 ? MAX(interval, 100) : 0;
1523 if (netdev->miimon_interval != interval) {
1524 if (interval && !netdev->miimon_interval) {
1525 atomic_count_inc(&miimon_cnt);
1526 } else if (!interval && netdev->miimon_interval) {
1527 atomic_count_dec(&miimon_cnt);
1528 }
1529
1530 netdev->miimon_interval = interval;
1531 timer_set_expired(&netdev->miimon_timer);
1532 }
1533 ovs_mutex_unlock(&netdev->mutex);
1534
1535 return 0;
1536 }
1537
1538 static void
1539 netdev_linux_miimon_run(void)
1540 {
1541 struct shash device_shash;
1542 struct shash_node *node;
1543
1544 shash_init(&device_shash);
1545 netdev_get_devices(&netdev_linux_class, &device_shash);
1546 SHASH_FOR_EACH (node, &device_shash) {
1547 struct netdev *netdev = node->data;
1548 struct netdev_linux *dev = netdev_linux_cast(netdev);
1549 bool miimon;
1550
1551 ovs_mutex_lock(&dev->mutex);
1552 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1553 netdev_linux_get_miimon(dev->up.name, &miimon);
1554 if (miimon != dev->miimon) {
1555 dev->miimon = miimon;
1556 netdev_linux_changed(dev, dev->ifi_flags, 0);
1557 }
1558
1559 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1560 }
1561 ovs_mutex_unlock(&dev->mutex);
1562 netdev_close(netdev);
1563 }
1564
1565 shash_destroy(&device_shash);
1566 }
1567
1568 static void
1569 netdev_linux_miimon_wait(void)
1570 {
1571 struct shash device_shash;
1572 struct shash_node *node;
1573
1574 shash_init(&device_shash);
1575 netdev_get_devices(&netdev_linux_class, &device_shash);
1576 SHASH_FOR_EACH (node, &device_shash) {
1577 struct netdev *netdev = node->data;
1578 struct netdev_linux *dev = netdev_linux_cast(netdev);
1579
1580 ovs_mutex_lock(&dev->mutex);
1581 if (dev->miimon_interval > 0) {
1582 timer_wait(&dev->miimon_timer);
1583 }
1584 ovs_mutex_unlock(&dev->mutex);
1585 netdev_close(netdev);
1586 }
1587 shash_destroy(&device_shash);
1588 }
1589
1590 static void
1591 swap_uint64(uint64_t *a, uint64_t *b)
1592 {
1593 uint64_t tmp = *a;
1594 *a = *b;
1595 *b = tmp;
1596 }
1597
1598 /* Copies 'src' into 'dst', performing format conversion in the process.
1599 *
1600 * 'src' is allowed to be misaligned. */
1601 static void
1602 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1603 const struct ovs_vport_stats *src)
1604 {
1605 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1606 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1607 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1608 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1609 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1610 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1611 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1612 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1613 dst->multicast = 0;
1614 dst->collisions = 0;
1615 dst->rx_length_errors = 0;
1616 dst->rx_over_errors = 0;
1617 dst->rx_crc_errors = 0;
1618 dst->rx_frame_errors = 0;
1619 dst->rx_fifo_errors = 0;
1620 dst->rx_missed_errors = 0;
1621 dst->tx_aborted_errors = 0;
1622 dst->tx_carrier_errors = 0;
1623 dst->tx_fifo_errors = 0;
1624 dst->tx_heartbeat_errors = 0;
1625 dst->tx_window_errors = 0;
1626 }
1627
1628 static int
1629 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1630 {
1631 struct dpif_netlink_vport reply;
1632 struct ofpbuf *buf;
1633 int error;
1634
1635 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1636 if (error) {
1637 return error;
1638 } else if (!reply.stats) {
1639 ofpbuf_delete(buf);
1640 return EOPNOTSUPP;
1641 }
1642
1643 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1644
1645 ofpbuf_delete(buf);
1646
1647 return 0;
1648 }
1649
1650 static void
1651 get_stats_via_vport(const struct netdev *netdev_,
1652 struct netdev_stats *stats)
1653 {
1654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1655
1656 if (!netdev->vport_stats_error ||
1657 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1658 int error;
1659
1660 error = get_stats_via_vport__(netdev_, stats);
1661 if (error && error != ENOENT && error != ENODEV) {
1662 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1663 "(%s)",
1664 netdev_get_name(netdev_), ovs_strerror(error));
1665 }
1666 netdev->vport_stats_error = error;
1667 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1668 }
1669 }
1670
1671 /* Retrieves current device stats for 'netdev-linux'. */
1672 static int
1673 netdev_linux_get_stats(const struct netdev *netdev_,
1674 struct netdev_stats *stats)
1675 {
1676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1677 struct netdev_stats dev_stats;
1678 int error;
1679
1680 ovs_mutex_lock(&netdev->mutex);
1681 get_stats_via_vport(netdev_, stats);
1682 error = get_stats_via_netlink(netdev_, &dev_stats);
1683 if (error) {
1684 if (!netdev->vport_stats_error) {
1685 error = 0;
1686 }
1687 } else if (netdev->vport_stats_error) {
1688 /* stats not available from OVS then use netdev stats. */
1689 *stats = dev_stats;
1690 } else {
1691 /* Use kernel netdev's packet and byte counts since vport's counters
1692 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1693 * enabled. */
1694 stats->rx_packets = dev_stats.rx_packets;
1695 stats->rx_bytes = dev_stats.rx_bytes;
1696 stats->tx_packets = dev_stats.tx_packets;
1697 stats->tx_bytes = dev_stats.tx_bytes;
1698
1699 stats->rx_errors += dev_stats.rx_errors;
1700 stats->tx_errors += dev_stats.tx_errors;
1701 stats->rx_dropped += dev_stats.rx_dropped;
1702 stats->tx_dropped += dev_stats.tx_dropped;
1703 stats->multicast += dev_stats.multicast;
1704 stats->collisions += dev_stats.collisions;
1705 stats->rx_length_errors += dev_stats.rx_length_errors;
1706 stats->rx_over_errors += dev_stats.rx_over_errors;
1707 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1708 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1709 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1710 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1711 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1712 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1713 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1714 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1715 stats->tx_window_errors += dev_stats.tx_window_errors;
1716 }
1717 ovs_mutex_unlock(&netdev->mutex);
1718
1719 return error;
1720 }
1721
1722 /* Retrieves current device stats for 'netdev-tap' netdev or
1723 * netdev-internal. */
1724 static int
1725 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1726 {
1727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1728 struct netdev_stats dev_stats;
1729 int error;
1730
1731 ovs_mutex_lock(&netdev->mutex);
1732 get_stats_via_vport(netdev_, stats);
1733 error = get_stats_via_netlink(netdev_, &dev_stats);
1734 if (error) {
1735 if (!netdev->vport_stats_error) {
1736 error = 0;
1737 }
1738 } else if (netdev->vport_stats_error) {
1739 /* Transmit and receive stats will appear to be swapped relative to the
1740 * other ports since we are the one sending the data, not a remote
1741 * computer. For consistency, we swap them back here. This does not
1742 * apply if we are getting stats from the vport layer because it always
1743 * tracks stats from the perspective of the switch. */
1744
1745 *stats = dev_stats;
1746 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1747 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1748 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1749 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1750 stats->rx_length_errors = 0;
1751 stats->rx_over_errors = 0;
1752 stats->rx_crc_errors = 0;
1753 stats->rx_frame_errors = 0;
1754 stats->rx_fifo_errors = 0;
1755 stats->rx_missed_errors = 0;
1756 stats->tx_aborted_errors = 0;
1757 stats->tx_carrier_errors = 0;
1758 stats->tx_fifo_errors = 0;
1759 stats->tx_heartbeat_errors = 0;
1760 stats->tx_window_errors = 0;
1761 } else {
1762 /* Use kernel netdev's packet and byte counts since vport counters
1763 * do not reflect packet counts on the wire when GSO, TSO or GRO
1764 * are enabled. */
1765 stats->rx_packets = dev_stats.tx_packets;
1766 stats->rx_bytes = dev_stats.tx_bytes;
1767 stats->tx_packets = dev_stats.rx_packets;
1768 stats->tx_bytes = dev_stats.rx_bytes;
1769
1770 stats->rx_dropped += dev_stats.tx_dropped;
1771 stats->tx_dropped += dev_stats.rx_dropped;
1772
1773 stats->rx_errors += dev_stats.tx_errors;
1774 stats->tx_errors += dev_stats.rx_errors;
1775
1776 stats->multicast += dev_stats.multicast;
1777 stats->collisions += dev_stats.collisions;
1778 }
1779 ovs_mutex_unlock(&netdev->mutex);
1780
1781 return error;
1782 }
1783
1784 static int
1785 netdev_internal_get_stats(const struct netdev *netdev_,
1786 struct netdev_stats *stats)
1787 {
1788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1789 int error;
1790
1791 ovs_mutex_lock(&netdev->mutex);
1792 get_stats_via_vport(netdev_, stats);
1793 error = netdev->vport_stats_error;
1794 ovs_mutex_unlock(&netdev->mutex);
1795
1796 return error;
1797 }
1798
1799 static void
1800 netdev_linux_read_features(struct netdev_linux *netdev)
1801 {
1802 struct ethtool_cmd ecmd;
1803 uint32_t speed;
1804 int error;
1805
1806 if (netdev->cache_valid & VALID_FEATURES) {
1807 return;
1808 }
1809
1810 COVERAGE_INC(netdev_get_ethtool);
1811 memset(&ecmd, 0, sizeof ecmd);
1812 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1813 ETHTOOL_GSET, "ETHTOOL_GSET");
1814 if (error) {
1815 goto out;
1816 }
1817
1818 /* Supported features. */
1819 netdev->supported = 0;
1820 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1821 netdev->supported |= NETDEV_F_10MB_HD;
1822 }
1823 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1824 netdev->supported |= NETDEV_F_10MB_FD;
1825 }
1826 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1827 netdev->supported |= NETDEV_F_100MB_HD;
1828 }
1829 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1830 netdev->supported |= NETDEV_F_100MB_FD;
1831 }
1832 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1833 netdev->supported |= NETDEV_F_1GB_HD;
1834 }
1835 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1836 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1837 netdev->supported |= NETDEV_F_1GB_FD;
1838 }
1839 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1840 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1841 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1842 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1843 netdev->supported |= NETDEV_F_10GB_FD;
1844 }
1845 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1846 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1847 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1848 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1849 netdev->supported |= NETDEV_F_40GB_FD;
1850 }
1851 if (ecmd.supported & SUPPORTED_TP) {
1852 netdev->supported |= NETDEV_F_COPPER;
1853 }
1854 if (ecmd.supported & SUPPORTED_FIBRE) {
1855 netdev->supported |= NETDEV_F_FIBER;
1856 }
1857 if (ecmd.supported & SUPPORTED_Autoneg) {
1858 netdev->supported |= NETDEV_F_AUTONEG;
1859 }
1860 if (ecmd.supported & SUPPORTED_Pause) {
1861 netdev->supported |= NETDEV_F_PAUSE;
1862 }
1863 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1864 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1865 }
1866
1867 /* Advertised features. */
1868 netdev->advertised = 0;
1869 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1870 netdev->advertised |= NETDEV_F_10MB_HD;
1871 }
1872 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1873 netdev->advertised |= NETDEV_F_10MB_FD;
1874 }
1875 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1876 netdev->advertised |= NETDEV_F_100MB_HD;
1877 }
1878 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1879 netdev->advertised |= NETDEV_F_100MB_FD;
1880 }
1881 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1882 netdev->advertised |= NETDEV_F_1GB_HD;
1883 }
1884 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1885 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1886 netdev->advertised |= NETDEV_F_1GB_FD;
1887 }
1888 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1889 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1890 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1891 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1892 netdev->advertised |= NETDEV_F_10GB_FD;
1893 }
1894 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1895 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1896 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1897 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1898 netdev->advertised |= NETDEV_F_40GB_FD;
1899 }
1900 if (ecmd.advertising & ADVERTISED_TP) {
1901 netdev->advertised |= NETDEV_F_COPPER;
1902 }
1903 if (ecmd.advertising & ADVERTISED_FIBRE) {
1904 netdev->advertised |= NETDEV_F_FIBER;
1905 }
1906 if (ecmd.advertising & ADVERTISED_Autoneg) {
1907 netdev->advertised |= NETDEV_F_AUTONEG;
1908 }
1909 if (ecmd.advertising & ADVERTISED_Pause) {
1910 netdev->advertised |= NETDEV_F_PAUSE;
1911 }
1912 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1913 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1914 }
1915
1916 /* Current settings. */
1917 speed = ethtool_cmd_speed(&ecmd);
1918 if (speed == SPEED_10) {
1919 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1920 } else if (speed == SPEED_100) {
1921 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1922 } else if (speed == SPEED_1000) {
1923 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1924 } else if (speed == SPEED_10000) {
1925 netdev->current = NETDEV_F_10GB_FD;
1926 } else if (speed == 40000) {
1927 netdev->current = NETDEV_F_40GB_FD;
1928 } else if (speed == 100000) {
1929 netdev->current = NETDEV_F_100GB_FD;
1930 } else if (speed == 1000000) {
1931 netdev->current = NETDEV_F_1TB_FD;
1932 } else {
1933 netdev->current = 0;
1934 }
1935
1936 if (ecmd.port == PORT_TP) {
1937 netdev->current |= NETDEV_F_COPPER;
1938 } else if (ecmd.port == PORT_FIBRE) {
1939 netdev->current |= NETDEV_F_FIBER;
1940 }
1941
1942 if (ecmd.autoneg) {
1943 netdev->current |= NETDEV_F_AUTONEG;
1944 }
1945
1946 out:
1947 netdev->cache_valid |= VALID_FEATURES;
1948 netdev->get_features_error = error;
1949 }
1950
1951 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1952 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1953 * Returns 0 if successful, otherwise a positive errno value. */
1954 static int
1955 netdev_linux_get_features(const struct netdev *netdev_,
1956 enum netdev_features *current,
1957 enum netdev_features *advertised,
1958 enum netdev_features *supported,
1959 enum netdev_features *peer)
1960 {
1961 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1962 int error;
1963
1964 ovs_mutex_lock(&netdev->mutex);
1965 netdev_linux_read_features(netdev);
1966 if (!netdev->get_features_error) {
1967 *current = netdev->current;
1968 *advertised = netdev->advertised;
1969 *supported = netdev->supported;
1970 *peer = 0; /* XXX */
1971 }
1972 error = netdev->get_features_error;
1973 ovs_mutex_unlock(&netdev->mutex);
1974
1975 return error;
1976 }
1977
1978 /* Set the features advertised by 'netdev' to 'advertise'. */
1979 static int
1980 netdev_linux_set_advertisements(struct netdev *netdev_,
1981 enum netdev_features advertise)
1982 {
1983 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1984 struct ethtool_cmd ecmd;
1985 int error;
1986
1987 ovs_mutex_lock(&netdev->mutex);
1988
1989 COVERAGE_INC(netdev_get_ethtool);
1990 memset(&ecmd, 0, sizeof ecmd);
1991 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1992 ETHTOOL_GSET, "ETHTOOL_GSET");
1993 if (error) {
1994 goto exit;
1995 }
1996
1997 ecmd.advertising = 0;
1998 if (advertise & NETDEV_F_10MB_HD) {
1999 ecmd.advertising |= ADVERTISED_10baseT_Half;
2000 }
2001 if (advertise & NETDEV_F_10MB_FD) {
2002 ecmd.advertising |= ADVERTISED_10baseT_Full;
2003 }
2004 if (advertise & NETDEV_F_100MB_HD) {
2005 ecmd.advertising |= ADVERTISED_100baseT_Half;
2006 }
2007 if (advertise & NETDEV_F_100MB_FD) {
2008 ecmd.advertising |= ADVERTISED_100baseT_Full;
2009 }
2010 if (advertise & NETDEV_F_1GB_HD) {
2011 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2012 }
2013 if (advertise & NETDEV_F_1GB_FD) {
2014 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2015 }
2016 if (advertise & NETDEV_F_10GB_FD) {
2017 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2018 }
2019 if (advertise & NETDEV_F_COPPER) {
2020 ecmd.advertising |= ADVERTISED_TP;
2021 }
2022 if (advertise & NETDEV_F_FIBER) {
2023 ecmd.advertising |= ADVERTISED_FIBRE;
2024 }
2025 if (advertise & NETDEV_F_AUTONEG) {
2026 ecmd.advertising |= ADVERTISED_Autoneg;
2027 }
2028 if (advertise & NETDEV_F_PAUSE) {
2029 ecmd.advertising |= ADVERTISED_Pause;
2030 }
2031 if (advertise & NETDEV_F_PAUSE_ASYM) {
2032 ecmd.advertising |= ADVERTISED_Asym_Pause;
2033 }
2034 COVERAGE_INC(netdev_set_ethtool);
2035 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2036 ETHTOOL_SSET, "ETHTOOL_SSET");
2037
2038 exit:
2039 ovs_mutex_unlock(&netdev->mutex);
2040 return error;
2041 }
2042
2043 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2044 * successful, otherwise a positive errno value. */
2045 static int
2046 netdev_linux_set_policing(struct netdev *netdev_,
2047 uint32_t kbits_rate, uint32_t kbits_burst)
2048 {
2049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2050 const char *netdev_name = netdev_get_name(netdev_);
2051 int error;
2052
2053 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2054 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2055 : kbits_burst); /* Stick with user-specified value. */
2056
2057 ovs_mutex_lock(&netdev->mutex);
2058 if (netdev->cache_valid & VALID_POLICING) {
2059 error = netdev->netdev_policing_error;
2060 if (error || (netdev->kbits_rate == kbits_rate &&
2061 netdev->kbits_burst == kbits_burst)) {
2062 /* Assume that settings haven't changed since we last set them. */
2063 goto out;
2064 }
2065 netdev->cache_valid &= ~VALID_POLICING;
2066 }
2067
2068 COVERAGE_INC(netdev_set_policing);
2069 /* Remove any existing ingress qdisc. */
2070 error = tc_add_del_ingress_qdisc(netdev_, false);
2071 if (error) {
2072 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2073 netdev_name, ovs_strerror(error));
2074 goto out;
2075 }
2076
2077 if (kbits_rate) {
2078 error = tc_add_del_ingress_qdisc(netdev_, true);
2079 if (error) {
2080 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2081 netdev_name, ovs_strerror(error));
2082 goto out;
2083 }
2084
2085 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2086 if (error){
2087 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2088 netdev_name, ovs_strerror(error));
2089 goto out;
2090 }
2091 }
2092
2093 netdev->kbits_rate = kbits_rate;
2094 netdev->kbits_burst = kbits_burst;
2095
2096 out:
2097 if (!error || error == ENODEV) {
2098 netdev->netdev_policing_error = error;
2099 netdev->cache_valid |= VALID_POLICING;
2100 }
2101 ovs_mutex_unlock(&netdev->mutex);
2102 return error;
2103 }
2104
2105 static int
2106 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2107 struct sset *types)
2108 {
2109 const struct tc_ops *const *opsp;
2110 for (opsp = tcs; *opsp != NULL; opsp++) {
2111 const struct tc_ops *ops = *opsp;
2112 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2113 sset_add(types, ops->ovs_name);
2114 }
2115 }
2116 return 0;
2117 }
2118
2119 static const struct tc_ops *
2120 tc_lookup_ovs_name(const char *name)
2121 {
2122 const struct tc_ops *const *opsp;
2123
2124 for (opsp = tcs; *opsp != NULL; opsp++) {
2125 const struct tc_ops *ops = *opsp;
2126 if (!strcmp(name, ops->ovs_name)) {
2127 return ops;
2128 }
2129 }
2130 return NULL;
2131 }
2132
2133 static const struct tc_ops *
2134 tc_lookup_linux_name(const char *name)
2135 {
2136 const struct tc_ops *const *opsp;
2137
2138 for (opsp = tcs; *opsp != NULL; opsp++) {
2139 const struct tc_ops *ops = *opsp;
2140 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2141 return ops;
2142 }
2143 }
2144 return NULL;
2145 }
2146
2147 static struct tc_queue *
2148 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2149 size_t hash)
2150 {
2151 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2152 struct tc_queue *queue;
2153
2154 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2155 if (queue->queue_id == queue_id) {
2156 return queue;
2157 }
2158 }
2159 return NULL;
2160 }
2161
2162 static struct tc_queue *
2163 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2164 {
2165 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2166 }
2167
2168 static int
2169 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2170 const char *type,
2171 struct netdev_qos_capabilities *caps)
2172 {
2173 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2174 if (!ops) {
2175 return EOPNOTSUPP;
2176 }
2177 caps->n_queues = ops->n_queues;
2178 return 0;
2179 }
2180
2181 static int
2182 netdev_linux_get_qos(const struct netdev *netdev_,
2183 const char **typep, struct smap *details)
2184 {
2185 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2186 int error;
2187
2188 ovs_mutex_lock(&netdev->mutex);
2189 error = tc_query_qdisc(netdev_);
2190 if (!error) {
2191 *typep = netdev->tc->ops->ovs_name;
2192 error = (netdev->tc->ops->qdisc_get
2193 ? netdev->tc->ops->qdisc_get(netdev_, details)
2194 : 0);
2195 }
2196 ovs_mutex_unlock(&netdev->mutex);
2197
2198 return error;
2199 }
2200
2201 static int
2202 netdev_linux_set_qos(struct netdev *netdev_,
2203 const char *type, const struct smap *details)
2204 {
2205 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2206 const struct tc_ops *new_ops;
2207 int error;
2208
2209 new_ops = tc_lookup_ovs_name(type);
2210 if (!new_ops || !new_ops->tc_install) {
2211 return EOPNOTSUPP;
2212 }
2213
2214 if (new_ops == &tc_ops_noop) {
2215 return new_ops->tc_install(netdev_, details);
2216 }
2217
2218 ovs_mutex_lock(&netdev->mutex);
2219 error = tc_query_qdisc(netdev_);
2220 if (error) {
2221 goto exit;
2222 }
2223
2224 if (new_ops == netdev->tc->ops) {
2225 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2226 } else {
2227 /* Delete existing qdisc. */
2228 error = tc_del_qdisc(netdev_);
2229 if (error) {
2230 goto exit;
2231 }
2232 ovs_assert(netdev->tc == NULL);
2233
2234 /* Install new qdisc. */
2235 error = new_ops->tc_install(netdev_, details);
2236 ovs_assert((error == 0) == (netdev->tc != NULL));
2237 }
2238
2239 exit:
2240 ovs_mutex_unlock(&netdev->mutex);
2241 return error;
2242 }
2243
2244 static int
2245 netdev_linux_get_queue(const struct netdev *netdev_,
2246 unsigned int queue_id, struct smap *details)
2247 {
2248 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2249 int error;
2250
2251 ovs_mutex_lock(&netdev->mutex);
2252 error = tc_query_qdisc(netdev_);
2253 if (!error) {
2254 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2255 error = (queue
2256 ? netdev->tc->ops->class_get(netdev_, queue, details)
2257 : ENOENT);
2258 }
2259 ovs_mutex_unlock(&netdev->mutex);
2260
2261 return error;
2262 }
2263
2264 static int
2265 netdev_linux_set_queue(struct netdev *netdev_,
2266 unsigned int queue_id, const struct smap *details)
2267 {
2268 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2269 int error;
2270
2271 ovs_mutex_lock(&netdev->mutex);
2272 error = tc_query_qdisc(netdev_);
2273 if (!error) {
2274 error = (queue_id < netdev->tc->ops->n_queues
2275 && netdev->tc->ops->class_set
2276 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2277 : EINVAL);
2278 }
2279 ovs_mutex_unlock(&netdev->mutex);
2280
2281 return error;
2282 }
2283
2284 static int
2285 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2286 {
2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2288 int error;
2289
2290 ovs_mutex_lock(&netdev->mutex);
2291 error = tc_query_qdisc(netdev_);
2292 if (!error) {
2293 if (netdev->tc->ops->class_delete) {
2294 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2295 error = (queue
2296 ? netdev->tc->ops->class_delete(netdev_, queue)
2297 : ENOENT);
2298 } else {
2299 error = EINVAL;
2300 }
2301 }
2302 ovs_mutex_unlock(&netdev->mutex);
2303
2304 return error;
2305 }
2306
2307 static int
2308 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2309 unsigned int queue_id,
2310 struct netdev_queue_stats *stats)
2311 {
2312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2313 int error;
2314
2315 ovs_mutex_lock(&netdev->mutex);
2316 error = tc_query_qdisc(netdev_);
2317 if (!error) {
2318 if (netdev->tc->ops->class_get_stats) {
2319 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2320 if (queue) {
2321 stats->created = queue->created;
2322 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2323 stats);
2324 } else {
2325 error = ENOENT;
2326 }
2327 } else {
2328 error = EOPNOTSUPP;
2329 }
2330 }
2331 ovs_mutex_unlock(&netdev->mutex);
2332
2333 return error;
2334 }
2335
2336 struct queue_dump_state {
2337 struct nl_dump dump;
2338 struct ofpbuf buf;
2339 };
2340
2341 static bool
2342 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2343 {
2344 struct ofpbuf request;
2345 struct tcmsg *tcmsg;
2346
2347 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2348 if (!tcmsg) {
2349 return false;
2350 }
2351 tcmsg->tcm_parent = 0;
2352 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2353 ofpbuf_uninit(&request);
2354
2355 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2356 return true;
2357 }
2358
2359 static int
2360 finish_queue_dump(struct queue_dump_state *state)
2361 {
2362 ofpbuf_uninit(&state->buf);
2363 return nl_dump_done(&state->dump);
2364 }
2365
2366 struct netdev_linux_queue_state {
2367 unsigned int *queues;
2368 size_t cur_queue;
2369 size_t n_queues;
2370 };
2371
2372 static int
2373 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2374 {
2375 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2376 int error;
2377
2378 ovs_mutex_lock(&netdev->mutex);
2379 error = tc_query_qdisc(netdev_);
2380 if (!error) {
2381 if (netdev->tc->ops->class_get) {
2382 struct netdev_linux_queue_state *state;
2383 struct tc_queue *queue;
2384 size_t i;
2385
2386 *statep = state = xmalloc(sizeof *state);
2387 state->n_queues = hmap_count(&netdev->tc->queues);
2388 state->cur_queue = 0;
2389 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2390
2391 i = 0;
2392 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2393 state->queues[i++] = queue->queue_id;
2394 }
2395 } else {
2396 error = EOPNOTSUPP;
2397 }
2398 }
2399 ovs_mutex_unlock(&netdev->mutex);
2400
2401 return error;
2402 }
2403
2404 static int
2405 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2406 unsigned int *queue_idp, struct smap *details)
2407 {
2408 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2409 struct netdev_linux_queue_state *state = state_;
2410 int error = EOF;
2411
2412 ovs_mutex_lock(&netdev->mutex);
2413 while (state->cur_queue < state->n_queues) {
2414 unsigned int queue_id = state->queues[state->cur_queue++];
2415 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2416
2417 if (queue) {
2418 *queue_idp = queue_id;
2419 error = netdev->tc->ops->class_get(netdev_, queue, details);
2420 break;
2421 }
2422 }
2423 ovs_mutex_unlock(&netdev->mutex);
2424
2425 return error;
2426 }
2427
2428 static int
2429 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2430 void *state_)
2431 {
2432 struct netdev_linux_queue_state *state = state_;
2433
2434 free(state->queues);
2435 free(state);
2436 return 0;
2437 }
2438
2439 static int
2440 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2441 netdev_dump_queue_stats_cb *cb, void *aux)
2442 {
2443 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2444 int error;
2445
2446 ovs_mutex_lock(&netdev->mutex);
2447 error = tc_query_qdisc(netdev_);
2448 if (!error) {
2449 struct queue_dump_state state;
2450
2451 if (!netdev->tc->ops->class_dump_stats) {
2452 error = EOPNOTSUPP;
2453 } else if (!start_queue_dump(netdev_, &state)) {
2454 error = ENODEV;
2455 } else {
2456 struct ofpbuf msg;
2457 int retval;
2458
2459 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2460 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2461 cb, aux);
2462 if (retval) {
2463 error = retval;
2464 }
2465 }
2466
2467 retval = finish_queue_dump(&state);
2468 if (retval) {
2469 error = retval;
2470 }
2471 }
2472 }
2473 ovs_mutex_unlock(&netdev->mutex);
2474
2475 return error;
2476 }
2477
2478 static int
2479 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2480 struct in_addr netmask)
2481 {
2482 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2483 int error;
2484
2485 ovs_mutex_lock(&netdev->mutex);
2486 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2487 if (!error) {
2488 if (address.s_addr != INADDR_ANY) {
2489 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2490 "SIOCSIFNETMASK", netmask);
2491 }
2492 }
2493
2494 ovs_mutex_unlock(&netdev->mutex);
2495
2496 return error;
2497 }
2498
2499 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2500 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2501 * error. */
2502 static int
2503 netdev_linux_get_addr_list(const struct netdev *netdev_,
2504 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2505 {
2506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2507 int error;
2508
2509 ovs_mutex_lock(&netdev->mutex);
2510 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2511 ovs_mutex_unlock(&netdev->mutex);
2512
2513 return error;
2514 }
2515
2516 static void
2517 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2518 {
2519 struct sockaddr_in sin;
2520 memset(&sin, 0, sizeof sin);
2521 sin.sin_family = AF_INET;
2522 sin.sin_addr = addr;
2523 sin.sin_port = 0;
2524
2525 memset(sa, 0, sizeof *sa);
2526 memcpy(sa, &sin, sizeof sin);
2527 }
2528
2529 static int
2530 do_set_addr(struct netdev *netdev,
2531 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2532 {
2533 struct ifreq ifr;
2534
2535 make_in4_sockaddr(&ifr.ifr_addr, addr);
2536 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2537 ioctl_name);
2538 }
2539
2540 /* Adds 'router' as a default IP gateway. */
2541 static int
2542 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2543 {
2544 struct in_addr any = { INADDR_ANY };
2545 struct rtentry rt;
2546 int error;
2547
2548 memset(&rt, 0, sizeof rt);
2549 make_in4_sockaddr(&rt.rt_dst, any);
2550 make_in4_sockaddr(&rt.rt_gateway, router);
2551 make_in4_sockaddr(&rt.rt_genmask, any);
2552 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2553 error = af_inet_ioctl(SIOCADDRT, &rt);
2554 if (error) {
2555 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2556 }
2557 return error;
2558 }
2559
2560 static int
2561 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2562 char **netdev_name)
2563 {
2564 static const char fn[] = "/proc/net/route";
2565 FILE *stream;
2566 char line[256];
2567 int ln;
2568
2569 *netdev_name = NULL;
2570 stream = fopen(fn, "r");
2571 if (stream == NULL) {
2572 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2573 return errno;
2574 }
2575
2576 ln = 0;
2577 while (fgets(line, sizeof line, stream)) {
2578 if (++ln >= 2) {
2579 char iface[17];
2580 ovs_be32 dest, gateway, mask;
2581 int refcnt, metric, mtu;
2582 unsigned int flags, use, window, irtt;
2583
2584 if (!ovs_scan(line,
2585 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2586 " %d %u %u\n",
2587 iface, &dest, &gateway, &flags, &refcnt,
2588 &use, &metric, &mask, &mtu, &window, &irtt)) {
2589 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2590 fn, ln, line);
2591 continue;
2592 }
2593 if (!(flags & RTF_UP)) {
2594 /* Skip routes that aren't up. */
2595 continue;
2596 }
2597
2598 /* The output of 'dest', 'mask', and 'gateway' were given in
2599 * network byte order, so we don't need need any endian
2600 * conversions here. */
2601 if ((dest & mask) == (host->s_addr & mask)) {
2602 if (!gateway) {
2603 /* The host is directly reachable. */
2604 next_hop->s_addr = 0;
2605 } else {
2606 /* To reach the host, we must go through a gateway. */
2607 next_hop->s_addr = gateway;
2608 }
2609 *netdev_name = xstrdup(iface);
2610 fclose(stream);
2611 return 0;
2612 }
2613 }
2614 }
2615
2616 fclose(stream);
2617 return ENXIO;
2618 }
2619
2620 static int
2621 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2622 {
2623 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2624 int error = 0;
2625
2626 ovs_mutex_lock(&netdev->mutex);
2627 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2628 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2629
2630 COVERAGE_INC(netdev_get_ethtool);
2631 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2632 error = netdev_linux_do_ethtool(netdev->up.name,
2633 cmd,
2634 ETHTOOL_GDRVINFO,
2635 "ETHTOOL_GDRVINFO");
2636 if (!error) {
2637 netdev->cache_valid |= VALID_DRVINFO;
2638 }
2639 }
2640
2641 if (!error) {
2642 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2643 smap_add(smap, "driver_version", netdev->drvinfo.version);
2644 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2645 }
2646 ovs_mutex_unlock(&netdev->mutex);
2647
2648 return error;
2649 }
2650
2651 static int
2652 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2653 struct smap *smap)
2654 {
2655 smap_add(smap, "driver_name", "openvswitch");
2656 return 0;
2657 }
2658
2659 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2660 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2661 * returns 0. Otherwise, it returns a positive errno value; in particular,
2662 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2663 static int
2664 netdev_linux_arp_lookup(const struct netdev *netdev,
2665 ovs_be32 ip, struct eth_addr *mac)
2666 {
2667 struct arpreq r;
2668 struct sockaddr_in sin;
2669 int retval;
2670
2671 memset(&r, 0, sizeof r);
2672 memset(&sin, 0, sizeof sin);
2673 sin.sin_family = AF_INET;
2674 sin.sin_addr.s_addr = ip;
2675 sin.sin_port = 0;
2676 memcpy(&r.arp_pa, &sin, sizeof sin);
2677 r.arp_ha.sa_family = ARPHRD_ETHER;
2678 r.arp_flags = 0;
2679 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2680 COVERAGE_INC(netdev_arp_lookup);
2681 retval = af_inet_ioctl(SIOCGARP, &r);
2682 if (!retval) {
2683 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2684 } else if (retval != ENXIO) {
2685 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2686 netdev_get_name(netdev), IP_ARGS(ip),
2687 ovs_strerror(retval));
2688 }
2689 return retval;
2690 }
2691
2692 static int
2693 nd_to_iff_flags(enum netdev_flags nd)
2694 {
2695 int iff = 0;
2696 if (nd & NETDEV_UP) {
2697 iff |= IFF_UP;
2698 }
2699 if (nd & NETDEV_PROMISC) {
2700 iff |= IFF_PROMISC;
2701 }
2702 if (nd & NETDEV_LOOPBACK) {
2703 iff |= IFF_LOOPBACK;
2704 }
2705 return iff;
2706 }
2707
2708 static int
2709 iff_to_nd_flags(int iff)
2710 {
2711 enum netdev_flags nd = 0;
2712 if (iff & IFF_UP) {
2713 nd |= NETDEV_UP;
2714 }
2715 if (iff & IFF_PROMISC) {
2716 nd |= NETDEV_PROMISC;
2717 }
2718 if (iff & IFF_LOOPBACK) {
2719 nd |= NETDEV_LOOPBACK;
2720 }
2721 return nd;
2722 }
2723
2724 static int
2725 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2726 enum netdev_flags on, enum netdev_flags *old_flagsp)
2727 OVS_REQUIRES(netdev->mutex)
2728 {
2729 int old_flags, new_flags;
2730 int error = 0;
2731
2732 old_flags = netdev->ifi_flags;
2733 *old_flagsp = iff_to_nd_flags(old_flags);
2734 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2735 if (new_flags != old_flags) {
2736 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2737 get_flags(&netdev->up, &netdev->ifi_flags);
2738 }
2739
2740 return error;
2741 }
2742
2743 static int
2744 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2745 enum netdev_flags on, enum netdev_flags *old_flagsp)
2746 {
2747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2748 int error;
2749
2750 ovs_mutex_lock(&netdev->mutex);
2751 error = update_flags(netdev, off, on, old_flagsp);
2752 ovs_mutex_unlock(&netdev->mutex);
2753
2754 return error;
2755 }
2756
2757 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2758 GET_FEATURES, GET_STATUS) \
2759 { \
2760 NAME, \
2761 false, /* is_pmd */ \
2762 \
2763 NULL, \
2764 netdev_linux_run, \
2765 netdev_linux_wait, \
2766 \
2767 netdev_linux_alloc, \
2768 CONSTRUCT, \
2769 netdev_linux_destruct, \
2770 netdev_linux_dealloc, \
2771 NULL, /* get_config */ \
2772 NULL, /* set_config */ \
2773 NULL, /* get_tunnel_config */ \
2774 NULL, /* build header */ \
2775 NULL, /* push header */ \
2776 NULL, /* pop header */ \
2777 NULL, /* get_numa_id */ \
2778 NULL, /* set_tx_multiq */ \
2779 \
2780 netdev_linux_send, \
2781 netdev_linux_send_wait, \
2782 \
2783 netdev_linux_set_etheraddr, \
2784 netdev_linux_get_etheraddr, \
2785 netdev_linux_get_mtu, \
2786 netdev_linux_set_mtu, \
2787 netdev_linux_get_ifindex, \
2788 netdev_linux_get_carrier, \
2789 netdev_linux_get_carrier_resets, \
2790 netdev_linux_set_miimon_interval, \
2791 GET_STATS, \
2792 \
2793 GET_FEATURES, \
2794 netdev_linux_set_advertisements, \
2795 \
2796 netdev_linux_set_policing, \
2797 netdev_linux_get_qos_types, \
2798 netdev_linux_get_qos_capabilities, \
2799 netdev_linux_get_qos, \
2800 netdev_linux_set_qos, \
2801 netdev_linux_get_queue, \
2802 netdev_linux_set_queue, \
2803 netdev_linux_delete_queue, \
2804 netdev_linux_get_queue_stats, \
2805 netdev_linux_queue_dump_start, \
2806 netdev_linux_queue_dump_next, \
2807 netdev_linux_queue_dump_done, \
2808 netdev_linux_dump_queue_stats, \
2809 \
2810 netdev_linux_set_in4, \
2811 netdev_linux_get_addr_list, \
2812 netdev_linux_add_router, \
2813 netdev_linux_get_next_hop, \
2814 GET_STATUS, \
2815 netdev_linux_arp_lookup, \
2816 \
2817 netdev_linux_update_flags, \
2818 NULL, /* reconfigure */ \
2819 \
2820 netdev_linux_rxq_alloc, \
2821 netdev_linux_rxq_construct, \
2822 netdev_linux_rxq_destruct, \
2823 netdev_linux_rxq_dealloc, \
2824 netdev_linux_rxq_recv, \
2825 netdev_linux_rxq_wait, \
2826 netdev_linux_rxq_drain, \
2827 }
2828
2829 const struct netdev_class netdev_linux_class =
2830 NETDEV_LINUX_CLASS(
2831 "system",
2832 netdev_linux_construct,
2833 netdev_linux_get_stats,
2834 netdev_linux_get_features,
2835 netdev_linux_get_status);
2836
2837 const struct netdev_class netdev_tap_class =
2838 NETDEV_LINUX_CLASS(
2839 "tap",
2840 netdev_linux_construct_tap,
2841 netdev_tap_get_stats,
2842 netdev_linux_get_features,
2843 netdev_linux_get_status);
2844
2845 const struct netdev_class netdev_internal_class =
2846 NETDEV_LINUX_CLASS(
2847 "internal",
2848 netdev_linux_construct,
2849 netdev_internal_get_stats,
2850 NULL, /* get_features */
2851 netdev_internal_get_status);
2852 \f
2853
2854 #define CODEL_N_QUEUES 0x0000
2855
2856 /* In sufficiently new kernel headers these are defined as enums in
2857 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2858 * kernels. (This overrides any enum definition in the header file but that's
2859 * harmless.) */
2860 #define TCA_CODEL_TARGET 1
2861 #define TCA_CODEL_LIMIT 2
2862 #define TCA_CODEL_INTERVAL 3
2863
2864 struct codel {
2865 struct tc tc;
2866 uint32_t target;
2867 uint32_t limit;
2868 uint32_t interval;
2869 };
2870
2871 static struct codel *
2872 codel_get__(const struct netdev *netdev_)
2873 {
2874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2875 return CONTAINER_OF(netdev->tc, struct codel, tc);
2876 }
2877
2878 static void
2879 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2880 uint32_t interval)
2881 {
2882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2883 struct codel *codel;
2884
2885 codel = xmalloc(sizeof *codel);
2886 tc_init(&codel->tc, &tc_ops_codel);
2887 codel->target = target;
2888 codel->limit = limit;
2889 codel->interval = interval;
2890
2891 netdev->tc = &codel->tc;
2892 }
2893
2894 static int
2895 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2896 uint32_t interval)
2897 {
2898 size_t opt_offset;
2899 struct ofpbuf request;
2900 struct tcmsg *tcmsg;
2901 uint32_t otarget, olimit, ointerval;
2902 int error;
2903
2904 tc_del_qdisc(netdev);
2905
2906 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2907 NLM_F_EXCL | NLM_F_CREATE, &request);
2908 if (!tcmsg) {
2909 return ENODEV;
2910 }
2911 tcmsg->tcm_handle = tc_make_handle(1, 0);
2912 tcmsg->tcm_parent = TC_H_ROOT;
2913
2914 otarget = target ? target : 5000;
2915 olimit = limit ? limit : 10240;
2916 ointerval = interval ? interval : 100000;
2917
2918 nl_msg_put_string(&request, TCA_KIND, "codel");
2919 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2920 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2921 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2922 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2923 nl_msg_end_nested(&request, opt_offset);
2924
2925 error = tc_transact(&request, NULL);
2926 if (error) {
2927 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2928 "target %u, limit %u, interval %u error %d(%s)",
2929 netdev_get_name(netdev),
2930 otarget, olimit, ointerval,
2931 error, ovs_strerror(error));
2932 }
2933 return error;
2934 }
2935
2936 static void
2937 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2938 const struct smap *details, struct codel *codel)
2939 {
2940 const char *target_s;
2941 const char *limit_s;
2942 const char *interval_s;
2943
2944 target_s = smap_get(details, "target");
2945 limit_s = smap_get(details, "limit");
2946 interval_s = smap_get(details, "interval");
2947
2948 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2949 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2950 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2951
2952 if (!codel->target) {
2953 codel->target = 5000;
2954 }
2955 if (!codel->limit) {
2956 codel->limit = 10240;
2957 }
2958 if (!codel->interval) {
2959 codel->interval = 100000;
2960 }
2961 }
2962
2963 static int
2964 codel_tc_install(struct netdev *netdev, const struct smap *details)
2965 {
2966 int error;
2967 struct codel codel;
2968
2969 codel_parse_qdisc_details__(netdev, details, &codel);
2970 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2971 codel.interval);
2972 if (!error) {
2973 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2974 }
2975 return error;
2976 }
2977
2978 static int
2979 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2980 {
2981 static const struct nl_policy tca_codel_policy[] = {
2982 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2983 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2984 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2985 };
2986
2987 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2988
2989 if (!nl_parse_nested(nl_options, tca_codel_policy,
2990 attrs, ARRAY_SIZE(tca_codel_policy))) {
2991 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2992 return EPROTO;
2993 }
2994
2995 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2996 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2997 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2998 return 0;
2999 }
3000
3001 static int
3002 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3003 {
3004 struct nlattr *nlattr;
3005 const char * kind;
3006 int error;
3007 struct codel codel;
3008
3009 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3010 if (error != 0) {
3011 return error;
3012 }
3013
3014 error = codel_parse_tca_options__(nlattr, &codel);
3015 if (error != 0) {
3016 return error;
3017 }
3018
3019 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3020 return 0;
3021 }
3022
3023
3024 static void
3025 codel_tc_destroy(struct tc *tc)
3026 {
3027 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3028 tc_destroy(tc);
3029 free(codel);
3030 }
3031
3032 static int
3033 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3034 {
3035 const struct codel *codel = codel_get__(netdev);
3036 smap_add_format(details, "target", "%u", codel->target);
3037 smap_add_format(details, "limit", "%u", codel->limit);
3038 smap_add_format(details, "interval", "%u", codel->interval);
3039 return 0;
3040 }
3041
3042 static int
3043 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3044 {
3045 struct codel codel;
3046
3047 codel_parse_qdisc_details__(netdev, details, &codel);
3048 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3049 codel_get__(netdev)->target = codel.target;
3050 codel_get__(netdev)->limit = codel.limit;
3051 codel_get__(netdev)->interval = codel.interval;
3052 return 0;
3053 }
3054
3055 static const struct tc_ops tc_ops_codel = {
3056 "codel", /* linux_name */
3057 "linux-codel", /* ovs_name */
3058 CODEL_N_QUEUES, /* n_queues */
3059 codel_tc_install,
3060 codel_tc_load,
3061 codel_tc_destroy,
3062 codel_qdisc_get,
3063 codel_qdisc_set,
3064 NULL,
3065 NULL,
3066 NULL,
3067 NULL,
3068 NULL
3069 };
3070 \f
3071 /* FQ-CoDel traffic control class. */
3072
3073 #define FQCODEL_N_QUEUES 0x0000
3074
3075 /* In sufficiently new kernel headers these are defined as enums in
3076 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3077 * kernels. (This overrides any enum definition in the header file but that's
3078 * harmless.) */
3079 #define TCA_FQ_CODEL_TARGET 1
3080 #define TCA_FQ_CODEL_LIMIT 2
3081 #define TCA_FQ_CODEL_INTERVAL 3
3082 #define TCA_FQ_CODEL_ECN 4
3083 #define TCA_FQ_CODEL_FLOWS 5
3084 #define TCA_FQ_CODEL_QUANTUM 6
3085
3086 struct fqcodel {
3087 struct tc tc;
3088 uint32_t target;
3089 uint32_t limit;
3090 uint32_t interval;
3091 uint32_t flows;
3092 uint32_t quantum;
3093 };
3094
3095 static struct fqcodel *
3096 fqcodel_get__(const struct netdev *netdev_)
3097 {
3098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3099 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3100 }
3101
3102 static void
3103 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3104 uint32_t interval, uint32_t flows, uint32_t quantum)
3105 {
3106 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3107 struct fqcodel *fqcodel;
3108
3109 fqcodel = xmalloc(sizeof *fqcodel);
3110 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3111 fqcodel->target = target;
3112 fqcodel->limit = limit;
3113 fqcodel->interval = interval;
3114 fqcodel->flows = flows;
3115 fqcodel->quantum = quantum;
3116
3117 netdev->tc = &fqcodel->tc;
3118 }
3119
3120 static int
3121 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3122 uint32_t interval, uint32_t flows, uint32_t quantum)
3123 {
3124 size_t opt_offset;
3125 struct ofpbuf request;
3126 struct tcmsg *tcmsg;
3127 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3128 int error;
3129
3130 tc_del_qdisc(netdev);
3131
3132 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3133 NLM_F_EXCL | NLM_F_CREATE, &request);
3134 if (!tcmsg) {
3135 return ENODEV;
3136 }
3137 tcmsg->tcm_handle = tc_make_handle(1, 0);
3138 tcmsg->tcm_parent = TC_H_ROOT;
3139
3140 otarget = target ? target : 5000;
3141 olimit = limit ? limit : 10240;
3142 ointerval = interval ? interval : 100000;
3143 oflows = flows ? flows : 1024;
3144 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3145 not mtu */
3146
3147 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3148 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3149 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3150 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3151 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3152 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3153 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3154 nl_msg_end_nested(&request, opt_offset);
3155
3156 error = tc_transact(&request, NULL);
3157 if (error) {
3158 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3159 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3160 netdev_get_name(netdev),
3161 otarget, olimit, ointerval, oflows, oquantum,
3162 error, ovs_strerror(error));
3163 }
3164 return error;
3165 }
3166
3167 static void
3168 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3169 const struct smap *details, struct fqcodel *fqcodel)
3170 {
3171 const char *target_s;
3172 const char *limit_s;
3173 const char *interval_s;
3174 const char *flows_s;
3175 const char *quantum_s;
3176
3177 target_s = smap_get(details, "target");
3178 limit_s = smap_get(details, "limit");
3179 interval_s = smap_get(details, "interval");
3180 flows_s = smap_get(details, "flows");
3181 quantum_s = smap_get(details, "quantum");
3182 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3183 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3184 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3185 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3186 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3187 if (!fqcodel->target) {
3188 fqcodel->target = 5000;
3189 }
3190 if (!fqcodel->limit) {
3191 fqcodel->limit = 10240;
3192 }
3193 if (!fqcodel->interval) {
3194 fqcodel->interval = 1000000;
3195 }
3196 if (!fqcodel->flows) {
3197 fqcodel->flows = 1024;
3198 }
3199 if (!fqcodel->quantum) {
3200 fqcodel->quantum = 1514;
3201 }
3202 }
3203
3204 static int
3205 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3206 {
3207 int error;
3208 struct fqcodel fqcodel;
3209
3210 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3211 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3212 fqcodel.interval, fqcodel.flows,
3213 fqcodel.quantum);
3214 if (!error) {
3215 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3216 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3217 }
3218 return error;
3219 }
3220
3221 static int
3222 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3223 {
3224 static const struct nl_policy tca_fqcodel_policy[] = {
3225 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3226 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3227 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3228 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3229 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3230 };
3231
3232 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3233
3234 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3235 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3236 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3237 return EPROTO;
3238 }
3239
3240 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3241 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3242 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3243 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3244 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3245 return 0;
3246 }
3247
3248 static int
3249 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3250 {
3251 struct nlattr *nlattr;
3252 const char * kind;
3253 int error;
3254 struct fqcodel fqcodel;
3255
3256 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3257 if (error != 0) {
3258 return error;
3259 }
3260
3261 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3262 if (error != 0) {
3263 return error;
3264 }
3265
3266 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3267 fqcodel.flows, fqcodel.quantum);
3268 return 0;
3269 }
3270
3271 static void
3272 fqcodel_tc_destroy(struct tc *tc)
3273 {
3274 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3275 tc_destroy(tc);
3276 free(fqcodel);
3277 }
3278
3279 static int
3280 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3281 {
3282 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3283 smap_add_format(details, "target", "%u", fqcodel->target);
3284 smap_add_format(details, "limit", "%u", fqcodel->limit);
3285 smap_add_format(details, "interval", "%u", fqcodel->interval);
3286 smap_add_format(details, "flows", "%u", fqcodel->flows);
3287 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3288 return 0;
3289 }
3290
3291 static int
3292 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3293 {
3294 struct fqcodel fqcodel;
3295
3296 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3297 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3298 fqcodel.flows, fqcodel.quantum);
3299 fqcodel_get__(netdev)->target = fqcodel.target;
3300 fqcodel_get__(netdev)->limit = fqcodel.limit;
3301 fqcodel_get__(netdev)->interval = fqcodel.interval;
3302 fqcodel_get__(netdev)->flows = fqcodel.flows;
3303 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3304 return 0;
3305 }
3306
3307 static const struct tc_ops tc_ops_fqcodel = {
3308 "fq_codel", /* linux_name */
3309 "linux-fq_codel", /* ovs_name */
3310 FQCODEL_N_QUEUES, /* n_queues */
3311 fqcodel_tc_install,
3312 fqcodel_tc_load,
3313 fqcodel_tc_destroy,
3314 fqcodel_qdisc_get,
3315 fqcodel_qdisc_set,
3316 NULL,
3317 NULL,
3318 NULL,
3319 NULL,
3320 NULL
3321 };
3322 \f
3323 /* SFQ traffic control class. */
3324
3325 #define SFQ_N_QUEUES 0x0000
3326
3327 struct sfq {
3328 struct tc tc;
3329 uint32_t quantum;
3330 uint32_t perturb;
3331 };
3332
3333 static struct sfq *
3334 sfq_get__(const struct netdev *netdev_)
3335 {
3336 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3337 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3338 }
3339
3340 static void
3341 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3342 {
3343 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3344 struct sfq *sfq;
3345
3346 sfq = xmalloc(sizeof *sfq);
3347 tc_init(&sfq->tc, &tc_ops_sfq);
3348 sfq->perturb = perturb;
3349 sfq->quantum = quantum;
3350
3351 netdev->tc = &sfq->tc;
3352 }
3353
3354 static int
3355 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3356 {
3357 struct tc_sfq_qopt opt;
3358 struct ofpbuf request;
3359 struct tcmsg *tcmsg;
3360 int mtu;
3361 int mtu_error, error;
3362 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3363
3364 tc_del_qdisc(netdev);
3365
3366 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3367 NLM_F_EXCL | NLM_F_CREATE, &request);
3368 if (!tcmsg) {
3369 return ENODEV;
3370 }
3371 tcmsg->tcm_handle = tc_make_handle(1, 0);
3372 tcmsg->tcm_parent = TC_H_ROOT;
3373
3374 memset(&opt, 0, sizeof opt);
3375 if (!quantum) {
3376 if (!mtu_error) {
3377 opt.quantum = mtu; /* if we cannot find mtu, use default */
3378 }
3379 } else {
3380 opt.quantum = quantum;
3381 }
3382
3383 if (!perturb) {
3384 opt.perturb_period = 10;
3385 } else {
3386 opt.perturb_period = perturb;
3387 }
3388
3389 nl_msg_put_string(&request, TCA_KIND, "sfq");
3390 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3391
3392 error = tc_transact(&request, NULL);
3393 if (error) {
3394 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3395 "quantum %u, perturb %u error %d(%s)",
3396 netdev_get_name(netdev),
3397 opt.quantum, opt.perturb_period,
3398 error, ovs_strerror(error));
3399 }
3400 return error;
3401 }
3402
3403 static void
3404 sfq_parse_qdisc_details__(struct netdev *netdev,
3405 const struct smap *details, struct sfq *sfq)
3406 {
3407 const char *perturb_s;
3408 const char *quantum_s;
3409 int mtu;
3410 int mtu_error;
3411
3412 perturb_s = smap_get(details, "perturb");
3413 quantum_s = smap_get(details, "quantum");
3414 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3415 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3416 if (!sfq->perturb) {
3417 sfq->perturb = 10;
3418 }
3419
3420 if (!sfq->quantum) {
3421 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3422 if (!mtu_error) {
3423 sfq->quantum = mtu;
3424 } else {
3425 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3426 "device without mtu");
3427 return;
3428 }
3429 }
3430 }
3431
3432 static int
3433 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3434 {
3435 int error;
3436 struct sfq sfq;
3437
3438 sfq_parse_qdisc_details__(netdev, details, &sfq);
3439 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3440 if (!error) {
3441 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3442 }
3443 return error;
3444 }
3445
3446 static int
3447 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3448 {
3449 const struct tc_sfq_qopt *sfq;
3450 struct nlattr *nlattr;
3451 const char * kind;
3452 int error;
3453
3454 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3455 if (error == 0) {
3456 sfq = nl_attr_get(nlattr);
3457 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3458 return 0;
3459 }
3460
3461 return error;
3462 }
3463
3464 static void
3465 sfq_tc_destroy(struct tc *tc)
3466 {
3467 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3468 tc_destroy(tc);
3469 free(sfq);
3470 }
3471
3472 static int
3473 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3474 {
3475 const struct sfq *sfq = sfq_get__(netdev);
3476 smap_add_format(details, "quantum", "%u", sfq->quantum);
3477 smap_add_format(details, "perturb", "%u", sfq->perturb);
3478 return 0;
3479 }
3480
3481 static int
3482 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3483 {
3484 struct sfq sfq;
3485
3486 sfq_parse_qdisc_details__(netdev, details, &sfq);
3487 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3488 sfq_get__(netdev)->quantum = sfq.quantum;
3489 sfq_get__(netdev)->perturb = sfq.perturb;
3490 return 0;
3491 }
3492
3493 static const struct tc_ops tc_ops_sfq = {
3494 "sfq", /* linux_name */
3495 "linux-sfq", /* ovs_name */
3496 SFQ_N_QUEUES, /* n_queues */
3497 sfq_tc_install,
3498 sfq_tc_load,
3499 sfq_tc_destroy,
3500 sfq_qdisc_get,
3501 sfq_qdisc_set,
3502 NULL,
3503 NULL,
3504 NULL,
3505 NULL,
3506 NULL
3507 };
3508 \f
3509 /* HTB traffic control class. */
3510
3511 #define HTB_N_QUEUES 0xf000
3512 #define HTB_RATE2QUANTUM 10
3513
3514 struct htb {
3515 struct tc tc;
3516 unsigned int max_rate; /* In bytes/s. */
3517 };
3518
3519 struct htb_class {
3520 struct tc_queue tc_queue;
3521 unsigned int min_rate; /* In bytes/s. */
3522 unsigned int max_rate; /* In bytes/s. */
3523 unsigned int burst; /* In bytes. */
3524 unsigned int priority; /* Lower values are higher priorities. */
3525 };
3526
3527 static struct htb *
3528 htb_get__(const struct netdev *netdev_)
3529 {
3530 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3531 return CONTAINER_OF(netdev->tc, struct htb, tc);
3532 }
3533
3534 static void
3535 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3536 {
3537 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3538 struct htb *htb;
3539
3540 htb = xmalloc(sizeof *htb);
3541 tc_init(&htb->tc, &tc_ops_htb);
3542 htb->max_rate = max_rate;
3543
3544 netdev->tc = &htb->tc;
3545 }
3546
3547 /* Create an HTB qdisc.
3548 *
3549 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3550 static int
3551 htb_setup_qdisc__(struct netdev *netdev)
3552 {
3553 size_t opt_offset;
3554 struct tc_htb_glob opt;
3555 struct ofpbuf request;
3556 struct tcmsg *tcmsg;
3557
3558 tc_del_qdisc(netdev);
3559
3560 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3561 NLM_F_EXCL | NLM_F_CREATE, &request);
3562 if (!tcmsg) {
3563 return ENODEV;
3564 }
3565 tcmsg->tcm_handle = tc_make_handle(1, 0);
3566 tcmsg->tcm_parent = TC_H_ROOT;
3567
3568 nl_msg_put_string(&request, TCA_KIND, "htb");
3569
3570 memset(&opt, 0, sizeof opt);
3571 opt.rate2quantum = HTB_RATE2QUANTUM;
3572 opt.version = 3;
3573 opt.defcls = 1;
3574
3575 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3576 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3577 nl_msg_end_nested(&request, opt_offset);
3578
3579 return tc_transact(&request, NULL);
3580 }
3581
3582 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3583 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3584 static int
3585 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3586 unsigned int parent, struct htb_class *class)
3587 {
3588 size_t opt_offset;
3589 struct tc_htb_opt opt;
3590 struct ofpbuf request;
3591 struct tcmsg *tcmsg;
3592 int error;
3593 int mtu;
3594
3595 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3596 if (error) {
3597 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3598 netdev_get_name(netdev));
3599 return error;
3600 }
3601
3602 memset(&opt, 0, sizeof opt);
3603 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3604 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3605 /* Makes sure the quantum is at least MTU. Setting quantum will
3606 * make htb ignore the r2q for this class. */
3607 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3608 opt.quantum = mtu;
3609 }
3610 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3611 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3612 opt.prio = class->priority;
3613
3614 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3615 if (!tcmsg) {
3616 return ENODEV;
3617 }
3618 tcmsg->tcm_handle = handle;
3619 tcmsg->tcm_parent = parent;
3620
3621 nl_msg_put_string(&request, TCA_KIND, "htb");
3622 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3623 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3624 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3625 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3626 nl_msg_end_nested(&request, opt_offset);
3627
3628 error = tc_transact(&request, NULL);
3629 if (error) {
3630 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3631 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3632 netdev_get_name(netdev),
3633 tc_get_major(handle), tc_get_minor(handle),
3634 tc_get_major(parent), tc_get_minor(parent),
3635 class->min_rate, class->max_rate,
3636 class->burst, class->priority, ovs_strerror(error));
3637 }
3638 return error;
3639 }
3640
3641 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3642 * description of them into 'details'. The description complies with the
3643 * specification given in the vswitch database documentation for linux-htb
3644 * queue details. */
3645 static int
3646 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3647 {
3648 static const struct nl_policy tca_htb_policy[] = {
3649 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3650 .min_len = sizeof(struct tc_htb_opt) },
3651 };
3652
3653 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3654 const struct tc_htb_opt *htb;
3655
3656 if (!nl_parse_nested(nl_options, tca_htb_policy,
3657 attrs, ARRAY_SIZE(tca_htb_policy))) {
3658 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3659 return EPROTO;
3660 }
3661
3662 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3663 class->min_rate = htb->rate.rate;
3664 class->max_rate = htb->ceil.rate;
3665 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3666 class->priority = htb->prio;
3667 return 0;
3668 }
3669
3670 static int
3671 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3672 struct htb_class *options,
3673 struct netdev_queue_stats *stats)
3674 {
3675 struct nlattr *nl_options;
3676 unsigned int handle;
3677 int error;
3678
3679 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3680 if (!error && queue_id) {
3681 unsigned int major = tc_get_major(handle);
3682 unsigned int minor = tc_get_minor(handle);
3683 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3684 *queue_id = minor - 1;
3685 } else {
3686 error = EPROTO;
3687 }
3688 }
3689 if (!error && options) {
3690 error = htb_parse_tca_options__(nl_options, options);
3691 }
3692 return error;
3693 }
3694
3695 static void
3696 htb_parse_qdisc_details__(struct netdev *netdev_,
3697 const struct smap *details, struct htb_class *hc)
3698 {
3699 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3700 const char *max_rate_s;
3701
3702 max_rate_s = smap_get(details, "max-rate");
3703 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3704 if (!hc->max_rate) {
3705 enum netdev_features current;
3706
3707 netdev_linux_read_features(netdev);
3708 current = !netdev->get_features_error ? netdev->current : 0;
3709 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3710 }
3711 hc->min_rate = hc->max_rate;
3712 hc->burst = 0;
3713 hc->priority = 0;
3714 }
3715
3716 static int
3717 htb_parse_class_details__(struct netdev *netdev,
3718 const struct smap *details, struct htb_class *hc)
3719 {
3720 const struct htb *htb = htb_get__(netdev);
3721 const char *min_rate_s = smap_get(details, "min-rate");
3722 const char *max_rate_s = smap_get(details, "max-rate");
3723 const char *burst_s = smap_get(details, "burst");
3724 const char *priority_s = smap_get(details, "priority");
3725 int mtu, error;
3726
3727 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3728 if (error) {
3729 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3730 netdev_get_name(netdev));
3731 return error;
3732 }
3733
3734 /* HTB requires at least an mtu sized min-rate to send any traffic even
3735 * on uncongested links. */
3736 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3737 hc->min_rate = MAX(hc->min_rate, mtu);
3738 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3739
3740 /* max-rate */
3741 hc->max_rate = (max_rate_s
3742 ? strtoull(max_rate_s, NULL, 10) / 8
3743 : htb->max_rate);
3744 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3745 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3746
3747 /* burst
3748 *
3749 * According to hints in the documentation that I've read, it is important
3750 * that 'burst' be at least as big as the largest frame that might be
3751 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3752 * but having it a bit too small is a problem. Since netdev_get_mtu()
3753 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3754 * the MTU. We actually add 64, instead of 14, as a guard against
3755 * additional headers get tacked on somewhere that we're not aware of. */
3756 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3757 hc->burst = MAX(hc->burst, mtu + 64);
3758
3759 /* priority */
3760 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3761
3762 return 0;
3763 }
3764
3765 static int
3766 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3767 unsigned int parent, struct htb_class *options,
3768 struct netdev_queue_stats *stats)
3769 {
3770 struct ofpbuf *reply;
3771 int error;
3772
3773 error = tc_query_class(netdev, handle, parent, &reply);
3774 if (!error) {
3775 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3776 ofpbuf_delete(reply);
3777 }
3778 return error;
3779 }
3780
3781 static int
3782 htb_tc_install(struct netdev *netdev, const struct smap *details)
3783 {
3784 int error;
3785
3786 error = htb_setup_qdisc__(netdev);
3787 if (!error) {
3788 struct htb_class hc;
3789
3790 htb_parse_qdisc_details__(netdev, details, &hc);
3791 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3792 tc_make_handle(1, 0), &hc);
3793 if (!error) {
3794 htb_install__(netdev, hc.max_rate);
3795 }
3796 }
3797 return error;
3798 }
3799
3800 static struct htb_class *
3801 htb_class_cast__(const struct tc_queue *queue)
3802 {
3803 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3804 }
3805
3806 static void
3807 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3808 const struct htb_class *hc)
3809 {
3810 struct htb *htb = htb_get__(netdev);
3811 size_t hash = hash_int(queue_id, 0);
3812 struct tc_queue *queue;
3813 struct htb_class *hcp;
3814
3815 queue = tc_find_queue__(netdev, queue_id, hash);
3816 if (queue) {
3817 hcp = htb_class_cast__(queue);
3818 } else {
3819 hcp = xmalloc(sizeof *hcp);
3820 queue = &hcp->tc_queue;
3821 queue->queue_id = queue_id;
3822 queue->created = time_msec();
3823 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3824 }
3825
3826 hcp->min_rate = hc->min_rate;
3827 hcp->max_rate = hc->max_rate;
3828 hcp->burst = hc->burst;
3829 hcp->priority = hc->priority;
3830 }
3831
3832 static int
3833 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3834 {
3835 struct ofpbuf msg;
3836 struct queue_dump_state state;
3837 struct htb_class hc;
3838
3839 /* Get qdisc options. */
3840 hc.max_rate = 0;
3841 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3842 htb_install__(netdev, hc.max_rate);
3843
3844 /* Get queues. */
3845 if (!start_queue_dump(netdev, &state)) {
3846 return ENODEV;
3847 }
3848 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3849 unsigned int queue_id;
3850
3851 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3852 htb_update_queue__(netdev, queue_id, &hc);
3853 }
3854 }
3855 finish_queue_dump(&state);
3856
3857 return 0;
3858 }
3859
3860 static void
3861 htb_tc_destroy(struct tc *tc)
3862 {
3863 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3864 struct htb_class *hc;
3865
3866 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3867 free(hc);
3868 }
3869 tc_destroy(tc);
3870 free(htb);
3871 }
3872
3873 static int
3874 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3875 {
3876 const struct htb *htb = htb_get__(netdev);
3877 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3878 return 0;
3879 }
3880
3881 static int
3882 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3883 {
3884 struct htb_class hc;
3885 int error;
3886
3887 htb_parse_qdisc_details__(netdev, details, &hc);
3888 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3889 tc_make_handle(1, 0), &hc);
3890 if (!error) {
3891 htb_get__(netdev)->max_rate = hc.max_rate;
3892 }
3893 return error;
3894 }
3895
3896 static int
3897 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3898 const struct tc_queue *queue, struct smap *details)
3899 {
3900 const struct htb_class *hc = htb_class_cast__(queue);
3901
3902 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3903 if (hc->min_rate != hc->max_rate) {
3904 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3905 }
3906 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3907 if (hc->priority) {
3908 smap_add_format(details, "priority", "%u", hc->priority);
3909 }
3910 return 0;
3911 }
3912
3913 static int
3914 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3915 const struct smap *details)
3916 {
3917 struct htb_class hc;
3918 int error;
3919
3920 error = htb_parse_class_details__(netdev, details, &hc);
3921 if (error) {
3922 return error;
3923 }
3924
3925 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3926 tc_make_handle(1, 0xfffe), &hc);
3927 if (error) {
3928 return error;
3929 }
3930
3931 htb_update_queue__(netdev, queue_id, &hc);
3932 return 0;
3933 }
3934
3935 static int
3936 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3937 {
3938 struct htb_class *hc = htb_class_cast__(queue);
3939 struct htb *htb = htb_get__(netdev);
3940 int error;
3941
3942 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3943 if (!error) {
3944 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3945 free(hc);
3946 }
3947 return error;
3948 }
3949
3950 static int
3951 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3952 struct netdev_queue_stats *stats)
3953 {
3954 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3955 tc_make_handle(1, 0xfffe), NULL, stats);
3956 }
3957
3958 static int
3959 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3960 const struct ofpbuf *nlmsg,
3961 netdev_dump_queue_stats_cb *cb, void *aux)
3962 {
3963 struct netdev_queue_stats stats;
3964 unsigned int handle, major, minor;
3965 int error;
3966
3967 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3968 if (error) {
3969 return error;
3970 }
3971
3972 major = tc_get_major(handle);
3973 minor = tc_get_minor(handle);
3974 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3975 (*cb)(minor - 1, &stats, aux);
3976 }
3977 return 0;
3978 }
3979
3980 static const struct tc_ops tc_ops_htb = {
3981 "htb", /* linux_name */
3982 "linux-htb", /* ovs_name */
3983 HTB_N_QUEUES, /* n_queues */
3984 htb_tc_install,
3985 htb_tc_load,
3986 htb_tc_destroy,
3987 htb_qdisc_get,
3988 htb_qdisc_set,
3989 htb_class_get,
3990 htb_class_set,
3991 htb_class_delete,
3992 htb_class_get_stats,
3993 htb_class_dump_stats
3994 };
3995 \f
3996 /* "linux-hfsc" traffic control class. */
3997
3998 #define HFSC_N_QUEUES 0xf000
3999
4000 struct hfsc {
4001 struct tc tc;
4002 uint32_t max_rate;
4003 };
4004
4005 struct hfsc_class {
4006 struct tc_queue tc_queue;
4007 uint32_t min_rate;
4008 uint32_t max_rate;
4009 };
4010
4011 static struct hfsc *
4012 hfsc_get__(const struct netdev *netdev_)
4013 {
4014 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4015 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4016 }
4017
4018 static struct hfsc_class *
4019 hfsc_class_cast__(const struct tc_queue *queue)
4020 {
4021 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4022 }
4023
4024 static void
4025 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4026 {
4027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4028 struct hfsc *hfsc;
4029
4030 hfsc = xmalloc(sizeof *hfsc);
4031 tc_init(&hfsc->tc, &tc_ops_hfsc);
4032 hfsc->max_rate = max_rate;
4033 netdev->tc = &hfsc->tc;
4034 }
4035
4036 static void
4037 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4038 const struct hfsc_class *hc)
4039 {
4040 size_t hash;
4041 struct hfsc *hfsc;
4042 struct hfsc_class *hcp;
4043 struct tc_queue *queue;
4044
4045 hfsc = hfsc_get__(netdev);
4046 hash = hash_int(queue_id, 0);
4047
4048 queue = tc_find_queue__(netdev, queue_id, hash);
4049 if (queue) {
4050 hcp = hfsc_class_cast__(queue);
4051 } else {
4052 hcp = xmalloc(sizeof *hcp);
4053 queue = &hcp->tc_queue;
4054 queue->queue_id = queue_id;
4055 queue->created = time_msec();
4056 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4057 }
4058
4059 hcp->min_rate = hc->min_rate;
4060 hcp->max_rate = hc->max_rate;
4061 }
4062
4063 static int
4064 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4065 {
4066 const struct tc_service_curve *rsc, *fsc, *usc;
4067 static const struct nl_policy tca_hfsc_policy[] = {
4068 [TCA_HFSC_RSC] = {
4069 .type = NL_A_UNSPEC,
4070 .optional = false,
4071 .min_len = sizeof(struct tc_service_curve),
4072 },
4073 [TCA_HFSC_FSC] = {
4074 .type = NL_A_UNSPEC,
4075 .optional = false,
4076 .min_len = sizeof(struct tc_service_curve),
4077 },
4078 [TCA_HFSC_USC] = {
4079 .type = NL_A_UNSPEC,
4080 .optional = false,
4081 .min_len = sizeof(struct tc_service_curve),
4082 },
4083 };
4084 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4085
4086 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4087 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4089 return EPROTO;
4090 }
4091
4092 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4093 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4094 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4095
4096 if (rsc->m1 != 0 || rsc->d != 0 ||
4097 fsc->m1 != 0 || fsc->d != 0 ||
4098 usc->m1 != 0 || usc->d != 0) {
4099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4100 "Non-linear service curves are not supported.");
4101 return EPROTO;
4102 }
4103
4104 if (rsc->m2 != fsc->m2) {
4105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4106 "Real-time service curves are not supported ");
4107 return EPROTO;
4108 }
4109
4110 if (rsc->m2 > usc->m2) {
4111 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4112 "Min-rate service curve is greater than "
4113 "the max-rate service curve.");
4114 return EPROTO;
4115 }
4116
4117 class->min_rate = fsc->m2;
4118 class->max_rate = usc->m2;
4119 return 0;
4120 }
4121
4122 static int
4123 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4124 struct hfsc_class *options,
4125 struct netdev_queue_stats *stats)
4126 {
4127 int error;
4128 unsigned int handle;
4129 struct nlattr *nl_options;
4130
4131 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4132 if (error) {
4133 return error;
4134 }
4135
4136 if (queue_id) {
4137 unsigned int major, minor;
4138
4139 major = tc_get_major(handle);
4140 minor = tc_get_minor(handle);
4141 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4142 *queue_id = minor - 1;
4143 } else {
4144 return EPROTO;
4145 }
4146 }
4147
4148 if (options) {
4149 error = hfsc_parse_tca_options__(nl_options, options);
4150 }
4151
4152 return error;
4153 }
4154
4155 static int
4156 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4157 unsigned int parent, struct hfsc_class *options,
4158 struct netdev_queue_stats *stats)
4159 {
4160 int error;
4161 struct ofpbuf *reply;
4162
4163 error = tc_query_class(netdev, handle, parent, &reply);
4164 if (error) {
4165 return error;
4166 }
4167
4168 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4169 ofpbuf_delete(reply);
4170 return error;
4171 }
4172
4173 static void
4174 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4175 struct hfsc_class *class)
4176 {
4177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4178 uint32_t max_rate;
4179 const char *max_rate_s;
4180
4181 max_rate_s = smap_get(details, "max-rate");
4182 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4183
4184 if (!max_rate) {
4185 enum netdev_features current;
4186
4187 netdev_linux_read_features(netdev);
4188 current = !netdev->get_features_error ? netdev->current : 0;
4189 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4190 }
4191
4192 class->min_rate = max_rate;
4193 class->max_rate = max_rate;
4194 }
4195
4196 static int
4197 hfsc_parse_class_details__(struct netdev *netdev,
4198 const struct smap *details,
4199 struct hfsc_class * class)
4200 {
4201 const struct hfsc *hfsc;
4202 uint32_t min_rate, max_rate;
4203 const char *min_rate_s, *max_rate_s;
4204
4205 hfsc = hfsc_get__(netdev);
4206 min_rate_s = smap_get(details, "min-rate");
4207 max_rate_s = smap_get(details, "max-rate");
4208
4209 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4210 min_rate = MAX(min_rate, 1);
4211 min_rate = MIN(min_rate, hfsc->max_rate);
4212
4213 max_rate = (max_rate_s
4214 ? strtoull(max_rate_s, NULL, 10) / 8
4215 : hfsc->max_rate);
4216 max_rate = MAX(max_rate, min_rate);
4217 max_rate = MIN(max_rate, hfsc->max_rate);
4218
4219 class->min_rate = min_rate;
4220 class->max_rate = max_rate;
4221
4222 return 0;
4223 }
4224
4225 /* Create an HFSC qdisc.
4226 *
4227 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4228 static int
4229 hfsc_setup_qdisc__(struct netdev * netdev)
4230 {
4231 struct tcmsg *tcmsg;
4232 struct ofpbuf request;
4233 struct tc_hfsc_qopt opt;
4234
4235 tc_del_qdisc(netdev);
4236
4237 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4238 NLM_F_EXCL | NLM_F_CREATE, &request);
4239
4240 if (!tcmsg) {
4241 return ENODEV;
4242 }
4243
4244 tcmsg->tcm_handle = tc_make_handle(1, 0);
4245 tcmsg->tcm_parent = TC_H_ROOT;
4246
4247 memset(&opt, 0, sizeof opt);
4248 opt.defcls = 1;
4249
4250 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4251 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4252
4253 return tc_transact(&request, NULL);
4254 }
4255
4256 /* Create an HFSC class.
4257 *
4258 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4259 * sc rate <min_rate> ul rate <max_rate>" */
4260 static int
4261 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4262 unsigned int parent, struct hfsc_class *class)
4263 {
4264 int error;
4265 size_t opt_offset;
4266 struct tcmsg *tcmsg;
4267 struct ofpbuf request;
4268 struct tc_service_curve min, max;
4269
4270 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4271
4272 if (!tcmsg) {
4273 return ENODEV;
4274 }
4275
4276 tcmsg->tcm_handle = handle;
4277 tcmsg->tcm_parent = parent;
4278
4279 min.m1 = 0;
4280 min.d = 0;
4281 min.m2 = class->min_rate;
4282
4283 max.m1 = 0;
4284 max.d = 0;
4285 max.m2 = class->max_rate;
4286
4287 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4288 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4289 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4290 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4291 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4292 nl_msg_end_nested(&request, opt_offset);
4293
4294 error = tc_transact(&request, NULL);
4295 if (error) {
4296 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4297 "min-rate %ubps, max-rate %ubps (%s)",
4298 netdev_get_name(netdev),
4299 tc_get_major(handle), tc_get_minor(handle),
4300 tc_get_major(parent), tc_get_minor(parent),
4301 class->min_rate, class->max_rate, ovs_strerror(error));
4302 }
4303
4304 return error;
4305 }
4306
4307 static int
4308 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4309 {
4310 int error;
4311 struct hfsc_class class;
4312
4313 error = hfsc_setup_qdisc__(netdev);
4314
4315 if (error) {
4316 return error;
4317 }
4318
4319 hfsc_parse_qdisc_details__(netdev, details, &class);
4320 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4321 tc_make_handle(1, 0), &class);
4322
4323 if (error) {
4324 return error;
4325 }
4326
4327 hfsc_install__(netdev, class.max_rate);
4328 return 0;
4329 }
4330
4331 static int
4332 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4333 {
4334 struct ofpbuf msg;
4335 struct queue_dump_state state;
4336 struct hfsc_class hc;
4337
4338 hc.max_rate = 0;
4339 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4340 hfsc_install__(netdev, hc.max_rate);
4341
4342 if (!start_queue_dump(netdev, &state)) {
4343 return ENODEV;
4344 }
4345
4346 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4347 unsigned int queue_id;
4348
4349 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4350 hfsc_update_queue__(netdev, queue_id, &hc);
4351 }
4352 }
4353
4354 finish_queue_dump(&state);
4355 return 0;
4356 }
4357
4358 static void
4359 hfsc_tc_destroy(struct tc *tc)
4360 {
4361 struct hfsc *hfsc;
4362 struct hfsc_class *hc, *next;
4363
4364 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4365
4366 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4367 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4368 free(hc);
4369 }
4370
4371 tc_destroy(tc);
4372 free(hfsc);
4373 }
4374
4375 static int
4376 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4377 {
4378 const struct hfsc *hfsc;
4379 hfsc = hfsc_get__(netdev);
4380 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4381 return 0;
4382 }
4383
4384 static int
4385 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4386 {
4387 int error;
4388 struct hfsc_class class;
4389
4390 hfsc_parse_qdisc_details__(netdev, details, &class);
4391 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4392 tc_make_handle(1, 0), &class);
4393
4394 if (!error) {
4395 hfsc_get__(netdev)->max_rate = class.max_rate;
4396 }
4397
4398 return error;
4399 }
4400
4401 static int
4402 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4403 const struct tc_queue *queue, struct smap *details)
4404 {
4405 const struct hfsc_class *hc;
4406
4407 hc = hfsc_class_cast__(queue);
4408 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4409 if (hc->min_rate != hc->max_rate) {
4410 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4411 }
4412 return 0;
4413 }
4414
4415 static int
4416 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4417 const struct smap *details)
4418 {
4419 int error;
4420 struct hfsc_class class;
4421
4422 error = hfsc_parse_class_details__(netdev, details, &class);
4423 if (error) {
4424 return error;
4425 }
4426
4427 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4428 tc_make_handle(1, 0xfffe), &class);
4429 if (error) {
4430 return error;
4431 }
4432
4433 hfsc_update_queue__(netdev, queue_id, &class);
4434 return 0;
4435 }
4436
4437 static int
4438 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4439 {
4440 int error;
4441 struct hfsc *hfsc;
4442 struct hfsc_class *hc;
4443
4444 hc = hfsc_class_cast__(queue);
4445 hfsc = hfsc_get__(netdev);
4446
4447 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4448 if (!error) {
4449 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4450 free(hc);
4451 }
4452 return error;
4453 }
4454
4455 static int
4456 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4457 struct netdev_queue_stats *stats)
4458 {
4459 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4460 tc_make_handle(1, 0xfffe), NULL, stats);
4461 }
4462
4463 static int
4464 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4465 const struct ofpbuf *nlmsg,
4466 netdev_dump_queue_stats_cb *cb, void *aux)
4467 {
4468 struct netdev_queue_stats stats;
4469 unsigned int handle, major, minor;
4470 int error;
4471
4472 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4473 if (error) {
4474 return error;
4475 }
4476
4477 major = tc_get_major(handle);
4478 minor = tc_get_minor(handle);
4479 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4480 (*cb)(minor - 1, &stats, aux);
4481 }
4482 return 0;
4483 }
4484
4485 static const struct tc_ops tc_ops_hfsc = {
4486 "hfsc", /* linux_name */
4487 "linux-hfsc", /* ovs_name */
4488 HFSC_N_QUEUES, /* n_queues */
4489 hfsc_tc_install, /* tc_install */
4490 hfsc_tc_load, /* tc_load */
4491 hfsc_tc_destroy, /* tc_destroy */
4492 hfsc_qdisc_get, /* qdisc_get */
4493 hfsc_qdisc_set, /* qdisc_set */
4494 hfsc_class_get, /* class_get */
4495 hfsc_class_set, /* class_set */
4496 hfsc_class_delete, /* class_delete */
4497 hfsc_class_get_stats, /* class_get_stats */
4498 hfsc_class_dump_stats /* class_dump_stats */
4499 };
4500 \f
4501 /* "linux-noop" traffic control class. */
4502
4503 static void
4504 noop_install__(struct netdev *netdev_)
4505 {
4506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4507 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4508
4509 netdev->tc = CONST_CAST(struct tc *, &tc);
4510 }
4511
4512 static int
4513 noop_tc_install(struct netdev *netdev,
4514 const struct smap *details OVS_UNUSED)
4515 {
4516 noop_install__(netdev);
4517 return 0;
4518 }
4519
4520 static int
4521 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4522 {
4523 noop_install__(netdev);
4524 return 0;
4525 }
4526
4527 static const struct tc_ops tc_ops_noop = {
4528 NULL, /* linux_name */
4529 "linux-noop", /* ovs_name */
4530 0, /* n_queues */
4531 noop_tc_install,
4532 noop_tc_load,
4533 NULL, /* tc_destroy */
4534 NULL, /* qdisc_get */
4535 NULL, /* qdisc_set */
4536 NULL, /* class_get */
4537 NULL, /* class_set */
4538 NULL, /* class_delete */
4539 NULL, /* class_get_stats */
4540 NULL /* class_dump_stats */
4541 };
4542 \f
4543 /* "linux-default" traffic control class.
4544 *
4545 * This class represents the default, unnamed Linux qdisc. It corresponds to
4546 * the "" (empty string) QoS type in the OVS database. */
4547
4548 static void
4549 default_install__(struct netdev *netdev_)
4550 {
4551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4552 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4553
4554 /* Nothing but a tc class implementation is allowed to write to a tc. This
4555 * class never does that, so we can legitimately use a const tc object. */
4556 netdev->tc = CONST_CAST(struct tc *, &tc);
4557 }
4558
4559 static int
4560 default_tc_install(struct netdev *netdev,
4561 const struct smap *details OVS_UNUSED)
4562 {
4563 default_install__(netdev);
4564 return 0;
4565 }
4566
4567 static int
4568 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4569 {
4570 default_install__(netdev);
4571 return 0;
4572 }
4573
4574 static const struct tc_ops tc_ops_default = {
4575 NULL, /* linux_name */
4576 "", /* ovs_name */
4577 0, /* n_queues */
4578 default_tc_install,
4579 default_tc_load,
4580 NULL, /* tc_destroy */
4581 NULL, /* qdisc_get */
4582 NULL, /* qdisc_set */
4583 NULL, /* class_get */
4584 NULL, /* class_set */
4585 NULL, /* class_delete */
4586 NULL, /* class_get_stats */
4587 NULL /* class_dump_stats */
4588 };
4589 \f
4590 /* "linux-other" traffic control class.
4591 *
4592 * */
4593
4594 static int
4595 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4596 {
4597 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4598 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4599
4600 /* Nothing but a tc class implementation is allowed to write to a tc. This
4601 * class never does that, so we can legitimately use a const tc object. */
4602 netdev->tc = CONST_CAST(struct tc *, &tc);
4603 return 0;
4604 }
4605
4606 static const struct tc_ops tc_ops_other = {
4607 NULL, /* linux_name */
4608 "linux-other", /* ovs_name */
4609 0, /* n_queues */
4610 NULL, /* tc_install */
4611 other_tc_load,
4612 NULL, /* tc_destroy */
4613 NULL, /* qdisc_get */
4614 NULL, /* qdisc_set */
4615 NULL, /* class_get */
4616 NULL, /* class_set */
4617 NULL, /* class_delete */
4618 NULL, /* class_get_stats */
4619 NULL /* class_dump_stats */
4620 };
4621 \f
4622 /* Traffic control. */
4623
4624 /* Number of kernel "tc" ticks per second. */
4625 static double ticks_per_s;
4626
4627 /* Number of kernel "jiffies" per second. This is used for the purpose of
4628 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4629 * one jiffy's worth of data.
4630 *
4631 * There are two possibilities here:
4632 *
4633 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4634 * approximate range of 100 to 1024. That means that we really need to
4635 * make sure that the qdisc can buffer that much data.
4636 *
4637 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4638 * has finely granular timers and there's no need to fudge additional room
4639 * for buffers. (There's no extra effort needed to implement that: the
4640 * large 'buffer_hz' is used as a divisor, so practically any number will
4641 * come out as 0 in the division. Small integer results in the case of
4642 * really high dividends won't have any real effect anyhow.)
4643 */
4644 static unsigned int buffer_hz;
4645
4646 /* Returns tc handle 'major':'minor'. */
4647 static unsigned int
4648 tc_make_handle(unsigned int major, unsigned int minor)
4649 {
4650 return TC_H_MAKE(major << 16, minor);
4651 }
4652
4653 /* Returns the major number from 'handle'. */
4654 static unsigned int
4655 tc_get_major(unsigned int handle)
4656 {
4657 return TC_H_MAJ(handle) >> 16;
4658 }
4659
4660 /* Returns the minor number from 'handle'. */
4661 static unsigned int
4662 tc_get_minor(unsigned int handle)
4663 {
4664 return TC_H_MIN(handle);
4665 }
4666
4667 static struct tcmsg *
4668 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4669 struct ofpbuf *request)
4670 {
4671 struct tcmsg *tcmsg;
4672 int ifindex;
4673 int error;
4674
4675 error = get_ifindex(netdev, &ifindex);
4676 if (error) {
4677 return NULL;
4678 }
4679
4680 ofpbuf_init(request, 512);
4681 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4682 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4683 tcmsg->tcm_family = AF_UNSPEC;
4684 tcmsg->tcm_ifindex = ifindex;
4685 /* Caller should fill in tcmsg->tcm_handle. */
4686 /* Caller should fill in tcmsg->tcm_parent. */
4687
4688 return tcmsg;
4689 }
4690
4691 static int
4692 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4693 {
4694 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4695 ofpbuf_uninit(request);
4696 return error;
4697 }
4698
4699 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4700 * policing configuration.
4701 *
4702 * This function is equivalent to running the following when 'add' is true:
4703 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4704 *
4705 * This function is equivalent to running the following when 'add' is false:
4706 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4707 *
4708 * The configuration and stats may be seen with the following command:
4709 * /sbin/tc -s qdisc show dev <devname>
4710 *
4711 * Returns 0 if successful, otherwise a positive errno value.
4712 */
4713 static int
4714 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4715 {
4716 struct ofpbuf request;
4717 struct tcmsg *tcmsg;
4718 int error;
4719 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4720 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4721
4722 tcmsg = tc_make_request(netdev, type, flags, &request);
4723 if (!tcmsg) {
4724 return ENODEV;
4725 }
4726 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4727 tcmsg->tcm_parent = TC_H_INGRESS;
4728 nl_msg_put_string(&request, TCA_KIND, "ingress");
4729 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4730
4731 error = tc_transact(&request, NULL);
4732 if (error) {
4733 /* If we're deleting the qdisc, don't worry about some of the
4734 * error conditions. */
4735 if (!add && (error == ENOENT || error == EINVAL)) {
4736 return 0;
4737 }
4738 return error;
4739 }
4740
4741 return 0;
4742 }
4743
4744 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4745 * of 'kbits_burst'.
4746 *
4747 * This function is equivalent to running:
4748 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4749 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4750 * mtu 65535 drop
4751 *
4752 * The configuration and stats may be seen with the following command:
4753 * /sbin/tc -s filter show dev <devname> parent ffff:
4754 *
4755 * Returns 0 if successful, otherwise a positive errno value.
4756 */
4757 static int
4758 tc_add_policer(struct netdev *netdev,
4759 uint32_t kbits_rate, uint32_t kbits_burst)
4760 {
4761 struct tc_police tc_police;
4762 struct ofpbuf request;
4763 struct tcmsg *tcmsg;
4764 size_t basic_offset;
4765 size_t police_offset;
4766 int error;
4767 int mtu = 65535;
4768
4769 memset(&tc_police, 0, sizeof tc_police);
4770 tc_police.action = TC_POLICE_SHOT;
4771 tc_police.mtu = mtu;
4772 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4773
4774 /* The following appears wrong in one way: In networking a kilobit is
4775 * usually 1000 bits but this uses 1024 bits.
4776 *
4777 * However if you "fix" those problems then "tc filter show ..." shows
4778 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4779 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4780 * tc's point of view. Whatever. */
4781 tc_police.burst = tc_bytes_to_ticks(
4782 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4783
4784 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4785 NLM_F_EXCL | NLM_F_CREATE, &request);
4786 if (!tcmsg) {
4787 return ENODEV;
4788 }
4789 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4790 tcmsg->tcm_info = tc_make_handle(49,
4791 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4792
4793 nl_msg_put_string(&request, TCA_KIND, "basic");
4794 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4795 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4796 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4797 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4798 nl_msg_end_nested(&request, police_offset);
4799 nl_msg_end_nested(&request, basic_offset);
4800
4801 error = tc_transact(&request, NULL);
4802 if (error) {
4803 return error;
4804 }
4805
4806 return 0;
4807 }
4808
4809 static void
4810 read_psched(void)
4811 {
4812 /* The values in psched are not individually very meaningful, but they are
4813 * important. The tables below show some values seen in the wild.
4814 *
4815 * Some notes:
4816 *
4817 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4818 * (Before that, there are hints that it was 1000000000.)
4819 *
4820 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4821 * above.
4822 *
4823 * /proc/net/psched
4824 * -----------------------------------
4825 * [1] 000c8000 000f4240 000f4240 00000064
4826 * [2] 000003e8 00000400 000f4240 3b9aca00
4827 * [3] 000003e8 00000400 000f4240 3b9aca00
4828 * [4] 000003e8 00000400 000f4240 00000064
4829 * [5] 000003e8 00000040 000f4240 3b9aca00
4830 * [6] 000003e8 00000040 000f4240 000000f9
4831 *
4832 * a b c d ticks_per_s buffer_hz
4833 * ------- --------- ---------- ------------- ----------- -------------
4834 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4835 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4836 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4837 * [4] 1,000 1,024 1,000,000 100 976,562 100
4838 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4839 * [6] 1,000 64 1,000,000 249 15,625,000 249
4840 *
4841 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4842 * [2] 2.6.26-1-686-bigmem from Debian lenny
4843 * [3] 2.6.26-2-sparc64 from Debian lenny
4844 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4845 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4846 * [6] 2.6.34 from kernel.org on KVM
4847 */
4848 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4849 static const char fn[] = "/proc/net/psched";
4850 unsigned int a, b, c, d;
4851 FILE *stream;
4852
4853 if (!ovsthread_once_start(&once)) {
4854 return;
4855 }
4856
4857 ticks_per_s = 1.0;
4858 buffer_hz = 100;
4859
4860 stream = fopen(fn, "r");
4861 if (!stream) {
4862 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4863 goto exit;
4864 }
4865
4866 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4867 VLOG_WARN("%s: read failed", fn);
4868 fclose(stream);
4869 goto exit;
4870 }
4871 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4872 fclose(stream);
4873
4874 if (!a || !c) {
4875 VLOG_WARN("%s: invalid scheduler parameters", fn);
4876 goto exit;
4877 }
4878
4879 ticks_per_s = (double) a * c / b;
4880 if (c == 1000000) {
4881 buffer_hz = d;
4882 } else {
4883 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4884 fn, a, b, c, d);
4885 }
4886 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4887
4888 exit:
4889 ovsthread_once_done(&once);
4890 }
4891
4892 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4893 * rate of 'rate' bytes per second. */
4894 static unsigned int
4895 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4896 {
4897 read_psched();
4898 return (rate * ticks) / ticks_per_s;
4899 }
4900
4901 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4902 * rate of 'rate' bytes per second. */
4903 static unsigned int
4904 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4905 {
4906 read_psched();
4907 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4908 }
4909
4910 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4911 * a transmission rate of 'rate' bytes per second. */
4912 static unsigned int
4913 tc_buffer_per_jiffy(unsigned int rate)
4914 {
4915 read_psched();
4916 return rate / buffer_hz;
4917 }
4918
4919 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4920 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4921 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4922 * stores NULL into it if it is absent.
4923 *
4924 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4925 * 'msg'.
4926 *
4927 * Returns 0 if successful, otherwise a positive errno value. */
4928 static int
4929 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4930 struct nlattr **options)
4931 {
4932 static const struct nl_policy tca_policy[] = {
4933 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4934 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4935 };
4936 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4937
4938 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4939 tca_policy, ta, ARRAY_SIZE(ta))) {
4940 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4941 goto error;
4942 }
4943
4944 if (kind) {
4945 *kind = nl_attr_get_string(ta[TCA_KIND]);
4946 }
4947
4948 if (options) {
4949 *options = ta[TCA_OPTIONS];
4950 }
4951
4952 return 0;
4953
4954 error:
4955 if (kind) {
4956 *kind = NULL;
4957 }
4958 if (options) {
4959 *options = NULL;
4960 }
4961 return EPROTO;
4962 }
4963
4964 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4965 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4966 * into '*options', and its queue statistics into '*stats'. Any of the output
4967 * arguments may be null.
4968 *
4969 * Returns 0 if successful, otherwise a positive errno value. */
4970 static int
4971 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4972 struct nlattr **options, struct netdev_queue_stats *stats)
4973 {
4974 static const struct nl_policy tca_policy[] = {
4975 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4976 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4977 };
4978 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4979
4980 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4981 tca_policy, ta, ARRAY_SIZE(ta))) {
4982 VLOG_WARN_RL(&rl, "failed to parse class message");
4983 goto error;
4984 }
4985
4986 if (handlep) {
4987 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4988 *handlep = tc->tcm_handle;
4989 }
4990
4991 if (options) {
4992 *options = ta[TCA_OPTIONS];
4993 }
4994
4995 if (stats) {
4996 const struct gnet_stats_queue *gsq;
4997 struct gnet_stats_basic gsb;
4998
4999 static const struct nl_policy stats_policy[] = {
5000 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5001 .min_len = sizeof gsb },
5002 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5003 .min_len = sizeof *gsq },
5004 };
5005 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5006
5007 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5008 sa, ARRAY_SIZE(sa))) {
5009 VLOG_WARN_RL(&rl, "failed to parse class stats");
5010 goto error;
5011 }
5012
5013 /* Alignment issues screw up the length of struct gnet_stats_basic on
5014 * some arch/bitsize combinations. Newer versions of Linux have a
5015 * struct gnet_stats_basic_packed, but we can't depend on that. The
5016 * easiest thing to do is just to make a copy. */
5017 memset(&gsb, 0, sizeof gsb);
5018 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5019 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5020 stats->tx_bytes = gsb.bytes;
5021 stats->tx_packets = gsb.packets;
5022
5023 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5024 stats->tx_errors = gsq->drops;
5025 }
5026
5027 return 0;
5028
5029 error:
5030 if (options) {
5031 *options = NULL;
5032 }
5033 if (stats) {
5034 memset(stats, 0, sizeof *stats);
5035 }
5036 return EPROTO;
5037 }
5038
5039 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5040 * on 'netdev'. */
5041 static int
5042 tc_query_class(const struct netdev *netdev,
5043 unsigned int handle, unsigned int parent,
5044 struct ofpbuf **replyp)
5045 {
5046 struct ofpbuf request;
5047 struct tcmsg *tcmsg;
5048 int error;
5049
5050 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5051 if (!tcmsg) {
5052 return ENODEV;
5053 }
5054 tcmsg->tcm_handle = handle;
5055 tcmsg->tcm_parent = parent;
5056
5057 error = tc_transact(&request, replyp);
5058 if (error) {
5059 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5060 netdev_get_name(netdev),
5061 tc_get_major(handle), tc_get_minor(handle),
5062 tc_get_major(parent), tc_get_minor(parent),
5063 ovs_strerror(error));
5064 }
5065 return error;
5066 }
5067
5068 /* Equivalent to "tc class del dev <name> handle <handle>". */
5069 static int
5070 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5071 {
5072 struct ofpbuf request;
5073 struct tcmsg *tcmsg;
5074 int error;
5075
5076 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5077 if (!tcmsg) {
5078 return ENODEV;
5079 }
5080 tcmsg->tcm_handle = handle;
5081 tcmsg->tcm_parent = 0;
5082
5083 error = tc_transact(&request, NULL);
5084 if (error) {
5085 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5086 netdev_get_name(netdev),
5087 tc_get_major(handle), tc_get_minor(handle),
5088 ovs_strerror(error));
5089 }
5090 return error;
5091 }
5092
5093 /* Equivalent to "tc qdisc del dev <name> root". */
5094 static int
5095 tc_del_qdisc(struct netdev *netdev_)
5096 {
5097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5098 struct ofpbuf request;
5099 struct tcmsg *tcmsg;
5100 int error;
5101
5102 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5103 if (!tcmsg) {
5104 return ENODEV;
5105 }
5106 tcmsg->tcm_handle = tc_make_handle(1, 0);
5107 tcmsg->tcm_parent = TC_H_ROOT;
5108
5109 error = tc_transact(&request, NULL);
5110 if (error == EINVAL) {
5111 /* EINVAL probably means that the default qdisc was in use, in which
5112 * case we've accomplished our purpose. */
5113 error = 0;
5114 }
5115 if (!error && netdev->tc) {
5116 if (netdev->tc->ops->tc_destroy) {
5117 netdev->tc->ops->tc_destroy(netdev->tc);
5118 }
5119 netdev->tc = NULL;
5120 }
5121 return error;
5122 }
5123
5124 static bool
5125 getqdisc_is_safe(void)
5126 {
5127 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5128 static bool safe = false;
5129
5130 if (ovsthread_once_start(&once)) {
5131 struct utsname utsname;
5132 int major, minor;
5133
5134 if (uname(&utsname) == -1) {
5135 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5136 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5137 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5138 } else if (major < 2 || (major == 2 && minor < 35)) {
5139 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5140 utsname.release);
5141 } else {
5142 safe = true;
5143 }
5144 ovsthread_once_done(&once);
5145 }
5146 return safe;
5147 }
5148
5149 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5150 * kernel to determine what they are. Returns 0 if successful, otherwise a
5151 * positive errno value. */
5152 static int
5153 tc_query_qdisc(const struct netdev *netdev_)
5154 {
5155 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5156 struct ofpbuf request, *qdisc;
5157 const struct tc_ops *ops;
5158 struct tcmsg *tcmsg;
5159 int load_error;
5160 int error;
5161
5162 if (netdev->tc) {
5163 return 0;
5164 }
5165
5166 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5167 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5168 * 2.6.35 without that fix backported to it.
5169 *
5170 * To avoid the OOPS, we must not make a request that would attempt to dump
5171 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5172 * few others. There are a few ways that I can see to do this, but most of
5173 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5174 * technique chosen here is to assume that any non-default qdisc that we
5175 * create will have a class with handle 1:0. The built-in qdiscs only have
5176 * a class with handle 0:0.
5177 *
5178 * On Linux 2.6.35+ we use the straightforward method because it allows us
5179 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5180 * in such a case we get no response at all from the kernel (!) if a
5181 * builtin qdisc is in use (which is later caught by "!error &&
5182 * !qdisc->size"). */
5183 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5184 if (!tcmsg) {
5185 return ENODEV;
5186 }
5187 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5188 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5189
5190 /* Figure out what tc class to instantiate. */
5191 error = tc_transact(&request, &qdisc);
5192 if (!error && qdisc->size) {
5193 const char *kind;
5194
5195 error = tc_parse_qdisc(qdisc, &kind, NULL);
5196 if (error) {
5197 ops = &tc_ops_other;
5198 } else {
5199 ops = tc_lookup_linux_name(kind);
5200 if (!ops) {
5201 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5202 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5203
5204 ops = &tc_ops_other;
5205 }
5206 }
5207 } else if ((!error && !qdisc->size) || error == ENOENT) {
5208 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5209 * set up by some other entity that doesn't have a handle 1:0. We will
5210 * assume that it's the system default qdisc. */
5211 ops = &tc_ops_default;
5212 error = 0;
5213 } else {
5214 /* Who knows? Maybe the device got deleted. */
5215 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5216 netdev_get_name(netdev_), ovs_strerror(error));
5217 ops = &tc_ops_other;
5218 }
5219
5220 /* Instantiate it. */
5221 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5222 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5223 ofpbuf_delete(qdisc);
5224
5225 return error ? error : load_error;
5226 }
5227
5228 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5229 approximate the time to transmit packets of various lengths. For an MTU of
5230 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5231 represents two possible packet lengths; for a MTU of 513 through 1024, four
5232 possible lengths; and so on.
5233
5234 Returns, for the specified 'mtu', the number of bits that packet lengths
5235 need to be shifted right to fit within such a 256-entry table. */
5236 static int
5237 tc_calc_cell_log(unsigned int mtu)
5238 {
5239 int cell_log;
5240
5241 if (!mtu) {
5242 mtu = ETH_PAYLOAD_MAX;
5243 }
5244 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5245
5246 for (cell_log = 0; mtu >= 256; cell_log++) {
5247 mtu >>= 1;
5248 }
5249
5250 return cell_log;
5251 }
5252
5253 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5254 * of 'mtu'. */
5255 static void
5256 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5257 {
5258 memset(rate, 0, sizeof *rate);
5259 rate->cell_log = tc_calc_cell_log(mtu);
5260 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5261 /* rate->cell_align = 0; */ /* distro headers. */
5262 rate->mpu = ETH_TOTAL_MIN;
5263 rate->rate = Bps;
5264 }
5265
5266 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5267 * attribute of the specified "type".
5268 *
5269 * See tc_calc_cell_log() above for a description of "rtab"s. */
5270 static void
5271 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5272 {
5273 uint32_t *rtab;
5274 unsigned int i;
5275
5276 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5277 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5278 unsigned packet_size = (i + 1) << rate->cell_log;
5279 if (packet_size < rate->mpu) {
5280 packet_size = rate->mpu;
5281 }
5282 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5283 }
5284 }
5285
5286 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5287 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5288 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5289 * 0 is fine.) */
5290 static int
5291 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5292 {
5293 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5294 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5295 }
5296 \f
5297 /* Linux-only functions declared in netdev-linux.h */
5298
5299 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5300 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5301 int
5302 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5303 const char *flag_name, bool enable)
5304 {
5305 const char *netdev_name = netdev_get_name(netdev);
5306 struct ethtool_value evalue;
5307 uint32_t new_flags;
5308 int error;
5309
5310 COVERAGE_INC(netdev_get_ethtool);
5311 memset(&evalue, 0, sizeof evalue);
5312 error = netdev_linux_do_ethtool(netdev_name,
5313 (struct ethtool_cmd *)&evalue,
5314 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5315 if (error) {
5316 return error;
5317 }
5318
5319 COVERAGE_INC(netdev_set_ethtool);
5320 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5321 if (new_flags == evalue.data) {
5322 return 0;
5323 }
5324 evalue.data = new_flags;
5325 error = netdev_linux_do_ethtool(netdev_name,
5326 (struct ethtool_cmd *)&evalue,
5327 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5328 if (error) {
5329 return error;
5330 }
5331
5332 COVERAGE_INC(netdev_get_ethtool);
5333 memset(&evalue, 0, sizeof evalue);
5334 error = netdev_linux_do_ethtool(netdev_name,
5335 (struct ethtool_cmd *)&evalue,
5336 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5337 if (error) {
5338 return error;
5339 }
5340
5341 if (new_flags != evalue.data) {
5342 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5343 "device %s failed", enable ? "enable" : "disable",
5344 flag_name, netdev_name);
5345 return EOPNOTSUPP;
5346 }
5347
5348 return 0;
5349 }
5350 \f
5351 /* Utility functions. */
5352
5353 /* Copies 'src' into 'dst', performing format conversion in the process. */
5354 static void
5355 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5356 const struct rtnl_link_stats *src)
5357 {
5358 dst->rx_packets = src->rx_packets;
5359 dst->tx_packets = src->tx_packets;
5360 dst->rx_bytes = src->rx_bytes;
5361 dst->tx_bytes = src->tx_bytes;
5362 dst->rx_errors = src->rx_errors;
5363 dst->tx_errors = src->tx_errors;
5364 dst->rx_dropped = src->rx_dropped;
5365 dst->tx_dropped = src->tx_dropped;
5366 dst->multicast = src->multicast;
5367 dst->collisions = src->collisions;
5368 dst->rx_length_errors = src->rx_length_errors;
5369 dst->rx_over_errors = src->rx_over_errors;
5370 dst->rx_crc_errors = src->rx_crc_errors;
5371 dst->rx_frame_errors = src->rx_frame_errors;
5372 dst->rx_fifo_errors = src->rx_fifo_errors;
5373 dst->rx_missed_errors = src->rx_missed_errors;
5374 dst->tx_aborted_errors = src->tx_aborted_errors;
5375 dst->tx_carrier_errors = src->tx_carrier_errors;
5376 dst->tx_fifo_errors = src->tx_fifo_errors;
5377 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5378 dst->tx_window_errors = src->tx_window_errors;
5379 }
5380
5381 /* Copies 'src' into 'dst', performing format conversion in the process. */
5382 static void
5383 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5384 const struct rtnl_link_stats64 *src)
5385 {
5386 dst->rx_packets = src->rx_packets;
5387 dst->tx_packets = src->tx_packets;
5388 dst->rx_bytes = src->rx_bytes;
5389 dst->tx_bytes = src->tx_bytes;
5390 dst->rx_errors = src->rx_errors;
5391 dst->tx_errors = src->tx_errors;
5392 dst->rx_dropped = src->rx_dropped;
5393 dst->tx_dropped = src->tx_dropped;
5394 dst->multicast = src->multicast;
5395 dst->collisions = src->collisions;
5396 dst->rx_length_errors = src->rx_length_errors;
5397 dst->rx_over_errors = src->rx_over_errors;
5398 dst->rx_crc_errors = src->rx_crc_errors;
5399 dst->rx_frame_errors = src->rx_frame_errors;
5400 dst->rx_fifo_errors = src->rx_fifo_errors;
5401 dst->rx_missed_errors = src->rx_missed_errors;
5402 dst->tx_aborted_errors = src->tx_aborted_errors;
5403 dst->tx_carrier_errors = src->tx_carrier_errors;
5404 dst->tx_fifo_errors = src->tx_fifo_errors;
5405 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5406 dst->tx_window_errors = src->tx_window_errors;
5407 }
5408
5409 static int
5410 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5411 {
5412 struct ofpbuf request;
5413 struct ofpbuf *reply;
5414 int error;
5415
5416 /* Filtering all counters by default */
5417 memset(stats, 0xFF, sizeof(struct netdev_stats));
5418
5419 ofpbuf_init(&request, 0);
5420 nl_msg_put_nlmsghdr(&request,
5421 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5422 RTM_GETLINK, NLM_F_REQUEST);
5423 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5424 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5425 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5426 ofpbuf_uninit(&request);
5427 if (error) {
5428 return error;
5429 }
5430
5431 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5432 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5433 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5434 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5435 error = 0;
5436 } else {
5437 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5438 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5439 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5440 error = 0;
5441 } else {
5442 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5443 error = EPROTO;
5444 }
5445 }
5446 } else {
5447 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5448 error = EPROTO;
5449 }
5450
5451
5452 ofpbuf_delete(reply);
5453 return error;
5454 }
5455
5456 static int
5457 get_flags(const struct netdev *dev, unsigned int *flags)
5458 {
5459 struct ifreq ifr;
5460 int error;
5461
5462 *flags = 0;
5463 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5464 if (!error) {
5465 *flags = ifr.ifr_flags;
5466 }
5467 return error;
5468 }
5469
5470 static int
5471 set_flags(const char *name, unsigned int flags)
5472 {
5473 struct ifreq ifr;
5474
5475 ifr.ifr_flags = flags;
5476 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5477 }
5478
5479 static int
5480 do_get_ifindex(const char *netdev_name)
5481 {
5482 struct ifreq ifr;
5483 int error;
5484
5485 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5486 COVERAGE_INC(netdev_get_ifindex);
5487
5488 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5489 if (error) {
5490 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5491 netdev_name, ovs_strerror(error));
5492 return -error;
5493 }
5494 return ifr.ifr_ifindex;
5495 }
5496
5497 static int
5498 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5499 {
5500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5501
5502 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5503 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5504
5505 if (ifindex < 0) {
5506 netdev->get_ifindex_error = -ifindex;
5507 netdev->ifindex = 0;
5508 } else {
5509 netdev->get_ifindex_error = 0;
5510 netdev->ifindex = ifindex;
5511 }
5512 netdev->cache_valid |= VALID_IFINDEX;
5513 }
5514
5515 *ifindexp = netdev->ifindex;
5516 return netdev->get_ifindex_error;
5517 }
5518
5519 static int
5520 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5521 {
5522 struct ifreq ifr;
5523 int hwaddr_family;
5524 int error;
5525
5526 memset(&ifr, 0, sizeof ifr);
5527 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5528 COVERAGE_INC(netdev_get_hwaddr);
5529 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5530 if (error) {
5531 /* ENODEV probably means that a vif disappeared asynchronously and
5532 * hasn't been removed from the database yet, so reduce the log level
5533 * to INFO for that case. */
5534 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5535 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5536 netdev_name, ovs_strerror(error));
5537 return error;
5538 }
5539 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5540 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5541 VLOG_INFO("%s device has unknown hardware address family %d",
5542 netdev_name, hwaddr_family);
5543 return EINVAL;
5544 }
5545 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5546 return 0;
5547 }
5548
5549 static int
5550 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5551 {
5552 struct ifreq ifr;
5553 int error;
5554
5555 memset(&ifr, 0, sizeof ifr);
5556 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5557 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5558 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5559 COVERAGE_INC(netdev_set_hwaddr);
5560 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5561 if (error) {
5562 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5563 netdev_name, ovs_strerror(error));
5564 }
5565 return error;
5566 }
5567
5568 static int
5569 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5570 int cmd, const char *cmd_name)
5571 {
5572 struct ifreq ifr;
5573 int error;
5574
5575 memset(&ifr, 0, sizeof ifr);
5576 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5577 ifr.ifr_data = (caddr_t) ecmd;
5578
5579 ecmd->cmd = cmd;
5580 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5581 if (error) {
5582 if (error != EOPNOTSUPP) {
5583 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5584 "failed: %s", cmd_name, name, ovs_strerror(error));
5585 } else {
5586 /* The device doesn't support this operation. That's pretty
5587 * common, so there's no point in logging anything. */
5588 }
5589 }
5590 return error;
5591 }
5592
5593 /* Returns an AF_PACKET raw socket or a negative errno value. */
5594 static int
5595 af_packet_sock(void)
5596 {
5597 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5598 static int sock;
5599
5600 if (ovsthread_once_start(&once)) {
5601 sock = socket(AF_PACKET, SOCK_RAW, 0);
5602 if (sock >= 0) {
5603 int error = set_nonblocking(sock);
5604 if (error) {
5605 close(sock);
5606 sock = -error;
5607 }
5608 } else {
5609 sock = -errno;
5610 VLOG_ERR("failed to create packet socket: %s",
5611 ovs_strerror(errno));
5612 }
5613 ovsthread_once_done(&once);
5614 }
5615
5616 return sock;
5617 }