]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
Update mailing list archive pointers to the current server.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <arpa/inet.h>
24 #include <inttypes.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/sockios.h>
34 #include <sys/types.h>
35 #include <sys/ioctl.h>
36 #include <sys/socket.h>
37 #include <sys/utsname.h>
38 #include <netpacket/packet.h>
39 #include <net/if.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "netlink.h"
63 #include "openvswitch/ofpbuf.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
66 #include "packets.h"
67 #include "openvswitch/poll-loop.h"
68 #include "rtnetlink.h"
69 #include "openvswitch/shash.h"
70 #include "socket-util.h"
71 #include "sset.h"
72 #include "tc.h"
73 #include "timer.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
76 #include "util.h"
77
78 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79
80 COVERAGE_DEFINE(netdev_set_policing);
81 COVERAGE_DEFINE(netdev_arp_lookup);
82 COVERAGE_DEFINE(netdev_get_ifindex);
83 COVERAGE_DEFINE(netdev_get_hwaddr);
84 COVERAGE_DEFINE(netdev_set_hwaddr);
85 COVERAGE_DEFINE(netdev_get_ethtool);
86 COVERAGE_DEFINE(netdev_set_ethtool);
87
88 \f
89 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91 #ifndef ADVERTISED_Pause
92 #define ADVERTISED_Pause (1 << 13)
93 #endif
94 #ifndef ADVERTISED_Asym_Pause
95 #define ADVERTISED_Asym_Pause (1 << 14)
96 #endif
97
98 /* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100 #ifndef ETHTOOL_GFLAGS
101 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #endif
103 #ifndef ETHTOOL_SFLAGS
104 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 #endif
106
107 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109 #ifndef TC_RTAB_SIZE
110 #define TC_RTAB_SIZE 1024
111 #endif
112
113 /* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
122 #ifndef PACKET_AUXDATA
123 #define PACKET_AUXDATA 8
124 #endif
125 #ifndef TP_STATUS_VLAN_VALID
126 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #endif
128 #ifndef TP_STATUS_VLAN_TPID_VALID
129 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #endif
131 #undef tpacket_auxdata
132 #define tpacket_auxdata rpl_tpacket_auxdata
133 struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141 };
142
143 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
148 * unconditionally replace ethtool_cmd_speed. */
149 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
150 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 {
152 return ep->speed | (ep->speed_hi << 16);
153 }
154
155 /* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157 #ifndef SUPPORTED_1000baseKX_Full
158 #define SUPPORTED_1000baseKX_Full (1 << 17)
159 #define SUPPORTED_10000baseKX4_Full (1 << 18)
160 #define SUPPORTED_10000baseKR_Full (1 << 19)
161 #define SUPPORTED_10000baseR_FEC (1 << 20)
162 #define ADVERTISED_1000baseKX_Full (1 << 17)
163 #define ADVERTISED_10000baseKX4_Full (1 << 18)
164 #define ADVERTISED_10000baseKR_Full (1 << 19)
165 #define ADVERTISED_10000baseR_FEC (1 << 20)
166 #endif
167
168 /* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170 #ifndef SUPPORTED_40000baseKR4_Full
171 #define SUPPORTED_40000baseKR4_Full (1 << 23)
172 #define SUPPORTED_40000baseCR4_Full (1 << 24)
173 #define SUPPORTED_40000baseSR4_Full (1 << 25)
174 #define SUPPORTED_40000baseLR4_Full (1 << 26)
175 #define ADVERTISED_40000baseKR4_Full (1 << 23)
176 #define ADVERTISED_40000baseCR4_Full (1 << 24)
177 #define ADVERTISED_40000baseSR4_Full (1 << 25)
178 #define ADVERTISED_40000baseLR4_Full (1 << 26)
179 #endif
180
181 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
186 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188 #ifndef IFLA_STATS64
189 #define IFLA_STATS64 23
190 #endif
191 #define rtnl_link_stats64 rpl_rtnl_link_stats64
192 struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219 };
220
221 enum {
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
230 };
231 \f
232 /* Traffic control. */
233
234 /* An instance of a traffic control class. Always associated with a particular
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
239 struct tc {
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244 };
245
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
248 /* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252 struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
256 };
257
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264 struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
344
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
374
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401 };
402
403 static void
404 tc_init(struct tc *tc, const struct tc_ops *ops)
405 {
406 tc->ops = ops;
407 hmap_init(&tc->queues);
408 }
409
410 static void
411 tc_destroy(struct tc *tc)
412 {
413 hmap_destroy(&tc->queues);
414 }
415
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_noop;
423 static const struct tc_ops tc_ops_other;
424
425 static const struct tc_ops *const tcs[] = {
426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
431 &tc_ops_noop, /* Non operating qos type. */
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435 };
436
437 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
438 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
439 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
440
441 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
442 int type,
443 unsigned int flags,
444 struct ofpbuf *);
445 static int tc_add_policer(struct netdev *,
446 uint32_t kbits_rate, uint32_t kbits_burst);
447
448 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
449 struct nlattr **options);
450 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
451 struct nlattr **options,
452 struct netdev_queue_stats *);
453 static int tc_query_class(const struct netdev *,
454 unsigned int handle, unsigned int parent,
455 struct ofpbuf **replyp);
456 static int tc_delete_class(const struct netdev *, unsigned int handle);
457
458 static int tc_del_qdisc(struct netdev *netdev);
459 static int tc_query_qdisc(const struct netdev *netdev);
460
461 static int tc_calc_cell_log(unsigned int mtu);
462 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
463 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
464 const struct tc_ratespec *rate);
465 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
466 \f
467 struct netdev_linux {
468 struct netdev up;
469
470 /* Protects all members below. */
471 struct ovs_mutex mutex;
472
473 unsigned int cache_valid;
474
475 bool miimon; /* Link status of last poll. */
476 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
477 struct timer miimon_timer;
478
479 /* The following are figured out "on demand" only. They are only valid
480 * when the corresponding VALID_* bit in 'cache_valid' is set. */
481 int ifindex;
482 struct eth_addr etheraddr;
483 int mtu;
484 unsigned int ifi_flags;
485 long long int carrier_resets;
486 uint32_t kbits_rate; /* Policing data. */
487 uint32_t kbits_burst;
488 int vport_stats_error; /* Cached error code from vport_get_stats().
489 0 or an errno value. */
490 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
491 int ether_addr_error; /* Cached error code from set/get etheraddr. */
492 int netdev_policing_error; /* Cached error code from set policing. */
493 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
494 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
495
496 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
497 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
499
500 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
501 struct tc *tc;
502
503 /* For devices of class netdev_tap_class only. */
504 int tap_fd;
505 };
506
507 struct netdev_rxq_linux {
508 struct netdev_rxq up;
509 bool is_tap;
510 int fd;
511 };
512
513 /* This is set pretty low because we probably won't learn anything from the
514 * additional log messages. */
515 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
516
517 /* Polling miimon status for all ports causes performance degradation when
518 * handling a large number of ports. If there are no devices using miimon, then
519 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
520 *
521 * Readers do not depend on this variable synchronizing with the related
522 * changes in the device miimon status, so we can use atomic_count. */
523 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
524
525 static void netdev_linux_run(const struct netdev_class *);
526
527 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
528 int cmd, const char *cmd_name);
529 static int get_flags(const struct netdev *, unsigned int *flags);
530 static int set_flags(const char *, unsigned int flags);
531 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
532 enum netdev_flags on, enum netdev_flags *old_flagsp)
533 OVS_REQUIRES(netdev->mutex);
534 static int get_ifindex(const struct netdev *, int *ifindexp);
535 static int do_set_addr(struct netdev *netdev,
536 int ioctl_nr, const char *ioctl_name,
537 struct in_addr addr);
538 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
539 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
540 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
541 static int af_packet_sock(void);
542 static bool netdev_linux_miimon_enabled(void);
543 static void netdev_linux_miimon_run(void);
544 static void netdev_linux_miimon_wait(void);
545 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
546
547 static bool
548 is_netdev_linux_class(const struct netdev_class *netdev_class)
549 {
550 return netdev_class->run == netdev_linux_run;
551 }
552
553 static bool
554 is_tap_netdev(const struct netdev *netdev)
555 {
556 return netdev_get_class(netdev) == &netdev_tap_class;
557 }
558
559 static struct netdev_linux *
560 netdev_linux_cast(const struct netdev *netdev)
561 {
562 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
563
564 return CONTAINER_OF(netdev, struct netdev_linux, up);
565 }
566
567 static struct netdev_rxq_linux *
568 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
569 {
570 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
571 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
572 }
573 \f
574 static void netdev_linux_update(struct netdev_linux *netdev,
575 const struct rtnetlink_change *)
576 OVS_REQUIRES(netdev->mutex);
577 static void netdev_linux_changed(struct netdev_linux *netdev,
578 unsigned int ifi_flags, unsigned int mask)
579 OVS_REQUIRES(netdev->mutex);
580
581 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
582 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
583 * if no such socket could be created. */
584 static struct nl_sock *
585 netdev_linux_notify_sock(void)
586 {
587 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
588 static struct nl_sock *sock;
589 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
590 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
591
592 if (ovsthread_once_start(&once)) {
593 int error;
594
595 error = nl_sock_create(NETLINK_ROUTE, &sock);
596 if (!error) {
597 size_t i;
598
599 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
600 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
601 if (error) {
602 nl_sock_destroy(sock);
603 sock = NULL;
604 break;
605 }
606 }
607 }
608 ovsthread_once_done(&once);
609 }
610
611 return sock;
612 }
613
614 static bool
615 netdev_linux_miimon_enabled(void)
616 {
617 return atomic_count_get(&miimon_cnt) > 0;
618 }
619
620 static void
621 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
622 {
623 struct nl_sock *sock;
624 int error;
625
626 if (netdev_linux_miimon_enabled()) {
627 netdev_linux_miimon_run();
628 }
629
630 sock = netdev_linux_notify_sock();
631 if (!sock) {
632 return;
633 }
634
635 do {
636 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
637 uint64_t buf_stub[4096 / 8];
638 struct ofpbuf buf;
639
640 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
641 error = nl_sock_recv(sock, &buf, false);
642 if (!error) {
643 struct rtnetlink_change change;
644
645 if (rtnetlink_parse(&buf, &change)) {
646 struct netdev *netdev_ = NULL;
647 char dev_name[IFNAMSIZ];
648
649 if (!change.ifname) {
650 change.ifname = if_indextoname(change.if_index, dev_name);
651 }
652
653 if (change.ifname) {
654 netdev_ = netdev_from_name(change.ifname);
655 }
656 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
658
659 ovs_mutex_lock(&netdev->mutex);
660 netdev_linux_update(netdev, &change);
661 ovs_mutex_unlock(&netdev->mutex);
662 }
663 netdev_close(netdev_);
664 }
665 } else if (error == ENOBUFS) {
666 struct shash device_shash;
667 struct shash_node *node;
668
669 nl_sock_drain(sock);
670
671 shash_init(&device_shash);
672 netdev_get_devices(&netdev_linux_class, &device_shash);
673 SHASH_FOR_EACH (node, &device_shash) {
674 struct netdev *netdev_ = node->data;
675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
676 unsigned int flags;
677
678 ovs_mutex_lock(&netdev->mutex);
679 get_flags(netdev_, &flags);
680 netdev_linux_changed(netdev, flags, 0);
681 ovs_mutex_unlock(&netdev->mutex);
682
683 netdev_close(netdev_);
684 }
685 shash_destroy(&device_shash);
686 } else if (error != EAGAIN) {
687 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
688 ovs_strerror(error));
689 }
690 ofpbuf_uninit(&buf);
691 } while (!error);
692 }
693
694 static void
695 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
696 {
697 struct nl_sock *sock;
698
699 if (netdev_linux_miimon_enabled()) {
700 netdev_linux_miimon_wait();
701 }
702 sock = netdev_linux_notify_sock();
703 if (sock) {
704 nl_sock_wait(sock, POLLIN);
705 }
706 }
707
708 static void
709 netdev_linux_changed(struct netdev_linux *dev,
710 unsigned int ifi_flags, unsigned int mask)
711 OVS_REQUIRES(dev->mutex)
712 {
713 netdev_change_seq_changed(&dev->up);
714
715 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
716 dev->carrier_resets++;
717 }
718 dev->ifi_flags = ifi_flags;
719
720 dev->cache_valid &= mask;
721 if (!(mask & VALID_IN)) {
722 netdev_get_addrs_list_flush();
723 }
724 }
725
726 static void
727 netdev_linux_update(struct netdev_linux *dev,
728 const struct rtnetlink_change *change)
729 OVS_REQUIRES(dev->mutex)
730 {
731 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
732 if (change->nlmsg_type == RTM_NEWLINK) {
733 /* Keep drv-info, and ip addresses. */
734 netdev_linux_changed(dev, change->ifi_flags,
735 VALID_DRVINFO | VALID_IN);
736
737 /* Update netdev from rtnl-change msg. */
738 if (change->mtu) {
739 dev->mtu = change->mtu;
740 dev->cache_valid |= VALID_MTU;
741 dev->netdev_mtu_error = 0;
742 }
743
744 if (!eth_addr_is_zero(change->mac)) {
745 dev->etheraddr = change->mac;
746 dev->cache_valid |= VALID_ETHERADDR;
747 dev->ether_addr_error = 0;
748 }
749
750 dev->ifindex = change->if_index;
751 dev->cache_valid |= VALID_IFINDEX;
752 dev->get_ifindex_error = 0;
753 } else {
754 netdev_linux_changed(dev, change->ifi_flags, 0);
755 }
756 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
757 /* Invalidates in4, in6. */
758 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
759 } else {
760 OVS_NOT_REACHED();
761 }
762 }
763
764 static struct netdev *
765 netdev_linux_alloc(void)
766 {
767 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
768 return &netdev->up;
769 }
770
771 static int
772 netdev_linux_common_construct(struct netdev *netdev_)
773 {
774 /* Prevent any attempt to create (or open) a network device named "default"
775 * or "all". These device names are effectively reserved on Linux because
776 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
777 * itself this wouldn't call for any special treatment, but in practice if
778 * a program tries to create devices with these names, it causes the kernel
779 * to fire a "new device" notification event even though creation failed,
780 * and in turn that causes OVS to wake up and try to create them again,
781 * which ends up as a 100% CPU loop. */
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783 const char *name = netdev_->name;
784 if (!strcmp(name, "default") || !strcmp(name, "all")) {
785 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
786 VLOG_WARN_RL(&rl, "%s: Linux forbids network device with this name",
787 name);
788 return EINVAL;
789 }
790
791 ovs_mutex_init(&netdev->mutex);
792 return 0;
793 }
794
795 /* Creates system and internal devices. */
796 static int
797 netdev_linux_construct(struct netdev *netdev_)
798 {
799 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
800 int error = netdev_linux_common_construct(netdev_);
801 if (error) {
802 return error;
803 }
804
805 error = get_flags(&netdev->up, &netdev->ifi_flags);
806 if (error == ENODEV) {
807 if (netdev->up.netdev_class != &netdev_internal_class) {
808 /* The device does not exist, so don't allow it to be opened. */
809 return ENODEV;
810 } else {
811 /* "Internal" netdevs have to be created as netdev objects before
812 * they exist in the kernel, because creating them in the kernel
813 * happens by passing a netdev object to dpif_port_add().
814 * Therefore, ignore the error. */
815 }
816 }
817
818 return 0;
819 }
820
821 /* For most types of netdevs we open the device for each call of
822 * netdev_open(). However, this is not the case with tap devices,
823 * since it is only possible to open the device once. In this
824 * situation we share a single file descriptor, and consequently
825 * buffers, across all readers. Therefore once data is read it will
826 * be unavailable to other reads for tap devices. */
827 static int
828 netdev_linux_construct_tap(struct netdev *netdev_)
829 {
830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
831 static const char tap_dev[] = "/dev/net/tun";
832 const char *name = netdev_->name;
833 struct ifreq ifr;
834
835 int error = netdev_linux_common_construct(netdev_);
836 if (error) {
837 return error;
838 }
839
840 /* Open tap device. */
841 netdev->tap_fd = open(tap_dev, O_RDWR);
842 if (netdev->tap_fd < 0) {
843 error = errno;
844 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
845 return error;
846 }
847
848 /* Create tap device. */
849 get_flags(&netdev->up, &netdev->ifi_flags);
850 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
851 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
852 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
853 VLOG_WARN("%s: creating tap device failed: %s", name,
854 ovs_strerror(errno));
855 error = errno;
856 goto error_close;
857 }
858
859 /* Make non-blocking. */
860 error = set_nonblocking(netdev->tap_fd);
861 if (error) {
862 goto error_close;
863 }
864
865 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
866 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
867 ovs_strerror(errno));
868 error = errno;
869 goto error_close;
870 }
871
872 return 0;
873
874 error_close:
875 close(netdev->tap_fd);
876 return error;
877 }
878
879 static void
880 netdev_linux_destruct(struct netdev *netdev_)
881 {
882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
883
884 if (netdev->tc && netdev->tc->ops->tc_destroy) {
885 netdev->tc->ops->tc_destroy(netdev->tc);
886 }
887
888 if (netdev_get_class(netdev_) == &netdev_tap_class
889 && netdev->tap_fd >= 0)
890 {
891 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
892 close(netdev->tap_fd);
893 }
894
895 if (netdev->miimon_interval > 0) {
896 atomic_count_dec(&miimon_cnt);
897 }
898
899 ovs_mutex_destroy(&netdev->mutex);
900 }
901
902 static void
903 netdev_linux_dealloc(struct netdev *netdev_)
904 {
905 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
906 free(netdev);
907 }
908
909 static struct netdev_rxq *
910 netdev_linux_rxq_alloc(void)
911 {
912 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
913 return &rx->up;
914 }
915
916 static int
917 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
918 {
919 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
920 struct netdev *netdev_ = rx->up.netdev;
921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
922 int error;
923
924 ovs_mutex_lock(&netdev->mutex);
925 rx->is_tap = is_tap_netdev(netdev_);
926 if (rx->is_tap) {
927 rx->fd = netdev->tap_fd;
928 } else {
929 struct sockaddr_ll sll;
930 int ifindex, val;
931 /* Result of tcpdump -dd inbound */
932 static const struct sock_filter filt[] = {
933 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
934 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
935 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
936 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
937 };
938 static const struct sock_fprog fprog = {
939 ARRAY_SIZE(filt), (struct sock_filter *) filt
940 };
941
942 /* Create file descriptor. */
943 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
944 if (rx->fd < 0) {
945 error = errno;
946 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
947 goto error;
948 }
949
950 val = 1;
951 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
952 error = errno;
953 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
954 netdev_get_name(netdev_), ovs_strerror(error));
955 goto error;
956 }
957
958 /* Set non-blocking mode. */
959 error = set_nonblocking(rx->fd);
960 if (error) {
961 goto error;
962 }
963
964 /* Get ethernet device index. */
965 error = get_ifindex(&netdev->up, &ifindex);
966 if (error) {
967 goto error;
968 }
969
970 /* Bind to specific ethernet device. */
971 memset(&sll, 0, sizeof sll);
972 sll.sll_family = AF_PACKET;
973 sll.sll_ifindex = ifindex;
974 sll.sll_protocol = htons(ETH_P_ALL);
975 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
976 error = errno;
977 VLOG_ERR("%s: failed to bind raw socket (%s)",
978 netdev_get_name(netdev_), ovs_strerror(error));
979 goto error;
980 }
981
982 /* Filter for only inbound packets. */
983 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
984 sizeof fprog);
985 if (error) {
986 error = errno;
987 VLOG_ERR("%s: failed to attach filter (%s)",
988 netdev_get_name(netdev_), ovs_strerror(error));
989 goto error;
990 }
991 }
992 ovs_mutex_unlock(&netdev->mutex);
993
994 return 0;
995
996 error:
997 if (rx->fd >= 0) {
998 close(rx->fd);
999 }
1000 ovs_mutex_unlock(&netdev->mutex);
1001 return error;
1002 }
1003
1004 static void
1005 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1006 {
1007 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1008
1009 if (!rx->is_tap) {
1010 close(rx->fd);
1011 }
1012 }
1013
1014 static void
1015 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1016 {
1017 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1018
1019 free(rx);
1020 }
1021
1022 static ovs_be16
1023 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1024 {
1025 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1026 return htons(aux->tp_vlan_tpid);
1027 } else if (double_tagged) {
1028 return htons(ETH_TYPE_VLAN_8021AD);
1029 } else {
1030 return htons(ETH_TYPE_VLAN_8021Q);
1031 }
1032 }
1033
1034 static bool
1035 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1036 {
1037 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1038 }
1039
1040 static int
1041 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1042 {
1043 size_t size;
1044 ssize_t retval;
1045 struct iovec iov;
1046 struct cmsghdr *cmsg;
1047 union {
1048 struct cmsghdr cmsg;
1049 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1050 } cmsg_buffer;
1051 struct msghdr msgh;
1052
1053 /* Reserve headroom for a single VLAN tag */
1054 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1055 size = dp_packet_tailroom(buffer);
1056
1057 iov.iov_base = dp_packet_data(buffer);
1058 iov.iov_len = size;
1059 msgh.msg_name = NULL;
1060 msgh.msg_namelen = 0;
1061 msgh.msg_iov = &iov;
1062 msgh.msg_iovlen = 1;
1063 msgh.msg_control = &cmsg_buffer;
1064 msgh.msg_controllen = sizeof cmsg_buffer;
1065 msgh.msg_flags = 0;
1066
1067 do {
1068 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1069 } while (retval < 0 && errno == EINTR);
1070
1071 if (retval < 0) {
1072 return errno;
1073 } else if (retval > size) {
1074 return EMSGSIZE;
1075 }
1076
1077 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1078
1079 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1080 const struct tpacket_auxdata *aux;
1081
1082 if (cmsg->cmsg_level != SOL_PACKET
1083 || cmsg->cmsg_type != PACKET_AUXDATA
1084 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1085 continue;
1086 }
1087
1088 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1089 if (auxdata_has_vlan_tci(aux)) {
1090 struct eth_header *eth;
1091 bool double_tagged;
1092
1093 if (retval < ETH_HEADER_LEN) {
1094 return EINVAL;
1095 }
1096
1097 eth = dp_packet_data(buffer);
1098 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1099
1100 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1101 htons(aux->tp_vlan_tci));
1102 break;
1103 }
1104 }
1105
1106 return 0;
1107 }
1108
1109 static int
1110 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1111 {
1112 ssize_t retval;
1113 size_t size = dp_packet_tailroom(buffer);
1114
1115 do {
1116 retval = read(fd, dp_packet_data(buffer), size);
1117 } while (retval < 0 && errno == EINTR);
1118
1119 if (retval < 0) {
1120 return errno;
1121 }
1122
1123 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1124 return 0;
1125 }
1126
1127 static int
1128 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
1129 {
1130 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1131 struct netdev *netdev = rx->up.netdev;
1132 struct dp_packet *buffer;
1133 ssize_t retval;
1134 int mtu;
1135
1136 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1137 mtu = ETH_PAYLOAD_MAX;
1138 }
1139
1140 /* Assume Ethernet port. No need to set packet_type. */
1141 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1142 DP_NETDEV_HEADROOM);
1143 retval = (rx->is_tap
1144 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1145 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1146
1147 if (retval) {
1148 if (retval != EAGAIN && retval != EMSGSIZE) {
1149 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1150 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1151 }
1152 dp_packet_delete(buffer);
1153 } else {
1154 dp_packet_batch_init_packet(batch, buffer);
1155 }
1156
1157 return retval;
1158 }
1159
1160 static void
1161 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1162 {
1163 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1164 poll_fd_wait(rx->fd, POLLIN);
1165 }
1166
1167 static int
1168 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1169 {
1170 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1171 if (rx->is_tap) {
1172 struct ifreq ifr;
1173 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1174 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1175 if (error) {
1176 return error;
1177 }
1178 drain_fd(rx->fd, ifr.ifr_qlen);
1179 return 0;
1180 } else {
1181 return drain_rcvbuf(rx->fd);
1182 }
1183 }
1184
1185 static int
1186 netdev_linux_sock_batch_send(int sock, int ifindex,
1187 struct dp_packet_batch *batch)
1188 {
1189 const size_t size = dp_packet_batch_size(batch);
1190 /* We don't bother setting most fields in sockaddr_ll because the
1191 * kernel ignores them for SOCK_RAW. */
1192 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1193 .sll_ifindex = ifindex };
1194
1195 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1196 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1197
1198 struct dp_packet *packet;
1199 DP_PACKET_BATCH_FOR_EACH (packet, batch) {
1200 iov[i].iov_base = dp_packet_data(packet);
1201 iov[i].iov_len = dp_packet_get_send_len(packet);
1202 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1203 .msg_namelen = sizeof sll,
1204 .msg_iov = &iov[i],
1205 .msg_iovlen = 1 };
1206 }
1207
1208 int error = 0;
1209 for (uint32_t ofs = 0; ofs < size; ) {
1210 ssize_t retval;
1211 do {
1212 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1213 error = retval < 0 ? errno : 0;
1214 } while (error == EINTR);
1215 if (error) {
1216 break;
1217 }
1218 ofs += retval;
1219 }
1220
1221 free(mmsg);
1222 free(iov);
1223 return error;
1224 }
1225
1226 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1227 * essential, because packets sent to a tap device with an AF_PACKET socket
1228 * will loop back to be *received* again on the tap device. This doesn't occur
1229 * on other interface types because we attach a socket filter to the rx
1230 * socket. */
1231 static int
1232 netdev_linux_tap_batch_send(struct netdev *netdev_,
1233 struct dp_packet_batch *batch)
1234 {
1235 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1236 struct dp_packet *packet;
1237 DP_PACKET_BATCH_FOR_EACH (packet, batch) {
1238 size_t size = dp_packet_get_send_len(packet);
1239 ssize_t retval;
1240 int error;
1241
1242 do {
1243 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1244 error = retval < 0 ? errno : 0;
1245 } while (error == EINTR);
1246
1247 if (error) {
1248 /* The Linux tap driver returns EIO if the device is not up. From
1249 * the OVS side this is not an error, so we ignore it; otherwise,
1250 * return the erro. */
1251 if (error != EIO) {
1252 return error;
1253 }
1254 } else if (retval != size) {
1255 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1256 "bytes of %"PRIuSIZE") on %s",
1257 retval, size, netdev_get_name(netdev_));
1258 return EMSGSIZE;
1259 }
1260 }
1261 return 0;
1262 }
1263
1264 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1265 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1266 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1267 * the packet is too big or too small to transmit on the device.
1268 *
1269 * The kernel maintains a packet transmission queue, so the caller is not
1270 * expected to do additional queuing of packets. */
1271 static int
1272 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1273 struct dp_packet_batch *batch, bool may_steal,
1274 bool concurrent_txq OVS_UNUSED)
1275 {
1276 int error = 0;
1277 int sock = 0;
1278
1279 if (!is_tap_netdev(netdev_)) {
1280 sock = af_packet_sock();
1281 if (sock < 0) {
1282 error = -sock;
1283 goto free_batch;
1284 }
1285
1286 int ifindex = netdev_get_ifindex(netdev_);
1287 if (ifindex < 0) {
1288 error = -ifindex;
1289 goto free_batch;
1290 }
1291
1292 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1293 } else {
1294 error = netdev_linux_tap_batch_send(netdev_, batch);
1295 }
1296 if (error) {
1297 if (error == ENOBUFS) {
1298 /* The Linux AF_PACKET implementation never blocks waiting
1299 * for room for packets, instead returning ENOBUFS.
1300 * Translate this into EAGAIN for the caller. */
1301 error = EAGAIN;
1302 } else {
1303 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1304 netdev_get_name(netdev_), ovs_strerror(error));
1305 }
1306 }
1307
1308 free_batch:
1309 dp_packet_delete_batch(batch, may_steal);
1310 return error;
1311 }
1312
1313 /* Registers with the poll loop to wake up from the next call to poll_block()
1314 * when the packet transmission queue has sufficient room to transmit a packet
1315 * with netdev_send().
1316 *
1317 * The kernel maintains a packet transmission queue, so the client is not
1318 * expected to do additional queuing of packets. Thus, this function is
1319 * unlikely to ever be used. It is included for completeness. */
1320 static void
1321 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1322 {
1323 if (is_tap_netdev(netdev)) {
1324 /* TAP device always accepts packets.*/
1325 poll_immediate_wake();
1326 }
1327 }
1328
1329 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1330 * otherwise a positive errno value. */
1331 static int
1332 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1333 {
1334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1335 enum netdev_flags old_flags = 0;
1336 int error;
1337
1338 ovs_mutex_lock(&netdev->mutex);
1339
1340 if (netdev->cache_valid & VALID_ETHERADDR) {
1341 error = netdev->ether_addr_error;
1342 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1343 goto exit;
1344 }
1345 netdev->cache_valid &= ~VALID_ETHERADDR;
1346 }
1347
1348 /* Tap devices must be brought down before setting the address. */
1349 if (is_tap_netdev(netdev_)) {
1350 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1351 }
1352 error = set_etheraddr(netdev_get_name(netdev_), mac);
1353 if (!error || error == ENODEV) {
1354 netdev->ether_addr_error = error;
1355 netdev->cache_valid |= VALID_ETHERADDR;
1356 if (!error) {
1357 netdev->etheraddr = mac;
1358 }
1359 }
1360
1361 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1362 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1363 }
1364
1365 exit:
1366 ovs_mutex_unlock(&netdev->mutex);
1367 return error;
1368 }
1369
1370 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1371 static int
1372 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1373 {
1374 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1375 int error;
1376
1377 ovs_mutex_lock(&netdev->mutex);
1378 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1379 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1380 &netdev->etheraddr);
1381 netdev->cache_valid |= VALID_ETHERADDR;
1382 }
1383
1384 error = netdev->ether_addr_error;
1385 if (!error) {
1386 *mac = netdev->etheraddr;
1387 }
1388 ovs_mutex_unlock(&netdev->mutex);
1389
1390 return error;
1391 }
1392
1393 static int
1394 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1395 {
1396 int error;
1397
1398 if (!(netdev->cache_valid & VALID_MTU)) {
1399 struct ifreq ifr;
1400
1401 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1402 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1403 netdev->mtu = ifr.ifr_mtu;
1404 netdev->cache_valid |= VALID_MTU;
1405 }
1406
1407 error = netdev->netdev_mtu_error;
1408 if (!error) {
1409 *mtup = netdev->mtu;
1410 }
1411
1412 return error;
1413 }
1414
1415 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1416 * in bytes, not including the hardware header; thus, this is typically 1500
1417 * bytes for Ethernet devices. */
1418 static int
1419 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1420 {
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1422 int error;
1423
1424 ovs_mutex_lock(&netdev->mutex);
1425 error = netdev_linux_get_mtu__(netdev, mtup);
1426 ovs_mutex_unlock(&netdev->mutex);
1427
1428 return error;
1429 }
1430
1431 /* Sets the maximum size of transmitted (MTU) for given device using linux
1432 * networking ioctl interface.
1433 */
1434 static int
1435 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1436 {
1437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1438 struct ifreq ifr;
1439 int error;
1440
1441 ovs_mutex_lock(&netdev->mutex);
1442 if (netdev->cache_valid & VALID_MTU) {
1443 error = netdev->netdev_mtu_error;
1444 if (error || netdev->mtu == mtu) {
1445 goto exit;
1446 }
1447 netdev->cache_valid &= ~VALID_MTU;
1448 }
1449 ifr.ifr_mtu = mtu;
1450 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1451 SIOCSIFMTU, "SIOCSIFMTU");
1452 if (!error || error == ENODEV) {
1453 netdev->netdev_mtu_error = error;
1454 netdev->mtu = ifr.ifr_mtu;
1455 netdev->cache_valid |= VALID_MTU;
1456 }
1457 exit:
1458 ovs_mutex_unlock(&netdev->mutex);
1459 return error;
1460 }
1461
1462 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1463 * On failure, returns a negative errno value. */
1464 static int
1465 netdev_linux_get_ifindex(const struct netdev *netdev_)
1466 {
1467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1468 int ifindex, error;
1469
1470 ovs_mutex_lock(&netdev->mutex);
1471 error = get_ifindex(netdev_, &ifindex);
1472 ovs_mutex_unlock(&netdev->mutex);
1473
1474 return error ? -error : ifindex;
1475 }
1476
1477 static int
1478 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1479 {
1480 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1481
1482 ovs_mutex_lock(&netdev->mutex);
1483 if (netdev->miimon_interval > 0) {
1484 *carrier = netdev->miimon;
1485 } else {
1486 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1487 }
1488 ovs_mutex_unlock(&netdev->mutex);
1489
1490 return 0;
1491 }
1492
1493 static long long int
1494 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1495 {
1496 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1497 long long int carrier_resets;
1498
1499 ovs_mutex_lock(&netdev->mutex);
1500 carrier_resets = netdev->carrier_resets;
1501 ovs_mutex_unlock(&netdev->mutex);
1502
1503 return carrier_resets;
1504 }
1505
1506 static int
1507 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1508 struct mii_ioctl_data *data)
1509 {
1510 struct ifreq ifr;
1511 int error;
1512
1513 memset(&ifr, 0, sizeof ifr);
1514 memcpy(&ifr.ifr_data, data, sizeof *data);
1515 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1516 memcpy(data, &ifr.ifr_data, sizeof *data);
1517
1518 return error;
1519 }
1520
1521 static int
1522 netdev_linux_get_miimon(const char *name, bool *miimon)
1523 {
1524 struct mii_ioctl_data data;
1525 int error;
1526
1527 *miimon = false;
1528
1529 memset(&data, 0, sizeof data);
1530 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1531 if (!error) {
1532 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1533 data.reg_num = MII_BMSR;
1534 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1535 &data);
1536
1537 if (!error) {
1538 *miimon = !!(data.val_out & BMSR_LSTATUS);
1539 }
1540 }
1541 if (error) {
1542 struct ethtool_cmd ecmd;
1543
1544 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1545 name);
1546
1547 COVERAGE_INC(netdev_get_ethtool);
1548 memset(&ecmd, 0, sizeof ecmd);
1549 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1550 "ETHTOOL_GLINK");
1551 if (!error) {
1552 struct ethtool_value eval;
1553
1554 memcpy(&eval, &ecmd, sizeof eval);
1555 *miimon = !!eval.data;
1556 } else {
1557 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1558 }
1559 }
1560
1561 return error;
1562 }
1563
1564 static int
1565 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1566 long long int interval)
1567 {
1568 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1569
1570 ovs_mutex_lock(&netdev->mutex);
1571 interval = interval > 0 ? MAX(interval, 100) : 0;
1572 if (netdev->miimon_interval != interval) {
1573 if (interval && !netdev->miimon_interval) {
1574 atomic_count_inc(&miimon_cnt);
1575 } else if (!interval && netdev->miimon_interval) {
1576 atomic_count_dec(&miimon_cnt);
1577 }
1578
1579 netdev->miimon_interval = interval;
1580 timer_set_expired(&netdev->miimon_timer);
1581 }
1582 ovs_mutex_unlock(&netdev->mutex);
1583
1584 return 0;
1585 }
1586
1587 static void
1588 netdev_linux_miimon_run(void)
1589 {
1590 struct shash device_shash;
1591 struct shash_node *node;
1592
1593 shash_init(&device_shash);
1594 netdev_get_devices(&netdev_linux_class, &device_shash);
1595 SHASH_FOR_EACH (node, &device_shash) {
1596 struct netdev *netdev = node->data;
1597 struct netdev_linux *dev = netdev_linux_cast(netdev);
1598 bool miimon;
1599
1600 ovs_mutex_lock(&dev->mutex);
1601 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1602 netdev_linux_get_miimon(dev->up.name, &miimon);
1603 if (miimon != dev->miimon) {
1604 dev->miimon = miimon;
1605 netdev_linux_changed(dev, dev->ifi_flags, 0);
1606 }
1607
1608 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1609 }
1610 ovs_mutex_unlock(&dev->mutex);
1611 netdev_close(netdev);
1612 }
1613
1614 shash_destroy(&device_shash);
1615 }
1616
1617 static void
1618 netdev_linux_miimon_wait(void)
1619 {
1620 struct shash device_shash;
1621 struct shash_node *node;
1622
1623 shash_init(&device_shash);
1624 netdev_get_devices(&netdev_linux_class, &device_shash);
1625 SHASH_FOR_EACH (node, &device_shash) {
1626 struct netdev *netdev = node->data;
1627 struct netdev_linux *dev = netdev_linux_cast(netdev);
1628
1629 ovs_mutex_lock(&dev->mutex);
1630 if (dev->miimon_interval > 0) {
1631 timer_wait(&dev->miimon_timer);
1632 }
1633 ovs_mutex_unlock(&dev->mutex);
1634 netdev_close(netdev);
1635 }
1636 shash_destroy(&device_shash);
1637 }
1638
1639 static void
1640 swap_uint64(uint64_t *a, uint64_t *b)
1641 {
1642 uint64_t tmp = *a;
1643 *a = *b;
1644 *b = tmp;
1645 }
1646
1647 /* Copies 'src' into 'dst', performing format conversion in the process.
1648 *
1649 * 'src' is allowed to be misaligned. */
1650 static void
1651 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1652 const struct ovs_vport_stats *src)
1653 {
1654 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1655 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1656 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1657 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1658 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1659 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1660 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1661 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1662 dst->multicast = 0;
1663 dst->collisions = 0;
1664 dst->rx_length_errors = 0;
1665 dst->rx_over_errors = 0;
1666 dst->rx_crc_errors = 0;
1667 dst->rx_frame_errors = 0;
1668 dst->rx_fifo_errors = 0;
1669 dst->rx_missed_errors = 0;
1670 dst->tx_aborted_errors = 0;
1671 dst->tx_carrier_errors = 0;
1672 dst->tx_fifo_errors = 0;
1673 dst->tx_heartbeat_errors = 0;
1674 dst->tx_window_errors = 0;
1675 }
1676
1677 static int
1678 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1679 {
1680 struct dpif_netlink_vport reply;
1681 struct ofpbuf *buf;
1682 int error;
1683
1684 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1685 if (error) {
1686 return error;
1687 } else if (!reply.stats) {
1688 ofpbuf_delete(buf);
1689 return EOPNOTSUPP;
1690 }
1691
1692 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1693
1694 ofpbuf_delete(buf);
1695
1696 return 0;
1697 }
1698
1699 static void
1700 get_stats_via_vport(const struct netdev *netdev_,
1701 struct netdev_stats *stats)
1702 {
1703 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1704
1705 if (!netdev->vport_stats_error ||
1706 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1707 int error;
1708
1709 error = get_stats_via_vport__(netdev_, stats);
1710 if (error && error != ENOENT && error != ENODEV) {
1711 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1712 "(%s)",
1713 netdev_get_name(netdev_), ovs_strerror(error));
1714 }
1715 netdev->vport_stats_error = error;
1716 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1717 }
1718 }
1719
1720 /* Retrieves current device stats for 'netdev-linux'. */
1721 static int
1722 netdev_linux_get_stats(const struct netdev *netdev_,
1723 struct netdev_stats *stats)
1724 {
1725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1726 struct netdev_stats dev_stats;
1727 int error;
1728
1729 ovs_mutex_lock(&netdev->mutex);
1730 get_stats_via_vport(netdev_, stats);
1731 error = get_stats_via_netlink(netdev_, &dev_stats);
1732 if (error) {
1733 if (!netdev->vport_stats_error) {
1734 error = 0;
1735 }
1736 } else if (netdev->vport_stats_error) {
1737 /* stats not available from OVS then use netdev stats. */
1738 *stats = dev_stats;
1739 } else {
1740 /* Use kernel netdev's packet and byte counts since vport's counters
1741 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1742 * enabled. */
1743 stats->rx_packets = dev_stats.rx_packets;
1744 stats->rx_bytes = dev_stats.rx_bytes;
1745 stats->tx_packets = dev_stats.tx_packets;
1746 stats->tx_bytes = dev_stats.tx_bytes;
1747
1748 stats->rx_errors += dev_stats.rx_errors;
1749 stats->tx_errors += dev_stats.tx_errors;
1750 stats->rx_dropped += dev_stats.rx_dropped;
1751 stats->tx_dropped += dev_stats.tx_dropped;
1752 stats->multicast += dev_stats.multicast;
1753 stats->collisions += dev_stats.collisions;
1754 stats->rx_length_errors += dev_stats.rx_length_errors;
1755 stats->rx_over_errors += dev_stats.rx_over_errors;
1756 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1757 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1758 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1759 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1760 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1761 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1762 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1763 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1764 stats->tx_window_errors += dev_stats.tx_window_errors;
1765 }
1766 ovs_mutex_unlock(&netdev->mutex);
1767
1768 return error;
1769 }
1770
1771 /* Retrieves current device stats for 'netdev-tap' netdev or
1772 * netdev-internal. */
1773 static int
1774 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1775 {
1776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1777 struct netdev_stats dev_stats;
1778 int error;
1779
1780 ovs_mutex_lock(&netdev->mutex);
1781 get_stats_via_vport(netdev_, stats);
1782 error = get_stats_via_netlink(netdev_, &dev_stats);
1783 if (error) {
1784 if (!netdev->vport_stats_error) {
1785 error = 0;
1786 }
1787 } else if (netdev->vport_stats_error) {
1788 /* Transmit and receive stats will appear to be swapped relative to the
1789 * other ports since we are the one sending the data, not a remote
1790 * computer. For consistency, we swap them back here. This does not
1791 * apply if we are getting stats from the vport layer because it always
1792 * tracks stats from the perspective of the switch. */
1793
1794 *stats = dev_stats;
1795 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1796 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1797 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1798 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1799 stats->rx_length_errors = 0;
1800 stats->rx_over_errors = 0;
1801 stats->rx_crc_errors = 0;
1802 stats->rx_frame_errors = 0;
1803 stats->rx_fifo_errors = 0;
1804 stats->rx_missed_errors = 0;
1805 stats->tx_aborted_errors = 0;
1806 stats->tx_carrier_errors = 0;
1807 stats->tx_fifo_errors = 0;
1808 stats->tx_heartbeat_errors = 0;
1809 stats->tx_window_errors = 0;
1810 } else {
1811 /* Use kernel netdev's packet and byte counts since vport counters
1812 * do not reflect packet counts on the wire when GSO, TSO or GRO
1813 * are enabled. */
1814 stats->rx_packets = dev_stats.tx_packets;
1815 stats->rx_bytes = dev_stats.tx_bytes;
1816 stats->tx_packets = dev_stats.rx_packets;
1817 stats->tx_bytes = dev_stats.rx_bytes;
1818
1819 stats->rx_dropped += dev_stats.tx_dropped;
1820 stats->tx_dropped += dev_stats.rx_dropped;
1821
1822 stats->rx_errors += dev_stats.tx_errors;
1823 stats->tx_errors += dev_stats.rx_errors;
1824
1825 stats->multicast += dev_stats.multicast;
1826 stats->collisions += dev_stats.collisions;
1827 }
1828 ovs_mutex_unlock(&netdev->mutex);
1829
1830 return error;
1831 }
1832
1833 static int
1834 netdev_internal_get_stats(const struct netdev *netdev_,
1835 struct netdev_stats *stats)
1836 {
1837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1838 int error;
1839
1840 ovs_mutex_lock(&netdev->mutex);
1841 get_stats_via_vport(netdev_, stats);
1842 error = netdev->vport_stats_error;
1843 ovs_mutex_unlock(&netdev->mutex);
1844
1845 return error;
1846 }
1847
1848 static void
1849 netdev_linux_read_features(struct netdev_linux *netdev)
1850 {
1851 struct ethtool_cmd ecmd;
1852 uint32_t speed;
1853 int error;
1854
1855 if (netdev->cache_valid & VALID_FEATURES) {
1856 return;
1857 }
1858
1859 COVERAGE_INC(netdev_get_ethtool);
1860 memset(&ecmd, 0, sizeof ecmd);
1861 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1862 ETHTOOL_GSET, "ETHTOOL_GSET");
1863 if (error) {
1864 goto out;
1865 }
1866
1867 /* Supported features. */
1868 netdev->supported = 0;
1869 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1870 netdev->supported |= NETDEV_F_10MB_HD;
1871 }
1872 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1873 netdev->supported |= NETDEV_F_10MB_FD;
1874 }
1875 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1876 netdev->supported |= NETDEV_F_100MB_HD;
1877 }
1878 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1879 netdev->supported |= NETDEV_F_100MB_FD;
1880 }
1881 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1882 netdev->supported |= NETDEV_F_1GB_HD;
1883 }
1884 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1885 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1886 netdev->supported |= NETDEV_F_1GB_FD;
1887 }
1888 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1889 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1890 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1891 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1892 netdev->supported |= NETDEV_F_10GB_FD;
1893 }
1894 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1895 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1896 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1897 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1898 netdev->supported |= NETDEV_F_40GB_FD;
1899 }
1900 if (ecmd.supported & SUPPORTED_TP) {
1901 netdev->supported |= NETDEV_F_COPPER;
1902 }
1903 if (ecmd.supported & SUPPORTED_FIBRE) {
1904 netdev->supported |= NETDEV_F_FIBER;
1905 }
1906 if (ecmd.supported & SUPPORTED_Autoneg) {
1907 netdev->supported |= NETDEV_F_AUTONEG;
1908 }
1909 if (ecmd.supported & SUPPORTED_Pause) {
1910 netdev->supported |= NETDEV_F_PAUSE;
1911 }
1912 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1913 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1914 }
1915
1916 /* Advertised features. */
1917 netdev->advertised = 0;
1918 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1919 netdev->advertised |= NETDEV_F_10MB_HD;
1920 }
1921 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1922 netdev->advertised |= NETDEV_F_10MB_FD;
1923 }
1924 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1925 netdev->advertised |= NETDEV_F_100MB_HD;
1926 }
1927 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1928 netdev->advertised |= NETDEV_F_100MB_FD;
1929 }
1930 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1931 netdev->advertised |= NETDEV_F_1GB_HD;
1932 }
1933 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1934 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1935 netdev->advertised |= NETDEV_F_1GB_FD;
1936 }
1937 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1938 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1939 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1940 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1941 netdev->advertised |= NETDEV_F_10GB_FD;
1942 }
1943 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1944 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1945 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1946 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1947 netdev->advertised |= NETDEV_F_40GB_FD;
1948 }
1949 if (ecmd.advertising & ADVERTISED_TP) {
1950 netdev->advertised |= NETDEV_F_COPPER;
1951 }
1952 if (ecmd.advertising & ADVERTISED_FIBRE) {
1953 netdev->advertised |= NETDEV_F_FIBER;
1954 }
1955 if (ecmd.advertising & ADVERTISED_Autoneg) {
1956 netdev->advertised |= NETDEV_F_AUTONEG;
1957 }
1958 if (ecmd.advertising & ADVERTISED_Pause) {
1959 netdev->advertised |= NETDEV_F_PAUSE;
1960 }
1961 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1962 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1963 }
1964
1965 /* Current settings. */
1966 speed = ethtool_cmd_speed(&ecmd);
1967 if (speed == SPEED_10) {
1968 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1969 } else if (speed == SPEED_100) {
1970 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1971 } else if (speed == SPEED_1000) {
1972 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1973 } else if (speed == SPEED_10000) {
1974 netdev->current = NETDEV_F_10GB_FD;
1975 } else if (speed == 40000) {
1976 netdev->current = NETDEV_F_40GB_FD;
1977 } else if (speed == 100000) {
1978 netdev->current = NETDEV_F_100GB_FD;
1979 } else if (speed == 1000000) {
1980 netdev->current = NETDEV_F_1TB_FD;
1981 } else {
1982 netdev->current = 0;
1983 }
1984
1985 if (ecmd.port == PORT_TP) {
1986 netdev->current |= NETDEV_F_COPPER;
1987 } else if (ecmd.port == PORT_FIBRE) {
1988 netdev->current |= NETDEV_F_FIBER;
1989 }
1990
1991 if (ecmd.autoneg) {
1992 netdev->current |= NETDEV_F_AUTONEG;
1993 }
1994
1995 out:
1996 netdev->cache_valid |= VALID_FEATURES;
1997 netdev->get_features_error = error;
1998 }
1999
2000 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2001 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2002 * Returns 0 if successful, otherwise a positive errno value. */
2003 static int
2004 netdev_linux_get_features(const struct netdev *netdev_,
2005 enum netdev_features *current,
2006 enum netdev_features *advertised,
2007 enum netdev_features *supported,
2008 enum netdev_features *peer)
2009 {
2010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2011 int error;
2012
2013 ovs_mutex_lock(&netdev->mutex);
2014 netdev_linux_read_features(netdev);
2015 if (!netdev->get_features_error) {
2016 *current = netdev->current;
2017 *advertised = netdev->advertised;
2018 *supported = netdev->supported;
2019 *peer = 0; /* XXX */
2020 }
2021 error = netdev->get_features_error;
2022 ovs_mutex_unlock(&netdev->mutex);
2023
2024 return error;
2025 }
2026
2027 /* Set the features advertised by 'netdev' to 'advertise'. */
2028 static int
2029 netdev_linux_set_advertisements(struct netdev *netdev_,
2030 enum netdev_features advertise)
2031 {
2032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2033 struct ethtool_cmd ecmd;
2034 int error;
2035
2036 ovs_mutex_lock(&netdev->mutex);
2037
2038 COVERAGE_INC(netdev_get_ethtool);
2039 memset(&ecmd, 0, sizeof ecmd);
2040 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2041 ETHTOOL_GSET, "ETHTOOL_GSET");
2042 if (error) {
2043 goto exit;
2044 }
2045
2046 ecmd.advertising = 0;
2047 if (advertise & NETDEV_F_10MB_HD) {
2048 ecmd.advertising |= ADVERTISED_10baseT_Half;
2049 }
2050 if (advertise & NETDEV_F_10MB_FD) {
2051 ecmd.advertising |= ADVERTISED_10baseT_Full;
2052 }
2053 if (advertise & NETDEV_F_100MB_HD) {
2054 ecmd.advertising |= ADVERTISED_100baseT_Half;
2055 }
2056 if (advertise & NETDEV_F_100MB_FD) {
2057 ecmd.advertising |= ADVERTISED_100baseT_Full;
2058 }
2059 if (advertise & NETDEV_F_1GB_HD) {
2060 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2061 }
2062 if (advertise & NETDEV_F_1GB_FD) {
2063 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2064 }
2065 if (advertise & NETDEV_F_10GB_FD) {
2066 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2067 }
2068 if (advertise & NETDEV_F_COPPER) {
2069 ecmd.advertising |= ADVERTISED_TP;
2070 }
2071 if (advertise & NETDEV_F_FIBER) {
2072 ecmd.advertising |= ADVERTISED_FIBRE;
2073 }
2074 if (advertise & NETDEV_F_AUTONEG) {
2075 ecmd.advertising |= ADVERTISED_Autoneg;
2076 }
2077 if (advertise & NETDEV_F_PAUSE) {
2078 ecmd.advertising |= ADVERTISED_Pause;
2079 }
2080 if (advertise & NETDEV_F_PAUSE_ASYM) {
2081 ecmd.advertising |= ADVERTISED_Asym_Pause;
2082 }
2083 COVERAGE_INC(netdev_set_ethtool);
2084 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2085 ETHTOOL_SSET, "ETHTOOL_SSET");
2086
2087 exit:
2088 ovs_mutex_unlock(&netdev->mutex);
2089 return error;
2090 }
2091
2092 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2093 * successful, otherwise a positive errno value. */
2094 static int
2095 netdev_linux_set_policing(struct netdev *netdev_,
2096 uint32_t kbits_rate, uint32_t kbits_burst)
2097 {
2098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2099 const char *netdev_name = netdev_get_name(netdev_);
2100 int ifindex;
2101 int error;
2102
2103 if (netdev_is_flow_api_enabled()) {
2104 if (kbits_rate) {
2105 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2106 netdev_name);
2107 }
2108 return EOPNOTSUPP;
2109 }
2110
2111 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2112 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2113 : kbits_burst); /* Stick with user-specified value. */
2114
2115 ovs_mutex_lock(&netdev->mutex);
2116 if (netdev->cache_valid & VALID_POLICING) {
2117 error = netdev->netdev_policing_error;
2118 if (error || (netdev->kbits_rate == kbits_rate &&
2119 netdev->kbits_burst == kbits_burst)) {
2120 /* Assume that settings haven't changed since we last set them. */
2121 goto out;
2122 }
2123 netdev->cache_valid &= ~VALID_POLICING;
2124 }
2125
2126 error = get_ifindex(netdev_, &ifindex);
2127 if (error) {
2128 goto out;
2129 }
2130
2131 COVERAGE_INC(netdev_set_policing);
2132 /* Remove any existing ingress qdisc. */
2133 error = tc_add_del_ingress_qdisc(ifindex, false);
2134 if (error) {
2135 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2136 netdev_name, ovs_strerror(error));
2137 goto out;
2138 }
2139
2140 if (kbits_rate) {
2141 error = tc_add_del_ingress_qdisc(ifindex, true);
2142 if (error) {
2143 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2144 netdev_name, ovs_strerror(error));
2145 goto out;
2146 }
2147
2148 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2149 if (error){
2150 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2151 netdev_name, ovs_strerror(error));
2152 goto out;
2153 }
2154 }
2155
2156 netdev->kbits_rate = kbits_rate;
2157 netdev->kbits_burst = kbits_burst;
2158
2159 out:
2160 if (!error || error == ENODEV) {
2161 netdev->netdev_policing_error = error;
2162 netdev->cache_valid |= VALID_POLICING;
2163 }
2164 ovs_mutex_unlock(&netdev->mutex);
2165 return error;
2166 }
2167
2168 static int
2169 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2170 struct sset *types)
2171 {
2172 const struct tc_ops *const *opsp;
2173 for (opsp = tcs; *opsp != NULL; opsp++) {
2174 const struct tc_ops *ops = *opsp;
2175 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2176 sset_add(types, ops->ovs_name);
2177 }
2178 }
2179 return 0;
2180 }
2181
2182 static const struct tc_ops *
2183 tc_lookup_ovs_name(const char *name)
2184 {
2185 const struct tc_ops *const *opsp;
2186
2187 for (opsp = tcs; *opsp != NULL; opsp++) {
2188 const struct tc_ops *ops = *opsp;
2189 if (!strcmp(name, ops->ovs_name)) {
2190 return ops;
2191 }
2192 }
2193 return NULL;
2194 }
2195
2196 static const struct tc_ops *
2197 tc_lookup_linux_name(const char *name)
2198 {
2199 const struct tc_ops *const *opsp;
2200
2201 for (opsp = tcs; *opsp != NULL; opsp++) {
2202 const struct tc_ops *ops = *opsp;
2203 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2204 return ops;
2205 }
2206 }
2207 return NULL;
2208 }
2209
2210 static struct tc_queue *
2211 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2212 size_t hash)
2213 {
2214 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2215 struct tc_queue *queue;
2216
2217 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2218 if (queue->queue_id == queue_id) {
2219 return queue;
2220 }
2221 }
2222 return NULL;
2223 }
2224
2225 static struct tc_queue *
2226 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2227 {
2228 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2229 }
2230
2231 static int
2232 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2233 const char *type,
2234 struct netdev_qos_capabilities *caps)
2235 {
2236 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2237 if (!ops) {
2238 return EOPNOTSUPP;
2239 }
2240 caps->n_queues = ops->n_queues;
2241 return 0;
2242 }
2243
2244 static int
2245 netdev_linux_get_qos(const struct netdev *netdev_,
2246 const char **typep, struct smap *details)
2247 {
2248 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2249 int error;
2250
2251 ovs_mutex_lock(&netdev->mutex);
2252 error = tc_query_qdisc(netdev_);
2253 if (!error) {
2254 *typep = netdev->tc->ops->ovs_name;
2255 error = (netdev->tc->ops->qdisc_get
2256 ? netdev->tc->ops->qdisc_get(netdev_, details)
2257 : 0);
2258 }
2259 ovs_mutex_unlock(&netdev->mutex);
2260
2261 return error;
2262 }
2263
2264 static int
2265 netdev_linux_set_qos(struct netdev *netdev_,
2266 const char *type, const struct smap *details)
2267 {
2268 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2269 const struct tc_ops *new_ops;
2270 int error;
2271
2272 new_ops = tc_lookup_ovs_name(type);
2273 if (!new_ops || !new_ops->tc_install) {
2274 return EOPNOTSUPP;
2275 }
2276
2277 if (new_ops == &tc_ops_noop) {
2278 return new_ops->tc_install(netdev_, details);
2279 }
2280
2281 ovs_mutex_lock(&netdev->mutex);
2282 error = tc_query_qdisc(netdev_);
2283 if (error) {
2284 goto exit;
2285 }
2286
2287 if (new_ops == netdev->tc->ops) {
2288 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2289 } else {
2290 /* Delete existing qdisc. */
2291 error = tc_del_qdisc(netdev_);
2292 if (error) {
2293 goto exit;
2294 }
2295 ovs_assert(netdev->tc == NULL);
2296
2297 /* Install new qdisc. */
2298 error = new_ops->tc_install(netdev_, details);
2299 ovs_assert((error == 0) == (netdev->tc != NULL));
2300 }
2301
2302 exit:
2303 ovs_mutex_unlock(&netdev->mutex);
2304 return error;
2305 }
2306
2307 static int
2308 netdev_linux_get_queue(const struct netdev *netdev_,
2309 unsigned int queue_id, struct smap *details)
2310 {
2311 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2312 int error;
2313
2314 ovs_mutex_lock(&netdev->mutex);
2315 error = tc_query_qdisc(netdev_);
2316 if (!error) {
2317 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2318 error = (queue
2319 ? netdev->tc->ops->class_get(netdev_, queue, details)
2320 : ENOENT);
2321 }
2322 ovs_mutex_unlock(&netdev->mutex);
2323
2324 return error;
2325 }
2326
2327 static int
2328 netdev_linux_set_queue(struct netdev *netdev_,
2329 unsigned int queue_id, const struct smap *details)
2330 {
2331 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2332 int error;
2333
2334 ovs_mutex_lock(&netdev->mutex);
2335 error = tc_query_qdisc(netdev_);
2336 if (!error) {
2337 error = (queue_id < netdev->tc->ops->n_queues
2338 && netdev->tc->ops->class_set
2339 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2340 : EINVAL);
2341 }
2342 ovs_mutex_unlock(&netdev->mutex);
2343
2344 return error;
2345 }
2346
2347 static int
2348 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2349 {
2350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2351 int error;
2352
2353 ovs_mutex_lock(&netdev->mutex);
2354 error = tc_query_qdisc(netdev_);
2355 if (!error) {
2356 if (netdev->tc->ops->class_delete) {
2357 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2358 error = (queue
2359 ? netdev->tc->ops->class_delete(netdev_, queue)
2360 : ENOENT);
2361 } else {
2362 error = EINVAL;
2363 }
2364 }
2365 ovs_mutex_unlock(&netdev->mutex);
2366
2367 return error;
2368 }
2369
2370 static int
2371 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2372 unsigned int queue_id,
2373 struct netdev_queue_stats *stats)
2374 {
2375 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2376 int error;
2377
2378 ovs_mutex_lock(&netdev->mutex);
2379 error = tc_query_qdisc(netdev_);
2380 if (!error) {
2381 if (netdev->tc->ops->class_get_stats) {
2382 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2383 if (queue) {
2384 stats->created = queue->created;
2385 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2386 stats);
2387 } else {
2388 error = ENOENT;
2389 }
2390 } else {
2391 error = EOPNOTSUPP;
2392 }
2393 }
2394 ovs_mutex_unlock(&netdev->mutex);
2395
2396 return error;
2397 }
2398
2399 struct queue_dump_state {
2400 struct nl_dump dump;
2401 struct ofpbuf buf;
2402 };
2403
2404 static bool
2405 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2406 {
2407 struct ofpbuf request;
2408 struct tcmsg *tcmsg;
2409
2410 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2411 if (!tcmsg) {
2412 return false;
2413 }
2414 tcmsg->tcm_parent = 0;
2415 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2416 ofpbuf_uninit(&request);
2417
2418 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2419 return true;
2420 }
2421
2422 static int
2423 finish_queue_dump(struct queue_dump_state *state)
2424 {
2425 ofpbuf_uninit(&state->buf);
2426 return nl_dump_done(&state->dump);
2427 }
2428
2429 struct netdev_linux_queue_state {
2430 unsigned int *queues;
2431 size_t cur_queue;
2432 size_t n_queues;
2433 };
2434
2435 static int
2436 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2437 {
2438 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2439 int error;
2440
2441 ovs_mutex_lock(&netdev->mutex);
2442 error = tc_query_qdisc(netdev_);
2443 if (!error) {
2444 if (netdev->tc->ops->class_get) {
2445 struct netdev_linux_queue_state *state;
2446 struct tc_queue *queue;
2447 size_t i;
2448
2449 *statep = state = xmalloc(sizeof *state);
2450 state->n_queues = hmap_count(&netdev->tc->queues);
2451 state->cur_queue = 0;
2452 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2453
2454 i = 0;
2455 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2456 state->queues[i++] = queue->queue_id;
2457 }
2458 } else {
2459 error = EOPNOTSUPP;
2460 }
2461 }
2462 ovs_mutex_unlock(&netdev->mutex);
2463
2464 return error;
2465 }
2466
2467 static int
2468 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2469 unsigned int *queue_idp, struct smap *details)
2470 {
2471 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2472 struct netdev_linux_queue_state *state = state_;
2473 int error = EOF;
2474
2475 ovs_mutex_lock(&netdev->mutex);
2476 while (state->cur_queue < state->n_queues) {
2477 unsigned int queue_id = state->queues[state->cur_queue++];
2478 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2479
2480 if (queue) {
2481 *queue_idp = queue_id;
2482 error = netdev->tc->ops->class_get(netdev_, queue, details);
2483 break;
2484 }
2485 }
2486 ovs_mutex_unlock(&netdev->mutex);
2487
2488 return error;
2489 }
2490
2491 static int
2492 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2493 void *state_)
2494 {
2495 struct netdev_linux_queue_state *state = state_;
2496
2497 free(state->queues);
2498 free(state);
2499 return 0;
2500 }
2501
2502 static int
2503 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2504 netdev_dump_queue_stats_cb *cb, void *aux)
2505 {
2506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2507 int error;
2508
2509 ovs_mutex_lock(&netdev->mutex);
2510 error = tc_query_qdisc(netdev_);
2511 if (!error) {
2512 struct queue_dump_state state;
2513
2514 if (!netdev->tc->ops->class_dump_stats) {
2515 error = EOPNOTSUPP;
2516 } else if (!start_queue_dump(netdev_, &state)) {
2517 error = ENODEV;
2518 } else {
2519 struct ofpbuf msg;
2520 int retval;
2521
2522 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2523 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2524 cb, aux);
2525 if (retval) {
2526 error = retval;
2527 }
2528 }
2529
2530 retval = finish_queue_dump(&state);
2531 if (retval) {
2532 error = retval;
2533 }
2534 }
2535 }
2536 ovs_mutex_unlock(&netdev->mutex);
2537
2538 return error;
2539 }
2540
2541 static int
2542 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2543 struct in_addr netmask)
2544 {
2545 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2546 int error;
2547
2548 ovs_mutex_lock(&netdev->mutex);
2549 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2550 if (!error) {
2551 if (address.s_addr != INADDR_ANY) {
2552 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2553 "SIOCSIFNETMASK", netmask);
2554 }
2555 }
2556
2557 ovs_mutex_unlock(&netdev->mutex);
2558
2559 return error;
2560 }
2561
2562 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2563 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2564 * error. */
2565 static int
2566 netdev_linux_get_addr_list(const struct netdev *netdev_,
2567 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2568 {
2569 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2570 int error;
2571
2572 ovs_mutex_lock(&netdev->mutex);
2573 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2574 ovs_mutex_unlock(&netdev->mutex);
2575
2576 return error;
2577 }
2578
2579 static void
2580 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2581 {
2582 struct sockaddr_in sin;
2583 memset(&sin, 0, sizeof sin);
2584 sin.sin_family = AF_INET;
2585 sin.sin_addr = addr;
2586 sin.sin_port = 0;
2587
2588 memset(sa, 0, sizeof *sa);
2589 memcpy(sa, &sin, sizeof sin);
2590 }
2591
2592 static int
2593 do_set_addr(struct netdev *netdev,
2594 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2595 {
2596 struct ifreq ifr;
2597
2598 make_in4_sockaddr(&ifr.ifr_addr, addr);
2599 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2600 ioctl_name);
2601 }
2602
2603 /* Adds 'router' as a default IP gateway. */
2604 static int
2605 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2606 {
2607 struct in_addr any = { INADDR_ANY };
2608 struct rtentry rt;
2609 int error;
2610
2611 memset(&rt, 0, sizeof rt);
2612 make_in4_sockaddr(&rt.rt_dst, any);
2613 make_in4_sockaddr(&rt.rt_gateway, router);
2614 make_in4_sockaddr(&rt.rt_genmask, any);
2615 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2616 error = af_inet_ioctl(SIOCADDRT, &rt);
2617 if (error) {
2618 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2619 }
2620 return error;
2621 }
2622
2623 static int
2624 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2625 char **netdev_name)
2626 {
2627 static const char fn[] = "/proc/net/route";
2628 FILE *stream;
2629 char line[256];
2630 int ln;
2631
2632 *netdev_name = NULL;
2633 stream = fopen(fn, "r");
2634 if (stream == NULL) {
2635 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2636 return errno;
2637 }
2638
2639 ln = 0;
2640 while (fgets(line, sizeof line, stream)) {
2641 if (++ln >= 2) {
2642 char iface[17];
2643 ovs_be32 dest, gateway, mask;
2644 int refcnt, metric, mtu;
2645 unsigned int flags, use, window, irtt;
2646
2647 if (!ovs_scan(line,
2648 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2649 " %d %u %u\n",
2650 iface, &dest, &gateway, &flags, &refcnt,
2651 &use, &metric, &mask, &mtu, &window, &irtt)) {
2652 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2653 fn, ln, line);
2654 continue;
2655 }
2656 if (!(flags & RTF_UP)) {
2657 /* Skip routes that aren't up. */
2658 continue;
2659 }
2660
2661 /* The output of 'dest', 'mask', and 'gateway' were given in
2662 * network byte order, so we don't need need any endian
2663 * conversions here. */
2664 if ((dest & mask) == (host->s_addr & mask)) {
2665 if (!gateway) {
2666 /* The host is directly reachable. */
2667 next_hop->s_addr = 0;
2668 } else {
2669 /* To reach the host, we must go through a gateway. */
2670 next_hop->s_addr = gateway;
2671 }
2672 *netdev_name = xstrdup(iface);
2673 fclose(stream);
2674 return 0;
2675 }
2676 }
2677 }
2678
2679 fclose(stream);
2680 return ENXIO;
2681 }
2682
2683 static int
2684 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2685 {
2686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2687 int error = 0;
2688
2689 ovs_mutex_lock(&netdev->mutex);
2690 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2691 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2692
2693 COVERAGE_INC(netdev_get_ethtool);
2694 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2695 error = netdev_linux_do_ethtool(netdev->up.name,
2696 cmd,
2697 ETHTOOL_GDRVINFO,
2698 "ETHTOOL_GDRVINFO");
2699 if (!error) {
2700 netdev->cache_valid |= VALID_DRVINFO;
2701 }
2702 }
2703
2704 if (!error) {
2705 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2706 smap_add(smap, "driver_version", netdev->drvinfo.version);
2707 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2708 }
2709 ovs_mutex_unlock(&netdev->mutex);
2710
2711 return error;
2712 }
2713
2714 static int
2715 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2716 struct smap *smap)
2717 {
2718 smap_add(smap, "driver_name", "openvswitch");
2719 return 0;
2720 }
2721
2722 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2723 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2724 * returns 0. Otherwise, it returns a positive errno value; in particular,
2725 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2726 static int
2727 netdev_linux_arp_lookup(const struct netdev *netdev,
2728 ovs_be32 ip, struct eth_addr *mac)
2729 {
2730 struct arpreq r;
2731 struct sockaddr_in sin;
2732 int retval;
2733
2734 memset(&r, 0, sizeof r);
2735 memset(&sin, 0, sizeof sin);
2736 sin.sin_family = AF_INET;
2737 sin.sin_addr.s_addr = ip;
2738 sin.sin_port = 0;
2739 memcpy(&r.arp_pa, &sin, sizeof sin);
2740 r.arp_ha.sa_family = ARPHRD_ETHER;
2741 r.arp_flags = 0;
2742 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2743 COVERAGE_INC(netdev_arp_lookup);
2744 retval = af_inet_ioctl(SIOCGARP, &r);
2745 if (!retval) {
2746 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2747 } else if (retval != ENXIO) {
2748 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2749 netdev_get_name(netdev), IP_ARGS(ip),
2750 ovs_strerror(retval));
2751 }
2752 return retval;
2753 }
2754
2755 static int
2756 nd_to_iff_flags(enum netdev_flags nd)
2757 {
2758 int iff = 0;
2759 if (nd & NETDEV_UP) {
2760 iff |= IFF_UP;
2761 }
2762 if (nd & NETDEV_PROMISC) {
2763 iff |= IFF_PROMISC;
2764 }
2765 if (nd & NETDEV_LOOPBACK) {
2766 iff |= IFF_LOOPBACK;
2767 }
2768 return iff;
2769 }
2770
2771 static int
2772 iff_to_nd_flags(int iff)
2773 {
2774 enum netdev_flags nd = 0;
2775 if (iff & IFF_UP) {
2776 nd |= NETDEV_UP;
2777 }
2778 if (iff & IFF_PROMISC) {
2779 nd |= NETDEV_PROMISC;
2780 }
2781 if (iff & IFF_LOOPBACK) {
2782 nd |= NETDEV_LOOPBACK;
2783 }
2784 return nd;
2785 }
2786
2787 static int
2788 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2789 enum netdev_flags on, enum netdev_flags *old_flagsp)
2790 OVS_REQUIRES(netdev->mutex)
2791 {
2792 int old_flags, new_flags;
2793 int error = 0;
2794
2795 old_flags = netdev->ifi_flags;
2796 *old_flagsp = iff_to_nd_flags(old_flags);
2797 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2798 if (new_flags != old_flags) {
2799 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2800 get_flags(&netdev->up, &netdev->ifi_flags);
2801 }
2802
2803 return error;
2804 }
2805
2806 static int
2807 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2808 enum netdev_flags on, enum netdev_flags *old_flagsp)
2809 {
2810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2811 int error;
2812
2813 ovs_mutex_lock(&netdev->mutex);
2814 error = update_flags(netdev, off, on, old_flagsp);
2815 ovs_mutex_unlock(&netdev->mutex);
2816
2817 return error;
2818 }
2819
2820 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2821 GET_FEATURES, GET_STATUS, \
2822 FLOW_OFFLOAD_API) \
2823 { \
2824 NAME, \
2825 false, /* is_pmd */ \
2826 \
2827 NULL, \
2828 netdev_linux_run, \
2829 netdev_linux_wait, \
2830 \
2831 netdev_linux_alloc, \
2832 CONSTRUCT, \
2833 netdev_linux_destruct, \
2834 netdev_linux_dealloc, \
2835 NULL, /* get_config */ \
2836 NULL, /* set_config */ \
2837 NULL, /* get_tunnel_config */ \
2838 NULL, /* build header */ \
2839 NULL, /* push header */ \
2840 NULL, /* pop header */ \
2841 NULL, /* get_numa_id */ \
2842 NULL, /* set_tx_multiq */ \
2843 \
2844 netdev_linux_send, \
2845 netdev_linux_send_wait, \
2846 \
2847 netdev_linux_set_etheraddr, \
2848 netdev_linux_get_etheraddr, \
2849 netdev_linux_get_mtu, \
2850 netdev_linux_set_mtu, \
2851 netdev_linux_get_ifindex, \
2852 netdev_linux_get_carrier, \
2853 netdev_linux_get_carrier_resets, \
2854 netdev_linux_set_miimon_interval, \
2855 GET_STATS, \
2856 \
2857 GET_FEATURES, \
2858 netdev_linux_set_advertisements, \
2859 NULL, /* get_pt_mode */ \
2860 \
2861 netdev_linux_set_policing, \
2862 netdev_linux_get_qos_types, \
2863 netdev_linux_get_qos_capabilities, \
2864 netdev_linux_get_qos, \
2865 netdev_linux_set_qos, \
2866 netdev_linux_get_queue, \
2867 netdev_linux_set_queue, \
2868 netdev_linux_delete_queue, \
2869 netdev_linux_get_queue_stats, \
2870 netdev_linux_queue_dump_start, \
2871 netdev_linux_queue_dump_next, \
2872 netdev_linux_queue_dump_done, \
2873 netdev_linux_dump_queue_stats, \
2874 \
2875 netdev_linux_set_in4, \
2876 netdev_linux_get_addr_list, \
2877 netdev_linux_add_router, \
2878 netdev_linux_get_next_hop, \
2879 GET_STATUS, \
2880 netdev_linux_arp_lookup, \
2881 \
2882 netdev_linux_update_flags, \
2883 NULL, /* reconfigure */ \
2884 \
2885 netdev_linux_rxq_alloc, \
2886 netdev_linux_rxq_construct, \
2887 netdev_linux_rxq_destruct, \
2888 netdev_linux_rxq_dealloc, \
2889 netdev_linux_rxq_recv, \
2890 netdev_linux_rxq_wait, \
2891 netdev_linux_rxq_drain, \
2892 \
2893 FLOW_OFFLOAD_API \
2894 }
2895
2896 const struct netdev_class netdev_linux_class =
2897 NETDEV_LINUX_CLASS(
2898 "system",
2899 netdev_linux_construct,
2900 netdev_linux_get_stats,
2901 netdev_linux_get_features,
2902 netdev_linux_get_status,
2903 LINUX_FLOW_OFFLOAD_API);
2904
2905 const struct netdev_class netdev_tap_class =
2906 NETDEV_LINUX_CLASS(
2907 "tap",
2908 netdev_linux_construct_tap,
2909 netdev_tap_get_stats,
2910 netdev_linux_get_features,
2911 netdev_linux_get_status,
2912 NO_OFFLOAD_API);
2913
2914 const struct netdev_class netdev_internal_class =
2915 NETDEV_LINUX_CLASS(
2916 "internal",
2917 netdev_linux_construct,
2918 netdev_internal_get_stats,
2919 NULL, /* get_features */
2920 netdev_internal_get_status,
2921 NO_OFFLOAD_API);
2922 \f
2923
2924 #define CODEL_N_QUEUES 0x0000
2925
2926 /* In sufficiently new kernel headers these are defined as enums in
2927 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2928 * kernels. (This overrides any enum definition in the header file but that's
2929 * harmless.) */
2930 #define TCA_CODEL_TARGET 1
2931 #define TCA_CODEL_LIMIT 2
2932 #define TCA_CODEL_INTERVAL 3
2933
2934 struct codel {
2935 struct tc tc;
2936 uint32_t target;
2937 uint32_t limit;
2938 uint32_t interval;
2939 };
2940
2941 static struct codel *
2942 codel_get__(const struct netdev *netdev_)
2943 {
2944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2945 return CONTAINER_OF(netdev->tc, struct codel, tc);
2946 }
2947
2948 static void
2949 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2950 uint32_t interval)
2951 {
2952 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2953 struct codel *codel;
2954
2955 codel = xmalloc(sizeof *codel);
2956 tc_init(&codel->tc, &tc_ops_codel);
2957 codel->target = target;
2958 codel->limit = limit;
2959 codel->interval = interval;
2960
2961 netdev->tc = &codel->tc;
2962 }
2963
2964 static int
2965 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2966 uint32_t interval)
2967 {
2968 size_t opt_offset;
2969 struct ofpbuf request;
2970 struct tcmsg *tcmsg;
2971 uint32_t otarget, olimit, ointerval;
2972 int error;
2973
2974 tc_del_qdisc(netdev);
2975
2976 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
2977 NLM_F_EXCL | NLM_F_CREATE, &request);
2978 if (!tcmsg) {
2979 return ENODEV;
2980 }
2981 tcmsg->tcm_handle = tc_make_handle(1, 0);
2982 tcmsg->tcm_parent = TC_H_ROOT;
2983
2984 otarget = target ? target : 5000;
2985 olimit = limit ? limit : 10240;
2986 ointerval = interval ? interval : 100000;
2987
2988 nl_msg_put_string(&request, TCA_KIND, "codel");
2989 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2990 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2991 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2992 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2993 nl_msg_end_nested(&request, opt_offset);
2994
2995 error = tc_transact(&request, NULL);
2996 if (error) {
2997 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2998 "target %u, limit %u, interval %u error %d(%s)",
2999 netdev_get_name(netdev),
3000 otarget, olimit, ointerval,
3001 error, ovs_strerror(error));
3002 }
3003 return error;
3004 }
3005
3006 static void
3007 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3008 const struct smap *details, struct codel *codel)
3009 {
3010 codel->target = smap_get_ullong(details, "target", 0);
3011 codel->limit = smap_get_ullong(details, "limit", 0);
3012 codel->interval = smap_get_ullong(details, "interval", 0);
3013
3014 if (!codel->target) {
3015 codel->target = 5000;
3016 }
3017 if (!codel->limit) {
3018 codel->limit = 10240;
3019 }
3020 if (!codel->interval) {
3021 codel->interval = 100000;
3022 }
3023 }
3024
3025 static int
3026 codel_tc_install(struct netdev *netdev, const struct smap *details)
3027 {
3028 int error;
3029 struct codel codel;
3030
3031 codel_parse_qdisc_details__(netdev, details, &codel);
3032 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3033 codel.interval);
3034 if (!error) {
3035 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3036 }
3037 return error;
3038 }
3039
3040 static int
3041 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3042 {
3043 static const struct nl_policy tca_codel_policy[] = {
3044 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3045 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3046 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3047 };
3048
3049 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3050
3051 if (!nl_parse_nested(nl_options, tca_codel_policy,
3052 attrs, ARRAY_SIZE(tca_codel_policy))) {
3053 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3054 return EPROTO;
3055 }
3056
3057 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3058 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3059 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3060 return 0;
3061 }
3062
3063 static int
3064 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3065 {
3066 struct nlattr *nlattr;
3067 const char * kind;
3068 int error;
3069 struct codel codel;
3070
3071 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3072 if (error != 0) {
3073 return error;
3074 }
3075
3076 error = codel_parse_tca_options__(nlattr, &codel);
3077 if (error != 0) {
3078 return error;
3079 }
3080
3081 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3082 return 0;
3083 }
3084
3085
3086 static void
3087 codel_tc_destroy(struct tc *tc)
3088 {
3089 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3090 tc_destroy(tc);
3091 free(codel);
3092 }
3093
3094 static int
3095 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3096 {
3097 const struct codel *codel = codel_get__(netdev);
3098 smap_add_format(details, "target", "%u", codel->target);
3099 smap_add_format(details, "limit", "%u", codel->limit);
3100 smap_add_format(details, "interval", "%u", codel->interval);
3101 return 0;
3102 }
3103
3104 static int
3105 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3106 {
3107 struct codel codel;
3108
3109 codel_parse_qdisc_details__(netdev, details, &codel);
3110 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3111 codel_get__(netdev)->target = codel.target;
3112 codel_get__(netdev)->limit = codel.limit;
3113 codel_get__(netdev)->interval = codel.interval;
3114 return 0;
3115 }
3116
3117 static const struct tc_ops tc_ops_codel = {
3118 "codel", /* linux_name */
3119 "linux-codel", /* ovs_name */
3120 CODEL_N_QUEUES, /* n_queues */
3121 codel_tc_install,
3122 codel_tc_load,
3123 codel_tc_destroy,
3124 codel_qdisc_get,
3125 codel_qdisc_set,
3126 NULL,
3127 NULL,
3128 NULL,
3129 NULL,
3130 NULL
3131 };
3132 \f
3133 /* FQ-CoDel traffic control class. */
3134
3135 #define FQCODEL_N_QUEUES 0x0000
3136
3137 /* In sufficiently new kernel headers these are defined as enums in
3138 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3139 * kernels. (This overrides any enum definition in the header file but that's
3140 * harmless.) */
3141 #define TCA_FQ_CODEL_TARGET 1
3142 #define TCA_FQ_CODEL_LIMIT 2
3143 #define TCA_FQ_CODEL_INTERVAL 3
3144 #define TCA_FQ_CODEL_ECN 4
3145 #define TCA_FQ_CODEL_FLOWS 5
3146 #define TCA_FQ_CODEL_QUANTUM 6
3147
3148 struct fqcodel {
3149 struct tc tc;
3150 uint32_t target;
3151 uint32_t limit;
3152 uint32_t interval;
3153 uint32_t flows;
3154 uint32_t quantum;
3155 };
3156
3157 static struct fqcodel *
3158 fqcodel_get__(const struct netdev *netdev_)
3159 {
3160 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3161 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3162 }
3163
3164 static void
3165 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3166 uint32_t interval, uint32_t flows, uint32_t quantum)
3167 {
3168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3169 struct fqcodel *fqcodel;
3170
3171 fqcodel = xmalloc(sizeof *fqcodel);
3172 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3173 fqcodel->target = target;
3174 fqcodel->limit = limit;
3175 fqcodel->interval = interval;
3176 fqcodel->flows = flows;
3177 fqcodel->quantum = quantum;
3178
3179 netdev->tc = &fqcodel->tc;
3180 }
3181
3182 static int
3183 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3184 uint32_t interval, uint32_t flows, uint32_t quantum)
3185 {
3186 size_t opt_offset;
3187 struct ofpbuf request;
3188 struct tcmsg *tcmsg;
3189 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3190 int error;
3191
3192 tc_del_qdisc(netdev);
3193
3194 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3195 NLM_F_EXCL | NLM_F_CREATE, &request);
3196 if (!tcmsg) {
3197 return ENODEV;
3198 }
3199 tcmsg->tcm_handle = tc_make_handle(1, 0);
3200 tcmsg->tcm_parent = TC_H_ROOT;
3201
3202 otarget = target ? target : 5000;
3203 olimit = limit ? limit : 10240;
3204 ointerval = interval ? interval : 100000;
3205 oflows = flows ? flows : 1024;
3206 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3207 not mtu */
3208
3209 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3210 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3211 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3212 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3213 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3214 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3215 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3216 nl_msg_end_nested(&request, opt_offset);
3217
3218 error = tc_transact(&request, NULL);
3219 if (error) {
3220 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3221 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3222 netdev_get_name(netdev),
3223 otarget, olimit, ointerval, oflows, oquantum,
3224 error, ovs_strerror(error));
3225 }
3226 return error;
3227 }
3228
3229 static void
3230 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3231 const struct smap *details, struct fqcodel *fqcodel)
3232 {
3233 fqcodel->target = smap_get_ullong(details, "target", 0);
3234 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3235 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3236 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3237 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3238
3239 if (!fqcodel->target) {
3240 fqcodel->target = 5000;
3241 }
3242 if (!fqcodel->limit) {
3243 fqcodel->limit = 10240;
3244 }
3245 if (!fqcodel->interval) {
3246 fqcodel->interval = 1000000;
3247 }
3248 if (!fqcodel->flows) {
3249 fqcodel->flows = 1024;
3250 }
3251 if (!fqcodel->quantum) {
3252 fqcodel->quantum = 1514;
3253 }
3254 }
3255
3256 static int
3257 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3258 {
3259 int error;
3260 struct fqcodel fqcodel;
3261
3262 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3263 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3264 fqcodel.interval, fqcodel.flows,
3265 fqcodel.quantum);
3266 if (!error) {
3267 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3268 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3269 }
3270 return error;
3271 }
3272
3273 static int
3274 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3275 {
3276 static const struct nl_policy tca_fqcodel_policy[] = {
3277 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3278 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3279 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3280 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3281 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3282 };
3283
3284 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3285
3286 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3287 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3288 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3289 return EPROTO;
3290 }
3291
3292 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3293 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3294 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3295 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3296 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3297 return 0;
3298 }
3299
3300 static int
3301 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3302 {
3303 struct nlattr *nlattr;
3304 const char * kind;
3305 int error;
3306 struct fqcodel fqcodel;
3307
3308 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3309 if (error != 0) {
3310 return error;
3311 }
3312
3313 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3314 if (error != 0) {
3315 return error;
3316 }
3317
3318 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3319 fqcodel.flows, fqcodel.quantum);
3320 return 0;
3321 }
3322
3323 static void
3324 fqcodel_tc_destroy(struct tc *tc)
3325 {
3326 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3327 tc_destroy(tc);
3328 free(fqcodel);
3329 }
3330
3331 static int
3332 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3333 {
3334 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3335 smap_add_format(details, "target", "%u", fqcodel->target);
3336 smap_add_format(details, "limit", "%u", fqcodel->limit);
3337 smap_add_format(details, "interval", "%u", fqcodel->interval);
3338 smap_add_format(details, "flows", "%u", fqcodel->flows);
3339 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3340 return 0;
3341 }
3342
3343 static int
3344 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3345 {
3346 struct fqcodel fqcodel;
3347
3348 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3349 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3350 fqcodel.flows, fqcodel.quantum);
3351 fqcodel_get__(netdev)->target = fqcodel.target;
3352 fqcodel_get__(netdev)->limit = fqcodel.limit;
3353 fqcodel_get__(netdev)->interval = fqcodel.interval;
3354 fqcodel_get__(netdev)->flows = fqcodel.flows;
3355 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3356 return 0;
3357 }
3358
3359 static const struct tc_ops tc_ops_fqcodel = {
3360 "fq_codel", /* linux_name */
3361 "linux-fq_codel", /* ovs_name */
3362 FQCODEL_N_QUEUES, /* n_queues */
3363 fqcodel_tc_install,
3364 fqcodel_tc_load,
3365 fqcodel_tc_destroy,
3366 fqcodel_qdisc_get,
3367 fqcodel_qdisc_set,
3368 NULL,
3369 NULL,
3370 NULL,
3371 NULL,
3372 NULL
3373 };
3374 \f
3375 /* SFQ traffic control class. */
3376
3377 #define SFQ_N_QUEUES 0x0000
3378
3379 struct sfq {
3380 struct tc tc;
3381 uint32_t quantum;
3382 uint32_t perturb;
3383 };
3384
3385 static struct sfq *
3386 sfq_get__(const struct netdev *netdev_)
3387 {
3388 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3389 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3390 }
3391
3392 static void
3393 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3394 {
3395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3396 struct sfq *sfq;
3397
3398 sfq = xmalloc(sizeof *sfq);
3399 tc_init(&sfq->tc, &tc_ops_sfq);
3400 sfq->perturb = perturb;
3401 sfq->quantum = quantum;
3402
3403 netdev->tc = &sfq->tc;
3404 }
3405
3406 static int
3407 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3408 {
3409 struct tc_sfq_qopt opt;
3410 struct ofpbuf request;
3411 struct tcmsg *tcmsg;
3412 int mtu;
3413 int mtu_error, error;
3414 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3415
3416 tc_del_qdisc(netdev);
3417
3418 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3419 NLM_F_EXCL | NLM_F_CREATE, &request);
3420 if (!tcmsg) {
3421 return ENODEV;
3422 }
3423 tcmsg->tcm_handle = tc_make_handle(1, 0);
3424 tcmsg->tcm_parent = TC_H_ROOT;
3425
3426 memset(&opt, 0, sizeof opt);
3427 if (!quantum) {
3428 if (!mtu_error) {
3429 opt.quantum = mtu; /* if we cannot find mtu, use default */
3430 }
3431 } else {
3432 opt.quantum = quantum;
3433 }
3434
3435 if (!perturb) {
3436 opt.perturb_period = 10;
3437 } else {
3438 opt.perturb_period = perturb;
3439 }
3440
3441 nl_msg_put_string(&request, TCA_KIND, "sfq");
3442 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3443
3444 error = tc_transact(&request, NULL);
3445 if (error) {
3446 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3447 "quantum %u, perturb %u error %d(%s)",
3448 netdev_get_name(netdev),
3449 opt.quantum, opt.perturb_period,
3450 error, ovs_strerror(error));
3451 }
3452 return error;
3453 }
3454
3455 static void
3456 sfq_parse_qdisc_details__(struct netdev *netdev,
3457 const struct smap *details, struct sfq *sfq)
3458 {
3459 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3460 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3461
3462 if (!sfq->perturb) {
3463 sfq->perturb = 10;
3464 }
3465
3466 if (!sfq->quantum) {
3467 int mtu;
3468 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3469 sfq->quantum = mtu;
3470 } else {
3471 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3472 "device without mtu");
3473 }
3474 }
3475 }
3476
3477 static int
3478 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3479 {
3480 int error;
3481 struct sfq sfq;
3482
3483 sfq_parse_qdisc_details__(netdev, details, &sfq);
3484 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3485 if (!error) {
3486 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3487 }
3488 return error;
3489 }
3490
3491 static int
3492 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3493 {
3494 const struct tc_sfq_qopt *sfq;
3495 struct nlattr *nlattr;
3496 const char * kind;
3497 int error;
3498
3499 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3500 if (error == 0) {
3501 sfq = nl_attr_get(nlattr);
3502 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3503 return 0;
3504 }
3505
3506 return error;
3507 }
3508
3509 static void
3510 sfq_tc_destroy(struct tc *tc)
3511 {
3512 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3513 tc_destroy(tc);
3514 free(sfq);
3515 }
3516
3517 static int
3518 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3519 {
3520 const struct sfq *sfq = sfq_get__(netdev);
3521 smap_add_format(details, "quantum", "%u", sfq->quantum);
3522 smap_add_format(details, "perturb", "%u", sfq->perturb);
3523 return 0;
3524 }
3525
3526 static int
3527 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3528 {
3529 struct sfq sfq;
3530
3531 sfq_parse_qdisc_details__(netdev, details, &sfq);
3532 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3533 sfq_get__(netdev)->quantum = sfq.quantum;
3534 sfq_get__(netdev)->perturb = sfq.perturb;
3535 return 0;
3536 }
3537
3538 static const struct tc_ops tc_ops_sfq = {
3539 "sfq", /* linux_name */
3540 "linux-sfq", /* ovs_name */
3541 SFQ_N_QUEUES, /* n_queues */
3542 sfq_tc_install,
3543 sfq_tc_load,
3544 sfq_tc_destroy,
3545 sfq_qdisc_get,
3546 sfq_qdisc_set,
3547 NULL,
3548 NULL,
3549 NULL,
3550 NULL,
3551 NULL
3552 };
3553 \f
3554 /* HTB traffic control class. */
3555
3556 #define HTB_N_QUEUES 0xf000
3557 #define HTB_RATE2QUANTUM 10
3558
3559 struct htb {
3560 struct tc tc;
3561 unsigned int max_rate; /* In bytes/s. */
3562 };
3563
3564 struct htb_class {
3565 struct tc_queue tc_queue;
3566 unsigned int min_rate; /* In bytes/s. */
3567 unsigned int max_rate; /* In bytes/s. */
3568 unsigned int burst; /* In bytes. */
3569 unsigned int priority; /* Lower values are higher priorities. */
3570 };
3571
3572 static struct htb *
3573 htb_get__(const struct netdev *netdev_)
3574 {
3575 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3576 return CONTAINER_OF(netdev->tc, struct htb, tc);
3577 }
3578
3579 static void
3580 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3581 {
3582 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3583 struct htb *htb;
3584
3585 htb = xmalloc(sizeof *htb);
3586 tc_init(&htb->tc, &tc_ops_htb);
3587 htb->max_rate = max_rate;
3588
3589 netdev->tc = &htb->tc;
3590 }
3591
3592 /* Create an HTB qdisc.
3593 *
3594 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3595 static int
3596 htb_setup_qdisc__(struct netdev *netdev)
3597 {
3598 size_t opt_offset;
3599 struct tc_htb_glob opt;
3600 struct ofpbuf request;
3601 struct tcmsg *tcmsg;
3602
3603 tc_del_qdisc(netdev);
3604
3605 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3606 NLM_F_EXCL | NLM_F_CREATE, &request);
3607 if (!tcmsg) {
3608 return ENODEV;
3609 }
3610 tcmsg->tcm_handle = tc_make_handle(1, 0);
3611 tcmsg->tcm_parent = TC_H_ROOT;
3612
3613 nl_msg_put_string(&request, TCA_KIND, "htb");
3614
3615 memset(&opt, 0, sizeof opt);
3616 opt.rate2quantum = HTB_RATE2QUANTUM;
3617 opt.version = 3;
3618 opt.defcls = 1;
3619
3620 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3621 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3622 nl_msg_end_nested(&request, opt_offset);
3623
3624 return tc_transact(&request, NULL);
3625 }
3626
3627 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3628 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3629 static int
3630 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3631 unsigned int parent, struct htb_class *class)
3632 {
3633 size_t opt_offset;
3634 struct tc_htb_opt opt;
3635 struct ofpbuf request;
3636 struct tcmsg *tcmsg;
3637 int error;
3638 int mtu;
3639
3640 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3641 if (error) {
3642 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3643 netdev_get_name(netdev));
3644 return error;
3645 }
3646
3647 memset(&opt, 0, sizeof opt);
3648 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3649 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3650 /* Makes sure the quantum is at least MTU. Setting quantum will
3651 * make htb ignore the r2q for this class. */
3652 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3653 opt.quantum = mtu;
3654 }
3655 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3656 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3657 opt.prio = class->priority;
3658
3659 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3660 &request);
3661 if (!tcmsg) {
3662 return ENODEV;
3663 }
3664 tcmsg->tcm_handle = handle;
3665 tcmsg->tcm_parent = parent;
3666
3667 nl_msg_put_string(&request, TCA_KIND, "htb");
3668 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3669 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3670 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3671 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3672 nl_msg_end_nested(&request, opt_offset);
3673
3674 error = tc_transact(&request, NULL);
3675 if (error) {
3676 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3677 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3678 netdev_get_name(netdev),
3679 tc_get_major(handle), tc_get_minor(handle),
3680 tc_get_major(parent), tc_get_minor(parent),
3681 class->min_rate, class->max_rate,
3682 class->burst, class->priority, ovs_strerror(error));
3683 }
3684 return error;
3685 }
3686
3687 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3688 * description of them into 'details'. The description complies with the
3689 * specification given in the vswitch database documentation for linux-htb
3690 * queue details. */
3691 static int
3692 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3693 {
3694 static const struct nl_policy tca_htb_policy[] = {
3695 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3696 .min_len = sizeof(struct tc_htb_opt) },
3697 };
3698
3699 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3700 const struct tc_htb_opt *htb;
3701
3702 if (!nl_parse_nested(nl_options, tca_htb_policy,
3703 attrs, ARRAY_SIZE(tca_htb_policy))) {
3704 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3705 return EPROTO;
3706 }
3707
3708 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3709 class->min_rate = htb->rate.rate;
3710 class->max_rate = htb->ceil.rate;
3711 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3712 class->priority = htb->prio;
3713 return 0;
3714 }
3715
3716 static int
3717 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3718 struct htb_class *options,
3719 struct netdev_queue_stats *stats)
3720 {
3721 struct nlattr *nl_options;
3722 unsigned int handle;
3723 int error;
3724
3725 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3726 if (!error && queue_id) {
3727 unsigned int major = tc_get_major(handle);
3728 unsigned int minor = tc_get_minor(handle);
3729 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3730 *queue_id = minor - 1;
3731 } else {
3732 error = EPROTO;
3733 }
3734 }
3735 if (!error && options) {
3736 error = htb_parse_tca_options__(nl_options, options);
3737 }
3738 return error;
3739 }
3740
3741 static void
3742 htb_parse_qdisc_details__(struct netdev *netdev_,
3743 const struct smap *details, struct htb_class *hc)
3744 {
3745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3746
3747 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3748 if (!hc->max_rate) {
3749 enum netdev_features current;
3750
3751 netdev_linux_read_features(netdev);
3752 current = !netdev->get_features_error ? netdev->current : 0;
3753 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3754 }
3755 hc->min_rate = hc->max_rate;
3756 hc->burst = 0;
3757 hc->priority = 0;
3758 }
3759
3760 static int
3761 htb_parse_class_details__(struct netdev *netdev,
3762 const struct smap *details, struct htb_class *hc)
3763 {
3764 const struct htb *htb = htb_get__(netdev);
3765 int mtu, error;
3766 unsigned long long int max_rate_bit;
3767
3768 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3769 if (error) {
3770 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3771 netdev_get_name(netdev));
3772 return error;
3773 }
3774
3775 /* HTB requires at least an mtu sized min-rate to send any traffic even
3776 * on uncongested links. */
3777 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
3778 hc->min_rate = MAX(hc->min_rate, mtu);
3779 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3780
3781 /* max-rate */
3782 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
3783 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
3784 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3785 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3786
3787 /* burst
3788 *
3789 * According to hints in the documentation that I've read, it is important
3790 * that 'burst' be at least as big as the largest frame that might be
3791 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3792 * but having it a bit too small is a problem. Since netdev_get_mtu()
3793 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3794 * the MTU. We actually add 64, instead of 14, as a guard against
3795 * additional headers get tacked on somewhere that we're not aware of. */
3796 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
3797 hc->burst = MAX(hc->burst, mtu + 64);
3798
3799 /* priority */
3800 hc->priority = smap_get_ullong(details, "priority", 0);
3801
3802 return 0;
3803 }
3804
3805 static int
3806 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3807 unsigned int parent, struct htb_class *options,
3808 struct netdev_queue_stats *stats)
3809 {
3810 struct ofpbuf *reply;
3811 int error;
3812
3813 error = tc_query_class(netdev, handle, parent, &reply);
3814 if (!error) {
3815 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3816 ofpbuf_delete(reply);
3817 }
3818 return error;
3819 }
3820
3821 static int
3822 htb_tc_install(struct netdev *netdev, const struct smap *details)
3823 {
3824 int error;
3825
3826 error = htb_setup_qdisc__(netdev);
3827 if (!error) {
3828 struct htb_class hc;
3829
3830 htb_parse_qdisc_details__(netdev, details, &hc);
3831 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3832 tc_make_handle(1, 0), &hc);
3833 if (!error) {
3834 htb_install__(netdev, hc.max_rate);
3835 }
3836 }
3837 return error;
3838 }
3839
3840 static struct htb_class *
3841 htb_class_cast__(const struct tc_queue *queue)
3842 {
3843 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3844 }
3845
3846 static void
3847 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3848 const struct htb_class *hc)
3849 {
3850 struct htb *htb = htb_get__(netdev);
3851 size_t hash = hash_int(queue_id, 0);
3852 struct tc_queue *queue;
3853 struct htb_class *hcp;
3854
3855 queue = tc_find_queue__(netdev, queue_id, hash);
3856 if (queue) {
3857 hcp = htb_class_cast__(queue);
3858 } else {
3859 hcp = xmalloc(sizeof *hcp);
3860 queue = &hcp->tc_queue;
3861 queue->queue_id = queue_id;
3862 queue->created = time_msec();
3863 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3864 }
3865
3866 hcp->min_rate = hc->min_rate;
3867 hcp->max_rate = hc->max_rate;
3868 hcp->burst = hc->burst;
3869 hcp->priority = hc->priority;
3870 }
3871
3872 static int
3873 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3874 {
3875 struct ofpbuf msg;
3876 struct queue_dump_state state;
3877 struct htb_class hc;
3878
3879 /* Get qdisc options. */
3880 hc.max_rate = 0;
3881 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3882 htb_install__(netdev, hc.max_rate);
3883
3884 /* Get queues. */
3885 if (!start_queue_dump(netdev, &state)) {
3886 return ENODEV;
3887 }
3888 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3889 unsigned int queue_id;
3890
3891 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3892 htb_update_queue__(netdev, queue_id, &hc);
3893 }
3894 }
3895 finish_queue_dump(&state);
3896
3897 return 0;
3898 }
3899
3900 static void
3901 htb_tc_destroy(struct tc *tc)
3902 {
3903 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3904 struct htb_class *hc;
3905
3906 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3907 free(hc);
3908 }
3909 tc_destroy(tc);
3910 free(htb);
3911 }
3912
3913 static int
3914 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3915 {
3916 const struct htb *htb = htb_get__(netdev);
3917 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3918 return 0;
3919 }
3920
3921 static int
3922 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3923 {
3924 struct htb_class hc;
3925 int error;
3926
3927 htb_parse_qdisc_details__(netdev, details, &hc);
3928 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3929 tc_make_handle(1, 0), &hc);
3930 if (!error) {
3931 htb_get__(netdev)->max_rate = hc.max_rate;
3932 }
3933 return error;
3934 }
3935
3936 static int
3937 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3938 const struct tc_queue *queue, struct smap *details)
3939 {
3940 const struct htb_class *hc = htb_class_cast__(queue);
3941
3942 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3943 if (hc->min_rate != hc->max_rate) {
3944 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3945 }
3946 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3947 if (hc->priority) {
3948 smap_add_format(details, "priority", "%u", hc->priority);
3949 }
3950 return 0;
3951 }
3952
3953 static int
3954 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3955 const struct smap *details)
3956 {
3957 struct htb_class hc;
3958 int error;
3959
3960 error = htb_parse_class_details__(netdev, details, &hc);
3961 if (error) {
3962 return error;
3963 }
3964
3965 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3966 tc_make_handle(1, 0xfffe), &hc);
3967 if (error) {
3968 return error;
3969 }
3970
3971 htb_update_queue__(netdev, queue_id, &hc);
3972 return 0;
3973 }
3974
3975 static int
3976 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3977 {
3978 struct htb_class *hc = htb_class_cast__(queue);
3979 struct htb *htb = htb_get__(netdev);
3980 int error;
3981
3982 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3983 if (!error) {
3984 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3985 free(hc);
3986 }
3987 return error;
3988 }
3989
3990 static int
3991 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3992 struct netdev_queue_stats *stats)
3993 {
3994 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3995 tc_make_handle(1, 0xfffe), NULL, stats);
3996 }
3997
3998 static int
3999 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4000 const struct ofpbuf *nlmsg,
4001 netdev_dump_queue_stats_cb *cb, void *aux)
4002 {
4003 struct netdev_queue_stats stats;
4004 unsigned int handle, major, minor;
4005 int error;
4006
4007 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4008 if (error) {
4009 return error;
4010 }
4011
4012 major = tc_get_major(handle);
4013 minor = tc_get_minor(handle);
4014 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4015 (*cb)(minor - 1, &stats, aux);
4016 }
4017 return 0;
4018 }
4019
4020 static const struct tc_ops tc_ops_htb = {
4021 "htb", /* linux_name */
4022 "linux-htb", /* ovs_name */
4023 HTB_N_QUEUES, /* n_queues */
4024 htb_tc_install,
4025 htb_tc_load,
4026 htb_tc_destroy,
4027 htb_qdisc_get,
4028 htb_qdisc_set,
4029 htb_class_get,
4030 htb_class_set,
4031 htb_class_delete,
4032 htb_class_get_stats,
4033 htb_class_dump_stats
4034 };
4035 \f
4036 /* "linux-hfsc" traffic control class. */
4037
4038 #define HFSC_N_QUEUES 0xf000
4039
4040 struct hfsc {
4041 struct tc tc;
4042 uint32_t max_rate;
4043 };
4044
4045 struct hfsc_class {
4046 struct tc_queue tc_queue;
4047 uint32_t min_rate;
4048 uint32_t max_rate;
4049 };
4050
4051 static struct hfsc *
4052 hfsc_get__(const struct netdev *netdev_)
4053 {
4054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4055 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4056 }
4057
4058 static struct hfsc_class *
4059 hfsc_class_cast__(const struct tc_queue *queue)
4060 {
4061 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4062 }
4063
4064 static void
4065 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4066 {
4067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4068 struct hfsc *hfsc;
4069
4070 hfsc = xmalloc(sizeof *hfsc);
4071 tc_init(&hfsc->tc, &tc_ops_hfsc);
4072 hfsc->max_rate = max_rate;
4073 netdev->tc = &hfsc->tc;
4074 }
4075
4076 static void
4077 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4078 const struct hfsc_class *hc)
4079 {
4080 size_t hash;
4081 struct hfsc *hfsc;
4082 struct hfsc_class *hcp;
4083 struct tc_queue *queue;
4084
4085 hfsc = hfsc_get__(netdev);
4086 hash = hash_int(queue_id, 0);
4087
4088 queue = tc_find_queue__(netdev, queue_id, hash);
4089 if (queue) {
4090 hcp = hfsc_class_cast__(queue);
4091 } else {
4092 hcp = xmalloc(sizeof *hcp);
4093 queue = &hcp->tc_queue;
4094 queue->queue_id = queue_id;
4095 queue->created = time_msec();
4096 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4097 }
4098
4099 hcp->min_rate = hc->min_rate;
4100 hcp->max_rate = hc->max_rate;
4101 }
4102
4103 static int
4104 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4105 {
4106 const struct tc_service_curve *rsc, *fsc, *usc;
4107 static const struct nl_policy tca_hfsc_policy[] = {
4108 [TCA_HFSC_RSC] = {
4109 .type = NL_A_UNSPEC,
4110 .optional = false,
4111 .min_len = sizeof(struct tc_service_curve),
4112 },
4113 [TCA_HFSC_FSC] = {
4114 .type = NL_A_UNSPEC,
4115 .optional = false,
4116 .min_len = sizeof(struct tc_service_curve),
4117 },
4118 [TCA_HFSC_USC] = {
4119 .type = NL_A_UNSPEC,
4120 .optional = false,
4121 .min_len = sizeof(struct tc_service_curve),
4122 },
4123 };
4124 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4125
4126 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4127 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4128 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4129 return EPROTO;
4130 }
4131
4132 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4133 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4134 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4135
4136 if (rsc->m1 != 0 || rsc->d != 0 ||
4137 fsc->m1 != 0 || fsc->d != 0 ||
4138 usc->m1 != 0 || usc->d != 0) {
4139 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4140 "Non-linear service curves are not supported.");
4141 return EPROTO;
4142 }
4143
4144 if (rsc->m2 != fsc->m2) {
4145 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4146 "Real-time service curves are not supported ");
4147 return EPROTO;
4148 }
4149
4150 if (rsc->m2 > usc->m2) {
4151 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4152 "Min-rate service curve is greater than "
4153 "the max-rate service curve.");
4154 return EPROTO;
4155 }
4156
4157 class->min_rate = fsc->m2;
4158 class->max_rate = usc->m2;
4159 return 0;
4160 }
4161
4162 static int
4163 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4164 struct hfsc_class *options,
4165 struct netdev_queue_stats *stats)
4166 {
4167 int error;
4168 unsigned int handle;
4169 struct nlattr *nl_options;
4170
4171 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4172 if (error) {
4173 return error;
4174 }
4175
4176 if (queue_id) {
4177 unsigned int major, minor;
4178
4179 major = tc_get_major(handle);
4180 minor = tc_get_minor(handle);
4181 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4182 *queue_id = minor - 1;
4183 } else {
4184 return EPROTO;
4185 }
4186 }
4187
4188 if (options) {
4189 error = hfsc_parse_tca_options__(nl_options, options);
4190 }
4191
4192 return error;
4193 }
4194
4195 static int
4196 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4197 unsigned int parent, struct hfsc_class *options,
4198 struct netdev_queue_stats *stats)
4199 {
4200 int error;
4201 struct ofpbuf *reply;
4202
4203 error = tc_query_class(netdev, handle, parent, &reply);
4204 if (error) {
4205 return error;
4206 }
4207
4208 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4209 ofpbuf_delete(reply);
4210 return error;
4211 }
4212
4213 static void
4214 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4215 struct hfsc_class *class)
4216 {
4217 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4218
4219 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4220 if (!max_rate) {
4221 enum netdev_features current;
4222
4223 netdev_linux_read_features(netdev);
4224 current = !netdev->get_features_error ? netdev->current : 0;
4225 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4226 }
4227
4228 class->min_rate = max_rate;
4229 class->max_rate = max_rate;
4230 }
4231
4232 static int
4233 hfsc_parse_class_details__(struct netdev *netdev,
4234 const struct smap *details,
4235 struct hfsc_class * class)
4236 {
4237 const struct hfsc *hfsc;
4238 uint32_t min_rate, max_rate;
4239
4240 hfsc = hfsc_get__(netdev);
4241
4242 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4243 min_rate = MAX(min_rate, 1);
4244 min_rate = MIN(min_rate, hfsc->max_rate);
4245
4246 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4247 max_rate = MAX(max_rate, min_rate);
4248 max_rate = MIN(max_rate, hfsc->max_rate);
4249
4250 class->min_rate = min_rate;
4251 class->max_rate = max_rate;
4252
4253 return 0;
4254 }
4255
4256 /* Create an HFSC qdisc.
4257 *
4258 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4259 static int
4260 hfsc_setup_qdisc__(struct netdev * netdev)
4261 {
4262 struct tcmsg *tcmsg;
4263 struct ofpbuf request;
4264 struct tc_hfsc_qopt opt;
4265
4266 tc_del_qdisc(netdev);
4267
4268 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4269 NLM_F_EXCL | NLM_F_CREATE, &request);
4270
4271 if (!tcmsg) {
4272 return ENODEV;
4273 }
4274
4275 tcmsg->tcm_handle = tc_make_handle(1, 0);
4276 tcmsg->tcm_parent = TC_H_ROOT;
4277
4278 memset(&opt, 0, sizeof opt);
4279 opt.defcls = 1;
4280
4281 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4282 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4283
4284 return tc_transact(&request, NULL);
4285 }
4286
4287 /* Create an HFSC class.
4288 *
4289 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4290 * sc rate <min_rate> ul rate <max_rate>" */
4291 static int
4292 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4293 unsigned int parent, struct hfsc_class *class)
4294 {
4295 int error;
4296 size_t opt_offset;
4297 struct tcmsg *tcmsg;
4298 struct ofpbuf request;
4299 struct tc_service_curve min, max;
4300
4301 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4302 &request);
4303
4304 if (!tcmsg) {
4305 return ENODEV;
4306 }
4307
4308 tcmsg->tcm_handle = handle;
4309 tcmsg->tcm_parent = parent;
4310
4311 min.m1 = 0;
4312 min.d = 0;
4313 min.m2 = class->min_rate;
4314
4315 max.m1 = 0;
4316 max.d = 0;
4317 max.m2 = class->max_rate;
4318
4319 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4320 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4321 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4322 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4323 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4324 nl_msg_end_nested(&request, opt_offset);
4325
4326 error = tc_transact(&request, NULL);
4327 if (error) {
4328 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4329 "min-rate %ubps, max-rate %ubps (%s)",
4330 netdev_get_name(netdev),
4331 tc_get_major(handle), tc_get_minor(handle),
4332 tc_get_major(parent), tc_get_minor(parent),
4333 class->min_rate, class->max_rate, ovs_strerror(error));
4334 }
4335
4336 return error;
4337 }
4338
4339 static int
4340 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4341 {
4342 int error;
4343 struct hfsc_class class;
4344
4345 error = hfsc_setup_qdisc__(netdev);
4346
4347 if (error) {
4348 return error;
4349 }
4350
4351 hfsc_parse_qdisc_details__(netdev, details, &class);
4352 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4353 tc_make_handle(1, 0), &class);
4354
4355 if (error) {
4356 return error;
4357 }
4358
4359 hfsc_install__(netdev, class.max_rate);
4360 return 0;
4361 }
4362
4363 static int
4364 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4365 {
4366 struct ofpbuf msg;
4367 struct queue_dump_state state;
4368 struct hfsc_class hc;
4369
4370 hc.max_rate = 0;
4371 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4372 hfsc_install__(netdev, hc.max_rate);
4373
4374 if (!start_queue_dump(netdev, &state)) {
4375 return ENODEV;
4376 }
4377
4378 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4379 unsigned int queue_id;
4380
4381 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4382 hfsc_update_queue__(netdev, queue_id, &hc);
4383 }
4384 }
4385
4386 finish_queue_dump(&state);
4387 return 0;
4388 }
4389
4390 static void
4391 hfsc_tc_destroy(struct tc *tc)
4392 {
4393 struct hfsc *hfsc;
4394 struct hfsc_class *hc, *next;
4395
4396 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4397
4398 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4399 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4400 free(hc);
4401 }
4402
4403 tc_destroy(tc);
4404 free(hfsc);
4405 }
4406
4407 static int
4408 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4409 {
4410 const struct hfsc *hfsc;
4411 hfsc = hfsc_get__(netdev);
4412 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4413 return 0;
4414 }
4415
4416 static int
4417 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4418 {
4419 int error;
4420 struct hfsc_class class;
4421
4422 hfsc_parse_qdisc_details__(netdev, details, &class);
4423 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4424 tc_make_handle(1, 0), &class);
4425
4426 if (!error) {
4427 hfsc_get__(netdev)->max_rate = class.max_rate;
4428 }
4429
4430 return error;
4431 }
4432
4433 static int
4434 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4435 const struct tc_queue *queue, struct smap *details)
4436 {
4437 const struct hfsc_class *hc;
4438
4439 hc = hfsc_class_cast__(queue);
4440 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4441 if (hc->min_rate != hc->max_rate) {
4442 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4443 }
4444 return 0;
4445 }
4446
4447 static int
4448 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4449 const struct smap *details)
4450 {
4451 int error;
4452 struct hfsc_class class;
4453
4454 error = hfsc_parse_class_details__(netdev, details, &class);
4455 if (error) {
4456 return error;
4457 }
4458
4459 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4460 tc_make_handle(1, 0xfffe), &class);
4461 if (error) {
4462 return error;
4463 }
4464
4465 hfsc_update_queue__(netdev, queue_id, &class);
4466 return 0;
4467 }
4468
4469 static int
4470 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4471 {
4472 int error;
4473 struct hfsc *hfsc;
4474 struct hfsc_class *hc;
4475
4476 hc = hfsc_class_cast__(queue);
4477 hfsc = hfsc_get__(netdev);
4478
4479 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4480 if (!error) {
4481 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4482 free(hc);
4483 }
4484 return error;
4485 }
4486
4487 static int
4488 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4489 struct netdev_queue_stats *stats)
4490 {
4491 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4492 tc_make_handle(1, 0xfffe), NULL, stats);
4493 }
4494
4495 static int
4496 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4497 const struct ofpbuf *nlmsg,
4498 netdev_dump_queue_stats_cb *cb, void *aux)
4499 {
4500 struct netdev_queue_stats stats;
4501 unsigned int handle, major, minor;
4502 int error;
4503
4504 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4505 if (error) {
4506 return error;
4507 }
4508
4509 major = tc_get_major(handle);
4510 minor = tc_get_minor(handle);
4511 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4512 (*cb)(minor - 1, &stats, aux);
4513 }
4514 return 0;
4515 }
4516
4517 static const struct tc_ops tc_ops_hfsc = {
4518 "hfsc", /* linux_name */
4519 "linux-hfsc", /* ovs_name */
4520 HFSC_N_QUEUES, /* n_queues */
4521 hfsc_tc_install, /* tc_install */
4522 hfsc_tc_load, /* tc_load */
4523 hfsc_tc_destroy, /* tc_destroy */
4524 hfsc_qdisc_get, /* qdisc_get */
4525 hfsc_qdisc_set, /* qdisc_set */
4526 hfsc_class_get, /* class_get */
4527 hfsc_class_set, /* class_set */
4528 hfsc_class_delete, /* class_delete */
4529 hfsc_class_get_stats, /* class_get_stats */
4530 hfsc_class_dump_stats /* class_dump_stats */
4531 };
4532 \f
4533 /* "linux-noop" traffic control class. */
4534
4535 static void
4536 noop_install__(struct netdev *netdev_)
4537 {
4538 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4539 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4540
4541 netdev->tc = CONST_CAST(struct tc *, &tc);
4542 }
4543
4544 static int
4545 noop_tc_install(struct netdev *netdev,
4546 const struct smap *details OVS_UNUSED)
4547 {
4548 noop_install__(netdev);
4549 return 0;
4550 }
4551
4552 static int
4553 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4554 {
4555 noop_install__(netdev);
4556 return 0;
4557 }
4558
4559 static const struct tc_ops tc_ops_noop = {
4560 NULL, /* linux_name */
4561 "linux-noop", /* ovs_name */
4562 0, /* n_queues */
4563 noop_tc_install,
4564 noop_tc_load,
4565 NULL, /* tc_destroy */
4566 NULL, /* qdisc_get */
4567 NULL, /* qdisc_set */
4568 NULL, /* class_get */
4569 NULL, /* class_set */
4570 NULL, /* class_delete */
4571 NULL, /* class_get_stats */
4572 NULL /* class_dump_stats */
4573 };
4574 \f
4575 /* "linux-default" traffic control class.
4576 *
4577 * This class represents the default, unnamed Linux qdisc. It corresponds to
4578 * the "" (empty string) QoS type in the OVS database. */
4579
4580 static void
4581 default_install__(struct netdev *netdev_)
4582 {
4583 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4584 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4585
4586 /* Nothing but a tc class implementation is allowed to write to a tc. This
4587 * class never does that, so we can legitimately use a const tc object. */
4588 netdev->tc = CONST_CAST(struct tc *, &tc);
4589 }
4590
4591 static int
4592 default_tc_install(struct netdev *netdev,
4593 const struct smap *details OVS_UNUSED)
4594 {
4595 default_install__(netdev);
4596 return 0;
4597 }
4598
4599 static int
4600 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4601 {
4602 default_install__(netdev);
4603 return 0;
4604 }
4605
4606 static const struct tc_ops tc_ops_default = {
4607 NULL, /* linux_name */
4608 "", /* ovs_name */
4609 0, /* n_queues */
4610 default_tc_install,
4611 default_tc_load,
4612 NULL, /* tc_destroy */
4613 NULL, /* qdisc_get */
4614 NULL, /* qdisc_set */
4615 NULL, /* class_get */
4616 NULL, /* class_set */
4617 NULL, /* class_delete */
4618 NULL, /* class_get_stats */
4619 NULL /* class_dump_stats */
4620 };
4621 \f
4622 /* "linux-other" traffic control class.
4623 *
4624 * */
4625
4626 static int
4627 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4628 {
4629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4630 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4631
4632 /* Nothing but a tc class implementation is allowed to write to a tc. This
4633 * class never does that, so we can legitimately use a const tc object. */
4634 netdev->tc = CONST_CAST(struct tc *, &tc);
4635 return 0;
4636 }
4637
4638 static const struct tc_ops tc_ops_other = {
4639 NULL, /* linux_name */
4640 "linux-other", /* ovs_name */
4641 0, /* n_queues */
4642 NULL, /* tc_install */
4643 other_tc_load,
4644 NULL, /* tc_destroy */
4645 NULL, /* qdisc_get */
4646 NULL, /* qdisc_set */
4647 NULL, /* class_get */
4648 NULL, /* class_set */
4649 NULL, /* class_delete */
4650 NULL, /* class_get_stats */
4651 NULL /* class_dump_stats */
4652 };
4653 \f
4654 /* Traffic control. */
4655
4656 /* Number of kernel "tc" ticks per second. */
4657 static double ticks_per_s;
4658
4659 /* Number of kernel "jiffies" per second. This is used for the purpose of
4660 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4661 * one jiffy's worth of data.
4662 *
4663 * There are two possibilities here:
4664 *
4665 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4666 * approximate range of 100 to 1024. That means that we really need to
4667 * make sure that the qdisc can buffer that much data.
4668 *
4669 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4670 * has finely granular timers and there's no need to fudge additional room
4671 * for buffers. (There's no extra effort needed to implement that: the
4672 * large 'buffer_hz' is used as a divisor, so practically any number will
4673 * come out as 0 in the division. Small integer results in the case of
4674 * really high dividends won't have any real effect anyhow.)
4675 */
4676 static unsigned int buffer_hz;
4677
4678 static struct tcmsg *
4679 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4680 unsigned int flags, struct ofpbuf *request)
4681 {
4682 int ifindex;
4683 int error;
4684
4685 error = get_ifindex(netdev, &ifindex);
4686 if (error) {
4687 return NULL;
4688 }
4689
4690 return tc_make_request(ifindex, type, flags, request);
4691 }
4692
4693 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4694 * of 'kbits_burst'.
4695 *
4696 * This function is equivalent to running:
4697 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4698 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4699 * mtu 65535 drop
4700 *
4701 * The configuration and stats may be seen with the following command:
4702 * /sbin/tc -s filter show dev <devname> parent ffff:
4703 *
4704 * Returns 0 if successful, otherwise a positive errno value.
4705 */
4706 static int
4707 tc_add_policer(struct netdev *netdev,
4708 uint32_t kbits_rate, uint32_t kbits_burst)
4709 {
4710 struct tc_police tc_police;
4711 struct ofpbuf request;
4712 struct tcmsg *tcmsg;
4713 size_t basic_offset;
4714 size_t police_offset;
4715 int error;
4716 int mtu = 65535;
4717
4718 memset(&tc_police, 0, sizeof tc_police);
4719 tc_police.action = TC_POLICE_SHOT;
4720 tc_police.mtu = mtu;
4721 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4722
4723 /* The following appears wrong in one way: In networking a kilobit is
4724 * usually 1000 bits but this uses 1024 bits.
4725 *
4726 * However if you "fix" those problems then "tc filter show ..." shows
4727 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4728 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4729 * tc's point of view. Whatever. */
4730 tc_police.burst = tc_bytes_to_ticks(
4731 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4732
4733 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4734 NLM_F_EXCL | NLM_F_CREATE, &request);
4735 if (!tcmsg) {
4736 return ENODEV;
4737 }
4738 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4739 tcmsg->tcm_info = tc_make_handle(49,
4740 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4741
4742 nl_msg_put_string(&request, TCA_KIND, "basic");
4743 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4744 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4745 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4746 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4747 nl_msg_end_nested(&request, police_offset);
4748 nl_msg_end_nested(&request, basic_offset);
4749
4750 error = tc_transact(&request, NULL);
4751 if (error) {
4752 return error;
4753 }
4754
4755 return 0;
4756 }
4757
4758 static void
4759 read_psched(void)
4760 {
4761 /* The values in psched are not individually very meaningful, but they are
4762 * important. The tables below show some values seen in the wild.
4763 *
4764 * Some notes:
4765 *
4766 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4767 * (Before that, there are hints that it was 1000000000.)
4768 *
4769 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4770 * above.
4771 *
4772 * /proc/net/psched
4773 * -----------------------------------
4774 * [1] 000c8000 000f4240 000f4240 00000064
4775 * [2] 000003e8 00000400 000f4240 3b9aca00
4776 * [3] 000003e8 00000400 000f4240 3b9aca00
4777 * [4] 000003e8 00000400 000f4240 00000064
4778 * [5] 000003e8 00000040 000f4240 3b9aca00
4779 * [6] 000003e8 00000040 000f4240 000000f9
4780 *
4781 * a b c d ticks_per_s buffer_hz
4782 * ------- --------- ---------- ------------- ----------- -------------
4783 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4784 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4785 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4786 * [4] 1,000 1,024 1,000,000 100 976,562 100
4787 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4788 * [6] 1,000 64 1,000,000 249 15,625,000 249
4789 *
4790 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4791 * [2] 2.6.26-1-686-bigmem from Debian lenny
4792 * [3] 2.6.26-2-sparc64 from Debian lenny
4793 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4794 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4795 * [6] 2.6.34 from kernel.org on KVM
4796 */
4797 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4798 static const char fn[] = "/proc/net/psched";
4799 unsigned int a, b, c, d;
4800 FILE *stream;
4801
4802 if (!ovsthread_once_start(&once)) {
4803 return;
4804 }
4805
4806 ticks_per_s = 1.0;
4807 buffer_hz = 100;
4808
4809 stream = fopen(fn, "r");
4810 if (!stream) {
4811 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4812 goto exit;
4813 }
4814
4815 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4816 VLOG_WARN("%s: read failed", fn);
4817 fclose(stream);
4818 goto exit;
4819 }
4820 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4821 fclose(stream);
4822
4823 if (!a || !c) {
4824 VLOG_WARN("%s: invalid scheduler parameters", fn);
4825 goto exit;
4826 }
4827
4828 ticks_per_s = (double) a * c / b;
4829 if (c == 1000000) {
4830 buffer_hz = d;
4831 } else {
4832 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4833 fn, a, b, c, d);
4834 }
4835 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4836
4837 exit:
4838 ovsthread_once_done(&once);
4839 }
4840
4841 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4842 * rate of 'rate' bytes per second. */
4843 static unsigned int
4844 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4845 {
4846 read_psched();
4847 return (rate * ticks) / ticks_per_s;
4848 }
4849
4850 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4851 * rate of 'rate' bytes per second. */
4852 static unsigned int
4853 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4854 {
4855 read_psched();
4856 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4857 }
4858
4859 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4860 * a transmission rate of 'rate' bytes per second. */
4861 static unsigned int
4862 tc_buffer_per_jiffy(unsigned int rate)
4863 {
4864 read_psched();
4865 return rate / buffer_hz;
4866 }
4867
4868 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4869 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4870 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4871 * stores NULL into it if it is absent.
4872 *
4873 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4874 * 'msg'.
4875 *
4876 * Returns 0 if successful, otherwise a positive errno value. */
4877 static int
4878 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4879 struct nlattr **options)
4880 {
4881 static const struct nl_policy tca_policy[] = {
4882 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4883 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4884 };
4885 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4886
4887 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4888 tca_policy, ta, ARRAY_SIZE(ta))) {
4889 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4890 goto error;
4891 }
4892
4893 if (kind) {
4894 *kind = nl_attr_get_string(ta[TCA_KIND]);
4895 }
4896
4897 if (options) {
4898 *options = ta[TCA_OPTIONS];
4899 }
4900
4901 return 0;
4902
4903 error:
4904 if (kind) {
4905 *kind = NULL;
4906 }
4907 if (options) {
4908 *options = NULL;
4909 }
4910 return EPROTO;
4911 }
4912
4913 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4914 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4915 * into '*options', and its queue statistics into '*stats'. Any of the output
4916 * arguments may be null.
4917 *
4918 * Returns 0 if successful, otherwise a positive errno value. */
4919 static int
4920 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4921 struct nlattr **options, struct netdev_queue_stats *stats)
4922 {
4923 static const struct nl_policy tca_policy[] = {
4924 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4925 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4926 };
4927 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4928
4929 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4930 tca_policy, ta, ARRAY_SIZE(ta))) {
4931 VLOG_WARN_RL(&rl, "failed to parse class message");
4932 goto error;
4933 }
4934
4935 if (handlep) {
4936 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4937 *handlep = tc->tcm_handle;
4938 }
4939
4940 if (options) {
4941 *options = ta[TCA_OPTIONS];
4942 }
4943
4944 if (stats) {
4945 const struct gnet_stats_queue *gsq;
4946 struct gnet_stats_basic gsb;
4947
4948 static const struct nl_policy stats_policy[] = {
4949 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4950 .min_len = sizeof gsb },
4951 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4952 .min_len = sizeof *gsq },
4953 };
4954 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4955
4956 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4957 sa, ARRAY_SIZE(sa))) {
4958 VLOG_WARN_RL(&rl, "failed to parse class stats");
4959 goto error;
4960 }
4961
4962 /* Alignment issues screw up the length of struct gnet_stats_basic on
4963 * some arch/bitsize combinations. Newer versions of Linux have a
4964 * struct gnet_stats_basic_packed, but we can't depend on that. The
4965 * easiest thing to do is just to make a copy. */
4966 memset(&gsb, 0, sizeof gsb);
4967 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4968 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4969 stats->tx_bytes = gsb.bytes;
4970 stats->tx_packets = gsb.packets;
4971
4972 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4973 stats->tx_errors = gsq->drops;
4974 }
4975
4976 return 0;
4977
4978 error:
4979 if (options) {
4980 *options = NULL;
4981 }
4982 if (stats) {
4983 memset(stats, 0, sizeof *stats);
4984 }
4985 return EPROTO;
4986 }
4987
4988 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4989 * on 'netdev'. */
4990 static int
4991 tc_query_class(const struct netdev *netdev,
4992 unsigned int handle, unsigned int parent,
4993 struct ofpbuf **replyp)
4994 {
4995 struct ofpbuf request;
4996 struct tcmsg *tcmsg;
4997 int error;
4998
4999 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5000 &request);
5001 if (!tcmsg) {
5002 return ENODEV;
5003 }
5004 tcmsg->tcm_handle = handle;
5005 tcmsg->tcm_parent = parent;
5006
5007 error = tc_transact(&request, replyp);
5008 if (error) {
5009 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5010 netdev_get_name(netdev),
5011 tc_get_major(handle), tc_get_minor(handle),
5012 tc_get_major(parent), tc_get_minor(parent),
5013 ovs_strerror(error));
5014 }
5015 return error;
5016 }
5017
5018 /* Equivalent to "tc class del dev <name> handle <handle>". */
5019 static int
5020 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5021 {
5022 struct ofpbuf request;
5023 struct tcmsg *tcmsg;
5024 int error;
5025
5026 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5027 if (!tcmsg) {
5028 return ENODEV;
5029 }
5030 tcmsg->tcm_handle = handle;
5031 tcmsg->tcm_parent = 0;
5032
5033 error = tc_transact(&request, NULL);
5034 if (error) {
5035 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5036 netdev_get_name(netdev),
5037 tc_get_major(handle), tc_get_minor(handle),
5038 ovs_strerror(error));
5039 }
5040 return error;
5041 }
5042
5043 /* Equivalent to "tc qdisc del dev <name> root". */
5044 static int
5045 tc_del_qdisc(struct netdev *netdev_)
5046 {
5047 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5048 struct ofpbuf request;
5049 struct tcmsg *tcmsg;
5050 int error;
5051
5052 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5053 if (!tcmsg) {
5054 return ENODEV;
5055 }
5056 tcmsg->tcm_handle = tc_make_handle(1, 0);
5057 tcmsg->tcm_parent = TC_H_ROOT;
5058
5059 error = tc_transact(&request, NULL);
5060 if (error == EINVAL) {
5061 /* EINVAL probably means that the default qdisc was in use, in which
5062 * case we've accomplished our purpose. */
5063 error = 0;
5064 }
5065 if (!error && netdev->tc) {
5066 if (netdev->tc->ops->tc_destroy) {
5067 netdev->tc->ops->tc_destroy(netdev->tc);
5068 }
5069 netdev->tc = NULL;
5070 }
5071 return error;
5072 }
5073
5074 static bool
5075 getqdisc_is_safe(void)
5076 {
5077 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5078 static bool safe = false;
5079
5080 if (ovsthread_once_start(&once)) {
5081 struct utsname utsname;
5082 int major, minor;
5083
5084 if (uname(&utsname) == -1) {
5085 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5086 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5087 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5088 } else if (major < 2 || (major == 2 && minor < 35)) {
5089 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5090 utsname.release);
5091 } else {
5092 safe = true;
5093 }
5094 ovsthread_once_done(&once);
5095 }
5096 return safe;
5097 }
5098
5099 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5100 * kernel to determine what they are. Returns 0 if successful, otherwise a
5101 * positive errno value. */
5102 static int
5103 tc_query_qdisc(const struct netdev *netdev_)
5104 {
5105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5106 struct ofpbuf request, *qdisc;
5107 const struct tc_ops *ops;
5108 struct tcmsg *tcmsg;
5109 int load_error;
5110 int error;
5111
5112 if (netdev->tc) {
5113 return 0;
5114 }
5115
5116 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5117 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5118 * 2.6.35 without that fix backported to it.
5119 *
5120 * To avoid the OOPS, we must not make a request that would attempt to dump
5121 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5122 * few others. There are a few ways that I can see to do this, but most of
5123 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5124 * technique chosen here is to assume that any non-default qdisc that we
5125 * create will have a class with handle 1:0. The built-in qdiscs only have
5126 * a class with handle 0:0.
5127 *
5128 * On Linux 2.6.35+ we use the straightforward method because it allows us
5129 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5130 * in such a case we get no response at all from the kernel (!) if a
5131 * builtin qdisc is in use (which is later caught by "!error &&
5132 * !qdisc->size"). */
5133 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5134 &request);
5135 if (!tcmsg) {
5136 return ENODEV;
5137 }
5138 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5139 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5140
5141 /* Figure out what tc class to instantiate. */
5142 error = tc_transact(&request, &qdisc);
5143 if (!error && qdisc->size) {
5144 const char *kind;
5145
5146 error = tc_parse_qdisc(qdisc, &kind, NULL);
5147 if (error) {
5148 ops = &tc_ops_other;
5149 } else {
5150 ops = tc_lookup_linux_name(kind);
5151 if (!ops) {
5152 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5153 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5154
5155 ops = &tc_ops_other;
5156 }
5157 }
5158 } else if ((!error && !qdisc->size) || error == ENOENT) {
5159 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5160 * set up by some other entity that doesn't have a handle 1:0. We will
5161 * assume that it's the system default qdisc. */
5162 ops = &tc_ops_default;
5163 error = 0;
5164 } else {
5165 /* Who knows? Maybe the device got deleted. */
5166 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5167 netdev_get_name(netdev_), ovs_strerror(error));
5168 ops = &tc_ops_other;
5169 }
5170
5171 /* Instantiate it. */
5172 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5173 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5174 ofpbuf_delete(qdisc);
5175
5176 return error ? error : load_error;
5177 }
5178
5179 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5180 approximate the time to transmit packets of various lengths. For an MTU of
5181 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5182 represents two possible packet lengths; for a MTU of 513 through 1024, four
5183 possible lengths; and so on.
5184
5185 Returns, for the specified 'mtu', the number of bits that packet lengths
5186 need to be shifted right to fit within such a 256-entry table. */
5187 static int
5188 tc_calc_cell_log(unsigned int mtu)
5189 {
5190 int cell_log;
5191
5192 if (!mtu) {
5193 mtu = ETH_PAYLOAD_MAX;
5194 }
5195 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5196
5197 for (cell_log = 0; mtu >= 256; cell_log++) {
5198 mtu >>= 1;
5199 }
5200
5201 return cell_log;
5202 }
5203
5204 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5205 * of 'mtu'. */
5206 static void
5207 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5208 {
5209 memset(rate, 0, sizeof *rate);
5210 rate->cell_log = tc_calc_cell_log(mtu);
5211 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5212 /* rate->cell_align = 0; */ /* distro headers. */
5213 rate->mpu = ETH_TOTAL_MIN;
5214 rate->rate = Bps;
5215 }
5216
5217 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5218 * attribute of the specified "type".
5219 *
5220 * See tc_calc_cell_log() above for a description of "rtab"s. */
5221 static void
5222 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5223 {
5224 uint32_t *rtab;
5225 unsigned int i;
5226
5227 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5228 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5229 unsigned packet_size = (i + 1) << rate->cell_log;
5230 if (packet_size < rate->mpu) {
5231 packet_size = rate->mpu;
5232 }
5233 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5234 }
5235 }
5236
5237 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5238 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5239 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5240 * 0 is fine.) */
5241 static int
5242 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5243 {
5244 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5245 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5246 }
5247 \f
5248 /* Linux-only functions declared in netdev-linux.h */
5249
5250 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5251 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5252 int
5253 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5254 const char *flag_name, bool enable)
5255 {
5256 const char *netdev_name = netdev_get_name(netdev);
5257 struct ethtool_value evalue;
5258 uint32_t new_flags;
5259 int error;
5260
5261 COVERAGE_INC(netdev_get_ethtool);
5262 memset(&evalue, 0, sizeof evalue);
5263 error = netdev_linux_do_ethtool(netdev_name,
5264 (struct ethtool_cmd *)&evalue,
5265 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5266 if (error) {
5267 return error;
5268 }
5269
5270 COVERAGE_INC(netdev_set_ethtool);
5271 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5272 if (new_flags == evalue.data) {
5273 return 0;
5274 }
5275 evalue.data = new_flags;
5276 error = netdev_linux_do_ethtool(netdev_name,
5277 (struct ethtool_cmd *)&evalue,
5278 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5279 if (error) {
5280 return error;
5281 }
5282
5283 COVERAGE_INC(netdev_get_ethtool);
5284 memset(&evalue, 0, sizeof evalue);
5285 error = netdev_linux_do_ethtool(netdev_name,
5286 (struct ethtool_cmd *)&evalue,
5287 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5288 if (error) {
5289 return error;
5290 }
5291
5292 if (new_flags != evalue.data) {
5293 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5294 "device %s failed", enable ? "enable" : "disable",
5295 flag_name, netdev_name);
5296 return EOPNOTSUPP;
5297 }
5298
5299 return 0;
5300 }
5301 \f
5302 /* Utility functions. */
5303
5304 /* Copies 'src' into 'dst', performing format conversion in the process. */
5305 static void
5306 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5307 const struct rtnl_link_stats *src)
5308 {
5309 dst->rx_packets = src->rx_packets;
5310 dst->tx_packets = src->tx_packets;
5311 dst->rx_bytes = src->rx_bytes;
5312 dst->tx_bytes = src->tx_bytes;
5313 dst->rx_errors = src->rx_errors;
5314 dst->tx_errors = src->tx_errors;
5315 dst->rx_dropped = src->rx_dropped;
5316 dst->tx_dropped = src->tx_dropped;
5317 dst->multicast = src->multicast;
5318 dst->collisions = src->collisions;
5319 dst->rx_length_errors = src->rx_length_errors;
5320 dst->rx_over_errors = src->rx_over_errors;
5321 dst->rx_crc_errors = src->rx_crc_errors;
5322 dst->rx_frame_errors = src->rx_frame_errors;
5323 dst->rx_fifo_errors = src->rx_fifo_errors;
5324 dst->rx_missed_errors = src->rx_missed_errors;
5325 dst->tx_aborted_errors = src->tx_aborted_errors;
5326 dst->tx_carrier_errors = src->tx_carrier_errors;
5327 dst->tx_fifo_errors = src->tx_fifo_errors;
5328 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5329 dst->tx_window_errors = src->tx_window_errors;
5330 }
5331
5332 /* Copies 'src' into 'dst', performing format conversion in the process. */
5333 static void
5334 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5335 const struct rtnl_link_stats64 *src)
5336 {
5337 dst->rx_packets = src->rx_packets;
5338 dst->tx_packets = src->tx_packets;
5339 dst->rx_bytes = src->rx_bytes;
5340 dst->tx_bytes = src->tx_bytes;
5341 dst->rx_errors = src->rx_errors;
5342 dst->tx_errors = src->tx_errors;
5343 dst->rx_dropped = src->rx_dropped;
5344 dst->tx_dropped = src->tx_dropped;
5345 dst->multicast = src->multicast;
5346 dst->collisions = src->collisions;
5347 dst->rx_length_errors = src->rx_length_errors;
5348 dst->rx_over_errors = src->rx_over_errors;
5349 dst->rx_crc_errors = src->rx_crc_errors;
5350 dst->rx_frame_errors = src->rx_frame_errors;
5351 dst->rx_fifo_errors = src->rx_fifo_errors;
5352 dst->rx_missed_errors = src->rx_missed_errors;
5353 dst->tx_aborted_errors = src->tx_aborted_errors;
5354 dst->tx_carrier_errors = src->tx_carrier_errors;
5355 dst->tx_fifo_errors = src->tx_fifo_errors;
5356 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5357 dst->tx_window_errors = src->tx_window_errors;
5358 }
5359
5360 static int
5361 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5362 {
5363 struct ofpbuf request;
5364 struct ofpbuf *reply;
5365 int error;
5366
5367 /* Filtering all counters by default */
5368 memset(stats, 0xFF, sizeof(struct netdev_stats));
5369
5370 ofpbuf_init(&request, 0);
5371 nl_msg_put_nlmsghdr(&request,
5372 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5373 RTM_GETLINK, NLM_F_REQUEST);
5374 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5375 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5376 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5377 ofpbuf_uninit(&request);
5378 if (error) {
5379 return error;
5380 }
5381
5382 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5383 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5384 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5385 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5386 error = 0;
5387 } else {
5388 a = nl_attr_find(reply, 0, IFLA_STATS);
5389 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5390 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5391 error = 0;
5392 } else {
5393 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5394 error = EPROTO;
5395 }
5396 }
5397 } else {
5398 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5399 error = EPROTO;
5400 }
5401
5402
5403 ofpbuf_delete(reply);
5404 return error;
5405 }
5406
5407 static int
5408 get_flags(const struct netdev *dev, unsigned int *flags)
5409 {
5410 struct ifreq ifr;
5411 int error;
5412
5413 *flags = 0;
5414 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5415 if (!error) {
5416 *flags = ifr.ifr_flags;
5417 }
5418 return error;
5419 }
5420
5421 static int
5422 set_flags(const char *name, unsigned int flags)
5423 {
5424 struct ifreq ifr;
5425
5426 ifr.ifr_flags = flags;
5427 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5428 }
5429
5430 int
5431 linux_get_ifindex(const char *netdev_name)
5432 {
5433 struct ifreq ifr;
5434 int error;
5435
5436 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5437 COVERAGE_INC(netdev_get_ifindex);
5438
5439 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5440 if (error) {
5441 /* ENODEV probably means that a vif disappeared asynchronously and
5442 * hasn't been removed from the database yet, so reduce the log level
5443 * to INFO for that case. */
5444 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5445 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5446 netdev_name, ovs_strerror(error));
5447 return -error;
5448 }
5449 return ifr.ifr_ifindex;
5450 }
5451
5452 static int
5453 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5454 {
5455 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5456
5457 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5458 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
5459
5460 if (ifindex < 0) {
5461 netdev->get_ifindex_error = -ifindex;
5462 netdev->ifindex = 0;
5463 } else {
5464 netdev->get_ifindex_error = 0;
5465 netdev->ifindex = ifindex;
5466 }
5467 netdev->cache_valid |= VALID_IFINDEX;
5468 }
5469
5470 *ifindexp = netdev->ifindex;
5471 return netdev->get_ifindex_error;
5472 }
5473
5474 static int
5475 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5476 {
5477 struct ifreq ifr;
5478 int hwaddr_family;
5479 int error;
5480
5481 memset(&ifr, 0, sizeof ifr);
5482 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5483 COVERAGE_INC(netdev_get_hwaddr);
5484 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5485 if (error) {
5486 /* ENODEV probably means that a vif disappeared asynchronously and
5487 * hasn't been removed from the database yet, so reduce the log level
5488 * to INFO for that case. */
5489 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5490 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5491 netdev_name, ovs_strerror(error));
5492 return error;
5493 }
5494 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5495 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5496 hwaddr_family != ARPHRD_NONE) {
5497 VLOG_INFO("%s device has unknown hardware address family %d",
5498 netdev_name, hwaddr_family);
5499 return EINVAL;
5500 }
5501 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5502 return 0;
5503 }
5504
5505 static int
5506 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5507 {
5508 struct ifreq ifr;
5509 int error;
5510
5511 memset(&ifr, 0, sizeof ifr);
5512 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5513 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5514 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5515 COVERAGE_INC(netdev_set_hwaddr);
5516 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5517 if (error) {
5518 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5519 netdev_name, ovs_strerror(error));
5520 }
5521 return error;
5522 }
5523
5524 static int
5525 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5526 int cmd, const char *cmd_name)
5527 {
5528 struct ifreq ifr;
5529 int error;
5530
5531 memset(&ifr, 0, sizeof ifr);
5532 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5533 ifr.ifr_data = (caddr_t) ecmd;
5534
5535 ecmd->cmd = cmd;
5536 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5537 if (error) {
5538 if (error != EOPNOTSUPP) {
5539 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5540 "failed: %s", cmd_name, name, ovs_strerror(error));
5541 } else {
5542 /* The device doesn't support this operation. That's pretty
5543 * common, so there's no point in logging anything. */
5544 }
5545 }
5546 return error;
5547 }
5548
5549 /* Returns an AF_PACKET raw socket or a negative errno value. */
5550 static int
5551 af_packet_sock(void)
5552 {
5553 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5554 static int sock;
5555
5556 if (ovsthread_once_start(&once)) {
5557 sock = socket(AF_PACKET, SOCK_RAW, 0);
5558 if (sock >= 0) {
5559 int error = set_nonblocking(sock);
5560 if (error) {
5561 close(sock);
5562 sock = -error;
5563 }
5564 } else {
5565 sock = -errno;
5566 VLOG_ERR("failed to create packet socket: %s",
5567 ovs_strerror(errno));
5568 }
5569 ovsthread_once_done(&once);
5570 }
5571
5572 return sock;
5573 }