]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-linux.c
netlink linux: fix to append the netnsid netlink attr.
[mirror_ovs.git] / lib / netdev-linux.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netdev-linux.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
26 #include <inttypes.h>
27 #include <math.h>
28 #include <linux/filter.h>
29 #include <linux/gen_stats.h>
30 #include <linux/if_ether.h>
31 #include <linux/if_tun.h>
32 #include <linux/types.h>
33 #include <linux/ethtool.h>
34 #include <linux/mii.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
41 #include <net/if.h>
42 #include <net/if_arp.h>
43 #include <net/route.h>
44 #include <poll.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #include "coverage.h"
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
55 #include "hash.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
62 #include "netlink.h"
63 #include "netnsid.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
67 #include "packets.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
72 #include "sset.h"
73 #include "tc.h"
74 #include "timer.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
77 #include "util.h"
78
79 VLOG_DEFINE_THIS_MODULE(netdev_linux);
80
81 COVERAGE_DEFINE(netdev_set_policing);
82 COVERAGE_DEFINE(netdev_arp_lookup);
83 COVERAGE_DEFINE(netdev_get_ifindex);
84 COVERAGE_DEFINE(netdev_get_hwaddr);
85 COVERAGE_DEFINE(netdev_set_hwaddr);
86 COVERAGE_DEFINE(netdev_get_ethtool);
87 COVERAGE_DEFINE(netdev_set_ethtool);
88
89 \f
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
92 #endif
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
97 #endif
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
100 #endif
101
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106 #endif
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109 #endif
110
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113 #ifndef TC_RTAB_SIZE
114 #define TC_RTAB_SIZE 1024
115 #endif
116
117 #ifndef TCM_IFINDEX_MAGIC_BLOCK
118 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
119 #endif
120
121 /* Linux 2.6.21 introduced struct tpacket_auxdata.
122 * Linux 2.6.27 added the tp_vlan_tci member.
123 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
124 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
125 * TP_STATUS_VLAN_TPID_VALID.
126 *
127 * With all this churn it's easiest to unconditionally define a replacement
128 * structure that has everything we want.
129 */
130 #ifndef PACKET_AUXDATA
131 #define PACKET_AUXDATA 8
132 #endif
133 #ifndef TP_STATUS_VLAN_VALID
134 #define TP_STATUS_VLAN_VALID (1 << 4)
135 #endif
136 #ifndef TP_STATUS_VLAN_TPID_VALID
137 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
138 #endif
139 #undef tpacket_auxdata
140 #define tpacket_auxdata rpl_tpacket_auxdata
141 struct tpacket_auxdata {
142 uint32_t tp_status;
143 uint32_t tp_len;
144 uint32_t tp_snaplen;
145 uint16_t tp_mac;
146 uint16_t tp_net;
147 uint16_t tp_vlan_tci;
148 uint16_t tp_vlan_tpid;
149 };
150
151 /* Linux 2.6.27 introduced ethtool_cmd_speed
152 *
153 * To avoid revisiting problems reported with using configure to detect
154 * compatibility (see report at
155 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
156 * unconditionally replace ethtool_cmd_speed. */
157 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
158 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
159 {
160 return ep->speed | (ep->speed_hi << 16);
161 }
162
163 /* Linux 2.6.30 introduced supported and advertised flags for
164 * 1G base KX, and 10G base KX4, KR and R. */
165 #ifndef SUPPORTED_1000baseKX_Full
166 #define SUPPORTED_1000baseKX_Full (1 << 17)
167 #define SUPPORTED_10000baseKX4_Full (1 << 18)
168 #define SUPPORTED_10000baseKR_Full (1 << 19)
169 #define SUPPORTED_10000baseR_FEC (1 << 20)
170 #define ADVERTISED_1000baseKX_Full (1 << 17)
171 #define ADVERTISED_10000baseKX4_Full (1 << 18)
172 #define ADVERTISED_10000baseKR_Full (1 << 19)
173 #define ADVERTISED_10000baseR_FEC (1 << 20)
174 #endif
175
176 /* Linux 3.5 introduced supported and advertised flags for
177 * 40G base KR4, CR4, SR4 and LR4. */
178 #ifndef SUPPORTED_40000baseKR4_Full
179 #define SUPPORTED_40000baseKR4_Full (1 << 23)
180 #define SUPPORTED_40000baseCR4_Full (1 << 24)
181 #define SUPPORTED_40000baseSR4_Full (1 << 25)
182 #define SUPPORTED_40000baseLR4_Full (1 << 26)
183 #define ADVERTISED_40000baseKR4_Full (1 << 23)
184 #define ADVERTISED_40000baseCR4_Full (1 << 24)
185 #define ADVERTISED_40000baseSR4_Full (1 << 25)
186 #define ADVERTISED_40000baseLR4_Full (1 << 26)
187 #endif
188
189 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
190 *
191 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
192 * 2.6.32-431.29.2.el6.x86_64 (see report at
193 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
194 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
195 * unconditionally define a replacement. */
196 #ifndef IFLA_STATS64
197 #define IFLA_STATS64 23
198 #endif
199 #define rtnl_link_stats64 rpl_rtnl_link_stats64
200 struct rtnl_link_stats64 {
201 uint64_t rx_packets;
202 uint64_t tx_packets;
203 uint64_t rx_bytes;
204 uint64_t tx_bytes;
205 uint64_t rx_errors;
206 uint64_t tx_errors;
207 uint64_t rx_dropped;
208 uint64_t tx_dropped;
209 uint64_t multicast;
210 uint64_t collisions;
211
212 uint64_t rx_length_errors;
213 uint64_t rx_over_errors;
214 uint64_t rx_crc_errors;
215 uint64_t rx_frame_errors;
216 uint64_t rx_fifo_errors;
217 uint64_t rx_missed_errors;
218
219 uint64_t tx_aborted_errors;
220 uint64_t tx_carrier_errors;
221 uint64_t tx_fifo_errors;
222 uint64_t tx_heartbeat_errors;
223 uint64_t tx_window_errors;
224
225 uint64_t rx_compressed;
226 uint64_t tx_compressed;
227 };
228
229 enum {
230 VALID_IFINDEX = 1 << 0,
231 VALID_ETHERADDR = 1 << 1,
232 VALID_IN = 1 << 2,
233 VALID_MTU = 1 << 3,
234 VALID_POLICING = 1 << 4,
235 VALID_VPORT_STAT_ERROR = 1 << 5,
236 VALID_DRVINFO = 1 << 6,
237 VALID_FEATURES = 1 << 7,
238 };
239 \f
240 struct linux_lag_slave {
241 uint32_t block_id;
242 struct shash_node *node;
243 };
244
245 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
246 static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
247
248 /* All slaves whose LAG masters are network devices in OvS. */
249 static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
250 = SHASH_INITIALIZER(&lag_shash);
251
252 /* Traffic control. */
253
254 /* An instance of a traffic control class. Always associated with a particular
255 * network device.
256 *
257 * Each TC implementation subclasses this with whatever additional data it
258 * needs. */
259 struct tc {
260 const struct tc_ops *ops;
261 struct hmap queues; /* Contains "struct tc_queue"s.
262 * Read by generic TC layer.
263 * Written only by TC implementation. */
264 };
265
266 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
267
268 /* One traffic control queue.
269 *
270 * Each TC implementation subclasses this with whatever additional data it
271 * needs. */
272 struct tc_queue {
273 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
274 unsigned int queue_id; /* OpenFlow queue ID. */
275 long long int created; /* Time queue was created, in msecs. */
276 };
277
278 /* A particular kind of traffic control. Each implementation generally maps to
279 * one particular Linux qdisc class.
280 *
281 * The functions below return 0 if successful or a positive errno value on
282 * failure, except where otherwise noted. All of them must be provided, except
283 * where otherwise noted. */
284 struct tc_ops {
285 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
286 * This is null for tc_ops_default and tc_ops_other, for which there are no
287 * appropriate values. */
288 const char *linux_name;
289
290 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
291 const char *ovs_name;
292
293 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
294 * queues. The queues are numbered 0 through n_queues - 1. */
295 unsigned int n_queues;
296
297 /* Called to install this TC class on 'netdev'. The implementation should
298 * make the Netlink calls required to set up 'netdev' with the right qdisc
299 * and configure it according to 'details'. The implementation may assume
300 * that the current qdisc is the default; that is, there is no need for it
301 * to delete the current qdisc before installing itself.
302 *
303 * The contents of 'details' should be documented as valid for 'ovs_name'
304 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
305 * (which is built as ovs-vswitchd.conf.db(8)).
306 *
307 * This function must return 0 if and only if it sets 'netdev->tc' to an
308 * initialized 'struct tc'.
309 *
310 * (This function is null for tc_ops_other, which cannot be installed. For
311 * other TC classes it should always be nonnull.) */
312 int (*tc_install)(struct netdev *netdev, const struct smap *details);
313
314 /* Called when the netdev code determines (through a Netlink query) that
315 * this TC class's qdisc is installed on 'netdev', but we didn't install
316 * it ourselves and so don't know any of the details.
317 *
318 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
319 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
320 * implementation should parse the other attributes of 'nlmsg' as
321 * necessary to determine its configuration. If necessary it should also
322 * use Netlink queries to determine the configuration of queues on
323 * 'netdev'.
324 *
325 * This function must return 0 if and only if it sets 'netdev->tc' to an
326 * initialized 'struct tc'. */
327 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
328
329 /* Destroys the data structures allocated by the implementation as part of
330 * 'tc'. (This includes destroying 'tc->queues' by calling
331 * tc_destroy(tc).
332 *
333 * The implementation should not need to perform any Netlink calls. If
334 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
335 * (But it may not be desirable.)
336 *
337 * This function may be null if 'tc' is trivial. */
338 void (*tc_destroy)(struct tc *tc);
339
340 /* Retrieves details of 'netdev->tc' configuration into 'details'.
341 *
342 * The implementation should not need to perform any Netlink calls, because
343 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
344 * cached the configuration.
345 *
346 * The contents of 'details' should be documented as valid for 'ovs_name'
347 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
348 * (which is built as ovs-vswitchd.conf.db(8)).
349 *
350 * This function may be null if 'tc' is not configurable.
351 */
352 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
353
354 /* Reconfigures 'netdev->tc' according to 'details', performing any
355 * required Netlink calls to complete the reconfiguration.
356 *
357 * The contents of 'details' should be documented as valid for 'ovs_name'
358 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
359 * (which is built as ovs-vswitchd.conf.db(8)).
360 *
361 * This function may be null if 'tc' is not configurable.
362 */
363 int (*qdisc_set)(struct netdev *, const struct smap *details);
364
365 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
366 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
367 *
368 * The contents of 'details' should be documented as valid for 'ovs_name'
369 * in the "other_config" column in the "Queue" table in
370 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
371 *
372 * The implementation should not need to perform any Netlink calls, because
373 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
374 * cached the queue configuration.
375 *
376 * This function may be null if 'tc' does not have queues ('n_queues' is
377 * 0). */
378 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
379 struct smap *details);
380
381 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
382 * 'details', perfoming any required Netlink calls to complete the
383 * reconfiguration. The caller ensures that 'queue_id' is less than
384 * 'n_queues'.
385 *
386 * The contents of 'details' should be documented as valid for 'ovs_name'
387 * in the "other_config" column in the "Queue" table in
388 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
389 *
390 * This function may be null if 'tc' does not have queues or its queues are
391 * not configurable. */
392 int (*class_set)(struct netdev *, unsigned int queue_id,
393 const struct smap *details);
394
395 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
396 * tc_queue's within 'netdev->tc->queues'.
397 *
398 * This function may be null if 'tc' does not have queues or its queues
399 * cannot be deleted. */
400 int (*class_delete)(struct netdev *, struct tc_queue *queue);
401
402 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
403 * 'struct tc_queue's within 'netdev->tc->queues'.
404 *
405 * On success, initializes '*stats'.
406 *
407 * This function may be null if 'tc' does not have queues or if it cannot
408 * report queue statistics. */
409 int (*class_get_stats)(const struct netdev *netdev,
410 const struct tc_queue *queue,
411 struct netdev_queue_stats *stats);
412
413 /* Extracts queue stats from 'nlmsg', which is a response to a
414 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
415 *
416 * This function may be null if 'tc' does not have queues or if it cannot
417 * report queue statistics. */
418 int (*class_dump_stats)(const struct netdev *netdev,
419 const struct ofpbuf *nlmsg,
420 netdev_dump_queue_stats_cb *cb, void *aux);
421 };
422
423 static void
424 tc_init(struct tc *tc, const struct tc_ops *ops)
425 {
426 tc->ops = ops;
427 hmap_init(&tc->queues);
428 }
429
430 static void
431 tc_destroy(struct tc *tc)
432 {
433 hmap_destroy(&tc->queues);
434 }
435
436 static const struct tc_ops tc_ops_htb;
437 static const struct tc_ops tc_ops_hfsc;
438 static const struct tc_ops tc_ops_codel;
439 static const struct tc_ops tc_ops_fqcodel;
440 static const struct tc_ops tc_ops_sfq;
441 static const struct tc_ops tc_ops_netem;
442 static const struct tc_ops tc_ops_default;
443 static const struct tc_ops tc_ops_noop;
444 static const struct tc_ops tc_ops_other;
445
446 static const struct tc_ops *const tcs[] = {
447 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
448 &tc_ops_hfsc, /* Hierarchical fair service curve. */
449 &tc_ops_codel, /* Controlled delay */
450 &tc_ops_fqcodel, /* Fair queue controlled delay */
451 &tc_ops_sfq, /* Stochastic fair queueing */
452 &tc_ops_netem, /* Network Emulator */
453 &tc_ops_noop, /* Non operating qos type. */
454 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
455 &tc_ops_other, /* Some other qdisc. */
456 NULL
457 };
458
459 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
460 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
461 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
462 static uint32_t tc_time_to_ticks(uint32_t time);
463
464 static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
465 int type,
466 unsigned int flags,
467 struct ofpbuf *);
468 static int tc_add_policer(struct netdev *,
469 uint32_t kbits_rate, uint32_t kbits_burst);
470
471 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
472 struct nlattr **options);
473 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
474 struct nlattr **options,
475 struct netdev_queue_stats *);
476 static int tc_query_class(const struct netdev *,
477 unsigned int handle, unsigned int parent,
478 struct ofpbuf **replyp);
479 static int tc_delete_class(const struct netdev *, unsigned int handle);
480
481 static int tc_del_qdisc(struct netdev *netdev);
482 static int tc_query_qdisc(const struct netdev *netdev);
483
484 void
485 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
486 static int tc_calc_cell_log(unsigned int mtu);
487 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
488 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
489 \f
490 struct netdev_linux {
491 struct netdev up;
492
493 /* Protects all members below. */
494 struct ovs_mutex mutex;
495
496 unsigned int cache_valid;
497
498 bool miimon; /* Link status of last poll. */
499 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
500 struct timer miimon_timer;
501
502 int netnsid; /* Network namespace ID. */
503 /* The following are figured out "on demand" only. They are only valid
504 * when the corresponding VALID_* bit in 'cache_valid' is set. */
505 int ifindex;
506 struct eth_addr etheraddr;
507 int mtu;
508 unsigned int ifi_flags;
509 long long int carrier_resets;
510 uint32_t kbits_rate; /* Policing data. */
511 uint32_t kbits_burst;
512 int vport_stats_error; /* Cached error code from vport_get_stats().
513 0 or an errno value. */
514 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
515 int ether_addr_error; /* Cached error code from set/get etheraddr. */
516 int netdev_policing_error; /* Cached error code from set policing. */
517 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
518 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
519
520 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
521 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
522 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
523
524 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
525 struct tc *tc;
526
527 /* For devices of class netdev_tap_class only. */
528 int tap_fd;
529 bool present; /* If the device is present in the namespace */
530 uint64_t tx_dropped; /* tap device can drop if the iface is down */
531
532 /* LAG information. */
533 bool is_lag_master; /* True if the netdev is a LAG master. */
534 };
535
536 struct netdev_rxq_linux {
537 struct netdev_rxq up;
538 bool is_tap;
539 int fd;
540 };
541
542 /* This is set pretty low because we probably won't learn anything from the
543 * additional log messages. */
544 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
545
546 /* Polling miimon status for all ports causes performance degradation when
547 * handling a large number of ports. If there are no devices using miimon, then
548 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
549 *
550 * Readers do not depend on this variable synchronizing with the related
551 * changes in the device miimon status, so we can use atomic_count. */
552 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
553
554 static void netdev_linux_run(const struct netdev_class *);
555
556 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
557 int cmd, const char *cmd_name);
558 static int get_flags(const struct netdev *, unsigned int *flags);
559 static int set_flags(const char *, unsigned int flags);
560 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
561 enum netdev_flags on, enum netdev_flags *old_flagsp)
562 OVS_REQUIRES(netdev->mutex);
563 static int get_ifindex(const struct netdev *, int *ifindexp);
564 static int do_set_addr(struct netdev *netdev,
565 int ioctl_nr, const char *ioctl_name,
566 struct in_addr addr);
567 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
568 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
569 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
570 static int af_packet_sock(void);
571 static bool netdev_linux_miimon_enabled(void);
572 static void netdev_linux_miimon_run(void);
573 static void netdev_linux_miimon_wait(void);
574 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
575
576 static bool
577 is_netdev_linux_class(const struct netdev_class *netdev_class)
578 {
579 return netdev_class->run == netdev_linux_run;
580 }
581
582 static bool
583 is_tap_netdev(const struct netdev *netdev)
584 {
585 return netdev_get_class(netdev) == &netdev_tap_class;
586 }
587
588 static struct netdev_linux *
589 netdev_linux_cast(const struct netdev *netdev)
590 {
591 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
592
593 return CONTAINER_OF(netdev, struct netdev_linux, up);
594 }
595
596 static struct netdev_rxq_linux *
597 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
598 {
599 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
600 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
601 }
602 \f
603 static int
604 netdev_linux_netnsid_update__(struct netdev_linux *netdev)
605 {
606 struct dpif_netlink_vport reply;
607 struct ofpbuf *buf;
608 int error;
609
610 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
611 if (error) {
612 if (error == ENOENT) {
613 /* Assume it is local if there is no API (e.g. if the openvswitch
614 * kernel module is not loaded). */
615 netnsid_set_local(&netdev->netnsid);
616 } else {
617 netnsid_unset(&netdev->netnsid);
618 }
619 return error;
620 }
621
622 netnsid_set(&netdev->netnsid, reply.netnsid);
623 ofpbuf_delete(buf);
624 return 0;
625 }
626
627 static int
628 netdev_linux_netnsid_update(struct netdev_linux *netdev)
629 {
630 if (netnsid_is_unset(netdev->netnsid)) {
631 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
632 netnsid_set_local(&netdev->netnsid);
633 } else {
634 return netdev_linux_netnsid_update__(netdev);
635 }
636 }
637
638 return 0;
639 }
640
641 static bool
642 netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
643 {
644 netdev_linux_netnsid_update(netdev);
645 return netnsid_eq(netdev->netnsid, nsid);
646 }
647
648 static bool
649 netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
650 {
651 netdev_linux_netnsid_update(netdev);
652 return netnsid_is_remote(netdev->netnsid);
653 }
654
655 static int netdev_linux_update_via_netlink(struct netdev_linux *);
656 static void netdev_linux_update(struct netdev_linux *netdev, int,
657 const struct rtnetlink_change *)
658 OVS_REQUIRES(netdev->mutex);
659 static void netdev_linux_changed(struct netdev_linux *netdev,
660 unsigned int ifi_flags, unsigned int mask)
661 OVS_REQUIRES(netdev->mutex);
662
663 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
664 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
665 * if no such socket could be created. */
666 static struct nl_sock *
667 netdev_linux_notify_sock(void)
668 {
669 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
670 static struct nl_sock *sock;
671 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
672 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
673
674 if (ovsthread_once_start(&once)) {
675 int error;
676
677 error = nl_sock_create(NETLINK_ROUTE, &sock);
678 if (!error) {
679 size_t i;
680
681 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
682 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
683 if (error) {
684 nl_sock_destroy(sock);
685 sock = NULL;
686 break;
687 }
688 }
689 }
690 nl_sock_listen_all_nsid(sock, true);
691 ovsthread_once_done(&once);
692 }
693
694 return sock;
695 }
696
697 static bool
698 netdev_linux_miimon_enabled(void)
699 {
700 return atomic_count_get(&miimon_cnt) > 0;
701 }
702
703 static bool
704 netdev_linux_kind_is_lag(const char *kind)
705 {
706 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
707 return true;
708 }
709
710 return false;
711 }
712
713 static void
714 netdev_linux_update_lag(struct rtnetlink_change *change)
715 OVS_REQUIRES(lag_mutex)
716 {
717 struct linux_lag_slave *lag;
718
719 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
720 return;
721 }
722
723 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
724 lag = shash_find_data(&lag_shash, change->ifname);
725
726 if (!lag) {
727 struct netdev *master_netdev;
728 char master_name[IFNAMSIZ];
729 uint32_t block_id;
730 int error = 0;
731
732 if_indextoname(change->master_ifindex, master_name);
733 master_netdev = netdev_from_name(master_name);
734 if (!master_netdev) {
735 return;
736 }
737
738 if (is_netdev_linux_class(master_netdev->netdev_class)) {
739 block_id = netdev_get_block_id(master_netdev);
740 if (!block_id) {
741 netdev_close(master_netdev);
742 return;
743 }
744
745 lag = xmalloc(sizeof *lag);
746 lag->block_id = block_id;
747 lag->node = shash_add(&lag_shash, change->ifname, lag);
748
749 /* delete ingress block in case it exists */
750 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
751 /* LAG master is linux netdev so add slave to same block. */
752 error = tc_add_del_qdisc(change->if_index, true, block_id,
753 TC_INGRESS);
754 if (error) {
755 VLOG_WARN("failed to bind LAG slave %s to master's block",
756 change->ifname);
757 shash_delete(&lag_shash, lag->node);
758 free(lag);
759 }
760 }
761
762 netdev_close(master_netdev);
763 }
764 } else if (change->master_ifindex == 0) {
765 /* Check if this was a lag slave that has been freed. */
766 lag = shash_find_data(&lag_shash, change->ifname);
767
768 if (lag) {
769 tc_add_del_qdisc(change->if_index, false, lag->block_id,
770 TC_INGRESS);
771 shash_delete(&lag_shash, lag->node);
772 free(lag);
773 }
774 }
775 }
776
777 static void
778 netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
779 {
780 struct nl_sock *sock;
781 int error;
782
783 if (netdev_linux_miimon_enabled()) {
784 netdev_linux_miimon_run();
785 }
786
787 sock = netdev_linux_notify_sock();
788 if (!sock) {
789 return;
790 }
791
792 do {
793 uint64_t buf_stub[4096 / 8];
794 int nsid;
795 struct ofpbuf buf;
796
797 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
798 error = nl_sock_recv(sock, &buf, &nsid, false);
799 if (!error) {
800 struct rtnetlink_change change;
801
802 if (rtnetlink_parse(&buf, &change)) {
803 struct netdev *netdev_ = NULL;
804 char dev_name[IFNAMSIZ];
805
806 if (!change.ifname) {
807 change.ifname = if_indextoname(change.if_index, dev_name);
808 }
809
810 if (change.ifname) {
811 netdev_ = netdev_from_name(change.ifname);
812 }
813 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815
816 ovs_mutex_lock(&netdev->mutex);
817 netdev_linux_update(netdev, nsid, &change);
818 ovs_mutex_unlock(&netdev->mutex);
819 }
820 else if (!netdev_ && change.ifname) {
821 /* Netdev is not present in OvS but its master could be. */
822 ovs_mutex_lock(&lag_mutex);
823 netdev_linux_update_lag(&change);
824 ovs_mutex_unlock(&lag_mutex);
825 }
826 netdev_close(netdev_);
827 }
828 } else if (error == ENOBUFS) {
829 struct shash device_shash;
830 struct shash_node *node;
831
832 nl_sock_drain(sock);
833
834 shash_init(&device_shash);
835 netdev_get_devices(&netdev_linux_class, &device_shash);
836 SHASH_FOR_EACH (node, &device_shash) {
837 struct netdev *netdev_ = node->data;
838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 unsigned int flags;
840
841 ovs_mutex_lock(&netdev->mutex);
842 get_flags(netdev_, &flags);
843 netdev_linux_changed(netdev, flags, 0);
844 ovs_mutex_unlock(&netdev->mutex);
845
846 netdev_close(netdev_);
847 }
848 shash_destroy(&device_shash);
849 } else if (error != EAGAIN) {
850 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
851 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
852 ovs_strerror(error));
853 }
854 ofpbuf_uninit(&buf);
855 } while (!error);
856 }
857
858 static void
859 netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
860 {
861 struct nl_sock *sock;
862
863 if (netdev_linux_miimon_enabled()) {
864 netdev_linux_miimon_wait();
865 }
866 sock = netdev_linux_notify_sock();
867 if (sock) {
868 nl_sock_wait(sock, POLLIN);
869 }
870 }
871
872 static void
873 netdev_linux_changed(struct netdev_linux *dev,
874 unsigned int ifi_flags, unsigned int mask)
875 OVS_REQUIRES(dev->mutex)
876 {
877 netdev_change_seq_changed(&dev->up);
878
879 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
880 dev->carrier_resets++;
881 }
882 dev->ifi_flags = ifi_flags;
883
884 dev->cache_valid &= mask;
885 if (!(mask & VALID_IN)) {
886 netdev_get_addrs_list_flush();
887 }
888 }
889
890 static void
891 netdev_linux_update__(struct netdev_linux *dev,
892 const struct rtnetlink_change *change)
893 OVS_REQUIRES(dev->mutex)
894 {
895 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
896 if (change->nlmsg_type == RTM_NEWLINK) {
897 /* Keep drv-info, and ip addresses. */
898 netdev_linux_changed(dev, change->ifi_flags,
899 VALID_DRVINFO | VALID_IN);
900
901 /* Update netdev from rtnl-change msg. */
902 if (change->mtu) {
903 dev->mtu = change->mtu;
904 dev->cache_valid |= VALID_MTU;
905 dev->netdev_mtu_error = 0;
906 }
907
908 if (!eth_addr_is_zero(change->mac)) {
909 dev->etheraddr = change->mac;
910 dev->cache_valid |= VALID_ETHERADDR;
911 dev->ether_addr_error = 0;
912
913 /* The mac addr has been changed, report it now. */
914 rtnetlink_report_link();
915 }
916
917 if (change->master && netdev_linux_kind_is_lag(change->master)) {
918 dev->is_lag_master = true;
919 }
920
921 dev->ifindex = change->if_index;
922 dev->cache_valid |= VALID_IFINDEX;
923 dev->get_ifindex_error = 0;
924 dev->present = true;
925 } else {
926 /* FIXME */
927 netdev_linux_changed(dev, change->ifi_flags, 0);
928 dev->present = false;
929 netnsid_unset(&dev->netnsid);
930 }
931 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
932 /* Invalidates in4, in6. */
933 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
934 } else {
935 OVS_NOT_REACHED();
936 }
937 }
938
939 static void
940 netdev_linux_update(struct netdev_linux *dev, int nsid,
941 const struct rtnetlink_change *change)
942 OVS_REQUIRES(dev->mutex)
943 {
944 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
945 netdev_linux_update__(dev, change);
946 }
947 }
948
949 static struct netdev *
950 netdev_linux_alloc(void)
951 {
952 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
953 return &netdev->up;
954 }
955
956 static int
957 netdev_linux_common_construct(struct netdev *netdev_)
958 {
959 /* Prevent any attempt to create (or open) a network device named "default"
960 * or "all". These device names are effectively reserved on Linux because
961 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
962 * itself this wouldn't call for any special treatment, but in practice if
963 * a program tries to create devices with these names, it causes the kernel
964 * to fire a "new device" notification event even though creation failed,
965 * and in turn that causes OVS to wake up and try to create them again,
966 * which ends up as a 100% CPU loop. */
967 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
968 const char *name = netdev_->name;
969 if (!strcmp(name, "default") || !strcmp(name, "all")) {
970 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
971 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
972 name);
973 return EINVAL;
974 }
975
976 /* The device could be in the same network namespace or in another one. */
977 netnsid_unset(&netdev->netnsid);
978 ovs_mutex_init(&netdev->mutex);
979 return 0;
980 }
981
982 /* Creates system and internal devices. */
983 static int
984 netdev_linux_construct(struct netdev *netdev_)
985 {
986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
987 int error = netdev_linux_common_construct(netdev_);
988 if (error) {
989 return error;
990 }
991
992 error = get_flags(&netdev->up, &netdev->ifi_flags);
993 if (error == ENODEV) {
994 if (netdev->up.netdev_class != &netdev_internal_class) {
995 /* The device does not exist, so don't allow it to be opened. */
996 return ENODEV;
997 } else {
998 /* "Internal" netdevs have to be created as netdev objects before
999 * they exist in the kernel, because creating them in the kernel
1000 * happens by passing a netdev object to dpif_port_add().
1001 * Therefore, ignore the error. */
1002 }
1003 }
1004
1005 return 0;
1006 }
1007
1008 /* For most types of netdevs we open the device for each call of
1009 * netdev_open(). However, this is not the case with tap devices,
1010 * since it is only possible to open the device once. In this
1011 * situation we share a single file descriptor, and consequently
1012 * buffers, across all readers. Therefore once data is read it will
1013 * be unavailable to other reads for tap devices. */
1014 static int
1015 netdev_linux_construct_tap(struct netdev *netdev_)
1016 {
1017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1018 static const char tap_dev[] = "/dev/net/tun";
1019 const char *name = netdev_->name;
1020 struct ifreq ifr;
1021
1022 int error = netdev_linux_common_construct(netdev_);
1023 if (error) {
1024 return error;
1025 }
1026
1027 /* Open tap device. */
1028 netdev->tap_fd = open(tap_dev, O_RDWR);
1029 if (netdev->tap_fd < 0) {
1030 error = errno;
1031 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1032 return error;
1033 }
1034
1035 /* Create tap device. */
1036 get_flags(&netdev->up, &netdev->ifi_flags);
1037 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1038 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1039 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1040 VLOG_WARN("%s: creating tap device failed: %s", name,
1041 ovs_strerror(errno));
1042 error = errno;
1043 goto error_close;
1044 }
1045
1046 /* Make non-blocking. */
1047 error = set_nonblocking(netdev->tap_fd);
1048 if (error) {
1049 goto error_close;
1050 }
1051
1052 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1053 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1054 ovs_strerror(errno));
1055 error = errno;
1056 goto error_close;
1057 }
1058
1059 netdev->present = true;
1060 return 0;
1061
1062 error_close:
1063 close(netdev->tap_fd);
1064 return error;
1065 }
1066
1067 static void
1068 netdev_linux_destruct(struct netdev *netdev_)
1069 {
1070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1071
1072 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1073 netdev->tc->ops->tc_destroy(netdev->tc);
1074 }
1075
1076 if (netdev_get_class(netdev_) == &netdev_tap_class
1077 && netdev->tap_fd >= 0)
1078 {
1079 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1080 close(netdev->tap_fd);
1081 }
1082
1083 if (netdev->miimon_interval > 0) {
1084 atomic_count_dec(&miimon_cnt);
1085 }
1086
1087 ovs_mutex_destroy(&netdev->mutex);
1088 }
1089
1090 static void
1091 netdev_linux_dealloc(struct netdev *netdev_)
1092 {
1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1094 free(netdev);
1095 }
1096
1097 static struct netdev_rxq *
1098 netdev_linux_rxq_alloc(void)
1099 {
1100 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1101 return &rx->up;
1102 }
1103
1104 static int
1105 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1106 {
1107 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1108 struct netdev *netdev_ = rx->up.netdev;
1109 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1110 int error;
1111
1112 ovs_mutex_lock(&netdev->mutex);
1113 rx->is_tap = is_tap_netdev(netdev_);
1114 if (rx->is_tap) {
1115 rx->fd = netdev->tap_fd;
1116 } else {
1117 struct sockaddr_ll sll;
1118 int ifindex, val;
1119 /* Result of tcpdump -dd inbound */
1120 static const struct sock_filter filt[] = {
1121 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1122 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1123 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1124 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1125 };
1126 static const struct sock_fprog fprog = {
1127 ARRAY_SIZE(filt), (struct sock_filter *) filt
1128 };
1129
1130 /* Create file descriptor. */
1131 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1132 if (rx->fd < 0) {
1133 error = errno;
1134 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1135 goto error;
1136 }
1137
1138 val = 1;
1139 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1140 error = errno;
1141 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1142 netdev_get_name(netdev_), ovs_strerror(error));
1143 goto error;
1144 }
1145
1146 /* Set non-blocking mode. */
1147 error = set_nonblocking(rx->fd);
1148 if (error) {
1149 goto error;
1150 }
1151
1152 /* Get ethernet device index. */
1153 error = get_ifindex(&netdev->up, &ifindex);
1154 if (error) {
1155 goto error;
1156 }
1157
1158 /* Bind to specific ethernet device. */
1159 memset(&sll, 0, sizeof sll);
1160 sll.sll_family = AF_PACKET;
1161 sll.sll_ifindex = ifindex;
1162 sll.sll_protocol = htons(ETH_P_ALL);
1163 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1164 error = errno;
1165 VLOG_ERR("%s: failed to bind raw socket (%s)",
1166 netdev_get_name(netdev_), ovs_strerror(error));
1167 goto error;
1168 }
1169
1170 /* Filter for only inbound packets. */
1171 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1172 sizeof fprog);
1173 if (error) {
1174 error = errno;
1175 VLOG_ERR("%s: failed to attach filter (%s)",
1176 netdev_get_name(netdev_), ovs_strerror(error));
1177 goto error;
1178 }
1179 }
1180 ovs_mutex_unlock(&netdev->mutex);
1181
1182 return 0;
1183
1184 error:
1185 if (rx->fd >= 0) {
1186 close(rx->fd);
1187 }
1188 ovs_mutex_unlock(&netdev->mutex);
1189 return error;
1190 }
1191
1192 static void
1193 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1194 {
1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1196
1197 if (!rx->is_tap) {
1198 close(rx->fd);
1199 }
1200 }
1201
1202 static void
1203 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1204 {
1205 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1206
1207 free(rx);
1208 }
1209
1210 static ovs_be16
1211 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1212 {
1213 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1214 return htons(aux->tp_vlan_tpid);
1215 } else if (double_tagged) {
1216 return htons(ETH_TYPE_VLAN_8021AD);
1217 } else {
1218 return htons(ETH_TYPE_VLAN_8021Q);
1219 }
1220 }
1221
1222 static bool
1223 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1224 {
1225 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1226 }
1227
1228 static int
1229 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1230 {
1231 size_t size;
1232 ssize_t retval;
1233 struct iovec iov;
1234 struct cmsghdr *cmsg;
1235 union {
1236 struct cmsghdr cmsg;
1237 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1238 } cmsg_buffer;
1239 struct msghdr msgh;
1240
1241 /* Reserve headroom for a single VLAN tag */
1242 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1243 size = dp_packet_tailroom(buffer);
1244
1245 iov.iov_base = dp_packet_data(buffer);
1246 iov.iov_len = size;
1247 msgh.msg_name = NULL;
1248 msgh.msg_namelen = 0;
1249 msgh.msg_iov = &iov;
1250 msgh.msg_iovlen = 1;
1251 msgh.msg_control = &cmsg_buffer;
1252 msgh.msg_controllen = sizeof cmsg_buffer;
1253 msgh.msg_flags = 0;
1254
1255 do {
1256 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1257 } while (retval < 0 && errno == EINTR);
1258
1259 if (retval < 0) {
1260 return errno;
1261 } else if (retval > size) {
1262 return EMSGSIZE;
1263 }
1264
1265 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1266
1267 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1268 const struct tpacket_auxdata *aux;
1269
1270 if (cmsg->cmsg_level != SOL_PACKET
1271 || cmsg->cmsg_type != PACKET_AUXDATA
1272 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1273 continue;
1274 }
1275
1276 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1277 if (auxdata_has_vlan_tci(aux)) {
1278 struct eth_header *eth;
1279 bool double_tagged;
1280
1281 if (retval < ETH_HEADER_LEN) {
1282 return EINVAL;
1283 }
1284
1285 eth = dp_packet_data(buffer);
1286 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1287
1288 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
1289 htons(aux->tp_vlan_tci));
1290 break;
1291 }
1292 }
1293
1294 return 0;
1295 }
1296
1297 static int
1298 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1299 {
1300 ssize_t retval;
1301 size_t size = dp_packet_tailroom(buffer);
1302
1303 do {
1304 retval = read(fd, dp_packet_data(buffer), size);
1305 } while (retval < 0 && errno == EINTR);
1306
1307 if (retval < 0) {
1308 return errno;
1309 }
1310
1311 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1312 return 0;
1313 }
1314
1315 static int
1316 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1317 int *qfill)
1318 {
1319 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1320 struct netdev *netdev = rx->up.netdev;
1321 struct dp_packet *buffer;
1322 ssize_t retval;
1323 int mtu;
1324
1325 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1326 mtu = ETH_PAYLOAD_MAX;
1327 }
1328
1329 /* Assume Ethernet port. No need to set packet_type. */
1330 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1331 DP_NETDEV_HEADROOM);
1332 retval = (rx->is_tap
1333 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1334 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1335
1336 if (retval) {
1337 if (retval != EAGAIN && retval != EMSGSIZE) {
1338 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1339 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1340 }
1341 dp_packet_delete(buffer);
1342 } else {
1343 dp_packet_batch_init_packet(batch, buffer);
1344 }
1345
1346 if (qfill) {
1347 *qfill = -ENOTSUP;
1348 }
1349
1350 return retval;
1351 }
1352
1353 static void
1354 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1355 {
1356 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1357 poll_fd_wait(rx->fd, POLLIN);
1358 }
1359
1360 static int
1361 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1362 {
1363 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1364 if (rx->is_tap) {
1365 struct ifreq ifr;
1366 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1367 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1368 if (error) {
1369 return error;
1370 }
1371 drain_fd(rx->fd, ifr.ifr_qlen);
1372 return 0;
1373 } else {
1374 return drain_rcvbuf(rx->fd);
1375 }
1376 }
1377
1378 static int
1379 netdev_linux_sock_batch_send(int sock, int ifindex,
1380 struct dp_packet_batch *batch)
1381 {
1382 const size_t size = dp_packet_batch_size(batch);
1383 /* We don't bother setting most fields in sockaddr_ll because the
1384 * kernel ignores them for SOCK_RAW. */
1385 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1386 .sll_ifindex = ifindex };
1387
1388 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1389 struct iovec *iov = xmalloc(sizeof(*iov) * size);
1390
1391 struct dp_packet *packet;
1392 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1393 iov[i].iov_base = dp_packet_data(packet);
1394 iov[i].iov_len = dp_packet_size(packet);
1395 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1396 .msg_namelen = sizeof sll,
1397 .msg_iov = &iov[i],
1398 .msg_iovlen = 1 };
1399 }
1400
1401 int error = 0;
1402 for (uint32_t ofs = 0; ofs < size; ) {
1403 ssize_t retval;
1404 do {
1405 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
1406 error = retval < 0 ? errno : 0;
1407 } while (error == EINTR);
1408 if (error) {
1409 break;
1410 }
1411 ofs += retval;
1412 }
1413
1414 free(mmsg);
1415 free(iov);
1416 return error;
1417 }
1418
1419 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1420 * essential, because packets sent to a tap device with an AF_PACKET socket
1421 * will loop back to be *received* again on the tap device. This doesn't occur
1422 * on other interface types because we attach a socket filter to the rx
1423 * socket. */
1424 static int
1425 netdev_linux_tap_batch_send(struct netdev *netdev_,
1426 struct dp_packet_batch *batch)
1427 {
1428 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1429 struct dp_packet *packet;
1430
1431 /* The Linux tap driver returns EIO if the device is not up,
1432 * so if the device is not up, don't waste time sending it.
1433 * However, if the device is in another network namespace
1434 * then OVS can't retrieve the state. In that case, send the
1435 * packets anyway. */
1436 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1437 netdev->tx_dropped += dp_packet_batch_size(batch);
1438 return 0;
1439 }
1440
1441 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1442 size_t size = dp_packet_size(packet);
1443 ssize_t retval;
1444 int error;
1445
1446 do {
1447 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1448 error = retval < 0 ? errno : 0;
1449 } while (error == EINTR);
1450
1451 if (error) {
1452 /* The Linux tap driver returns EIO if the device is not up. From
1453 * the OVS side this is not an error, so we ignore it; otherwise,
1454 * return the erro. */
1455 if (error != EIO) {
1456 return error;
1457 }
1458 } else if (retval != size) {
1459 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1460 "bytes of %"PRIuSIZE") on %s",
1461 retval, size, netdev_get_name(netdev_));
1462 return EMSGSIZE;
1463 }
1464 }
1465 return 0;
1466 }
1467
1468 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1469 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1470 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1471 * the packet is too big or too small to transmit on the device.
1472 *
1473 * The kernel maintains a packet transmission queue, so the caller is not
1474 * expected to do additional queuing of packets. */
1475 static int
1476 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1477 struct dp_packet_batch *batch,
1478 bool concurrent_txq OVS_UNUSED)
1479 {
1480 int error = 0;
1481 int sock = 0;
1482
1483 if (!is_tap_netdev(netdev_)) {
1484 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1485 error = EOPNOTSUPP;
1486 goto free_batch;
1487 }
1488
1489 sock = af_packet_sock();
1490 if (sock < 0) {
1491 error = -sock;
1492 goto free_batch;
1493 }
1494
1495 int ifindex = netdev_get_ifindex(netdev_);
1496 if (ifindex < 0) {
1497 error = -ifindex;
1498 goto free_batch;
1499 }
1500
1501 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1502 } else {
1503 error = netdev_linux_tap_batch_send(netdev_, batch);
1504 }
1505 if (error) {
1506 if (error == ENOBUFS) {
1507 /* The Linux AF_PACKET implementation never blocks waiting
1508 * for room for packets, instead returning ENOBUFS.
1509 * Translate this into EAGAIN for the caller. */
1510 error = EAGAIN;
1511 } else {
1512 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1513 netdev_get_name(netdev_), ovs_strerror(error));
1514 }
1515 }
1516
1517 free_batch:
1518 dp_packet_delete_batch(batch, true);
1519 return error;
1520 }
1521
1522 /* Registers with the poll loop to wake up from the next call to poll_block()
1523 * when the packet transmission queue has sufficient room to transmit a packet
1524 * with netdev_send().
1525 *
1526 * The kernel maintains a packet transmission queue, so the client is not
1527 * expected to do additional queuing of packets. Thus, this function is
1528 * unlikely to ever be used. It is included for completeness. */
1529 static void
1530 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1531 {
1532 if (is_tap_netdev(netdev)) {
1533 /* TAP device always accepts packets.*/
1534 poll_immediate_wake();
1535 }
1536 }
1537
1538 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1539 * otherwise a positive errno value. */
1540 static int
1541 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1542 {
1543 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1544 enum netdev_flags old_flags = 0;
1545 int error;
1546
1547 ovs_mutex_lock(&netdev->mutex);
1548 if (netdev_linux_netnsid_is_remote(netdev)) {
1549 error = EOPNOTSUPP;
1550 goto exit;
1551 }
1552
1553 if (netdev->cache_valid & VALID_ETHERADDR) {
1554 error = netdev->ether_addr_error;
1555 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1556 goto exit;
1557 }
1558 netdev->cache_valid &= ~VALID_ETHERADDR;
1559 }
1560
1561 /* Tap devices must be brought down before setting the address. */
1562 if (is_tap_netdev(netdev_)) {
1563 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1564 }
1565 error = set_etheraddr(netdev_get_name(netdev_), mac);
1566 if (!error || error == ENODEV) {
1567 netdev->ether_addr_error = error;
1568 netdev->cache_valid |= VALID_ETHERADDR;
1569 if (!error) {
1570 netdev->etheraddr = mac;
1571 }
1572 }
1573
1574 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1575 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1576 }
1577
1578 exit:
1579 ovs_mutex_unlock(&netdev->mutex);
1580 return error;
1581 }
1582
1583 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1584 static int
1585 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1586 {
1587 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1588 int error;
1589
1590 ovs_mutex_lock(&netdev->mutex);
1591 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1592 netdev_linux_update_via_netlink(netdev);
1593 }
1594
1595 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1596 /* Fall back to ioctl if netlink fails */
1597 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1598 &netdev->etheraddr);
1599 netdev->cache_valid |= VALID_ETHERADDR;
1600 }
1601
1602 error = netdev->ether_addr_error;
1603 if (!error) {
1604 *mac = netdev->etheraddr;
1605 }
1606 ovs_mutex_unlock(&netdev->mutex);
1607
1608 return error;
1609 }
1610
1611 static int
1612 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1613 {
1614 int error;
1615
1616 if (!(netdev->cache_valid & VALID_MTU)) {
1617 netdev_linux_update_via_netlink(netdev);
1618 }
1619
1620 if (!(netdev->cache_valid & VALID_MTU)) {
1621 /* Fall back to ioctl if netlink fails */
1622 struct ifreq ifr;
1623
1624 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1625 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1626 netdev->mtu = ifr.ifr_mtu;
1627 netdev->cache_valid |= VALID_MTU;
1628 }
1629
1630 error = netdev->netdev_mtu_error;
1631 if (!error) {
1632 *mtup = netdev->mtu;
1633 }
1634
1635 return error;
1636 }
1637
1638 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1639 * in bytes, not including the hardware header; thus, this is typically 1500
1640 * bytes for Ethernet devices. */
1641 static int
1642 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1643 {
1644 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1645 int error;
1646
1647 ovs_mutex_lock(&netdev->mutex);
1648 error = netdev_linux_get_mtu__(netdev, mtup);
1649 ovs_mutex_unlock(&netdev->mutex);
1650
1651 return error;
1652 }
1653
1654 /* Sets the maximum size of transmitted (MTU) for given device using linux
1655 * networking ioctl interface.
1656 */
1657 static int
1658 netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1659 {
1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1661 struct ifreq ifr;
1662 int error;
1663
1664 ovs_mutex_lock(&netdev->mutex);
1665 if (netdev_linux_netnsid_is_remote(netdev)) {
1666 error = EOPNOTSUPP;
1667 goto exit;
1668 }
1669
1670 if (netdev->cache_valid & VALID_MTU) {
1671 error = netdev->netdev_mtu_error;
1672 if (error || netdev->mtu == mtu) {
1673 goto exit;
1674 }
1675 netdev->cache_valid &= ~VALID_MTU;
1676 }
1677 ifr.ifr_mtu = mtu;
1678 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1679 SIOCSIFMTU, "SIOCSIFMTU");
1680 if (!error || error == ENODEV) {
1681 netdev->netdev_mtu_error = error;
1682 netdev->mtu = ifr.ifr_mtu;
1683 netdev->cache_valid |= VALID_MTU;
1684 }
1685 exit:
1686 ovs_mutex_unlock(&netdev->mutex);
1687 return error;
1688 }
1689
1690 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1691 * On failure, returns a negative errno value. */
1692 static int
1693 netdev_linux_get_ifindex(const struct netdev *netdev_)
1694 {
1695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1696 int ifindex, error;
1697
1698 ovs_mutex_lock(&netdev->mutex);
1699 if (netdev_linux_netnsid_is_remote(netdev)) {
1700 error = EOPNOTSUPP;
1701 goto exit;
1702 }
1703 error = get_ifindex(netdev_, &ifindex);
1704
1705 exit:
1706 ovs_mutex_unlock(&netdev->mutex);
1707 return error ? -error : ifindex;
1708 }
1709
1710 static int
1711 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1712 {
1713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1714
1715 ovs_mutex_lock(&netdev->mutex);
1716 if (netdev->miimon_interval > 0) {
1717 *carrier = netdev->miimon;
1718 } else {
1719 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1720 }
1721 ovs_mutex_unlock(&netdev->mutex);
1722
1723 return 0;
1724 }
1725
1726 static long long int
1727 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1728 {
1729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1730 long long int carrier_resets;
1731
1732 ovs_mutex_lock(&netdev->mutex);
1733 carrier_resets = netdev->carrier_resets;
1734 ovs_mutex_unlock(&netdev->mutex);
1735
1736 return carrier_resets;
1737 }
1738
1739 static int
1740 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1741 struct mii_ioctl_data *data)
1742 {
1743 struct ifreq ifr;
1744 int error;
1745
1746 memset(&ifr, 0, sizeof ifr);
1747 memcpy(&ifr.ifr_data, data, sizeof *data);
1748 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1749 memcpy(data, &ifr.ifr_data, sizeof *data);
1750
1751 return error;
1752 }
1753
1754 static int
1755 netdev_linux_get_miimon(const char *name, bool *miimon)
1756 {
1757 struct mii_ioctl_data data;
1758 int error;
1759
1760 *miimon = false;
1761
1762 memset(&data, 0, sizeof data);
1763 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1764 if (!error) {
1765 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1766 data.reg_num = MII_BMSR;
1767 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1768 &data);
1769
1770 if (!error) {
1771 *miimon = !!(data.val_out & BMSR_LSTATUS);
1772 }
1773 }
1774 if (error) {
1775 struct ethtool_cmd ecmd;
1776
1777 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1778 name);
1779
1780 COVERAGE_INC(netdev_get_ethtool);
1781 memset(&ecmd, 0, sizeof ecmd);
1782 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1783 "ETHTOOL_GLINK");
1784 if (!error) {
1785 struct ethtool_value eval;
1786
1787 memcpy(&eval, &ecmd, sizeof eval);
1788 *miimon = !!eval.data;
1789 } else {
1790 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1791 }
1792 }
1793
1794 return error;
1795 }
1796
1797 static int
1798 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1799 long long int interval)
1800 {
1801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1802
1803 ovs_mutex_lock(&netdev->mutex);
1804 interval = interval > 0 ? MAX(interval, 100) : 0;
1805 if (netdev->miimon_interval != interval) {
1806 if (interval && !netdev->miimon_interval) {
1807 atomic_count_inc(&miimon_cnt);
1808 } else if (!interval && netdev->miimon_interval) {
1809 atomic_count_dec(&miimon_cnt);
1810 }
1811
1812 netdev->miimon_interval = interval;
1813 timer_set_expired(&netdev->miimon_timer);
1814 }
1815 ovs_mutex_unlock(&netdev->mutex);
1816
1817 return 0;
1818 }
1819
1820 static void
1821 netdev_linux_miimon_run(void)
1822 {
1823 struct shash device_shash;
1824 struct shash_node *node;
1825
1826 shash_init(&device_shash);
1827 netdev_get_devices(&netdev_linux_class, &device_shash);
1828 SHASH_FOR_EACH (node, &device_shash) {
1829 struct netdev *netdev = node->data;
1830 struct netdev_linux *dev = netdev_linux_cast(netdev);
1831 bool miimon;
1832
1833 ovs_mutex_lock(&dev->mutex);
1834 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1835 netdev_linux_get_miimon(dev->up.name, &miimon);
1836 if (miimon != dev->miimon) {
1837 dev->miimon = miimon;
1838 netdev_linux_changed(dev, dev->ifi_flags, 0);
1839 }
1840
1841 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1842 }
1843 ovs_mutex_unlock(&dev->mutex);
1844 netdev_close(netdev);
1845 }
1846
1847 shash_destroy(&device_shash);
1848 }
1849
1850 static void
1851 netdev_linux_miimon_wait(void)
1852 {
1853 struct shash device_shash;
1854 struct shash_node *node;
1855
1856 shash_init(&device_shash);
1857 netdev_get_devices(&netdev_linux_class, &device_shash);
1858 SHASH_FOR_EACH (node, &device_shash) {
1859 struct netdev *netdev = node->data;
1860 struct netdev_linux *dev = netdev_linux_cast(netdev);
1861
1862 ovs_mutex_lock(&dev->mutex);
1863 if (dev->miimon_interval > 0) {
1864 timer_wait(&dev->miimon_timer);
1865 }
1866 ovs_mutex_unlock(&dev->mutex);
1867 netdev_close(netdev);
1868 }
1869 shash_destroy(&device_shash);
1870 }
1871
1872 static void
1873 swap_uint64(uint64_t *a, uint64_t *b)
1874 {
1875 uint64_t tmp = *a;
1876 *a = *b;
1877 *b = tmp;
1878 }
1879
1880 /* Copies 'src' into 'dst', performing format conversion in the process.
1881 *
1882 * 'src' is allowed to be misaligned. */
1883 static void
1884 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1885 const struct ovs_vport_stats *src)
1886 {
1887 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1888 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1889 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1890 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1891 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1892 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1893 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1894 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1895 dst->multicast = 0;
1896 dst->collisions = 0;
1897 dst->rx_length_errors = 0;
1898 dst->rx_over_errors = 0;
1899 dst->rx_crc_errors = 0;
1900 dst->rx_frame_errors = 0;
1901 dst->rx_fifo_errors = 0;
1902 dst->rx_missed_errors = 0;
1903 dst->tx_aborted_errors = 0;
1904 dst->tx_carrier_errors = 0;
1905 dst->tx_fifo_errors = 0;
1906 dst->tx_heartbeat_errors = 0;
1907 dst->tx_window_errors = 0;
1908 }
1909
1910 static int
1911 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1912 {
1913 struct dpif_netlink_vport reply;
1914 struct ofpbuf *buf;
1915 int error;
1916
1917 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1918 if (error) {
1919 return error;
1920 } else if (!reply.stats) {
1921 ofpbuf_delete(buf);
1922 return EOPNOTSUPP;
1923 }
1924
1925 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1926
1927 ofpbuf_delete(buf);
1928
1929 return 0;
1930 }
1931
1932 static void
1933 get_stats_via_vport(const struct netdev *netdev_,
1934 struct netdev_stats *stats)
1935 {
1936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1937
1938 if (!netdev->vport_stats_error ||
1939 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1940 int error;
1941
1942 error = get_stats_via_vport__(netdev_, stats);
1943 if (error && error != ENOENT && error != ENODEV) {
1944 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1945 "(%s)",
1946 netdev_get_name(netdev_), ovs_strerror(error));
1947 }
1948 netdev->vport_stats_error = error;
1949 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1950 }
1951 }
1952
1953 /* Retrieves current device stats for 'netdev-linux'. */
1954 static int
1955 netdev_linux_get_stats(const struct netdev *netdev_,
1956 struct netdev_stats *stats)
1957 {
1958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1959 struct netdev_stats dev_stats;
1960 int error;
1961
1962 ovs_mutex_lock(&netdev->mutex);
1963 get_stats_via_vport(netdev_, stats);
1964 error = get_stats_via_netlink(netdev_, &dev_stats);
1965 if (error) {
1966 if (!netdev->vport_stats_error) {
1967 error = 0;
1968 }
1969 } else if (netdev->vport_stats_error) {
1970 /* stats not available from OVS then use netdev stats. */
1971 *stats = dev_stats;
1972 } else {
1973 /* Use kernel netdev's packet and byte counts since vport's counters
1974 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1975 * enabled. */
1976 stats->rx_packets = dev_stats.rx_packets;
1977 stats->rx_bytes = dev_stats.rx_bytes;
1978 stats->tx_packets = dev_stats.tx_packets;
1979 stats->tx_bytes = dev_stats.tx_bytes;
1980
1981 stats->rx_errors += dev_stats.rx_errors;
1982 stats->tx_errors += dev_stats.tx_errors;
1983 stats->rx_dropped += dev_stats.rx_dropped;
1984 stats->tx_dropped += dev_stats.tx_dropped;
1985 stats->multicast += dev_stats.multicast;
1986 stats->collisions += dev_stats.collisions;
1987 stats->rx_length_errors += dev_stats.rx_length_errors;
1988 stats->rx_over_errors += dev_stats.rx_over_errors;
1989 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1990 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1991 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1992 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1993 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1994 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1995 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1996 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1997 stats->tx_window_errors += dev_stats.tx_window_errors;
1998 }
1999 ovs_mutex_unlock(&netdev->mutex);
2000
2001 return error;
2002 }
2003
2004 /* Retrieves current device stats for 'netdev-tap' netdev or
2005 * netdev-internal. */
2006 static int
2007 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2008 {
2009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2010 struct netdev_stats dev_stats;
2011 int error;
2012
2013 ovs_mutex_lock(&netdev->mutex);
2014 get_stats_via_vport(netdev_, stats);
2015 error = get_stats_via_netlink(netdev_, &dev_stats);
2016 if (error) {
2017 if (!netdev->vport_stats_error) {
2018 error = 0;
2019 }
2020 } else if (netdev->vport_stats_error) {
2021 /* Transmit and receive stats will appear to be swapped relative to the
2022 * other ports since we are the one sending the data, not a remote
2023 * computer. For consistency, we swap them back here. This does not
2024 * apply if we are getting stats from the vport layer because it always
2025 * tracks stats from the perspective of the switch. */
2026
2027 *stats = dev_stats;
2028 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2029 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2030 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2031 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2032 stats->rx_length_errors = 0;
2033 stats->rx_over_errors = 0;
2034 stats->rx_crc_errors = 0;
2035 stats->rx_frame_errors = 0;
2036 stats->rx_fifo_errors = 0;
2037 stats->rx_missed_errors = 0;
2038 stats->tx_aborted_errors = 0;
2039 stats->tx_carrier_errors = 0;
2040 stats->tx_fifo_errors = 0;
2041 stats->tx_heartbeat_errors = 0;
2042 stats->tx_window_errors = 0;
2043 } else {
2044 /* Use kernel netdev's packet and byte counts since vport counters
2045 * do not reflect packet counts on the wire when GSO, TSO or GRO
2046 * are enabled. */
2047 stats->rx_packets = dev_stats.tx_packets;
2048 stats->rx_bytes = dev_stats.tx_bytes;
2049 stats->tx_packets = dev_stats.rx_packets;
2050 stats->tx_bytes = dev_stats.rx_bytes;
2051
2052 stats->rx_dropped += dev_stats.tx_dropped;
2053 stats->tx_dropped += dev_stats.rx_dropped;
2054
2055 stats->rx_errors += dev_stats.tx_errors;
2056 stats->tx_errors += dev_stats.rx_errors;
2057
2058 stats->multicast += dev_stats.multicast;
2059 stats->collisions += dev_stats.collisions;
2060 }
2061 stats->tx_dropped += netdev->tx_dropped;
2062 ovs_mutex_unlock(&netdev->mutex);
2063
2064 return error;
2065 }
2066
2067 static int
2068 netdev_internal_get_stats(const struct netdev *netdev_,
2069 struct netdev_stats *stats)
2070 {
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2072 int error;
2073
2074 ovs_mutex_lock(&netdev->mutex);
2075 get_stats_via_vport(netdev_, stats);
2076 error = netdev->vport_stats_error;
2077 ovs_mutex_unlock(&netdev->mutex);
2078
2079 return error;
2080 }
2081
2082 static void
2083 netdev_linux_read_features(struct netdev_linux *netdev)
2084 {
2085 struct ethtool_cmd ecmd;
2086 uint32_t speed;
2087 int error;
2088
2089 if (netdev->cache_valid & VALID_FEATURES) {
2090 return;
2091 }
2092
2093 COVERAGE_INC(netdev_get_ethtool);
2094 memset(&ecmd, 0, sizeof ecmd);
2095 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2096 ETHTOOL_GSET, "ETHTOOL_GSET");
2097 if (error) {
2098 goto out;
2099 }
2100
2101 /* Supported features. */
2102 netdev->supported = 0;
2103 if (ecmd.supported & SUPPORTED_10baseT_Half) {
2104 netdev->supported |= NETDEV_F_10MB_HD;
2105 }
2106 if (ecmd.supported & SUPPORTED_10baseT_Full) {
2107 netdev->supported |= NETDEV_F_10MB_FD;
2108 }
2109 if (ecmd.supported & SUPPORTED_100baseT_Half) {
2110 netdev->supported |= NETDEV_F_100MB_HD;
2111 }
2112 if (ecmd.supported & SUPPORTED_100baseT_Full) {
2113 netdev->supported |= NETDEV_F_100MB_FD;
2114 }
2115 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2116 netdev->supported |= NETDEV_F_1GB_HD;
2117 }
2118 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2119 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2120 netdev->supported |= NETDEV_F_1GB_FD;
2121 }
2122 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2123 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2124 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2125 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2126 netdev->supported |= NETDEV_F_10GB_FD;
2127 }
2128 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2129 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2130 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2131 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2132 netdev->supported |= NETDEV_F_40GB_FD;
2133 }
2134 if (ecmd.supported & SUPPORTED_TP) {
2135 netdev->supported |= NETDEV_F_COPPER;
2136 }
2137 if (ecmd.supported & SUPPORTED_FIBRE) {
2138 netdev->supported |= NETDEV_F_FIBER;
2139 }
2140 if (ecmd.supported & SUPPORTED_Autoneg) {
2141 netdev->supported |= NETDEV_F_AUTONEG;
2142 }
2143 if (ecmd.supported & SUPPORTED_Pause) {
2144 netdev->supported |= NETDEV_F_PAUSE;
2145 }
2146 if (ecmd.supported & SUPPORTED_Asym_Pause) {
2147 netdev->supported |= NETDEV_F_PAUSE_ASYM;
2148 }
2149
2150 /* Advertised features. */
2151 netdev->advertised = 0;
2152 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2153 netdev->advertised |= NETDEV_F_10MB_HD;
2154 }
2155 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2156 netdev->advertised |= NETDEV_F_10MB_FD;
2157 }
2158 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2159 netdev->advertised |= NETDEV_F_100MB_HD;
2160 }
2161 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2162 netdev->advertised |= NETDEV_F_100MB_FD;
2163 }
2164 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2165 netdev->advertised |= NETDEV_F_1GB_HD;
2166 }
2167 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2168 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2169 netdev->advertised |= NETDEV_F_1GB_FD;
2170 }
2171 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2172 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2173 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2174 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2175 netdev->advertised |= NETDEV_F_10GB_FD;
2176 }
2177 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2178 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2179 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2180 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2181 netdev->advertised |= NETDEV_F_40GB_FD;
2182 }
2183 if (ecmd.advertising & ADVERTISED_TP) {
2184 netdev->advertised |= NETDEV_F_COPPER;
2185 }
2186 if (ecmd.advertising & ADVERTISED_FIBRE) {
2187 netdev->advertised |= NETDEV_F_FIBER;
2188 }
2189 if (ecmd.advertising & ADVERTISED_Autoneg) {
2190 netdev->advertised |= NETDEV_F_AUTONEG;
2191 }
2192 if (ecmd.advertising & ADVERTISED_Pause) {
2193 netdev->advertised |= NETDEV_F_PAUSE;
2194 }
2195 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2196 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2197 }
2198
2199 /* Current settings. */
2200 speed = ethtool_cmd_speed(&ecmd);
2201 if (speed == SPEED_10) {
2202 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2203 } else if (speed == SPEED_100) {
2204 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2205 } else if (speed == SPEED_1000) {
2206 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2207 } else if (speed == SPEED_10000) {
2208 netdev->current = NETDEV_F_10GB_FD;
2209 } else if (speed == 40000) {
2210 netdev->current = NETDEV_F_40GB_FD;
2211 } else if (speed == 100000) {
2212 netdev->current = NETDEV_F_100GB_FD;
2213 } else if (speed == 1000000) {
2214 netdev->current = NETDEV_F_1TB_FD;
2215 } else {
2216 netdev->current = 0;
2217 }
2218
2219 if (ecmd.port == PORT_TP) {
2220 netdev->current |= NETDEV_F_COPPER;
2221 } else if (ecmd.port == PORT_FIBRE) {
2222 netdev->current |= NETDEV_F_FIBER;
2223 }
2224
2225 if (ecmd.autoneg) {
2226 netdev->current |= NETDEV_F_AUTONEG;
2227 }
2228
2229 out:
2230 netdev->cache_valid |= VALID_FEATURES;
2231 netdev->get_features_error = error;
2232 }
2233
2234 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2235 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2236 * Returns 0 if successful, otherwise a positive errno value. */
2237 static int
2238 netdev_linux_get_features(const struct netdev *netdev_,
2239 enum netdev_features *current,
2240 enum netdev_features *advertised,
2241 enum netdev_features *supported,
2242 enum netdev_features *peer)
2243 {
2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2245 int error;
2246
2247 ovs_mutex_lock(&netdev->mutex);
2248 if (netdev_linux_netnsid_is_remote(netdev)) {
2249 error = EOPNOTSUPP;
2250 goto exit;
2251 }
2252
2253 netdev_linux_read_features(netdev);
2254 if (!netdev->get_features_error) {
2255 *current = netdev->current;
2256 *advertised = netdev->advertised;
2257 *supported = netdev->supported;
2258 *peer = 0; /* XXX */
2259 }
2260 error = netdev->get_features_error;
2261
2262 exit:
2263 ovs_mutex_unlock(&netdev->mutex);
2264 return error;
2265 }
2266
2267 /* Set the features advertised by 'netdev' to 'advertise'. */
2268 static int
2269 netdev_linux_set_advertisements(struct netdev *netdev_,
2270 enum netdev_features advertise)
2271 {
2272 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2273 struct ethtool_cmd ecmd;
2274 int error;
2275
2276 ovs_mutex_lock(&netdev->mutex);
2277
2278 COVERAGE_INC(netdev_get_ethtool);
2279
2280 if (netdev_linux_netnsid_is_remote(netdev)) {
2281 error = EOPNOTSUPP;
2282 goto exit;
2283 }
2284
2285 memset(&ecmd, 0, sizeof ecmd);
2286 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2287 ETHTOOL_GSET, "ETHTOOL_GSET");
2288 if (error) {
2289 goto exit;
2290 }
2291
2292 ecmd.advertising = 0;
2293 if (advertise & NETDEV_F_10MB_HD) {
2294 ecmd.advertising |= ADVERTISED_10baseT_Half;
2295 }
2296 if (advertise & NETDEV_F_10MB_FD) {
2297 ecmd.advertising |= ADVERTISED_10baseT_Full;
2298 }
2299 if (advertise & NETDEV_F_100MB_HD) {
2300 ecmd.advertising |= ADVERTISED_100baseT_Half;
2301 }
2302 if (advertise & NETDEV_F_100MB_FD) {
2303 ecmd.advertising |= ADVERTISED_100baseT_Full;
2304 }
2305 if (advertise & NETDEV_F_1GB_HD) {
2306 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2307 }
2308 if (advertise & NETDEV_F_1GB_FD) {
2309 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2310 }
2311 if (advertise & NETDEV_F_10GB_FD) {
2312 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2313 }
2314 if (advertise & NETDEV_F_COPPER) {
2315 ecmd.advertising |= ADVERTISED_TP;
2316 }
2317 if (advertise & NETDEV_F_FIBER) {
2318 ecmd.advertising |= ADVERTISED_FIBRE;
2319 }
2320 if (advertise & NETDEV_F_AUTONEG) {
2321 ecmd.advertising |= ADVERTISED_Autoneg;
2322 }
2323 if (advertise & NETDEV_F_PAUSE) {
2324 ecmd.advertising |= ADVERTISED_Pause;
2325 }
2326 if (advertise & NETDEV_F_PAUSE_ASYM) {
2327 ecmd.advertising |= ADVERTISED_Asym_Pause;
2328 }
2329 COVERAGE_INC(netdev_set_ethtool);
2330 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2331 ETHTOOL_SSET, "ETHTOOL_SSET");
2332
2333 exit:
2334 ovs_mutex_unlock(&netdev->mutex);
2335 return error;
2336 }
2337
2338 static struct tc_police
2339 tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2340 {
2341 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2342 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2343 struct tc_police police;
2344 struct tc_ratespec rate;
2345 int mtu = 65535;
2346
2347 memset(&rate, 0, sizeof rate);
2348 rate.rate = bps;
2349 rate.cell_log = tc_calc_cell_log(mtu);
2350 rate.mpu = ETH_TOTAL_MIN;
2351
2352 memset(&police, 0, sizeof police);
2353 police.burst = tc_bytes_to_ticks(bps, bsize);
2354 police.action = TC_POLICE_SHOT;
2355 police.rate = rate;
2356 police.mtu = mtu;
2357
2358 return police;
2359 }
2360
2361 static void
2362 nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2363 {
2364 size_t offset;
2365
2366 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2367 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2368 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2369 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2370 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2371 nl_msg_end_nested(request, offset);
2372 }
2373
2374 static int
2375 tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2376 uint32_t kbits_burst)
2377 {
2378 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2379 size_t basic_offset, action_offset, inner_offset;
2380 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2381 int ifindex, index, err = 0;
2382 struct tc_police pol_act;
2383 uint32_t block_id = 0;
2384 struct ofpbuf request;
2385 struct ofpbuf *reply;
2386 struct tcmsg *tcmsg;
2387 uint32_t handle = 1;
2388
2389 err = get_ifindex(netdev, &ifindex);
2390 if (err) {
2391 return err;
2392 }
2393
2394 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2395 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2396 &request);
2397 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2398 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2399 tcmsg->tcm_handle = handle;
2400
2401 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2402 nl_msg_put_string(&request, TCA_KIND, "matchall");
2403 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2404 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2405 inner_offset = nl_msg_start_nested(&request, 1);
2406 nl_msg_put_act_police(&request, pol_act);
2407 nl_msg_end_nested(&request, inner_offset);
2408 nl_msg_end_nested(&request, action_offset);
2409 nl_msg_end_nested(&request, basic_offset);
2410
2411 err = tc_transact(&request, &reply);
2412 if (!err) {
2413 struct tcmsg *tc =
2414 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2415 ofpbuf_delete(reply);
2416 }
2417
2418 return err;
2419 }
2420
2421 static int
2422 tc_del_matchall_policer(struct netdev *netdev)
2423 {
2424 uint32_t block_id = 0;
2425 int ifindex;
2426 int err;
2427
2428 err = get_ifindex(netdev, &ifindex);
2429 if (err) {
2430 return err;
2431 }
2432
2433 err = tc_del_filter(ifindex, TC_RESERVED_PRIORITY_POLICE, 1, block_id,
2434 TC_INGRESS);
2435 if (err) {
2436 return err;
2437 }
2438
2439 return 0;
2440 }
2441
2442 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2443 * successful, otherwise a positive errno value. */
2444 static int
2445 netdev_linux_set_policing(struct netdev *netdev_,
2446 uint32_t kbits_rate, uint32_t kbits_burst)
2447 {
2448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2449 const char *netdev_name = netdev_get_name(netdev_);
2450 int ifindex;
2451 int error;
2452
2453 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2454 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2455 : kbits_burst); /* Stick with user-specified value. */
2456
2457 ovs_mutex_lock(&netdev->mutex);
2458 if (netdev_linux_netnsid_is_remote(netdev)) {
2459 error = EOPNOTSUPP;
2460 goto out;
2461 }
2462
2463 if (netdev->cache_valid & VALID_POLICING) {
2464 error = netdev->netdev_policing_error;
2465 if (error || (netdev->kbits_rate == kbits_rate &&
2466 netdev->kbits_burst == kbits_burst)) {
2467 /* Assume that settings haven't changed since we last set them. */
2468 goto out;
2469 }
2470 netdev->cache_valid &= ~VALID_POLICING;
2471 }
2472
2473 error = get_ifindex(netdev_, &ifindex);
2474 if (error) {
2475 goto out;
2476 }
2477
2478 /* Use matchall for policing when offloadling ovs with tc-flower. */
2479 if (netdev_is_flow_api_enabled()) {
2480 error = tc_del_matchall_policer(netdev_);
2481 if (kbits_rate) {
2482 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2483 }
2484 ovs_mutex_unlock(&netdev->mutex);
2485 return error;
2486 }
2487
2488 COVERAGE_INC(netdev_set_policing);
2489 /* Remove any existing ingress qdisc. */
2490 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
2491 if (error) {
2492 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2493 netdev_name, ovs_strerror(error));
2494 goto out;
2495 }
2496
2497 if (kbits_rate) {
2498 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
2499 if (error) {
2500 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2501 netdev_name, ovs_strerror(error));
2502 goto out;
2503 }
2504
2505 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2506 if (error){
2507 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2508 netdev_name, ovs_strerror(error));
2509 goto out;
2510 }
2511 }
2512
2513 netdev->kbits_rate = kbits_rate;
2514 netdev->kbits_burst = kbits_burst;
2515
2516 out:
2517 if (!error || error == ENODEV) {
2518 netdev->netdev_policing_error = error;
2519 netdev->cache_valid |= VALID_POLICING;
2520 }
2521 ovs_mutex_unlock(&netdev->mutex);
2522 return error;
2523 }
2524
2525 static int
2526 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2527 struct sset *types)
2528 {
2529 const struct tc_ops *const *opsp;
2530 for (opsp = tcs; *opsp != NULL; opsp++) {
2531 const struct tc_ops *ops = *opsp;
2532 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2533 sset_add(types, ops->ovs_name);
2534 }
2535 }
2536 return 0;
2537 }
2538
2539 static const struct tc_ops *
2540 tc_lookup_ovs_name(const char *name)
2541 {
2542 const struct tc_ops *const *opsp;
2543
2544 for (opsp = tcs; *opsp != NULL; opsp++) {
2545 const struct tc_ops *ops = *opsp;
2546 if (!strcmp(name, ops->ovs_name)) {
2547 return ops;
2548 }
2549 }
2550 return NULL;
2551 }
2552
2553 static const struct tc_ops *
2554 tc_lookup_linux_name(const char *name)
2555 {
2556 const struct tc_ops *const *opsp;
2557
2558 for (opsp = tcs; *opsp != NULL; opsp++) {
2559 const struct tc_ops *ops = *opsp;
2560 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2561 return ops;
2562 }
2563 }
2564 return NULL;
2565 }
2566
2567 static struct tc_queue *
2568 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2569 size_t hash)
2570 {
2571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2572 struct tc_queue *queue;
2573
2574 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2575 if (queue->queue_id == queue_id) {
2576 return queue;
2577 }
2578 }
2579 return NULL;
2580 }
2581
2582 static struct tc_queue *
2583 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2584 {
2585 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2586 }
2587
2588 static int
2589 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2590 const char *type,
2591 struct netdev_qos_capabilities *caps)
2592 {
2593 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2594 if (!ops) {
2595 return EOPNOTSUPP;
2596 }
2597 caps->n_queues = ops->n_queues;
2598 return 0;
2599 }
2600
2601 static int
2602 netdev_linux_get_qos(const struct netdev *netdev_,
2603 const char **typep, struct smap *details)
2604 {
2605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2606 int error;
2607
2608 ovs_mutex_lock(&netdev->mutex);
2609 if (netdev_linux_netnsid_is_remote(netdev)) {
2610 error = EOPNOTSUPP;
2611 goto exit;
2612 }
2613
2614 error = tc_query_qdisc(netdev_);
2615 if (!error) {
2616 *typep = netdev->tc->ops->ovs_name;
2617 error = (netdev->tc->ops->qdisc_get
2618 ? netdev->tc->ops->qdisc_get(netdev_, details)
2619 : 0);
2620 }
2621
2622 exit:
2623 ovs_mutex_unlock(&netdev->mutex);
2624 return error;
2625 }
2626
2627 static int
2628 netdev_linux_set_qos(struct netdev *netdev_,
2629 const char *type, const struct smap *details)
2630 {
2631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2632 const struct tc_ops *new_ops;
2633 int error;
2634
2635 new_ops = tc_lookup_ovs_name(type);
2636 if (!new_ops || !new_ops->tc_install) {
2637 return EOPNOTSUPP;
2638 }
2639
2640 if (new_ops == &tc_ops_noop) {
2641 return new_ops->tc_install(netdev_, details);
2642 }
2643
2644 ovs_mutex_lock(&netdev->mutex);
2645 if (netdev_linux_netnsid_is_remote(netdev)) {
2646 error = EOPNOTSUPP;
2647 goto exit;
2648 }
2649
2650 error = tc_query_qdisc(netdev_);
2651 if (error) {
2652 goto exit;
2653 }
2654
2655 if (new_ops == netdev->tc->ops) {
2656 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2657 } else {
2658 /* Delete existing qdisc. */
2659 error = tc_del_qdisc(netdev_);
2660 if (error) {
2661 goto exit;
2662 }
2663 ovs_assert(netdev->tc == NULL);
2664
2665 /* Install new qdisc. */
2666 error = new_ops->tc_install(netdev_, details);
2667 ovs_assert((error == 0) == (netdev->tc != NULL));
2668 }
2669
2670 exit:
2671 ovs_mutex_unlock(&netdev->mutex);
2672 return error;
2673 }
2674
2675 static int
2676 netdev_linux_get_queue(const struct netdev *netdev_,
2677 unsigned int queue_id, struct smap *details)
2678 {
2679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2680 int error;
2681
2682 ovs_mutex_lock(&netdev->mutex);
2683 if (netdev_linux_netnsid_is_remote(netdev)) {
2684 error = EOPNOTSUPP;
2685 goto exit;
2686 }
2687
2688 error = tc_query_qdisc(netdev_);
2689 if (!error) {
2690 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2691 error = (queue
2692 ? netdev->tc->ops->class_get(netdev_, queue, details)
2693 : ENOENT);
2694 }
2695
2696 exit:
2697 ovs_mutex_unlock(&netdev->mutex);
2698 return error;
2699 }
2700
2701 static int
2702 netdev_linux_set_queue(struct netdev *netdev_,
2703 unsigned int queue_id, const struct smap *details)
2704 {
2705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2706 int error;
2707
2708 ovs_mutex_lock(&netdev->mutex);
2709 if (netdev_linux_netnsid_is_remote(netdev)) {
2710 error = EOPNOTSUPP;
2711 goto exit;
2712 }
2713
2714 error = tc_query_qdisc(netdev_);
2715 if (!error) {
2716 error = (queue_id < netdev->tc->ops->n_queues
2717 && netdev->tc->ops->class_set
2718 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2719 : EINVAL);
2720 }
2721
2722 exit:
2723 ovs_mutex_unlock(&netdev->mutex);
2724 return error;
2725 }
2726
2727 static int
2728 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2729 {
2730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2731 int error;
2732
2733 ovs_mutex_lock(&netdev->mutex);
2734 if (netdev_linux_netnsid_is_remote(netdev)) {
2735 error = EOPNOTSUPP;
2736 goto exit;
2737 }
2738
2739 error = tc_query_qdisc(netdev_);
2740 if (!error) {
2741 if (netdev->tc->ops->class_delete) {
2742 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2743 error = (queue
2744 ? netdev->tc->ops->class_delete(netdev_, queue)
2745 : ENOENT);
2746 } else {
2747 error = EINVAL;
2748 }
2749 }
2750
2751 exit:
2752 ovs_mutex_unlock(&netdev->mutex);
2753 return error;
2754 }
2755
2756 static int
2757 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2758 unsigned int queue_id,
2759 struct netdev_queue_stats *stats)
2760 {
2761 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2762 int error;
2763
2764 ovs_mutex_lock(&netdev->mutex);
2765 if (netdev_linux_netnsid_is_remote(netdev)) {
2766 error = EOPNOTSUPP;
2767 goto exit;
2768 }
2769
2770 error = tc_query_qdisc(netdev_);
2771 if (!error) {
2772 if (netdev->tc->ops->class_get_stats) {
2773 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2774 if (queue) {
2775 stats->created = queue->created;
2776 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2777 stats);
2778 } else {
2779 error = ENOENT;
2780 }
2781 } else {
2782 error = EOPNOTSUPP;
2783 }
2784 }
2785
2786 exit:
2787 ovs_mutex_unlock(&netdev->mutex);
2788 return error;
2789 }
2790
2791 struct queue_dump_state {
2792 struct nl_dump dump;
2793 struct ofpbuf buf;
2794 };
2795
2796 static bool
2797 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2798 {
2799 struct ofpbuf request;
2800 struct tcmsg *tcmsg;
2801
2802 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2803 if (!tcmsg) {
2804 return false;
2805 }
2806 tcmsg->tcm_parent = 0;
2807 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2808 ofpbuf_uninit(&request);
2809
2810 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2811 return true;
2812 }
2813
2814 static int
2815 finish_queue_dump(struct queue_dump_state *state)
2816 {
2817 ofpbuf_uninit(&state->buf);
2818 return nl_dump_done(&state->dump);
2819 }
2820
2821 struct netdev_linux_queue_state {
2822 unsigned int *queues;
2823 size_t cur_queue;
2824 size_t n_queues;
2825 };
2826
2827 static int
2828 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2829 {
2830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2831 int error;
2832
2833 ovs_mutex_lock(&netdev->mutex);
2834 if (netdev_linux_netnsid_is_remote(netdev)) {
2835 error = EOPNOTSUPP;
2836 goto exit;
2837 }
2838
2839 error = tc_query_qdisc(netdev_);
2840 if (!error) {
2841 if (netdev->tc->ops->class_get) {
2842 struct netdev_linux_queue_state *state;
2843 struct tc_queue *queue;
2844 size_t i;
2845
2846 *statep = state = xmalloc(sizeof *state);
2847 state->n_queues = hmap_count(&netdev->tc->queues);
2848 state->cur_queue = 0;
2849 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2850
2851 i = 0;
2852 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2853 state->queues[i++] = queue->queue_id;
2854 }
2855 } else {
2856 error = EOPNOTSUPP;
2857 }
2858 }
2859
2860 exit:
2861 ovs_mutex_unlock(&netdev->mutex);
2862 return error;
2863 }
2864
2865 static int
2866 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2867 unsigned int *queue_idp, struct smap *details)
2868 {
2869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2870 struct netdev_linux_queue_state *state = state_;
2871 int error = EOF;
2872
2873 ovs_mutex_lock(&netdev->mutex);
2874 if (netdev_linux_netnsid_is_remote(netdev)) {
2875 error = EOPNOTSUPP;
2876 goto exit;
2877 }
2878
2879 while (state->cur_queue < state->n_queues) {
2880 unsigned int queue_id = state->queues[state->cur_queue++];
2881 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2882
2883 if (queue) {
2884 *queue_idp = queue_id;
2885 error = netdev->tc->ops->class_get(netdev_, queue, details);
2886 break;
2887 }
2888 }
2889
2890 exit:
2891 ovs_mutex_unlock(&netdev->mutex);
2892 return error;
2893 }
2894
2895 static int
2896 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2897 void *state_)
2898 {
2899 struct netdev_linux_queue_state *state = state_;
2900
2901 free(state->queues);
2902 free(state);
2903 return 0;
2904 }
2905
2906 static int
2907 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2908 netdev_dump_queue_stats_cb *cb, void *aux)
2909 {
2910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2911 int error;
2912
2913 ovs_mutex_lock(&netdev->mutex);
2914 if (netdev_linux_netnsid_is_remote(netdev)) {
2915 error = EOPNOTSUPP;
2916 goto exit;
2917 }
2918
2919 error = tc_query_qdisc(netdev_);
2920 if (!error) {
2921 struct queue_dump_state state;
2922
2923 if (!netdev->tc->ops->class_dump_stats) {
2924 error = EOPNOTSUPP;
2925 } else if (!start_queue_dump(netdev_, &state)) {
2926 error = ENODEV;
2927 } else {
2928 struct ofpbuf msg;
2929 int retval;
2930
2931 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2932 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2933 cb, aux);
2934 if (retval) {
2935 error = retval;
2936 }
2937 }
2938
2939 retval = finish_queue_dump(&state);
2940 if (retval) {
2941 error = retval;
2942 }
2943 }
2944 }
2945
2946 exit:
2947 ovs_mutex_unlock(&netdev->mutex);
2948 return error;
2949 }
2950
2951 static int
2952 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2953 struct in_addr netmask)
2954 {
2955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2956 int error;
2957
2958 ovs_mutex_lock(&netdev->mutex);
2959 if (netdev_linux_netnsid_is_remote(netdev)) {
2960 error = EOPNOTSUPP;
2961 goto exit;
2962 }
2963
2964 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2965 if (!error) {
2966 if (address.s_addr != INADDR_ANY) {
2967 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2968 "SIOCSIFNETMASK", netmask);
2969 }
2970 }
2971
2972 exit:
2973 ovs_mutex_unlock(&netdev->mutex);
2974 return error;
2975 }
2976
2977 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2978 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2979 * error. */
2980 static int
2981 netdev_linux_get_addr_list(const struct netdev *netdev_,
2982 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2983 {
2984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2985 int error;
2986
2987 ovs_mutex_lock(&netdev->mutex);
2988 if (netdev_linux_netnsid_is_remote(netdev)) {
2989 error = EOPNOTSUPP;
2990 goto exit;
2991 }
2992
2993 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2994
2995 exit:
2996 ovs_mutex_unlock(&netdev->mutex);
2997 return error;
2998 }
2999
3000 static void
3001 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3002 {
3003 struct sockaddr_in sin;
3004 memset(&sin, 0, sizeof sin);
3005 sin.sin_family = AF_INET;
3006 sin.sin_addr = addr;
3007 sin.sin_port = 0;
3008
3009 memset(sa, 0, sizeof *sa);
3010 memcpy(sa, &sin, sizeof sin);
3011 }
3012
3013 static int
3014 do_set_addr(struct netdev *netdev,
3015 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3016 {
3017 struct ifreq ifr;
3018
3019 make_in4_sockaddr(&ifr.ifr_addr, addr);
3020 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3021 ioctl_name);
3022 }
3023
3024 /* Adds 'router' as a default IP gateway. */
3025 static int
3026 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3027 {
3028 struct in_addr any = { INADDR_ANY };
3029 struct rtentry rt;
3030 int error;
3031
3032 memset(&rt, 0, sizeof rt);
3033 make_in4_sockaddr(&rt.rt_dst, any);
3034 make_in4_sockaddr(&rt.rt_gateway, router);
3035 make_in4_sockaddr(&rt.rt_genmask, any);
3036 rt.rt_flags = RTF_UP | RTF_GATEWAY;
3037 error = af_inet_ioctl(SIOCADDRT, &rt);
3038 if (error) {
3039 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3040 }
3041 return error;
3042 }
3043
3044 static int
3045 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3046 char **netdev_name)
3047 {
3048 static const char fn[] = "/proc/net/route";
3049 FILE *stream;
3050 char line[256];
3051 int ln;
3052
3053 *netdev_name = NULL;
3054 stream = fopen(fn, "r");
3055 if (stream == NULL) {
3056 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3057 return errno;
3058 }
3059
3060 ln = 0;
3061 while (fgets(line, sizeof line, stream)) {
3062 if (++ln >= 2) {
3063 char iface[17];
3064 ovs_be32 dest, gateway, mask;
3065 int refcnt, metric, mtu;
3066 unsigned int flags, use, window, irtt;
3067
3068 if (!ovs_scan(line,
3069 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3070 " %d %u %u\n",
3071 iface, &dest, &gateway, &flags, &refcnt,
3072 &use, &metric, &mask, &mtu, &window, &irtt)) {
3073 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3074 fn, ln, line);
3075 continue;
3076 }
3077 if (!(flags & RTF_UP)) {
3078 /* Skip routes that aren't up. */
3079 continue;
3080 }
3081
3082 /* The output of 'dest', 'mask', and 'gateway' were given in
3083 * network byte order, so we don't need need any endian
3084 * conversions here. */
3085 if ((dest & mask) == (host->s_addr & mask)) {
3086 if (!gateway) {
3087 /* The host is directly reachable. */
3088 next_hop->s_addr = 0;
3089 } else {
3090 /* To reach the host, we must go through a gateway. */
3091 next_hop->s_addr = gateway;
3092 }
3093 *netdev_name = xstrdup(iface);
3094 fclose(stream);
3095 return 0;
3096 }
3097 }
3098 }
3099
3100 fclose(stream);
3101 return ENXIO;
3102 }
3103
3104 static int
3105 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3106 {
3107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3108 int error = 0;
3109
3110 ovs_mutex_lock(&netdev->mutex);
3111 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3112 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3113
3114 COVERAGE_INC(netdev_get_ethtool);
3115 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3116 error = netdev_linux_do_ethtool(netdev->up.name,
3117 cmd,
3118 ETHTOOL_GDRVINFO,
3119 "ETHTOOL_GDRVINFO");
3120 if (!error) {
3121 netdev->cache_valid |= VALID_DRVINFO;
3122 }
3123 }
3124
3125 if (!error) {
3126 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3127 smap_add(smap, "driver_version", netdev->drvinfo.version);
3128 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3129 }
3130 ovs_mutex_unlock(&netdev->mutex);
3131
3132 return error;
3133 }
3134
3135 static int
3136 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3137 struct smap *smap)
3138 {
3139 smap_add(smap, "driver_name", "openvswitch");
3140 return 0;
3141 }
3142
3143 static uint32_t
3144 netdev_linux_get_block_id(struct netdev *netdev_)
3145 {
3146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3147 uint32_t block_id = 0;
3148
3149 ovs_mutex_lock(&netdev->mutex);
3150 /* Ensure the linux netdev has had its fields populated. */
3151 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3152 netdev_linux_update_via_netlink(netdev);
3153 }
3154
3155 /* Only assigning block ids to linux netdevs that are LAG masters. */
3156 if (netdev->is_lag_master) {
3157 block_id = netdev->ifindex;
3158 }
3159 ovs_mutex_unlock(&netdev->mutex);
3160
3161 return block_id;
3162 }
3163
3164 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3165 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3166 * returns 0. Otherwise, it returns a positive errno value; in particular,
3167 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3168 static int
3169 netdev_linux_arp_lookup(const struct netdev *netdev,
3170 ovs_be32 ip, struct eth_addr *mac)
3171 {
3172 struct arpreq r;
3173 struct sockaddr_in sin;
3174 int retval;
3175
3176 memset(&r, 0, sizeof r);
3177 memset(&sin, 0, sizeof sin);
3178 sin.sin_family = AF_INET;
3179 sin.sin_addr.s_addr = ip;
3180 sin.sin_port = 0;
3181 memcpy(&r.arp_pa, &sin, sizeof sin);
3182 r.arp_ha.sa_family = ARPHRD_ETHER;
3183 r.arp_flags = 0;
3184 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3185 COVERAGE_INC(netdev_arp_lookup);
3186 retval = af_inet_ioctl(SIOCGARP, &r);
3187 if (!retval) {
3188 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3189 } else if (retval != ENXIO) {
3190 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3191 netdev_get_name(netdev), IP_ARGS(ip),
3192 ovs_strerror(retval));
3193 }
3194 return retval;
3195 }
3196
3197 static unsigned int
3198 nd_to_iff_flags(enum netdev_flags nd)
3199 {
3200 unsigned int iff = 0;
3201 if (nd & NETDEV_UP) {
3202 iff |= IFF_UP;
3203 }
3204 if (nd & NETDEV_PROMISC) {
3205 iff |= IFF_PROMISC;
3206 }
3207 if (nd & NETDEV_LOOPBACK) {
3208 iff |= IFF_LOOPBACK;
3209 }
3210 return iff;
3211 }
3212
3213 static int
3214 iff_to_nd_flags(unsigned int iff)
3215 {
3216 enum netdev_flags nd = 0;
3217 if (iff & IFF_UP) {
3218 nd |= NETDEV_UP;
3219 }
3220 if (iff & IFF_PROMISC) {
3221 nd |= NETDEV_PROMISC;
3222 }
3223 if (iff & IFF_LOOPBACK) {
3224 nd |= NETDEV_LOOPBACK;
3225 }
3226 return nd;
3227 }
3228
3229 static int
3230 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3231 enum netdev_flags on, enum netdev_flags *old_flagsp)
3232 OVS_REQUIRES(netdev->mutex)
3233 {
3234 unsigned int old_flags, new_flags;
3235 int error = 0;
3236
3237 old_flags = netdev->ifi_flags;
3238 *old_flagsp = iff_to_nd_flags(old_flags);
3239 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3240 if (new_flags != old_flags) {
3241 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3242 get_flags(&netdev->up, &netdev->ifi_flags);
3243 }
3244
3245 return error;
3246 }
3247
3248 static int
3249 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3250 enum netdev_flags on, enum netdev_flags *old_flagsp)
3251 {
3252 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3253 int error = 0;
3254
3255 ovs_mutex_lock(&netdev->mutex);
3256 if (on || off) {
3257 /* Changing flags over netlink isn't support yet. */
3258 if (netdev_linux_netnsid_is_remote(netdev)) {
3259 error = EOPNOTSUPP;
3260 goto exit;
3261 }
3262 error = update_flags(netdev, off, on, old_flagsp);
3263 } else {
3264 /* Try reading flags over netlink, or fall back to ioctl. */
3265 if (!netdev_linux_update_via_netlink(netdev)) {
3266 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3267 } else {
3268 error = update_flags(netdev, off, on, old_flagsp);
3269 }
3270 }
3271
3272 exit:
3273 ovs_mutex_unlock(&netdev->mutex);
3274 return error;
3275 }
3276
3277 #define NETDEV_LINUX_CLASS_COMMON \
3278 .run = netdev_linux_run, \
3279 .wait = netdev_linux_wait, \
3280 .alloc = netdev_linux_alloc, \
3281 .destruct = netdev_linux_destruct, \
3282 .dealloc = netdev_linux_dealloc, \
3283 .send = netdev_linux_send, \
3284 .send_wait = netdev_linux_send_wait, \
3285 .set_etheraddr = netdev_linux_set_etheraddr, \
3286 .get_etheraddr = netdev_linux_get_etheraddr, \
3287 .get_mtu = netdev_linux_get_mtu, \
3288 .set_mtu = netdev_linux_set_mtu, \
3289 .get_ifindex = netdev_linux_get_ifindex, \
3290 .get_carrier = netdev_linux_get_carrier, \
3291 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3292 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3293 .set_advertisements = netdev_linux_set_advertisements, \
3294 .set_policing = netdev_linux_set_policing, \
3295 .get_qos_types = netdev_linux_get_qos_types, \
3296 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3297 .get_qos = netdev_linux_get_qos, \
3298 .set_qos = netdev_linux_set_qos, \
3299 .get_queue = netdev_linux_get_queue, \
3300 .set_queue = netdev_linux_set_queue, \
3301 .delete_queue = netdev_linux_delete_queue, \
3302 .get_queue_stats = netdev_linux_get_queue_stats, \
3303 .queue_dump_start = netdev_linux_queue_dump_start, \
3304 .queue_dump_next = netdev_linux_queue_dump_next, \
3305 .queue_dump_done = netdev_linux_queue_dump_done, \
3306 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3307 .set_in4 = netdev_linux_set_in4, \
3308 .get_addr_list = netdev_linux_get_addr_list, \
3309 .add_router = netdev_linux_add_router, \
3310 .get_next_hop = netdev_linux_get_next_hop, \
3311 .arp_lookup = netdev_linux_arp_lookup, \
3312 .update_flags = netdev_linux_update_flags, \
3313 .rxq_alloc = netdev_linux_rxq_alloc, \
3314 .rxq_construct = netdev_linux_rxq_construct, \
3315 .rxq_destruct = netdev_linux_rxq_destruct, \
3316 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3317 .rxq_recv = netdev_linux_rxq_recv, \
3318 .rxq_wait = netdev_linux_rxq_wait, \
3319 .rxq_drain = netdev_linux_rxq_drain
3320
3321 const struct netdev_class netdev_linux_class = {
3322 NETDEV_LINUX_CLASS_COMMON,
3323 LINUX_FLOW_OFFLOAD_API,
3324 .type = "system",
3325 .construct = netdev_linux_construct,
3326 .get_stats = netdev_linux_get_stats,
3327 .get_features = netdev_linux_get_features,
3328 .get_status = netdev_linux_get_status,
3329 .get_block_id = netdev_linux_get_block_id
3330 };
3331
3332 const struct netdev_class netdev_tap_class = {
3333 NETDEV_LINUX_CLASS_COMMON,
3334 .type = "tap",
3335 .construct = netdev_linux_construct_tap,
3336 .get_stats = netdev_tap_get_stats,
3337 .get_features = netdev_linux_get_features,
3338 .get_status = netdev_linux_get_status,
3339 };
3340
3341 const struct netdev_class netdev_internal_class = {
3342 NETDEV_LINUX_CLASS_COMMON,
3343 LINUX_FLOW_OFFLOAD_API,
3344 .type = "internal",
3345 .construct = netdev_linux_construct,
3346 .get_stats = netdev_internal_get_stats,
3347 .get_status = netdev_internal_get_status,
3348 };
3349 \f
3350
3351 #define CODEL_N_QUEUES 0x0000
3352
3353 /* In sufficiently new kernel headers these are defined as enums in
3354 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3355 * kernels. (This overrides any enum definition in the header file but that's
3356 * harmless.) */
3357 #define TCA_CODEL_TARGET 1
3358 #define TCA_CODEL_LIMIT 2
3359 #define TCA_CODEL_INTERVAL 3
3360
3361 struct codel {
3362 struct tc tc;
3363 uint32_t target;
3364 uint32_t limit;
3365 uint32_t interval;
3366 };
3367
3368 static struct codel *
3369 codel_get__(const struct netdev *netdev_)
3370 {
3371 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3372 return CONTAINER_OF(netdev->tc, struct codel, tc);
3373 }
3374
3375 static void
3376 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3377 uint32_t interval)
3378 {
3379 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3380 struct codel *codel;
3381
3382 codel = xmalloc(sizeof *codel);
3383 tc_init(&codel->tc, &tc_ops_codel);
3384 codel->target = target;
3385 codel->limit = limit;
3386 codel->interval = interval;
3387
3388 netdev->tc = &codel->tc;
3389 }
3390
3391 static int
3392 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3393 uint32_t interval)
3394 {
3395 size_t opt_offset;
3396 struct ofpbuf request;
3397 struct tcmsg *tcmsg;
3398 uint32_t otarget, olimit, ointerval;
3399 int error;
3400
3401 tc_del_qdisc(netdev);
3402
3403 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3404 NLM_F_EXCL | NLM_F_CREATE, &request);
3405 if (!tcmsg) {
3406 return ENODEV;
3407 }
3408 tcmsg->tcm_handle = tc_make_handle(1, 0);
3409 tcmsg->tcm_parent = TC_H_ROOT;
3410
3411 otarget = target ? target : 5000;
3412 olimit = limit ? limit : 10240;
3413 ointerval = interval ? interval : 100000;
3414
3415 nl_msg_put_string(&request, TCA_KIND, "codel");
3416 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3417 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3418 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3419 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3420 nl_msg_end_nested(&request, opt_offset);
3421
3422 error = tc_transact(&request, NULL);
3423 if (error) {
3424 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3425 "target %u, limit %u, interval %u error %d(%s)",
3426 netdev_get_name(netdev),
3427 otarget, olimit, ointerval,
3428 error, ovs_strerror(error));
3429 }
3430 return error;
3431 }
3432
3433 static void
3434 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3435 const struct smap *details, struct codel *codel)
3436 {
3437 codel->target = smap_get_ullong(details, "target", 0);
3438 codel->limit = smap_get_ullong(details, "limit", 0);
3439 codel->interval = smap_get_ullong(details, "interval", 0);
3440
3441 if (!codel->target) {
3442 codel->target = 5000;
3443 }
3444 if (!codel->limit) {
3445 codel->limit = 10240;
3446 }
3447 if (!codel->interval) {
3448 codel->interval = 100000;
3449 }
3450 }
3451
3452 static int
3453 codel_tc_install(struct netdev *netdev, const struct smap *details)
3454 {
3455 int error;
3456 struct codel codel;
3457
3458 codel_parse_qdisc_details__(netdev, details, &codel);
3459 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3460 codel.interval);
3461 if (!error) {
3462 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3463 }
3464 return error;
3465 }
3466
3467 static int
3468 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3469 {
3470 static const struct nl_policy tca_codel_policy[] = {
3471 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3472 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3473 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3474 };
3475
3476 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3477
3478 if (!nl_parse_nested(nl_options, tca_codel_policy,
3479 attrs, ARRAY_SIZE(tca_codel_policy))) {
3480 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3481 return EPROTO;
3482 }
3483
3484 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3485 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3486 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3487 return 0;
3488 }
3489
3490 static int
3491 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3492 {
3493 struct nlattr *nlattr;
3494 const char * kind;
3495 int error;
3496 struct codel codel;
3497
3498 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3499 if (error != 0) {
3500 return error;
3501 }
3502
3503 error = codel_parse_tca_options__(nlattr, &codel);
3504 if (error != 0) {
3505 return error;
3506 }
3507
3508 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3509 return 0;
3510 }
3511
3512
3513 static void
3514 codel_tc_destroy(struct tc *tc)
3515 {
3516 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3517 tc_destroy(tc);
3518 free(codel);
3519 }
3520
3521 static int
3522 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3523 {
3524 const struct codel *codel = codel_get__(netdev);
3525 smap_add_format(details, "target", "%u", codel->target);
3526 smap_add_format(details, "limit", "%u", codel->limit);
3527 smap_add_format(details, "interval", "%u", codel->interval);
3528 return 0;
3529 }
3530
3531 static int
3532 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3533 {
3534 struct codel codel;
3535
3536 codel_parse_qdisc_details__(netdev, details, &codel);
3537 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3538 codel_get__(netdev)->target = codel.target;
3539 codel_get__(netdev)->limit = codel.limit;
3540 codel_get__(netdev)->interval = codel.interval;
3541 return 0;
3542 }
3543
3544 static const struct tc_ops tc_ops_codel = {
3545 .linux_name = "codel",
3546 .ovs_name = "linux-codel",
3547 .n_queues = CODEL_N_QUEUES,
3548 .tc_install = codel_tc_install,
3549 .tc_load = codel_tc_load,
3550 .tc_destroy = codel_tc_destroy,
3551 .qdisc_get = codel_qdisc_get,
3552 .qdisc_set = codel_qdisc_set,
3553 };
3554 \f
3555 /* FQ-CoDel traffic control class. */
3556
3557 #define FQCODEL_N_QUEUES 0x0000
3558
3559 /* In sufficiently new kernel headers these are defined as enums in
3560 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3561 * kernels. (This overrides any enum definition in the header file but that's
3562 * harmless.) */
3563 #define TCA_FQ_CODEL_TARGET 1
3564 #define TCA_FQ_CODEL_LIMIT 2
3565 #define TCA_FQ_CODEL_INTERVAL 3
3566 #define TCA_FQ_CODEL_ECN 4
3567 #define TCA_FQ_CODEL_FLOWS 5
3568 #define TCA_FQ_CODEL_QUANTUM 6
3569
3570 struct fqcodel {
3571 struct tc tc;
3572 uint32_t target;
3573 uint32_t limit;
3574 uint32_t interval;
3575 uint32_t flows;
3576 uint32_t quantum;
3577 };
3578
3579 static struct fqcodel *
3580 fqcodel_get__(const struct netdev *netdev_)
3581 {
3582 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3583 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3584 }
3585
3586 static void
3587 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3588 uint32_t interval, uint32_t flows, uint32_t quantum)
3589 {
3590 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3591 struct fqcodel *fqcodel;
3592
3593 fqcodel = xmalloc(sizeof *fqcodel);
3594 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3595 fqcodel->target = target;
3596 fqcodel->limit = limit;
3597 fqcodel->interval = interval;
3598 fqcodel->flows = flows;
3599 fqcodel->quantum = quantum;
3600
3601 netdev->tc = &fqcodel->tc;
3602 }
3603
3604 static int
3605 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3606 uint32_t interval, uint32_t flows, uint32_t quantum)
3607 {
3608 size_t opt_offset;
3609 struct ofpbuf request;
3610 struct tcmsg *tcmsg;
3611 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3612 int error;
3613
3614 tc_del_qdisc(netdev);
3615
3616 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3617 NLM_F_EXCL | NLM_F_CREATE, &request);
3618 if (!tcmsg) {
3619 return ENODEV;
3620 }
3621 tcmsg->tcm_handle = tc_make_handle(1, 0);
3622 tcmsg->tcm_parent = TC_H_ROOT;
3623
3624 otarget = target ? target : 5000;
3625 olimit = limit ? limit : 10240;
3626 ointerval = interval ? interval : 100000;
3627 oflows = flows ? flows : 1024;
3628 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3629 not mtu */
3630
3631 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3632 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3633 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3634 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3635 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3636 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3637 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3638 nl_msg_end_nested(&request, opt_offset);
3639
3640 error = tc_transact(&request, NULL);
3641 if (error) {
3642 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3643 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3644 netdev_get_name(netdev),
3645 otarget, olimit, ointerval, oflows, oquantum,
3646 error, ovs_strerror(error));
3647 }
3648 return error;
3649 }
3650
3651 static void
3652 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3653 const struct smap *details, struct fqcodel *fqcodel)
3654 {
3655 fqcodel->target = smap_get_ullong(details, "target", 0);
3656 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3657 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3658 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3659 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3660
3661 if (!fqcodel->target) {
3662 fqcodel->target = 5000;
3663 }
3664 if (!fqcodel->limit) {
3665 fqcodel->limit = 10240;
3666 }
3667 if (!fqcodel->interval) {
3668 fqcodel->interval = 1000000;
3669 }
3670 if (!fqcodel->flows) {
3671 fqcodel->flows = 1024;
3672 }
3673 if (!fqcodel->quantum) {
3674 fqcodel->quantum = 1514;
3675 }
3676 }
3677
3678 static int
3679 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3680 {
3681 int error;
3682 struct fqcodel fqcodel;
3683
3684 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3685 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3686 fqcodel.interval, fqcodel.flows,
3687 fqcodel.quantum);
3688 if (!error) {
3689 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3690 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3691 }
3692 return error;
3693 }
3694
3695 static int
3696 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3697 {
3698 static const struct nl_policy tca_fqcodel_policy[] = {
3699 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3700 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3701 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3702 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3703 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3704 };
3705
3706 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3707
3708 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3709 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3710 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3711 return EPROTO;
3712 }
3713
3714 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3715 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3716 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3717 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3718 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3719 return 0;
3720 }
3721
3722 static int
3723 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3724 {
3725 struct nlattr *nlattr;
3726 const char * kind;
3727 int error;
3728 struct fqcodel fqcodel;
3729
3730 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3731 if (error != 0) {
3732 return error;
3733 }
3734
3735 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3736 if (error != 0) {
3737 return error;
3738 }
3739
3740 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3741 fqcodel.flows, fqcodel.quantum);
3742 return 0;
3743 }
3744
3745 static void
3746 fqcodel_tc_destroy(struct tc *tc)
3747 {
3748 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3749 tc_destroy(tc);
3750 free(fqcodel);
3751 }
3752
3753 static int
3754 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3755 {
3756 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3757 smap_add_format(details, "target", "%u", fqcodel->target);
3758 smap_add_format(details, "limit", "%u", fqcodel->limit);
3759 smap_add_format(details, "interval", "%u", fqcodel->interval);
3760 smap_add_format(details, "flows", "%u", fqcodel->flows);
3761 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3762 return 0;
3763 }
3764
3765 static int
3766 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3767 {
3768 struct fqcodel fqcodel;
3769
3770 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3771 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3772 fqcodel.flows, fqcodel.quantum);
3773 fqcodel_get__(netdev)->target = fqcodel.target;
3774 fqcodel_get__(netdev)->limit = fqcodel.limit;
3775 fqcodel_get__(netdev)->interval = fqcodel.interval;
3776 fqcodel_get__(netdev)->flows = fqcodel.flows;
3777 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3778 return 0;
3779 }
3780
3781 static const struct tc_ops tc_ops_fqcodel = {
3782 .linux_name = "fq_codel",
3783 .ovs_name = "linux-fq_codel",
3784 .n_queues = FQCODEL_N_QUEUES,
3785 .tc_install = fqcodel_tc_install,
3786 .tc_load = fqcodel_tc_load,
3787 .tc_destroy = fqcodel_tc_destroy,
3788 .qdisc_get = fqcodel_qdisc_get,
3789 .qdisc_set = fqcodel_qdisc_set,
3790 };
3791 \f
3792 /* SFQ traffic control class. */
3793
3794 #define SFQ_N_QUEUES 0x0000
3795
3796 struct sfq {
3797 struct tc tc;
3798 uint32_t quantum;
3799 uint32_t perturb;
3800 };
3801
3802 static struct sfq *
3803 sfq_get__(const struct netdev *netdev_)
3804 {
3805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3806 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3807 }
3808
3809 static void
3810 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3811 {
3812 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3813 struct sfq *sfq;
3814
3815 sfq = xmalloc(sizeof *sfq);
3816 tc_init(&sfq->tc, &tc_ops_sfq);
3817 sfq->perturb = perturb;
3818 sfq->quantum = quantum;
3819
3820 netdev->tc = &sfq->tc;
3821 }
3822
3823 static int
3824 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3825 {
3826 struct tc_sfq_qopt opt;
3827 struct ofpbuf request;
3828 struct tcmsg *tcmsg;
3829 int mtu;
3830 int mtu_error, error;
3831 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3832
3833 tc_del_qdisc(netdev);
3834
3835 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3836 NLM_F_EXCL | NLM_F_CREATE, &request);
3837 if (!tcmsg) {
3838 return ENODEV;
3839 }
3840 tcmsg->tcm_handle = tc_make_handle(1, 0);
3841 tcmsg->tcm_parent = TC_H_ROOT;
3842
3843 memset(&opt, 0, sizeof opt);
3844 if (!quantum) {
3845 if (!mtu_error) {
3846 opt.quantum = mtu; /* if we cannot find mtu, use default */
3847 }
3848 } else {
3849 opt.quantum = quantum;
3850 }
3851
3852 if (!perturb) {
3853 opt.perturb_period = 10;
3854 } else {
3855 opt.perturb_period = perturb;
3856 }
3857
3858 nl_msg_put_string(&request, TCA_KIND, "sfq");
3859 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3860
3861 error = tc_transact(&request, NULL);
3862 if (error) {
3863 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3864 "quantum %u, perturb %u error %d(%s)",
3865 netdev_get_name(netdev),
3866 opt.quantum, opt.perturb_period,
3867 error, ovs_strerror(error));
3868 }
3869 return error;
3870 }
3871
3872 static void
3873 sfq_parse_qdisc_details__(struct netdev *netdev,
3874 const struct smap *details, struct sfq *sfq)
3875 {
3876 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3877 sfq->quantum = smap_get_ullong(details, "quantum", 0);
3878
3879 if (!sfq->perturb) {
3880 sfq->perturb = 10;
3881 }
3882
3883 if (!sfq->quantum) {
3884 int mtu;
3885 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
3886 sfq->quantum = mtu;
3887 } else {
3888 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3889 "device without mtu");
3890 }
3891 }
3892 }
3893
3894 static int
3895 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3896 {
3897 int error;
3898 struct sfq sfq;
3899
3900 sfq_parse_qdisc_details__(netdev, details, &sfq);
3901 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3902 if (!error) {
3903 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3904 }
3905 return error;
3906 }
3907
3908 static int
3909 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3910 {
3911 const struct tc_sfq_qopt *sfq;
3912 struct nlattr *nlattr;
3913 const char * kind;
3914 int error;
3915
3916 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3917 if (error == 0) {
3918 sfq = nl_attr_get(nlattr);
3919 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
3920 return 0;
3921 }
3922
3923 return error;
3924 }
3925
3926 static void
3927 sfq_tc_destroy(struct tc *tc)
3928 {
3929 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3930 tc_destroy(tc);
3931 free(sfq);
3932 }
3933
3934 static int
3935 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3936 {
3937 const struct sfq *sfq = sfq_get__(netdev);
3938 smap_add_format(details, "quantum", "%u", sfq->quantum);
3939 smap_add_format(details, "perturb", "%u", sfq->perturb);
3940 return 0;
3941 }
3942
3943 static int
3944 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3945 {
3946 struct sfq sfq;
3947
3948 sfq_parse_qdisc_details__(netdev, details, &sfq);
3949 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3950 sfq_get__(netdev)->quantum = sfq.quantum;
3951 sfq_get__(netdev)->perturb = sfq.perturb;
3952 return 0;
3953 }
3954
3955 static const struct tc_ops tc_ops_sfq = {
3956 .linux_name = "sfq",
3957 .ovs_name = "linux-sfq",
3958 .n_queues = SFQ_N_QUEUES,
3959 .tc_install = sfq_tc_install,
3960 .tc_load = sfq_tc_load,
3961 .tc_destroy = sfq_tc_destroy,
3962 .qdisc_get = sfq_qdisc_get,
3963 .qdisc_set = sfq_qdisc_set,
3964 };
3965 \f
3966 /* netem traffic control class. */
3967
3968 struct netem {
3969 struct tc tc;
3970 uint32_t latency;
3971 uint32_t limit;
3972 uint32_t loss;
3973 };
3974
3975 static struct netem *
3976 netem_get__(const struct netdev *netdev_)
3977 {
3978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3979 return CONTAINER_OF(netdev->tc, struct netem, tc);
3980 }
3981
3982 static void
3983 netem_install__(struct netdev *netdev_, uint32_t latency,
3984 uint32_t limit, uint32_t loss)
3985 {
3986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3987 struct netem *netem;
3988
3989 netem = xmalloc(sizeof *netem);
3990 tc_init(&netem->tc, &tc_ops_netem);
3991 netem->latency = latency;
3992 netem->limit = limit;
3993 netem->loss = loss;
3994
3995 netdev->tc = &netem->tc;
3996 }
3997
3998 static int
3999 netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4000 uint32_t limit, uint32_t loss)
4001 {
4002 struct tc_netem_qopt opt;
4003 struct ofpbuf request;
4004 struct tcmsg *tcmsg;
4005 int error;
4006
4007 tc_del_qdisc(netdev);
4008
4009 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4010 NLM_F_EXCL | NLM_F_CREATE, &request);
4011 if (!tcmsg) {
4012 return ENODEV;
4013 }
4014 tcmsg->tcm_handle = tc_make_handle(1, 0);
4015 tcmsg->tcm_parent = TC_H_ROOT;
4016
4017 memset(&opt, 0, sizeof opt);
4018
4019 if (!limit) {
4020 opt.limit = 1000;
4021 } else {
4022 opt.limit = limit;
4023 }
4024
4025 if (loss) {
4026 if (loss > 100) {
4027 VLOG_WARN_RL(&rl,
4028 "loss should be a percentage value between 0 to 100, "
4029 "loss was %u", loss);
4030 return EINVAL;
4031 }
4032 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4033 }
4034
4035 opt.latency = tc_time_to_ticks(latency);
4036
4037 nl_msg_put_string(&request, TCA_KIND, "netem");
4038 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4039
4040 error = tc_transact(&request, NULL);
4041 if (error) {
4042 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4043 "latency %u, limit %u, loss %u error %d(%s)",
4044 netdev_get_name(netdev),
4045 opt.latency, opt.limit, opt.loss,
4046 error, ovs_strerror(error));
4047 }
4048 return error;
4049 }
4050
4051 static void
4052 netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4053 const struct smap *details, struct netem *netem)
4054 {
4055 netem->latency = smap_get_ullong(details, "latency", 0);
4056 netem->limit = smap_get_ullong(details, "limit", 0);
4057 netem->loss = smap_get_ullong(details, "loss", 0);
4058
4059 if (!netem->limit) {
4060 netem->limit = 1000;
4061 }
4062 }
4063
4064 static int
4065 netem_tc_install(struct netdev *netdev, const struct smap *details)
4066 {
4067 int error;
4068 struct netem netem;
4069
4070 netem_parse_qdisc_details__(netdev, details, &netem);
4071 error = netem_setup_qdisc__(netdev, netem.latency,
4072 netem.limit, netem.loss);
4073 if (!error) {
4074 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4075 }
4076 return error;
4077 }
4078
4079 static int
4080 netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4081 {
4082 const struct tc_netem_qopt *netem;
4083 struct nlattr *nlattr;
4084 const char *kind;
4085 int error;
4086
4087 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4088 if (error == 0) {
4089 netem = nl_attr_get(nlattr);
4090 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4091 return 0;
4092 }
4093
4094 return error;
4095 }
4096
4097 static void
4098 netem_tc_destroy(struct tc *tc)
4099 {
4100 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4101 tc_destroy(tc);
4102 free(netem);
4103 }
4104
4105 static int
4106 netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4107 {
4108 const struct netem *netem = netem_get__(netdev);
4109 smap_add_format(details, "latency", "%u", netem->latency);
4110 smap_add_format(details, "limit", "%u", netem->limit);
4111 smap_add_format(details, "loss", "%u", netem->loss);
4112 return 0;
4113 }
4114
4115 static int
4116 netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4117 {
4118 struct netem netem;
4119
4120 netem_parse_qdisc_details__(netdev, details, &netem);
4121 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4122 netem_get__(netdev)->latency = netem.latency;
4123 netem_get__(netdev)->limit = netem.limit;
4124 netem_get__(netdev)->loss = netem.loss;
4125 return 0;
4126 }
4127
4128 static const struct tc_ops tc_ops_netem = {
4129 .linux_name = "netem",
4130 .ovs_name = "linux-netem",
4131 .n_queues = 0,
4132 .tc_install = netem_tc_install,
4133 .tc_load = netem_tc_load,
4134 .tc_destroy = netem_tc_destroy,
4135 .qdisc_get = netem_qdisc_get,
4136 .qdisc_set = netem_qdisc_set,
4137 };
4138 \f
4139 /* HTB traffic control class. */
4140
4141 #define HTB_N_QUEUES 0xf000
4142 #define HTB_RATE2QUANTUM 10
4143
4144 struct htb {
4145 struct tc tc;
4146 unsigned int max_rate; /* In bytes/s. */
4147 };
4148
4149 struct htb_class {
4150 struct tc_queue tc_queue;
4151 unsigned int min_rate; /* In bytes/s. */
4152 unsigned int max_rate; /* In bytes/s. */
4153 unsigned int burst; /* In bytes. */
4154 unsigned int priority; /* Lower values are higher priorities. */
4155 };
4156
4157 static struct htb *
4158 htb_get__(const struct netdev *netdev_)
4159 {
4160 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4161 return CONTAINER_OF(netdev->tc, struct htb, tc);
4162 }
4163
4164 static void
4165 htb_install__(struct netdev *netdev_, uint64_t max_rate)
4166 {
4167 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4168 struct htb *htb;
4169
4170 htb = xmalloc(sizeof *htb);
4171 tc_init(&htb->tc, &tc_ops_htb);
4172 htb->max_rate = max_rate;
4173
4174 netdev->tc = &htb->tc;
4175 }
4176
4177 /* Create an HTB qdisc.
4178 *
4179 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4180 static int
4181 htb_setup_qdisc__(struct netdev *netdev)
4182 {
4183 size_t opt_offset;
4184 struct tc_htb_glob opt;
4185 struct ofpbuf request;
4186 struct tcmsg *tcmsg;
4187
4188 tc_del_qdisc(netdev);
4189
4190 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4191 NLM_F_EXCL | NLM_F_CREATE, &request);
4192 if (!tcmsg) {
4193 return ENODEV;
4194 }
4195 tcmsg->tcm_handle = tc_make_handle(1, 0);
4196 tcmsg->tcm_parent = TC_H_ROOT;
4197
4198 nl_msg_put_string(&request, TCA_KIND, "htb");
4199
4200 memset(&opt, 0, sizeof opt);
4201 opt.rate2quantum = HTB_RATE2QUANTUM;
4202 opt.version = 3;
4203 opt.defcls = 1;
4204
4205 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4206 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4207 nl_msg_end_nested(&request, opt_offset);
4208
4209 return tc_transact(&request, NULL);
4210 }
4211
4212 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4213 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4214 static int
4215 htb_setup_class__(struct netdev *netdev, unsigned int handle,
4216 unsigned int parent, struct htb_class *class)
4217 {
4218 size_t opt_offset;
4219 struct tc_htb_opt opt;
4220 struct ofpbuf request;
4221 struct tcmsg *tcmsg;
4222 int error;
4223 int mtu;
4224
4225 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4226 if (error) {
4227 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4228 netdev_get_name(netdev));
4229 return error;
4230 }
4231
4232 memset(&opt, 0, sizeof opt);
4233 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4234 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4235 /* Makes sure the quantum is at least MTU. Setting quantum will
4236 * make htb ignore the r2q for this class. */
4237 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4238 opt.quantum = mtu;
4239 }
4240 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4241 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4242 opt.prio = class->priority;
4243
4244 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4245 &request);
4246 if (!tcmsg) {
4247 return ENODEV;
4248 }
4249 tcmsg->tcm_handle = handle;
4250 tcmsg->tcm_parent = parent;
4251
4252 nl_msg_put_string(&request, TCA_KIND, "htb");
4253 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4254 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4255 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4256 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4257 nl_msg_end_nested(&request, opt_offset);
4258
4259 error = tc_transact(&request, NULL);
4260 if (error) {
4261 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4262 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4263 netdev_get_name(netdev),
4264 tc_get_major(handle), tc_get_minor(handle),
4265 tc_get_major(parent), tc_get_minor(parent),
4266 class->min_rate, class->max_rate,
4267 class->burst, class->priority, ovs_strerror(error));
4268 }
4269 return error;
4270 }
4271
4272 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4273 * description of them into 'details'. The description complies with the
4274 * specification given in the vswitch database documentation for linux-htb
4275 * queue details. */
4276 static int
4277 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4278 {
4279 static const struct nl_policy tca_htb_policy[] = {
4280 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4281 .min_len = sizeof(struct tc_htb_opt) },
4282 };
4283
4284 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4285 const struct tc_htb_opt *htb;
4286
4287 if (!nl_parse_nested(nl_options, tca_htb_policy,
4288 attrs, ARRAY_SIZE(tca_htb_policy))) {
4289 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4290 return EPROTO;
4291 }
4292
4293 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4294 class->min_rate = htb->rate.rate;
4295 class->max_rate = htb->ceil.rate;
4296 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4297 class->priority = htb->prio;
4298 return 0;
4299 }
4300
4301 static int
4302 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4303 struct htb_class *options,
4304 struct netdev_queue_stats *stats)
4305 {
4306 struct nlattr *nl_options;
4307 unsigned int handle;
4308 int error;
4309
4310 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4311 if (!error && queue_id) {
4312 unsigned int major = tc_get_major(handle);
4313 unsigned int minor = tc_get_minor(handle);
4314 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4315 *queue_id = minor - 1;
4316 } else {
4317 error = EPROTO;
4318 }
4319 }
4320 if (!error && options) {
4321 error = htb_parse_tca_options__(nl_options, options);
4322 }
4323 return error;
4324 }
4325
4326 static void
4327 htb_parse_qdisc_details__(struct netdev *netdev_,
4328 const struct smap *details, struct htb_class *hc)
4329 {
4330 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4331
4332 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4333 if (!hc->max_rate) {
4334 enum netdev_features current;
4335
4336 netdev_linux_read_features(netdev);
4337 current = !netdev->get_features_error ? netdev->current : 0;
4338 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4339 }
4340 hc->min_rate = hc->max_rate;
4341 hc->burst = 0;
4342 hc->priority = 0;
4343 }
4344
4345 static int
4346 htb_parse_class_details__(struct netdev *netdev,
4347 const struct smap *details, struct htb_class *hc)
4348 {
4349 const struct htb *htb = htb_get__(netdev);
4350 int mtu, error;
4351 unsigned long long int max_rate_bit;
4352
4353 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4354 if (error) {
4355 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4356 netdev_get_name(netdev));
4357 return error;
4358 }
4359
4360 /* HTB requires at least an mtu sized min-rate to send any traffic even
4361 * on uncongested links. */
4362 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4363 hc->min_rate = MAX(hc->min_rate, mtu);
4364 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4365
4366 /* max-rate */
4367 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4368 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
4369 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4370 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4371
4372 /* burst
4373 *
4374 * According to hints in the documentation that I've read, it is important
4375 * that 'burst' be at least as big as the largest frame that might be
4376 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4377 * but having it a bit too small is a problem. Since netdev_get_mtu()
4378 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4379 * the MTU. We actually add 64, instead of 14, as a guard against
4380 * additional headers get tacked on somewhere that we're not aware of. */
4381 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
4382 hc->burst = MAX(hc->burst, mtu + 64);
4383
4384 /* priority */
4385 hc->priority = smap_get_ullong(details, "priority", 0);
4386
4387 return 0;
4388 }
4389
4390 static int
4391 htb_query_class__(const struct netdev *netdev, unsigned int handle,
4392 unsigned int parent, struct htb_class *options,
4393 struct netdev_queue_stats *stats)
4394 {
4395 struct ofpbuf *reply;
4396 int error;
4397
4398 error = tc_query_class(netdev, handle, parent, &reply);
4399 if (!error) {
4400 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4401 ofpbuf_delete(reply);
4402 }
4403 return error;
4404 }
4405
4406 static int
4407 htb_tc_install(struct netdev *netdev, const struct smap *details)
4408 {
4409 int error;
4410
4411 error = htb_setup_qdisc__(netdev);
4412 if (!error) {
4413 struct htb_class hc;
4414
4415 htb_parse_qdisc_details__(netdev, details, &hc);
4416 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4417 tc_make_handle(1, 0), &hc);
4418 if (!error) {
4419 htb_install__(netdev, hc.max_rate);
4420 }
4421 }
4422 return error;
4423 }
4424
4425 static struct htb_class *
4426 htb_class_cast__(const struct tc_queue *queue)
4427 {
4428 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4429 }
4430
4431 static void
4432 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4433 const struct htb_class *hc)
4434 {
4435 struct htb *htb = htb_get__(netdev);
4436 size_t hash = hash_int(queue_id, 0);
4437 struct tc_queue *queue;
4438 struct htb_class *hcp;
4439
4440 queue = tc_find_queue__(netdev, queue_id, hash);
4441 if (queue) {
4442 hcp = htb_class_cast__(queue);
4443 } else {
4444 hcp = xmalloc(sizeof *hcp);
4445 queue = &hcp->tc_queue;
4446 queue->queue_id = queue_id;
4447 queue->created = time_msec();
4448 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
4449 }
4450
4451 hcp->min_rate = hc->min_rate;
4452 hcp->max_rate = hc->max_rate;
4453 hcp->burst = hc->burst;
4454 hcp->priority = hc->priority;
4455 }
4456
4457 static int
4458 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4459 {
4460 struct ofpbuf msg;
4461 struct queue_dump_state state;
4462 struct htb_class hc;
4463
4464 /* Get qdisc options. */
4465 hc.max_rate = 0;
4466 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4467 htb_install__(netdev, hc.max_rate);
4468
4469 /* Get queues. */
4470 if (!start_queue_dump(netdev, &state)) {
4471 return ENODEV;
4472 }
4473 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4474 unsigned int queue_id;
4475
4476 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4477 htb_update_queue__(netdev, queue_id, &hc);
4478 }
4479 }
4480 finish_queue_dump(&state);
4481
4482 return 0;
4483 }
4484
4485 static void
4486 htb_tc_destroy(struct tc *tc)
4487 {
4488 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4489 struct htb_class *hc;
4490
4491 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
4492 free(hc);
4493 }
4494 tc_destroy(tc);
4495 free(htb);
4496 }
4497
4498 static int
4499 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
4500 {
4501 const struct htb *htb = htb_get__(netdev);
4502 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
4503 return 0;
4504 }
4505
4506 static int
4507 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
4508 {
4509 struct htb_class hc;
4510 int error;
4511
4512 htb_parse_qdisc_details__(netdev, details, &hc);
4513 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4514 tc_make_handle(1, 0), &hc);
4515 if (!error) {
4516 htb_get__(netdev)->max_rate = hc.max_rate;
4517 }
4518 return error;
4519 }
4520
4521 static int
4522 htb_class_get(const struct netdev *netdev OVS_UNUSED,
4523 const struct tc_queue *queue, struct smap *details)
4524 {
4525 const struct htb_class *hc = htb_class_cast__(queue);
4526
4527 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4528 if (hc->min_rate != hc->max_rate) {
4529 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4530 }
4531 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
4532 if (hc->priority) {
4533 smap_add_format(details, "priority", "%u", hc->priority);
4534 }
4535 return 0;
4536 }
4537
4538 static int
4539 htb_class_set(struct netdev *netdev, unsigned int queue_id,
4540 const struct smap *details)
4541 {
4542 struct htb_class hc;
4543 int error;
4544
4545 error = htb_parse_class_details__(netdev, details, &hc);
4546 if (error) {
4547 return error;
4548 }
4549
4550 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4551 tc_make_handle(1, 0xfffe), &hc);
4552 if (error) {
4553 return error;
4554 }
4555
4556 htb_update_queue__(netdev, queue_id, &hc);
4557 return 0;
4558 }
4559
4560 static int
4561 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4562 {
4563 struct htb_class *hc = htb_class_cast__(queue);
4564 struct htb *htb = htb_get__(netdev);
4565 int error;
4566
4567 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4568 if (!error) {
4569 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4570 free(hc);
4571 }
4572 return error;
4573 }
4574
4575 static int
4576 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4577 struct netdev_queue_stats *stats)
4578 {
4579 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4580 tc_make_handle(1, 0xfffe), NULL, stats);
4581 }
4582
4583 static int
4584 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4585 const struct ofpbuf *nlmsg,
4586 netdev_dump_queue_stats_cb *cb, void *aux)
4587 {
4588 struct netdev_queue_stats stats;
4589 unsigned int handle, major, minor;
4590 int error;
4591
4592 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4593 if (error) {
4594 return error;
4595 }
4596
4597 major = tc_get_major(handle);
4598 minor = tc_get_minor(handle);
4599 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4600 (*cb)(minor - 1, &stats, aux);
4601 }
4602 return 0;
4603 }
4604
4605 static const struct tc_ops tc_ops_htb = {
4606 .linux_name = "htb",
4607 .ovs_name = "linux-htb",
4608 .n_queues = HTB_N_QUEUES,
4609 .tc_install = htb_tc_install,
4610 .tc_load = htb_tc_load,
4611 .tc_destroy = htb_tc_destroy,
4612 .qdisc_get = htb_qdisc_get,
4613 .qdisc_set = htb_qdisc_set,
4614 .class_get = htb_class_get,
4615 .class_set = htb_class_set,
4616 .class_delete = htb_class_delete,
4617 .class_get_stats = htb_class_get_stats,
4618 .class_dump_stats = htb_class_dump_stats
4619 };
4620 \f
4621 /* "linux-hfsc" traffic control class. */
4622
4623 #define HFSC_N_QUEUES 0xf000
4624
4625 struct hfsc {
4626 struct tc tc;
4627 uint32_t max_rate;
4628 };
4629
4630 struct hfsc_class {
4631 struct tc_queue tc_queue;
4632 uint32_t min_rate;
4633 uint32_t max_rate;
4634 };
4635
4636 static struct hfsc *
4637 hfsc_get__(const struct netdev *netdev_)
4638 {
4639 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4640 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4641 }
4642
4643 static struct hfsc_class *
4644 hfsc_class_cast__(const struct tc_queue *queue)
4645 {
4646 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4647 }
4648
4649 static void
4650 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4651 {
4652 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4653 struct hfsc *hfsc;
4654
4655 hfsc = xmalloc(sizeof *hfsc);
4656 tc_init(&hfsc->tc, &tc_ops_hfsc);
4657 hfsc->max_rate = max_rate;
4658 netdev->tc = &hfsc->tc;
4659 }
4660
4661 static void
4662 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4663 const struct hfsc_class *hc)
4664 {
4665 size_t hash;
4666 struct hfsc *hfsc;
4667 struct hfsc_class *hcp;
4668 struct tc_queue *queue;
4669
4670 hfsc = hfsc_get__(netdev);
4671 hash = hash_int(queue_id, 0);
4672
4673 queue = tc_find_queue__(netdev, queue_id, hash);
4674 if (queue) {
4675 hcp = hfsc_class_cast__(queue);
4676 } else {
4677 hcp = xmalloc(sizeof *hcp);
4678 queue = &hcp->tc_queue;
4679 queue->queue_id = queue_id;
4680 queue->created = time_msec();
4681 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4682 }
4683
4684 hcp->min_rate = hc->min_rate;
4685 hcp->max_rate = hc->max_rate;
4686 }
4687
4688 static int
4689 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4690 {
4691 const struct tc_service_curve *rsc, *fsc, *usc;
4692 static const struct nl_policy tca_hfsc_policy[] = {
4693 [TCA_HFSC_RSC] = {
4694 .type = NL_A_UNSPEC,
4695 .optional = false,
4696 .min_len = sizeof(struct tc_service_curve),
4697 },
4698 [TCA_HFSC_FSC] = {
4699 .type = NL_A_UNSPEC,
4700 .optional = false,
4701 .min_len = sizeof(struct tc_service_curve),
4702 },
4703 [TCA_HFSC_USC] = {
4704 .type = NL_A_UNSPEC,
4705 .optional = false,
4706 .min_len = sizeof(struct tc_service_curve),
4707 },
4708 };
4709 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4710
4711 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4712 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4713 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4714 return EPROTO;
4715 }
4716
4717 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4718 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4719 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4720
4721 if (rsc->m1 != 0 || rsc->d != 0 ||
4722 fsc->m1 != 0 || fsc->d != 0 ||
4723 usc->m1 != 0 || usc->d != 0) {
4724 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4725 "Non-linear service curves are not supported.");
4726 return EPROTO;
4727 }
4728
4729 if (rsc->m2 != fsc->m2) {
4730 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4731 "Real-time service curves are not supported ");
4732 return EPROTO;
4733 }
4734
4735 if (rsc->m2 > usc->m2) {
4736 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4737 "Min-rate service curve is greater than "
4738 "the max-rate service curve.");
4739 return EPROTO;
4740 }
4741
4742 class->min_rate = fsc->m2;
4743 class->max_rate = usc->m2;
4744 return 0;
4745 }
4746
4747 static int
4748 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4749 struct hfsc_class *options,
4750 struct netdev_queue_stats *stats)
4751 {
4752 int error;
4753 unsigned int handle;
4754 struct nlattr *nl_options;
4755
4756 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4757 if (error) {
4758 return error;
4759 }
4760
4761 if (queue_id) {
4762 unsigned int major, minor;
4763
4764 major = tc_get_major(handle);
4765 minor = tc_get_minor(handle);
4766 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4767 *queue_id = minor - 1;
4768 } else {
4769 return EPROTO;
4770 }
4771 }
4772
4773 if (options) {
4774 error = hfsc_parse_tca_options__(nl_options, options);
4775 }
4776
4777 return error;
4778 }
4779
4780 static int
4781 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4782 unsigned int parent, struct hfsc_class *options,
4783 struct netdev_queue_stats *stats)
4784 {
4785 int error;
4786 struct ofpbuf *reply;
4787
4788 error = tc_query_class(netdev, handle, parent, &reply);
4789 if (error) {
4790 return error;
4791 }
4792
4793 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4794 ofpbuf_delete(reply);
4795 return error;
4796 }
4797
4798 static void
4799 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4800 struct hfsc_class *class)
4801 {
4802 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4803
4804 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4805 if (!max_rate) {
4806 enum netdev_features current;
4807
4808 netdev_linux_read_features(netdev);
4809 current = !netdev->get_features_error ? netdev->current : 0;
4810 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4811 }
4812
4813 class->min_rate = max_rate;
4814 class->max_rate = max_rate;
4815 }
4816
4817 static int
4818 hfsc_parse_class_details__(struct netdev *netdev,
4819 const struct smap *details,
4820 struct hfsc_class * class)
4821 {
4822 const struct hfsc *hfsc;
4823 uint32_t min_rate, max_rate;
4824
4825 hfsc = hfsc_get__(netdev);
4826
4827 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4828 min_rate = MAX(min_rate, 1);
4829 min_rate = MIN(min_rate, hfsc->max_rate);
4830
4831 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
4832 max_rate = MAX(max_rate, min_rate);
4833 max_rate = MIN(max_rate, hfsc->max_rate);
4834
4835 class->min_rate = min_rate;
4836 class->max_rate = max_rate;
4837
4838 return 0;
4839 }
4840
4841 /* Create an HFSC qdisc.
4842 *
4843 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4844 static int
4845 hfsc_setup_qdisc__(struct netdev * netdev)
4846 {
4847 struct tcmsg *tcmsg;
4848 struct ofpbuf request;
4849 struct tc_hfsc_qopt opt;
4850
4851 tc_del_qdisc(netdev);
4852
4853 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4854 NLM_F_EXCL | NLM_F_CREATE, &request);
4855
4856 if (!tcmsg) {
4857 return ENODEV;
4858 }
4859
4860 tcmsg->tcm_handle = tc_make_handle(1, 0);
4861 tcmsg->tcm_parent = TC_H_ROOT;
4862
4863 memset(&opt, 0, sizeof opt);
4864 opt.defcls = 1;
4865
4866 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4867 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4868
4869 return tc_transact(&request, NULL);
4870 }
4871
4872 /* Create an HFSC class.
4873 *
4874 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4875 * sc rate <min_rate> ul rate <max_rate>" */
4876 static int
4877 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4878 unsigned int parent, struct hfsc_class *class)
4879 {
4880 int error;
4881 size_t opt_offset;
4882 struct tcmsg *tcmsg;
4883 struct ofpbuf request;
4884 struct tc_service_curve min, max;
4885
4886 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4887 &request);
4888
4889 if (!tcmsg) {
4890 return ENODEV;
4891 }
4892
4893 tcmsg->tcm_handle = handle;
4894 tcmsg->tcm_parent = parent;
4895
4896 min.m1 = 0;
4897 min.d = 0;
4898 min.m2 = class->min_rate;
4899
4900 max.m1 = 0;
4901 max.d = 0;
4902 max.m2 = class->max_rate;
4903
4904 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4905 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4906 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4907 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4908 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4909 nl_msg_end_nested(&request, opt_offset);
4910
4911 error = tc_transact(&request, NULL);
4912 if (error) {
4913 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4914 "min-rate %ubps, max-rate %ubps (%s)",
4915 netdev_get_name(netdev),
4916 tc_get_major(handle), tc_get_minor(handle),
4917 tc_get_major(parent), tc_get_minor(parent),
4918 class->min_rate, class->max_rate, ovs_strerror(error));
4919 }
4920
4921 return error;
4922 }
4923
4924 static int
4925 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4926 {
4927 int error;
4928 struct hfsc_class class;
4929
4930 error = hfsc_setup_qdisc__(netdev);
4931
4932 if (error) {
4933 return error;
4934 }
4935
4936 hfsc_parse_qdisc_details__(netdev, details, &class);
4937 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4938 tc_make_handle(1, 0), &class);
4939
4940 if (error) {
4941 return error;
4942 }
4943
4944 hfsc_install__(netdev, class.max_rate);
4945 return 0;
4946 }
4947
4948 static int
4949 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4950 {
4951 struct ofpbuf msg;
4952 struct queue_dump_state state;
4953 struct hfsc_class hc;
4954
4955 hc.max_rate = 0;
4956 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4957 hfsc_install__(netdev, hc.max_rate);
4958
4959 if (!start_queue_dump(netdev, &state)) {
4960 return ENODEV;
4961 }
4962
4963 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4964 unsigned int queue_id;
4965
4966 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4967 hfsc_update_queue__(netdev, queue_id, &hc);
4968 }
4969 }
4970
4971 finish_queue_dump(&state);
4972 return 0;
4973 }
4974
4975 static void
4976 hfsc_tc_destroy(struct tc *tc)
4977 {
4978 struct hfsc *hfsc;
4979 struct hfsc_class *hc, *next;
4980
4981 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4982
4983 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4984 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4985 free(hc);
4986 }
4987
4988 tc_destroy(tc);
4989 free(hfsc);
4990 }
4991
4992 static int
4993 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4994 {
4995 const struct hfsc *hfsc;
4996 hfsc = hfsc_get__(netdev);
4997 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4998 return 0;
4999 }
5000
5001 static int
5002 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5003 {
5004 int error;
5005 struct hfsc_class class;
5006
5007 hfsc_parse_qdisc_details__(netdev, details, &class);
5008 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5009 tc_make_handle(1, 0), &class);
5010
5011 if (!error) {
5012 hfsc_get__(netdev)->max_rate = class.max_rate;
5013 }
5014
5015 return error;
5016 }
5017
5018 static int
5019 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5020 const struct tc_queue *queue, struct smap *details)
5021 {
5022 const struct hfsc_class *hc;
5023
5024 hc = hfsc_class_cast__(queue);
5025 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5026 if (hc->min_rate != hc->max_rate) {
5027 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5028 }
5029 return 0;
5030 }
5031
5032 static int
5033 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5034 const struct smap *details)
5035 {
5036 int error;
5037 struct hfsc_class class;
5038
5039 error = hfsc_parse_class_details__(netdev, details, &class);
5040 if (error) {
5041 return error;
5042 }
5043
5044 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5045 tc_make_handle(1, 0xfffe), &class);
5046 if (error) {
5047 return error;
5048 }
5049
5050 hfsc_update_queue__(netdev, queue_id, &class);
5051 return 0;
5052 }
5053
5054 static int
5055 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5056 {
5057 int error;
5058 struct hfsc *hfsc;
5059 struct hfsc_class *hc;
5060
5061 hc = hfsc_class_cast__(queue);
5062 hfsc = hfsc_get__(netdev);
5063
5064 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5065 if (!error) {
5066 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5067 free(hc);
5068 }
5069 return error;
5070 }
5071
5072 static int
5073 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5074 struct netdev_queue_stats *stats)
5075 {
5076 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5077 tc_make_handle(1, 0xfffe), NULL, stats);
5078 }
5079
5080 static int
5081 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5082 const struct ofpbuf *nlmsg,
5083 netdev_dump_queue_stats_cb *cb, void *aux)
5084 {
5085 struct netdev_queue_stats stats;
5086 unsigned int handle, major, minor;
5087 int error;
5088
5089 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5090 if (error) {
5091 return error;
5092 }
5093
5094 major = tc_get_major(handle);
5095 minor = tc_get_minor(handle);
5096 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5097 (*cb)(minor - 1, &stats, aux);
5098 }
5099 return 0;
5100 }
5101
5102 static const struct tc_ops tc_ops_hfsc = {
5103 .linux_name = "hfsc",
5104 .ovs_name = "linux-hfsc",
5105 .n_queues = HFSC_N_QUEUES, /* n_queues */
5106 .tc_install = hfsc_tc_install,
5107 .tc_load = hfsc_tc_load,
5108 .tc_destroy = hfsc_tc_destroy,
5109 .qdisc_get = hfsc_qdisc_get,
5110 .qdisc_set = hfsc_qdisc_set,
5111 .class_get = hfsc_class_get,
5112 .class_set = hfsc_class_set,
5113 .class_delete = hfsc_class_delete,
5114 .class_get_stats = hfsc_class_get_stats,
5115 .class_dump_stats = hfsc_class_dump_stats,
5116 };
5117 \f
5118 /* "linux-noop" traffic control class. */
5119
5120 static void
5121 noop_install__(struct netdev *netdev_)
5122 {
5123 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5124 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5125
5126 netdev->tc = CONST_CAST(struct tc *, &tc);
5127 }
5128
5129 static int
5130 noop_tc_install(struct netdev *netdev,
5131 const struct smap *details OVS_UNUSED)
5132 {
5133 noop_install__(netdev);
5134 return 0;
5135 }
5136
5137 static int
5138 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5139 {
5140 noop_install__(netdev);
5141 return 0;
5142 }
5143
5144 static const struct tc_ops tc_ops_noop = {
5145 .ovs_name = "linux-noop", /* ovs_name */
5146 .tc_install = noop_tc_install,
5147 .tc_load = noop_tc_load,
5148 };
5149 \f
5150 /* "linux-default" traffic control class.
5151 *
5152 * This class represents the default, unnamed Linux qdisc. It corresponds to
5153 * the "" (empty string) QoS type in the OVS database. */
5154
5155 static void
5156 default_install__(struct netdev *netdev_)
5157 {
5158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5159 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5160
5161 /* Nothing but a tc class implementation is allowed to write to a tc. This
5162 * class never does that, so we can legitimately use a const tc object. */
5163 netdev->tc = CONST_CAST(struct tc *, &tc);
5164 }
5165
5166 static int
5167 default_tc_install(struct netdev *netdev,
5168 const struct smap *details OVS_UNUSED)
5169 {
5170 default_install__(netdev);
5171 return 0;
5172 }
5173
5174 static int
5175 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5176 {
5177 default_install__(netdev);
5178 return 0;
5179 }
5180
5181 static const struct tc_ops tc_ops_default = {
5182 .ovs_name = "", /* ovs_name */
5183 .tc_install = default_tc_install,
5184 .tc_load = default_tc_load,
5185 };
5186 \f
5187 /* "linux-other" traffic control class.
5188 *
5189 * */
5190
5191 static int
5192 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5193 {
5194 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5195 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5196
5197 /* Nothing but a tc class implementation is allowed to write to a tc. This
5198 * class never does that, so we can legitimately use a const tc object. */
5199 netdev->tc = CONST_CAST(struct tc *, &tc);
5200 return 0;
5201 }
5202
5203 static const struct tc_ops tc_ops_other = {
5204 .ovs_name = "linux-other",
5205 .tc_load = other_tc_load,
5206 };
5207 \f
5208 /* Traffic control. */
5209
5210 /* Number of kernel "tc" ticks per second. */
5211 static double ticks_per_s;
5212
5213 /* Number of kernel "jiffies" per second. This is used for the purpose of
5214 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5215 * one jiffy's worth of data.
5216 *
5217 * There are two possibilities here:
5218 *
5219 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5220 * approximate range of 100 to 1024. That means that we really need to
5221 * make sure that the qdisc can buffer that much data.
5222 *
5223 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5224 * has finely granular timers and there's no need to fudge additional room
5225 * for buffers. (There's no extra effort needed to implement that: the
5226 * large 'buffer_hz' is used as a divisor, so practically any number will
5227 * come out as 0 in the division. Small integer results in the case of
5228 * really high dividends won't have any real effect anyhow.)
5229 */
5230 static unsigned int buffer_hz;
5231
5232 static struct tcmsg *
5233 netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5234 unsigned int flags, struct ofpbuf *request)
5235 {
5236 int ifindex;
5237 int error;
5238
5239 error = get_ifindex(netdev, &ifindex);
5240 if (error) {
5241 return NULL;
5242 }
5243
5244 return tc_make_request(ifindex, type, flags, request);
5245 }
5246
5247 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5248 * of 'kbits_burst'.
5249 *
5250 * This function is equivalent to running:
5251 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5252 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5253 * mtu 65535 drop
5254 *
5255 * The configuration and stats may be seen with the following command:
5256 * /sbin/tc -s filter show dev <devname> parent ffff:
5257 *
5258 * Returns 0 if successful, otherwise a positive errno value.
5259 */
5260 static int
5261 tc_add_policer(struct netdev *netdev,
5262 uint32_t kbits_rate, uint32_t kbits_burst)
5263 {
5264 struct tc_police tc_police;
5265 struct ofpbuf request;
5266 struct tcmsg *tcmsg;
5267 size_t basic_offset;
5268 size_t police_offset;
5269 int error;
5270 int mtu = 65535;
5271
5272 memset(&tc_police, 0, sizeof tc_police);
5273 tc_police.action = TC_POLICE_SHOT;
5274 tc_police.mtu = mtu;
5275 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
5276
5277 /* The following appears wrong in one way: In networking a kilobit is
5278 * usually 1000 bits but this uses 1024 bits.
5279 *
5280 * However if you "fix" those problems then "tc filter show ..." shows
5281 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5282 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5283 * tc's point of view. Whatever. */
5284 tc_police.burst = tc_bytes_to_ticks(
5285 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
5286
5287 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5288 NLM_F_EXCL | NLM_F_CREATE, &request);
5289 if (!tcmsg) {
5290 return ENODEV;
5291 }
5292 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5293 tcmsg->tcm_info = tc_make_handle(49,
5294 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5295
5296 nl_msg_put_string(&request, TCA_KIND, "basic");
5297 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5298 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5299 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5300 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5301 nl_msg_end_nested(&request, police_offset);
5302 nl_msg_end_nested(&request, basic_offset);
5303
5304 error = tc_transact(&request, NULL);
5305 if (error) {
5306 return error;
5307 }
5308
5309 return 0;
5310 }
5311
5312 static void
5313 read_psched(void)
5314 {
5315 /* The values in psched are not individually very meaningful, but they are
5316 * important. The tables below show some values seen in the wild.
5317 *
5318 * Some notes:
5319 *
5320 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5321 * (Before that, there are hints that it was 1000000000.)
5322 *
5323 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5324 * above.
5325 *
5326 * /proc/net/psched
5327 * -----------------------------------
5328 * [1] 000c8000 000f4240 000f4240 00000064
5329 * [2] 000003e8 00000400 000f4240 3b9aca00
5330 * [3] 000003e8 00000400 000f4240 3b9aca00
5331 * [4] 000003e8 00000400 000f4240 00000064
5332 * [5] 000003e8 00000040 000f4240 3b9aca00
5333 * [6] 000003e8 00000040 000f4240 000000f9
5334 *
5335 * a b c d ticks_per_s buffer_hz
5336 * ------- --------- ---------- ------------- ----------- -------------
5337 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5338 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5339 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5340 * [4] 1,000 1,024 1,000,000 100 976,562 100
5341 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5342 * [6] 1,000 64 1,000,000 249 15,625,000 249
5343 *
5344 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5345 * [2] 2.6.26-1-686-bigmem from Debian lenny
5346 * [3] 2.6.26-2-sparc64 from Debian lenny
5347 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5348 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5349 * [6] 2.6.34 from kernel.org on KVM
5350 */
5351 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5352 static const char fn[] = "/proc/net/psched";
5353 unsigned int a, b, c, d;
5354 FILE *stream;
5355
5356 if (!ovsthread_once_start(&once)) {
5357 return;
5358 }
5359
5360 ticks_per_s = 1.0;
5361 buffer_hz = 100;
5362
5363 stream = fopen(fn, "r");
5364 if (!stream) {
5365 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
5366 goto exit;
5367 }
5368
5369 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5370 VLOG_WARN("%s: read failed", fn);
5371 fclose(stream);
5372 goto exit;
5373 }
5374 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5375 fclose(stream);
5376
5377 if (!a || !b || !c) {
5378 VLOG_WARN("%s: invalid scheduler parameters", fn);
5379 goto exit;
5380 }
5381
5382 ticks_per_s = (double) a * c / b;
5383 if (c == 1000000) {
5384 buffer_hz = d;
5385 } else {
5386 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5387 fn, a, b, c, d);
5388 }
5389 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
5390
5391 exit:
5392 ovsthread_once_done(&once);
5393 }
5394
5395 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5396 * rate of 'rate' bytes per second. */
5397 static unsigned int
5398 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5399 {
5400 read_psched();
5401 return (rate * ticks) / ticks_per_s;
5402 }
5403
5404 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5405 * rate of 'rate' bytes per second. */
5406 static unsigned int
5407 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5408 {
5409 read_psched();
5410 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
5411 }
5412
5413 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5414 * a transmission rate of 'rate' bytes per second. */
5415 static unsigned int
5416 tc_buffer_per_jiffy(unsigned int rate)
5417 {
5418 read_psched();
5419 return rate / buffer_hz;
5420 }
5421
5422 static uint32_t
5423 tc_time_to_ticks(uint32_t time) {
5424 read_psched();
5425 return time * (ticks_per_s / 1000000);
5426 }
5427
5428 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5429 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5430 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5431 * stores NULL into it if it is absent.
5432 *
5433 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5434 * 'msg'.
5435 *
5436 * Returns 0 if successful, otherwise a positive errno value. */
5437 static int
5438 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5439 struct nlattr **options)
5440 {
5441 static const struct nl_policy tca_policy[] = {
5442 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5443 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5444 };
5445 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5446
5447 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5448 tca_policy, ta, ARRAY_SIZE(ta))) {
5449 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5450 goto error;
5451 }
5452
5453 if (kind) {
5454 *kind = nl_attr_get_string(ta[TCA_KIND]);
5455 }
5456
5457 if (options) {
5458 *options = ta[TCA_OPTIONS];
5459 }
5460
5461 return 0;
5462
5463 error:
5464 if (kind) {
5465 *kind = NULL;
5466 }
5467 if (options) {
5468 *options = NULL;
5469 }
5470 return EPROTO;
5471 }
5472
5473 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5474 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5475 * into '*options', and its queue statistics into '*stats'. Any of the output
5476 * arguments may be null.
5477 *
5478 * Returns 0 if successful, otherwise a positive errno value. */
5479 static int
5480 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5481 struct nlattr **options, struct netdev_queue_stats *stats)
5482 {
5483 static const struct nl_policy tca_policy[] = {
5484 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5485 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5486 };
5487 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5488
5489 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5490 tca_policy, ta, ARRAY_SIZE(ta))) {
5491 VLOG_WARN_RL(&rl, "failed to parse class message");
5492 goto error;
5493 }
5494
5495 if (handlep) {
5496 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5497 *handlep = tc->tcm_handle;
5498 }
5499
5500 if (options) {
5501 *options = ta[TCA_OPTIONS];
5502 }
5503
5504 if (stats) {
5505 const struct gnet_stats_queue *gsq;
5506 struct gnet_stats_basic gsb;
5507
5508 static const struct nl_policy stats_policy[] = {
5509 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5510 .min_len = sizeof gsb },
5511 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5512 .min_len = sizeof *gsq },
5513 };
5514 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5515
5516 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5517 sa, ARRAY_SIZE(sa))) {
5518 VLOG_WARN_RL(&rl, "failed to parse class stats");
5519 goto error;
5520 }
5521
5522 /* Alignment issues screw up the length of struct gnet_stats_basic on
5523 * some arch/bitsize combinations. Newer versions of Linux have a
5524 * struct gnet_stats_basic_packed, but we can't depend on that. The
5525 * easiest thing to do is just to make a copy. */
5526 memset(&gsb, 0, sizeof gsb);
5527 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5528 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5529 stats->tx_bytes = gsb.bytes;
5530 stats->tx_packets = gsb.packets;
5531
5532 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5533 stats->tx_errors = gsq->drops;
5534 }
5535
5536 return 0;
5537
5538 error:
5539 if (options) {
5540 *options = NULL;
5541 }
5542 if (stats) {
5543 memset(stats, 0, sizeof *stats);
5544 }
5545 return EPROTO;
5546 }
5547
5548 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5549 * on 'netdev'. */
5550 static int
5551 tc_query_class(const struct netdev *netdev,
5552 unsigned int handle, unsigned int parent,
5553 struct ofpbuf **replyp)
5554 {
5555 struct ofpbuf request;
5556 struct tcmsg *tcmsg;
5557 int error;
5558
5559 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5560 &request);
5561 if (!tcmsg) {
5562 return ENODEV;
5563 }
5564 tcmsg->tcm_handle = handle;
5565 tcmsg->tcm_parent = parent;
5566
5567 error = tc_transact(&request, replyp);
5568 if (error) {
5569 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5570 netdev_get_name(netdev),
5571 tc_get_major(handle), tc_get_minor(handle),
5572 tc_get_major(parent), tc_get_minor(parent),
5573 ovs_strerror(error));
5574 }
5575 return error;
5576 }
5577
5578 /* Equivalent to "tc class del dev <name> handle <handle>". */
5579 static int
5580 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5581 {
5582 struct ofpbuf request;
5583 struct tcmsg *tcmsg;
5584 int error;
5585
5586 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5587 if (!tcmsg) {
5588 return ENODEV;
5589 }
5590 tcmsg->tcm_handle = handle;
5591 tcmsg->tcm_parent = 0;
5592
5593 error = tc_transact(&request, NULL);
5594 if (error) {
5595 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5596 netdev_get_name(netdev),
5597 tc_get_major(handle), tc_get_minor(handle),
5598 ovs_strerror(error));
5599 }
5600 return error;
5601 }
5602
5603 /* Equivalent to "tc qdisc del dev <name> root". */
5604 static int
5605 tc_del_qdisc(struct netdev *netdev_)
5606 {
5607 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5608 struct ofpbuf request;
5609 struct tcmsg *tcmsg;
5610 int error;
5611
5612 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5613 if (!tcmsg) {
5614 return ENODEV;
5615 }
5616 tcmsg->tcm_handle = tc_make_handle(1, 0);
5617 tcmsg->tcm_parent = TC_H_ROOT;
5618
5619 error = tc_transact(&request, NULL);
5620 if (error == EINVAL) {
5621 /* EINVAL probably means that the default qdisc was in use, in which
5622 * case we've accomplished our purpose. */
5623 error = 0;
5624 }
5625 if (!error && netdev->tc) {
5626 if (netdev->tc->ops->tc_destroy) {
5627 netdev->tc->ops->tc_destroy(netdev->tc);
5628 }
5629 netdev->tc = NULL;
5630 }
5631 return error;
5632 }
5633
5634 static bool
5635 getqdisc_is_safe(void)
5636 {
5637 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5638 static bool safe = false;
5639
5640 if (ovsthread_once_start(&once)) {
5641 struct utsname utsname;
5642 int major, minor;
5643
5644 if (uname(&utsname) == -1) {
5645 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5646 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5647 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5648 } else if (major < 2 || (major == 2 && minor < 35)) {
5649 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5650 utsname.release);
5651 } else {
5652 safe = true;
5653 }
5654 ovsthread_once_done(&once);
5655 }
5656 return safe;
5657 }
5658
5659 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5660 * kernel to determine what they are. Returns 0 if successful, otherwise a
5661 * positive errno value. */
5662 static int
5663 tc_query_qdisc(const struct netdev *netdev_)
5664 {
5665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5666 struct ofpbuf request, *qdisc;
5667 const struct tc_ops *ops;
5668 struct tcmsg *tcmsg;
5669 int load_error;
5670 int error;
5671
5672 if (netdev->tc) {
5673 return 0;
5674 }
5675
5676 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5677 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5678 * 2.6.35 without that fix backported to it.
5679 *
5680 * To avoid the OOPS, we must not make a request that would attempt to dump
5681 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5682 * few others. There are a few ways that I can see to do this, but most of
5683 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5684 * technique chosen here is to assume that any non-default qdisc that we
5685 * create will have a class with handle 1:0. The built-in qdiscs only have
5686 * a class with handle 0:0.
5687 *
5688 * On Linux 2.6.35+ we use the straightforward method because it allows us
5689 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5690 * in such a case we get no response at all from the kernel (!) if a
5691 * builtin qdisc is in use (which is later caught by "!error &&
5692 * !qdisc->size"). */
5693 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5694 &request);
5695 if (!tcmsg) {
5696 return ENODEV;
5697 }
5698 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5699 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5700
5701 /* Figure out what tc class to instantiate. */
5702 error = tc_transact(&request, &qdisc);
5703 if (!error && qdisc->size) {
5704 const char *kind;
5705
5706 error = tc_parse_qdisc(qdisc, &kind, NULL);
5707 if (error) {
5708 ops = &tc_ops_other;
5709 } else {
5710 ops = tc_lookup_linux_name(kind);
5711 if (!ops) {
5712 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5713 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5714
5715 ops = &tc_ops_other;
5716 }
5717 }
5718 } else if ((!error && !qdisc->size) || error == ENOENT) {
5719 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5720 * set up by some other entity that doesn't have a handle 1:0. We will
5721 * assume that it's the system default qdisc. */
5722 ops = &tc_ops_default;
5723 error = 0;
5724 } else {
5725 /* Who knows? Maybe the device got deleted. */
5726 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5727 netdev_get_name(netdev_), ovs_strerror(error));
5728 ops = &tc_ops_other;
5729 }
5730
5731 /* Instantiate it. */
5732 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5733 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5734 ofpbuf_delete(qdisc);
5735
5736 return error ? error : load_error;
5737 }
5738
5739 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5740 approximate the time to transmit packets of various lengths. For an MTU of
5741 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5742 represents two possible packet lengths; for a MTU of 513 through 1024, four
5743 possible lengths; and so on.
5744
5745 Returns, for the specified 'mtu', the number of bits that packet lengths
5746 need to be shifted right to fit within such a 256-entry table. */
5747 static int
5748 tc_calc_cell_log(unsigned int mtu)
5749 {
5750 int cell_log;
5751
5752 if (!mtu) {
5753 mtu = ETH_PAYLOAD_MAX;
5754 }
5755 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5756
5757 for (cell_log = 0; mtu >= 256; cell_log++) {
5758 mtu >>= 1;
5759 }
5760
5761 return cell_log;
5762 }
5763
5764 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5765 * of 'mtu'. */
5766 static void
5767 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5768 {
5769 memset(rate, 0, sizeof *rate);
5770 rate->cell_log = tc_calc_cell_log(mtu);
5771 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5772 /* rate->cell_align = 0; */ /* distro headers. */
5773 rate->mpu = ETH_TOTAL_MIN;
5774 rate->rate = Bps;
5775 }
5776
5777 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5778 * attribute of the specified "type".
5779 *
5780 * See tc_calc_cell_log() above for a description of "rtab"s. */
5781 void
5782 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5783 {
5784 uint32_t *rtab;
5785 unsigned int i;
5786
5787 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5788 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5789 unsigned packet_size = (i + 1) << rate->cell_log;
5790 if (packet_size < rate->mpu) {
5791 packet_size = rate->mpu;
5792 }
5793 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5794 }
5795 }
5796
5797 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5798 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5799 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5800 * 0 is fine.) */
5801 static int
5802 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5803 {
5804 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5805 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5806 }
5807 \f
5808 /* Linux-only functions declared in netdev-linux.h */
5809
5810 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5811 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5812 int
5813 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5814 const char *flag_name, bool enable)
5815 {
5816 const char *netdev_name = netdev_get_name(netdev);
5817 struct ethtool_value evalue;
5818 uint32_t new_flags;
5819 int error;
5820
5821 COVERAGE_INC(netdev_get_ethtool);
5822 memset(&evalue, 0, sizeof evalue);
5823 error = netdev_linux_do_ethtool(netdev_name,
5824 (struct ethtool_cmd *)&evalue,
5825 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5826 if (error) {
5827 return error;
5828 }
5829
5830 COVERAGE_INC(netdev_set_ethtool);
5831 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5832 if (new_flags == evalue.data) {
5833 return 0;
5834 }
5835 evalue.data = new_flags;
5836 error = netdev_linux_do_ethtool(netdev_name,
5837 (struct ethtool_cmd *)&evalue,
5838 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5839 if (error) {
5840 return error;
5841 }
5842
5843 COVERAGE_INC(netdev_get_ethtool);
5844 memset(&evalue, 0, sizeof evalue);
5845 error = netdev_linux_do_ethtool(netdev_name,
5846 (struct ethtool_cmd *)&evalue,
5847 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5848 if (error) {
5849 return error;
5850 }
5851
5852 if (new_flags != evalue.data) {
5853 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5854 "device %s failed", enable ? "enable" : "disable",
5855 flag_name, netdev_name);
5856 return EOPNOTSUPP;
5857 }
5858
5859 return 0;
5860 }
5861 \f
5862 /* Utility functions. */
5863
5864 /* Copies 'src' into 'dst', performing format conversion in the process. */
5865 static void
5866 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5867 const struct rtnl_link_stats *src)
5868 {
5869 dst->rx_packets = src->rx_packets;
5870 dst->tx_packets = src->tx_packets;
5871 dst->rx_bytes = src->rx_bytes;
5872 dst->tx_bytes = src->tx_bytes;
5873 dst->rx_errors = src->rx_errors;
5874 dst->tx_errors = src->tx_errors;
5875 dst->rx_dropped = src->rx_dropped;
5876 dst->tx_dropped = src->tx_dropped;
5877 dst->multicast = src->multicast;
5878 dst->collisions = src->collisions;
5879 dst->rx_length_errors = src->rx_length_errors;
5880 dst->rx_over_errors = src->rx_over_errors;
5881 dst->rx_crc_errors = src->rx_crc_errors;
5882 dst->rx_frame_errors = src->rx_frame_errors;
5883 dst->rx_fifo_errors = src->rx_fifo_errors;
5884 dst->rx_missed_errors = src->rx_missed_errors;
5885 dst->tx_aborted_errors = src->tx_aborted_errors;
5886 dst->tx_carrier_errors = src->tx_carrier_errors;
5887 dst->tx_fifo_errors = src->tx_fifo_errors;
5888 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5889 dst->tx_window_errors = src->tx_window_errors;
5890 }
5891
5892 /* Copies 'src' into 'dst', performing format conversion in the process. */
5893 static void
5894 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5895 const struct rtnl_link_stats64 *src)
5896 {
5897 dst->rx_packets = src->rx_packets;
5898 dst->tx_packets = src->tx_packets;
5899 dst->rx_bytes = src->rx_bytes;
5900 dst->tx_bytes = src->tx_bytes;
5901 dst->rx_errors = src->rx_errors;
5902 dst->tx_errors = src->tx_errors;
5903 dst->rx_dropped = src->rx_dropped;
5904 dst->tx_dropped = src->tx_dropped;
5905 dst->multicast = src->multicast;
5906 dst->collisions = src->collisions;
5907 dst->rx_length_errors = src->rx_length_errors;
5908 dst->rx_over_errors = src->rx_over_errors;
5909 dst->rx_crc_errors = src->rx_crc_errors;
5910 dst->rx_frame_errors = src->rx_frame_errors;
5911 dst->rx_fifo_errors = src->rx_fifo_errors;
5912 dst->rx_missed_errors = src->rx_missed_errors;
5913 dst->tx_aborted_errors = src->tx_aborted_errors;
5914 dst->tx_carrier_errors = src->tx_carrier_errors;
5915 dst->tx_fifo_errors = src->tx_fifo_errors;
5916 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5917 dst->tx_window_errors = src->tx_window_errors;
5918 }
5919
5920 static int
5921 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5922 {
5923 struct ofpbuf request;
5924 struct ofpbuf *reply;
5925 int error;
5926
5927 /* Filtering all counters by default */
5928 memset(stats, 0xFF, sizeof(struct netdev_stats));
5929
5930 ofpbuf_init(&request, 0);
5931 nl_msg_put_nlmsghdr(&request,
5932 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5933 RTM_GETLINK, NLM_F_REQUEST);
5934 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5935 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5936 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5937 ofpbuf_uninit(&request);
5938 if (error) {
5939 return error;
5940 }
5941
5942 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5943 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5944 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5945 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5946 error = 0;
5947 } else {
5948 a = nl_attr_find(reply, 0, IFLA_STATS);
5949 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5950 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5951 error = 0;
5952 } else {
5953 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5954 error = EPROTO;
5955 }
5956 }
5957 } else {
5958 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5959 error = EPROTO;
5960 }
5961
5962
5963 ofpbuf_delete(reply);
5964 return error;
5965 }
5966
5967 static int
5968 get_flags(const struct netdev *dev, unsigned int *flags)
5969 {
5970 struct ifreq ifr;
5971 int error;
5972
5973 *flags = 0;
5974 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5975 if (!error) {
5976 *flags = ifr.ifr_flags;
5977 }
5978 return error;
5979 }
5980
5981 static int
5982 set_flags(const char *name, unsigned int flags)
5983 {
5984 struct ifreq ifr;
5985
5986 ifr.ifr_flags = flags;
5987 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5988 }
5989
5990 int
5991 linux_get_ifindex(const char *netdev_name)
5992 {
5993 struct ifreq ifr;
5994 int error;
5995
5996 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5997 COVERAGE_INC(netdev_get_ifindex);
5998
5999 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6000 if (error) {
6001 /* ENODEV probably means that a vif disappeared asynchronously and
6002 * hasn't been removed from the database yet, so reduce the log level
6003 * to INFO for that case. */
6004 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6005 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6006 netdev_name, ovs_strerror(error));
6007 return -error;
6008 }
6009 return ifr.ifr_ifindex;
6010 }
6011
6012 static int
6013 get_ifindex(const struct netdev *netdev_, int *ifindexp)
6014 {
6015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6016
6017 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6018 netdev_linux_update_via_netlink(netdev);
6019 }
6020
6021 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6022 /* Fall back to ioctl if netlink fails */
6023 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6024
6025 if (ifindex < 0) {
6026 netdev->get_ifindex_error = -ifindex;
6027 netdev->ifindex = 0;
6028 } else {
6029 netdev->get_ifindex_error = 0;
6030 netdev->ifindex = ifindex;
6031 }
6032 netdev->cache_valid |= VALID_IFINDEX;
6033 }
6034
6035 *ifindexp = netdev->ifindex;
6036 return netdev->get_ifindex_error;
6037 }
6038
6039 static int
6040 netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6041 {
6042 struct ofpbuf request;
6043 struct ofpbuf *reply;
6044 struct rtnetlink_change chg;
6045 struct rtnetlink_change *change = &chg;
6046 int error;
6047
6048 ofpbuf_init(&request, 0);
6049 nl_msg_put_nlmsghdr(&request,
6050 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6051 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6052 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6053
6054 /* The correct identifiers for a Linux device are netnsid and ifindex,
6055 * but ifindex changes as the port is moved to another network namespace
6056 * and the interface name statically stored in ovsdb. */
6057 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6058 if (netdev_linux_netnsid_is_remote(netdev)) {
6059 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6060 }
6061 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6062 ofpbuf_uninit(&request);
6063 if (error) {
6064 ofpbuf_delete(reply);
6065 return error;
6066 }
6067
6068 if (rtnetlink_parse(reply, change)
6069 && change->nlmsg_type == RTM_NEWLINK) {
6070 bool changed = false;
6071 error = 0;
6072
6073 /* Update netdev from rtnl msg and increment its seq if needed. */
6074 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6075 netdev->carrier_resets++;
6076 changed = true;
6077 }
6078 if (change->ifi_flags != netdev->ifi_flags) {
6079 netdev->ifi_flags = change->ifi_flags;
6080 changed = true;
6081 }
6082 if (change->mtu && change->mtu != netdev->mtu) {
6083 netdev->mtu = change->mtu;
6084 netdev->cache_valid |= VALID_MTU;
6085 netdev->netdev_mtu_error = 0;
6086 changed = true;
6087 }
6088 if (!eth_addr_is_zero(change->mac)
6089 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6090 netdev->etheraddr = change->mac;
6091 netdev->cache_valid |= VALID_ETHERADDR;
6092 netdev->ether_addr_error = 0;
6093 changed = true;
6094 }
6095 if (change->if_index != netdev->ifindex) {
6096 netdev->ifindex = change->if_index;
6097 netdev->cache_valid |= VALID_IFINDEX;
6098 netdev->get_ifindex_error = 0;
6099 changed = true;
6100 }
6101 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6102 netdev->is_lag_master = true;
6103 }
6104 if (changed) {
6105 netdev_change_seq_changed(&netdev->up);
6106 }
6107 } else {
6108 error = EINVAL;
6109 }
6110
6111 ofpbuf_delete(reply);
6112 return error;
6113 }
6114
6115 static int
6116 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6117 {
6118 struct ifreq ifr;
6119 int hwaddr_family;
6120 int error;
6121
6122 memset(&ifr, 0, sizeof ifr);
6123 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6124 COVERAGE_INC(netdev_get_hwaddr);
6125 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6126 if (error) {
6127 /* ENODEV probably means that a vif disappeared asynchronously and
6128 * hasn't been removed from the database yet, so reduce the log level
6129 * to INFO for that case. */
6130 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6131 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6132 netdev_name, ovs_strerror(error));
6133 return error;
6134 }
6135 hwaddr_family = ifr.ifr_hwaddr.sa_family;
6136 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6137 hwaddr_family != ARPHRD_NONE) {
6138 VLOG_INFO("%s device has unknown hardware address family %d",
6139 netdev_name, hwaddr_family);
6140 return EINVAL;
6141 }
6142 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6143 return 0;
6144 }
6145
6146 static int
6147 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6148 {
6149 struct ifreq ifr;
6150 int error;
6151
6152 memset(&ifr, 0, sizeof ifr);
6153 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6154 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6155 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6156 COVERAGE_INC(netdev_set_hwaddr);
6157 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6158 if (error) {
6159 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6160 netdev_name, ovs_strerror(error));
6161 }
6162 return error;
6163 }
6164
6165 static int
6166 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6167 int cmd, const char *cmd_name)
6168 {
6169 struct ifreq ifr;
6170 int error;
6171
6172 memset(&ifr, 0, sizeof ifr);
6173 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6174 ifr.ifr_data = (caddr_t) ecmd;
6175
6176 ecmd->cmd = cmd;
6177 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6178 if (error) {
6179 if (error != EOPNOTSUPP) {
6180 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6181 "failed: %s", cmd_name, name, ovs_strerror(error));
6182 } else {
6183 /* The device doesn't support this operation. That's pretty
6184 * common, so there's no point in logging anything. */
6185 }
6186 }
6187 return error;
6188 }
6189
6190 /* Returns an AF_PACKET raw socket or a negative errno value. */
6191 static int
6192 af_packet_sock(void)
6193 {
6194 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6195 static int sock;
6196
6197 if (ovsthread_once_start(&once)) {
6198 sock = socket(AF_PACKET, SOCK_RAW, 0);
6199 if (sock >= 0) {
6200 int error = set_nonblocking(sock);
6201 if (error) {
6202 close(sock);
6203 sock = -error;
6204 }
6205 } else {
6206 sock = -errno;
6207 VLOG_ERR("failed to create packet socket: %s",
6208 ovs_strerror(errno));
6209 }
6210 ovsthread_once_done(&once);
6211 }
6212
6213 return sock;
6214 }