]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
dpif-linux: Add some internal "const" qualifiers.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
275707c3 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d
BP
22#include <fcntl.h>
23#include <arpa/inet.h>
24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
c060c4cf 61#include "netlink.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d 67#include "shash.h"
c060c4cf 68#include "socket-util.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
c060c4cf 71#include "unaligned.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 75
d76f09ea
BP
76COVERAGE_DEFINE(netdev_set_policing);
77COVERAGE_DEFINE(netdev_arp_lookup);
78COVERAGE_DEFINE(netdev_get_ifindex);
79COVERAGE_DEFINE(netdev_get_hwaddr);
80COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
81COVERAGE_DEFINE(netdev_get_ethtool);
82COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 83
8b61709d
BP
84\f
85/* These were introduced in Linux 2.6.14, so they might be missing if we have
86 * old headers. */
87#ifndef ADVERTISED_Pause
88#define ADVERTISED_Pause (1 << 13)
89#endif
90#ifndef ADVERTISED_Asym_Pause
91#define ADVERTISED_Asym_Pause (1 << 14)
92#endif
93
e47bd51a
JP
94/* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96#ifndef ETHTOOL_GFLAGS
97#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98#endif
99#ifndef ETHTOOL_SFLAGS
100#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101#endif
102
c1c9c9c4
BP
103/* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 * headers. */
105#ifndef TC_RTAB_SIZE
106#define TC_RTAB_SIZE 1024
107#endif
108
2ee6545f 109static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 110static int cache_notifier_refcount;
8b61709d
BP
111
112enum {
7fbef77a
JG
113 VALID_IFINDEX = 1 << 0,
114 VALID_ETHERADDR = 1 << 1,
115 VALID_IN4 = 1 << 2,
116 VALID_IN6 = 1 << 3,
117 VALID_MTU = 1 << 4,
3a183124 118 VALID_POLICING = 1 << 5,
4f925bd3
PS
119 VALID_VPORT_STAT_ERROR = 1 << 6,
120 VALID_DRVINFO = 1 << 7,
51f87458 121 VALID_FEATURES = 1 << 8,
8b61709d
BP
122};
123
149f577a
JG
124struct tap_state {
125 int fd;
126};
c1c9c9c4
BP
127\f
128/* Traffic control. */
129
130/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
c1c9c9c4
BP
135struct tc {
136 const struct tc_ops *ops;
93b13be8
BP
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140};
c1c9c9c4 141
559eb230
BP
142#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
93b13be8
BP
144/* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
151};
152
153/* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
155 *
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
159struct tc_ops {
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
164
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
167
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
171
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
177 *
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
181 *
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
184 *
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
79f1cbe9 187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
188
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
192 *
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
198 * 'netdev'.
199 *
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * tc_destroy(tc).
207 *
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
211 *
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
214
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 *
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
220 *
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
224 *
225 * This function may be null if 'tc' is not configurable.
226 */
79f1cbe9 227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
228
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
231 *
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
235 *
236 * This function may be null if 'tc' is not configurable.
237 */
79f1cbe9 238 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 239
93b13be8
BP
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
242 *
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 *
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
250 *
251 * This function may be null if 'tc' does not have queues ('n_queues' is
252 * 0). */
93b13be8 253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 254 struct smap *details);
c1c9c9c4
BP
255
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * 'n_queues'.
260 *
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 *
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 268 const struct smap *details);
c1c9c9c4 269
93b13be8
BP
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
272 *
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
93b13be8 275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 276
93b13be8
BP
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
279 *
280 * On success, initializes '*stats'.
281 *
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
93b13be8
BP
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
c1c9c9c4
BP
286 struct netdev_queue_stats *stats);
287
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 *
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
296};
297
298static void
299tc_init(struct tc *tc, const struct tc_ops *ops)
300{
301 tc->ops = ops;
93b13be8 302 hmap_init(&tc->queues);
c1c9c9c4
BP
303}
304
305static void
306tc_destroy(struct tc *tc)
307{
93b13be8 308 hmap_destroy(&tc->queues);
c1c9c9c4
BP
309}
310
311static const struct tc_ops tc_ops_htb;
a339aa81 312static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
313static const struct tc_ops tc_ops_default;
314static const struct tc_ops tc_ops_other;
315
559eb230 316static const struct tc_ops *const tcs[] = {
c1c9c9c4 317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
321 NULL
322};
149f577a 323
c1c9c9c4
BP
324static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325static unsigned int tc_get_major(unsigned int handle);
326static unsigned int tc_get_minor(unsigned int handle);
327
328static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331
332static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
335static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 int kbits_burst);
c1c9c9c4
BP
338
339static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347static int tc_delete_class(const struct netdev *, unsigned int handle);
348
349static int tc_del_qdisc(struct netdev *netdev);
350static int tc_query_qdisc(const struct netdev *netdev);
351
352static int tc_calc_cell_log(unsigned int mtu);
353static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357\f
b5d57fc8
BP
358struct netdev_linux {
359 struct netdev up;
149f577a 360
8b61709d 361 struct shash_node *shash_node;
149f577a 362 unsigned int cache_valid;
ac4d3bcb 363 unsigned int change_seq;
8b61709d 364
1670c579
EJ
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
368
8722022c
BP
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
371 int ifindex;
372 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 373 struct in_addr address, netmask;
8b61709d
BP
374 struct in6_addr in6;
375 int mtu;
059e5f4f 376 unsigned int ifi_flags;
65c3058c 377 long long int carrier_resets;
80a86fbe
BP
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
bba1e6f3
PS
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
90a6637d 382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 384 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 387
a00ca915
EJ
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
90a6637d 392
4f925bd3 393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 394 struct tc *tc;
149f577a
JG
395
396 union {
397 struct tap_state tap;
398 } state;
8b61709d
BP
399};
400
796223f5
BP
401struct netdev_rx_linux {
402 struct netdev_rx up;
403 bool is_tap;
5b7448ed 404 int fd;
149f577a 405};
8b61709d 406
796223f5
BP
407static const struct netdev_rx_class netdev_rx_linux_class;
408
76c308b5
BP
409/* Sockets used for ioctl operations. */
410static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 411
8b61709d
BP
412/* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
415
15b3596a 416static int netdev_linux_init(void);
6f643e49 417
0b0544d7 418static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 419 int cmd, const char *cmd_name);
149f577a
JG
420static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
f1acd62b
BP
422static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
b5d57fc8 424static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 425static int set_flags(const char *, unsigned int flags);
8b61709d
BP
426static int do_get_ifindex(const char *netdev_name);
427static int get_ifindex(const struct netdev *, int *ifindexp);
428static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 432static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
433static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 435static int af_packet_sock(void);
1670c579
EJ
436static void netdev_linux_miimon_run(void);
437static void netdev_linux_miimon_wait(void);
8b61709d 438
15b3596a
JG
439static bool
440is_netdev_linux_class(const struct netdev_class *netdev_class)
441{
442 return netdev_class->init == netdev_linux_init;
443}
444
796223f5
BP
445static bool
446is_tap_netdev(const struct netdev *netdev)
447{
b5d57fc8 448 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
449}
450
8b61709d
BP
451static struct netdev_linux *
452netdev_linux_cast(const struct netdev *netdev)
453{
b5d57fc8 454 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 455
180c6d0b 456 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 457}
796223f5
BP
458
459static struct netdev_rx_linux *
460netdev_rx_linux_cast(const struct netdev_rx *rx)
461{
462 netdev_rx_assert_class(rx, &netdev_rx_linux_class);
463 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
464}
ff4ed3c9 465\f
8b61709d
BP
466static int
467netdev_linux_init(void)
468{
469 static int status = -1;
470 if (status < 0) {
ff4ed3c9 471 /* Create AF_INET socket. */
8b61709d
BP
472 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
473 status = af_inet_sock >= 0 ? 0 : errno;
474 if (status) {
10a89ef0 475 VLOG_ERR("failed to create inet socket: %s", ovs_strerror(status));
8b61709d
BP
476 }
477 }
478 return status;
479}
480
481static void
482netdev_linux_run(void)
483{
18a23781 484 rtnetlink_link_run();
1670c579 485 netdev_linux_miimon_run();
8b61709d
BP
486}
487
488static void
489netdev_linux_wait(void)
490{
18a23781 491 rtnetlink_link_wait();
1670c579 492 netdev_linux_miimon_wait();
8b61709d
BP
493}
494
ac4d3bcb 495static void
b5d57fc8
BP
496netdev_linux_changed(struct netdev_linux *dev,
497 unsigned int ifi_flags, unsigned int mask)
ac4d3bcb
EJ
498{
499 dev->change_seq++;
500 if (!dev->change_seq) {
501 dev->change_seq++;
502 }
8aa77183
BP
503
504 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
505 dev->carrier_resets++;
506 }
507 dev->ifi_flags = ifi_flags;
508
4f925bd3
PS
509 dev->cache_valid &= mask;
510}
511
512static void
b5d57fc8
BP
513netdev_linux_update(struct netdev_linux *dev,
514 const struct rtnetlink_link_change *change)
4f925bd3
PS
515{
516 if (change->nlmsg_type == RTM_NEWLINK) {
517 /* Keep drv-info */
b5d57fc8 518 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 519
c7b1b0a5 520 /* Update netdev from rtnl-change msg. */
90a6637d
PS
521 if (change->mtu) {
522 dev->mtu = change->mtu;
523 dev->cache_valid |= VALID_MTU;
524 dev->netdev_mtu_error = 0;
525 }
526
44445cac
PS
527 if (!eth_addr_is_zero(change->addr)) {
528 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
529 dev->cache_valid |= VALID_ETHERADDR;
530 dev->ether_addr_error = 0;
531 }
532
c7b1b0a5
PS
533 dev->ifindex = change->ifi_index;
534 dev->cache_valid |= VALID_IFINDEX;
535 dev->get_ifindex_error = 0;
536
4f925bd3 537 } else {
b5d57fc8 538 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 539 }
ac4d3bcb
EJ
540}
541
8b61709d 542static void
21d6e22e 543netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 544 void *aux OVS_UNUSED)
8b61709d 545{
b5d57fc8 546 struct netdev_linux *dev;
8b61709d 547 if (change) {
b5d57fc8
BP
548 struct netdev *base_dev = netdev_from_name(change->ifname);
549 if (base_dev && is_netdev_linux_class(netdev_get_class(base_dev))) {
550 netdev_linux_update(netdev_linux_cast(base_dev), change);
8b61709d
BP
551 }
552 } else {
46415c90 553 struct shash device_shash;
8b61709d 554 struct shash_node *node;
46415c90
JG
555
556 shash_init(&device_shash);
b5d57fc8 557 netdev_get_devices(&netdev_linux_class, &device_shash);
46415c90 558 SHASH_FOR_EACH (node, &device_shash) {
059e5f4f 559 unsigned int flags;
3a183124 560
149f577a 561 dev = node->data;
3a183124 562
180c6d0b 563 get_flags(&dev->up, &flags);
b5d57fc8 564 netdev_linux_changed(dev, flags, 0);
8b61709d 565 }
46415c90 566 shash_destroy(&device_shash);
8b61709d
BP
567 }
568}
569
570static int
1f6e0fbd 571cache_notifier_ref(void)
6c88d577 572{
46415c90 573 if (!cache_notifier_refcount) {
cb22974d 574 ovs_assert(!netdev_linux_cache_notifier);
2ee6545f
EJ
575
576 netdev_linux_cache_notifier =
577 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
578
579 if (!netdev_linux_cache_notifier) {
580 return EINVAL;
149f577a
JG
581 }
582 }
46415c90 583 cache_notifier_refcount++;
6c88d577 584
1f6e0fbd
BP
585 return 0;
586}
587
588static void
589cache_notifier_unref(void)
590{
cb22974d 591 ovs_assert(cache_notifier_refcount > 0);
1f6e0fbd 592 if (!--cache_notifier_refcount) {
cb22974d 593 ovs_assert(netdev_linux_cache_notifier);
1f6e0fbd
BP
594 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
595 netdev_linux_cache_notifier = NULL;
596 }
597}
598
599/* Creates system and internal devices. */
600static int
601netdev_linux_create(const struct netdev_class *class, const char *name,
b5d57fc8 602 struct netdev **netdevp)
1f6e0fbd 603{
b5d57fc8 604 struct netdev_linux *netdev;
1f6e0fbd
BP
605 int error;
606
607 error = cache_notifier_ref();
608 if (error) {
609 return error;
610 }
611
b5d57fc8
BP
612 netdev = xzalloc(sizeof *netdev);
613 netdev->change_seq = 1;
614 netdev_init(&netdev->up, name, class);
615 error = get_flags(&netdev->up, &netdev->ifi_flags);
616 if (error == ENODEV) {
617 if (class != &netdev_internal_class) {
618 /* The device does not exist, so don't allow it to be opened. */
619 netdev_uninit(&netdev->up, false);
620 cache_notifier_unref();
621 free(netdev);
622 return ENODEV;
623 } else {
624 /* "Internal" netdevs have to be created as netdev objects before
625 * they exist in the kernel, because creating them in the kernel
626 * happens by passing a netdev object to dpif_port_add().
627 * Therefore, ignore the error. */
628 }
629 }
46415c90 630
b5d57fc8 631 *netdevp = &netdev->up;
a740f0de
JG
632 return 0;
633}
634
5b7448ed
JG
635/* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
a740f0de 641static int
b8dcf5e9 642netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
b5d57fc8 643 const char *name, struct netdev **netdevp)
a740f0de 644{
b5d57fc8 645 struct netdev_linux *netdev;
a740f0de
JG
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
648 struct ifreq ifr;
649 int error;
650
b5d57fc8
BP
651 netdev = xzalloc(sizeof *netdev);
652 state = &netdev->state.tap;
a740f0de 653
1f6e0fbd
BP
654 error = cache_notifier_ref();
655 if (error) {
656 goto error;
657 }
658
6c88d577 659 /* Open tap device. */
149f577a
JG
660 state->fd = open(tap_dev, O_RDWR);
661 if (state->fd < 0) {
6c88d577 662 error = errno;
10a89ef0 663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1f6e0fbd 664 goto error_unref_notifier;
6c88d577
JP
665 }
666
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577 671 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 672 ovs_strerror(errno));
6c88d577 673 error = errno;
1f6e0fbd 674 goto error_unref_notifier;
6c88d577
JP
675 }
676
677 /* Make non-blocking. */
149f577a 678 error = set_nonblocking(state->fd);
a740f0de 679 if (error) {
1f6e0fbd 680 goto error_unref_notifier;
a740f0de
JG
681 }
682
b5d57fc8
BP
683 netdev_init(&netdev->up, name, &netdev_tap_class);
684 *netdevp = &netdev->up;
a740f0de
JG
685 return 0;
686
1f6e0fbd
BP
687error_unref_notifier:
688 cache_notifier_unref();
a740f0de 689error:
b5d57fc8 690 free(netdev);
a740f0de
JG
691 return error;
692}
693
a740f0de 694static void
b5d57fc8 695destroy_tap(struct netdev_linux *netdev)
a740f0de 696{
b5d57fc8 697 struct tap_state *state = &netdev->state.tap;
149f577a
JG
698
699 if (state->fd >= 0) {
700 close(state->fd);
a740f0de
JG
701 }
702}
703
b5d57fc8 704/* Destroys the netdev device 'netdev_'. */
6c88d577 705static void
b5d57fc8 706netdev_linux_destroy(struct netdev *netdev_)
6c88d577 707{
b5d57fc8 708 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 709
b5d57fc8
BP
710 if (netdev->tc && netdev->tc->ops->tc_destroy) {
711 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
712 }
713
b5d57fc8
BP
714 if (netdev_get_class(netdev_) == &netdev_tap_class) {
715 destroy_tap(netdev);
6c88d577 716 }
b5d57fc8 717 free(netdev);
1f6e0fbd
BP
718
719 cache_notifier_unref();
6c88d577
JP
720}
721
7b6b0ef4 722static int
796223f5 723netdev_linux_rx_open(struct netdev *netdev_, struct netdev_rx **rxp)
7b6b0ef4
BP
724{
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5
BP
726 bool is_tap = is_tap_netdev(netdev_);
727 struct netdev_rx_linux *rx;
7b6b0ef4
BP
728 int error;
729 int fd;
730
796223f5 731 if (is_tap) {
b5d57fc8 732 fd = netdev->state.tap.fd;
796223f5
BP
733 } else {
734 struct sockaddr_ll sll;
735 int ifindex;
32383c3b
MM
736 /* Result of tcpdump -dd inbound */
737 static struct sock_filter filt[] = {
738 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
739 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
740 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
741 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
742 };
743 static struct sock_fprog fprog = { ARRAY_SIZE(filt), filt };
7b6b0ef4 744
796223f5
BP
745 /* Create file descriptor. */
746 fd = socket(PF_PACKET, SOCK_RAW, 0);
747 if (fd < 0) {
748 error = errno;
10a89ef0 749 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
750 goto error;
751 }
33d82a56 752
796223f5
BP
753 /* Set non-blocking mode. */
754 error = set_nonblocking(fd);
755 if (error) {
756 goto error;
757 }
7b6b0ef4 758
796223f5 759 /* Get ethernet device index. */
180c6d0b 760 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
761 if (error) {
762 goto error;
763 }
7b6b0ef4 764
796223f5
BP
765 /* Bind to specific ethernet device. */
766 memset(&sll, 0, sizeof sll);
767 sll.sll_family = AF_PACKET;
768 sll.sll_ifindex = ifindex;
769 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
770 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
771 error = errno;
772 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 773 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
774 goto error;
775 }
32383c3b
MM
776
777 /* Filter for only inbound packets. */
778 error = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
779 sizeof fprog);
780 if (error) {
781 error = errno;
782 VLOG_ERR("%s: failed attach filter (%s)",
10a89ef0 783 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
784 goto error;
785 }
7b6b0ef4
BP
786 }
787
796223f5 788 rx = xmalloc(sizeof *rx);
b5d57fc8 789 netdev_rx_init(&rx->up, netdev_, &netdev_rx_linux_class);
796223f5
BP
790 rx->is_tap = is_tap;
791 rx->fd = fd;
7b6b0ef4 792
796223f5 793 *rxp = &rx->up;
7b6b0ef4
BP
794 return 0;
795
796error:
797 if (fd >= 0) {
798 close(fd);
799 }
800 return error;
801}
802
796223f5
BP
803static void
804netdev_rx_linux_destroy(struct netdev_rx *rx_)
8b61709d 805{
796223f5 806 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
8b61709d 807
796223f5
BP
808 if (!rx->is_tap) {
809 close(rx->fd);
8b61709d 810 }
796223f5
BP
811 free(rx);
812}
8b61709d 813
796223f5
BP
814static int
815netdev_rx_linux_recv(struct netdev_rx *rx_, void *data, size_t size)
816{
817 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
818 ssize_t retval;
8e8cddf7 819
796223f5
BP
820 do {
821 retval = (rx->is_tap
822 ? read(rx->fd, data, size)
823 : recv(rx->fd, data, size, MSG_TRUNC));
824 } while (retval < 0 && errno == EINTR);
825
bb5c1468
Z
826 if (retval >= 0) {
827 return retval > size ? -EMSGSIZE : retval;
796223f5
BP
828 } else {
829 if (errno != EAGAIN) {
830 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
10a89ef0 831 ovs_strerror(errno), netdev_rx_get_name(rx_));
8b61709d 832 }
796223f5 833 return -errno;
8b61709d
BP
834 }
835}
836
8b61709d 837static void
796223f5 838netdev_rx_linux_wait(struct netdev_rx *rx_)
8b61709d 839{
796223f5
BP
840 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
841 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
842}
843
8b61709d 844static int
796223f5 845netdev_rx_linux_drain(struct netdev_rx *rx_)
8b61709d 846{
796223f5
BP
847 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
848 if (rx->is_tap) {
8b61709d 849 struct ifreq ifr;
796223f5 850 int error = netdev_linux_do_ioctl(netdev_rx_get_name(rx_), &ifr,
8b61709d
BP
851 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
852 if (error) {
853 return error;
854 }
796223f5 855 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
856 return 0;
857 } else {
796223f5 858 return drain_rcvbuf(rx->fd);
8b61709d
BP
859 }
860}
861
862/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
863 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
864 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
865 * the packet is too big or too small to transmit on the device.
866 *
867 * The caller retains ownership of 'buffer' in all cases.
868 *
869 * The kernel maintains a packet transmission queue, so the caller is not
870 * expected to do additional queuing of packets. */
871static int
872netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
873{
f23347ea
BP
874 for (;;) {
875 ssize_t retval;
8b61709d 876
796223f5 877 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
878 /* Use our AF_PACKET socket to send to this device. */
879 struct sockaddr_ll sll;
880 struct msghdr msg;
881 struct iovec iov;
882 int ifindex;
883 int error;
488d734d
BP
884 int sock;
885
886 sock = af_packet_sock();
887 if (sock < 0) {
c4c7a3d7 888 return -sock;
488d734d 889 }
f23347ea
BP
890
891 error = get_ifindex(netdev_, &ifindex);
892 if (error) {
893 return error;
894 }
8b61709d 895
f23347ea
BP
896 /* We don't bother setting most fields in sockaddr_ll because the
897 * kernel ignores them for SOCK_RAW. */
898 memset(&sll, 0, sizeof sll);
899 sll.sll_family = AF_PACKET;
900 sll.sll_ifindex = ifindex;
76c308b5 901
ebc56baa 902 iov.iov_base = CONST_CAST(void *, data);
f23347ea 903 iov.iov_len = size;
76c308b5 904
f23347ea
BP
905 msg.msg_name = &sll;
906 msg.msg_namelen = sizeof sll;
907 msg.msg_iov = &iov;
908 msg.msg_iovlen = 1;
909 msg.msg_control = NULL;
910 msg.msg_controllen = 0;
911 msg.msg_flags = 0;
912
488d734d 913 retval = sendmsg(sock, &msg, 0);
f23347ea 914 } else {
796223f5
BP
915 /* Use the tap fd to send to this device. This is essential for
916 * tap devices, because packets sent to a tap device with an
917 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
918 * tap device. This doesn't occur on other interface types
919 * because we attach a socket filter to the rx socket. */
b5d57fc8 920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 921
b5d57fc8 922 retval = write(netdev->state.tap.fd, data, size);
f23347ea 923 }
76c308b5 924
8b61709d
BP
925 if (retval < 0) {
926 /* The Linux AF_PACKET implementation never blocks waiting for room
927 * for packets, instead returning ENOBUFS. Translate this into
928 * EAGAIN for the caller. */
929 if (errno == ENOBUFS) {
930 return EAGAIN;
931 } else if (errno == EINTR) {
932 continue;
933 } else if (errno != EAGAIN) {
934 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
10a89ef0 935 netdev_get_name(netdev_), ovs_strerror(errno));
8b61709d
BP
936 }
937 return errno;
938 } else if (retval != size) {
939 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
940 "%zu) on %s", retval, size, netdev_get_name(netdev_));
941 return EMSGSIZE;
942 } else {
943 return 0;
944 }
945 }
946}
947
948/* Registers with the poll loop to wake up from the next call to poll_block()
949 * when the packet transmission queue has sufficient room to transmit a packet
950 * with netdev_send().
951 *
952 * The kernel maintains a packet transmission queue, so the client is not
953 * expected to do additional queuing of packets. Thus, this function is
954 * unlikely to ever be used. It is included for completeness. */
955static void
796223f5 956netdev_linux_send_wait(struct netdev *netdev)
8b61709d 957{
796223f5 958 if (is_tap_netdev(netdev)) {
8b61709d
BP
959 /* TAP device always accepts packets.*/
960 poll_immediate_wake();
961 }
962}
963
964/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
965 * otherwise a positive errno value. */
966static int
967netdev_linux_set_etheraddr(struct netdev *netdev_,
968 const uint8_t mac[ETH_ADDR_LEN])
969{
b5d57fc8 970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4b609110 971 struct netdev_saved_flags *sf = NULL;
eb395f2e
BP
972 int error;
973
b5d57fc8
BP
974 if (netdev->cache_valid & VALID_ETHERADDR) {
975 if (netdev->ether_addr_error) {
976 return netdev->ether_addr_error;
44445cac 977 }
b5d57fc8 978 if (eth_addr_equals(netdev->etheraddr, mac)) {
44445cac
PS
979 return 0;
980 }
b5d57fc8 981 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
982 }
983
7eb1bd81 984 /* Tap devices must be brought down before setting the address. */
796223f5 985 if (is_tap_netdev(netdev_)) {
bbd5b6f4 986 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
7eb1bd81 987 }
44445cac
PS
988 error = set_etheraddr(netdev_get_name(netdev_), mac);
989 if (!error || error == ENODEV) {
b5d57fc8
BP
990 netdev->ether_addr_error = error;
991 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 992 if (!error) {
b5d57fc8 993 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 994 }
8b61709d 995 }
44445cac 996
4b609110 997 netdev_restore_flags(sf);
7eb1bd81 998
8b61709d
BP
999 return error;
1000}
1001
44445cac 1002/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1003static int
1004netdev_linux_get_etheraddr(const struct netdev *netdev_,
1005 uint8_t mac[ETH_ADDR_LEN])
1006{
b5d57fc8 1007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
44445cac 1008
b5d57fc8 1009 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
8b61709d 1010 int error = get_etheraddr(netdev_get_name(netdev_),
b5d57fc8 1011 netdev->etheraddr);
44445cac 1012
b5d57fc8
BP
1013 netdev->ether_addr_error = error;
1014 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1015 }
44445cac 1016
b5d57fc8
BP
1017 if (!netdev->ether_addr_error) {
1018 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac
PS
1019 }
1020
b5d57fc8 1021 return netdev->ether_addr_error;
8b61709d
BP
1022}
1023
1024/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1025 * in bytes, not including the hardware header; thus, this is typically 1500
1026 * bytes for Ethernet devices. */
1027static int
1028netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1029{
b5d57fc8
BP
1030 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1031 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d
BP
1032 struct ifreq ifr;
1033 int error;
1034
149f577a
JG
1035 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1036 SIOCGIFMTU, "SIOCGIFMTU");
90a6637d 1037
b5d57fc8
BP
1038 netdev->netdev_mtu_error = error;
1039 netdev->mtu = ifr.ifr_mtu;
1040 netdev->cache_valid |= VALID_MTU;
8b61709d 1041 }
90a6637d 1042
b5d57fc8
BP
1043 if (!netdev->netdev_mtu_error) {
1044 *mtup = netdev->mtu;
90a6637d 1045 }
b5d57fc8 1046 return netdev->netdev_mtu_error;
8b61709d
BP
1047}
1048
9b020780
PS
1049/* Sets the maximum size of transmitted (MTU) for given device using linux
1050 * networking ioctl interface.
1051 */
1052static int
1053netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1054{
b5d57fc8 1055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1056 struct ifreq ifr;
1057 int error;
1058
b5d57fc8
BP
1059 if (netdev->cache_valid & VALID_MTU) {
1060 if (netdev->netdev_mtu_error) {
1061 return netdev->netdev_mtu_error;
90a6637d 1062 }
b5d57fc8 1063 if (netdev->mtu == mtu) {
90a6637d
PS
1064 return 0;
1065 }
b5d57fc8 1066 netdev->cache_valid &= ~VALID_MTU;
153e5481 1067 }
9b020780
PS
1068 ifr.ifr_mtu = mtu;
1069 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1070 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1071 if (!error || error == ENODEV) {
b5d57fc8
BP
1072 netdev->netdev_mtu_error = error;
1073 netdev->mtu = ifr.ifr_mtu;
1074 netdev->cache_valid |= VALID_MTU;
9b020780 1075 }
90a6637d 1076 return error;
9b020780
PS
1077}
1078
9ab3d9a3
BP
1079/* Returns the ifindex of 'netdev', if successful, as a positive number.
1080 * On failure, returns a negative errno value. */
1081static int
1082netdev_linux_get_ifindex(const struct netdev *netdev)
1083{
1084 int ifindex, error;
1085
1086 error = get_ifindex(netdev, &ifindex);
1087 return error ? -error : ifindex;
1088}
1089
8b61709d
BP
1090static int
1091netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1092{
b5d57fc8 1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1094
b5d57fc8
BP
1095 if (netdev->miimon_interval > 0) {
1096 *carrier = netdev->miimon;
3a183124 1097 } else {
b5d57fc8 1098 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1099 }
8b61709d 1100
3a183124 1101 return 0;
8b61709d
BP
1102}
1103
65c3058c
EJ
1104static long long int
1105netdev_linux_get_carrier_resets(const struct netdev *netdev)
1106{
b5d57fc8 1107 return netdev_linux_cast(netdev)->carrier_resets;
65c3058c
EJ
1108}
1109
63331829 1110static int
1670c579
EJ
1111netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1112 struct mii_ioctl_data *data)
63331829 1113{
63331829 1114 struct ifreq ifr;
782e6111 1115 int error;
63331829 1116
63331829 1117 memset(&ifr, 0, sizeof ifr);
782e6111 1118 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1119 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1120 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1121
782e6111
EJ
1122 return error;
1123}
1124
1125static int
1670c579 1126netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1127{
782e6111
EJ
1128 struct mii_ioctl_data data;
1129 int error;
63331829 1130
782e6111
EJ
1131 *miimon = false;
1132
1133 memset(&data, 0, sizeof data);
1670c579 1134 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1135 if (!error) {
1136 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1137 data.reg_num = MII_BMSR;
1670c579 1138 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1139 &data);
63331829
EJ
1140
1141 if (!error) {
782e6111 1142 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1143 } else {
1144 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1145 }
1146 } else {
1147 struct ethtool_cmd ecmd;
63331829
EJ
1148
1149 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1150 name);
1151
ab985a77 1152 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1153 memset(&ecmd, 0, sizeof ecmd);
1154 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1155 "ETHTOOL_GLINK");
1156 if (!error) {
782e6111
EJ
1157 struct ethtool_value eval;
1158
1159 memcpy(&eval, &ecmd, sizeof eval);
1160 *miimon = !!eval.data;
63331829
EJ
1161 } else {
1162 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1163 }
1164 }
1165
1166 return error;
1167}
1168
1670c579
EJ
1169static int
1170netdev_linux_set_miimon_interval(struct netdev *netdev_,
1171 long long int interval)
1172{
b5d57fc8 1173 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579
EJ
1174
1175 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8
BP
1176 if (netdev->miimon_interval != interval) {
1177 netdev->miimon_interval = interval;
1178 timer_set_expired(&netdev->miimon_timer);
1670c579
EJ
1179 }
1180
1181 return 0;
1182}
1183
1184static void
1185netdev_linux_miimon_run(void)
1186{
1187 struct shash device_shash;
1188 struct shash_node *node;
1189
1190 shash_init(&device_shash);
b5d57fc8 1191 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1192 SHASH_FOR_EACH (node, &device_shash) {
b5d57fc8 1193 struct netdev_linux *dev = node->data;
1670c579
EJ
1194 bool miimon;
1195
1196 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1197 continue;
1198 }
1199
180c6d0b 1200 netdev_linux_get_miimon(dev->up.name, &miimon);
1670c579 1201 if (miimon != dev->miimon) {
1670c579 1202 dev->miimon = miimon;
b5d57fc8 1203 netdev_linux_changed(dev, dev->ifi_flags, 0);
1670c579
EJ
1204 }
1205
1206 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1207 }
1208
1209 shash_destroy(&device_shash);
1210}
1211
1212static void
1213netdev_linux_miimon_wait(void)
1214{
1215 struct shash device_shash;
1216 struct shash_node *node;
1217
1218 shash_init(&device_shash);
b5d57fc8 1219 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1220 SHASH_FOR_EACH (node, &device_shash) {
b5d57fc8 1221 struct netdev_linux *dev = node->data;
1670c579
EJ
1222
1223 if (dev->miimon_interval > 0) {
1224 timer_wait(&dev->miimon_timer);
1225 }
1226 }
1227 shash_destroy(&device_shash);
1228}
1229
8b61709d
BP
1230/* Check whether we can we use RTM_GETLINK to get network device statistics.
1231 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1232 * enabled. */
1233static bool
1234check_for_working_netlink_stats(void)
1235{
1236 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1237 * preferable, so if that works, we'll use it. */
1238 int ifindex = do_get_ifindex("lo");
1239 if (ifindex < 0) {
1240 VLOG_WARN("failed to get ifindex for lo, "
1241 "obtaining netdev stats from proc");
1242 return false;
1243 } else {
1244 struct netdev_stats stats;
1245 int error = get_stats_via_netlink(ifindex, &stats);
1246 if (!error) {
1247 VLOG_DBG("obtaining netdev stats via rtnetlink");
1248 return true;
1249 } else {
1250 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1251 "via proc (you are probably running a pre-2.6.19 "
10a89ef0 1252 "kernel)", ovs_strerror(error));
8b61709d
BP
1253 return false;
1254 }
1255 }
1256}
1257
92df599c
JG
1258static void
1259swap_uint64(uint64_t *a, uint64_t *b)
1260{
1de0e8ae
BP
1261 uint64_t tmp = *a;
1262 *a = *b;
1263 *b = tmp;
92df599c
JG
1264}
1265
c060c4cf
EJ
1266/* Copies 'src' into 'dst', performing format conversion in the process.
1267 *
1268 * 'src' is allowed to be misaligned. */
1269static void
1270netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1271 const struct ovs_vport_stats *src)
1272{
1273 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1274 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1275 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1276 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1277 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1278 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1279 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1280 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1281 dst->multicast = 0;
1282 dst->collisions = 0;
1283 dst->rx_length_errors = 0;
1284 dst->rx_over_errors = 0;
1285 dst->rx_crc_errors = 0;
1286 dst->rx_frame_errors = 0;
1287 dst->rx_fifo_errors = 0;
1288 dst->rx_missed_errors = 0;
1289 dst->tx_aborted_errors = 0;
1290 dst->tx_carrier_errors = 0;
1291 dst->tx_fifo_errors = 0;
1292 dst->tx_heartbeat_errors = 0;
1293 dst->tx_window_errors = 0;
1294}
1295
1296static int
1297get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1298{
1299 struct dpif_linux_vport reply;
1300 struct ofpbuf *buf;
1301 int error;
1302
1303 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1304 if (error) {
1305 return error;
1306 } else if (!reply.stats) {
1307 ofpbuf_delete(buf);
1308 return EOPNOTSUPP;
1309 }
1310
1311 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1312
1313 ofpbuf_delete(buf);
1314
1315 return 0;
1316}
1317
f613a0d7
PS
1318static void
1319get_stats_via_vport(const struct netdev *netdev_,
1320 struct netdev_stats *stats)
8b61709d 1321{
b5d57fc8 1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1323
b5d57fc8
BP
1324 if (!netdev->vport_stats_error ||
1325 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1326 int error;
7fbef77a 1327
c060c4cf 1328 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1329 if (error && error != ENOENT) {
a57a8488 1330 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1331 "(%s)",
1332 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1333 }
b5d57fc8
BP
1334 netdev->vport_stats_error = error;
1335 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1336 }
f613a0d7 1337}
8b61709d 1338
f613a0d7
PS
1339static int
1340netdev_linux_sys_get_stats(const struct netdev *netdev_,
1341 struct netdev_stats *stats)
1342{
23882115
BP
1343 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1344 static int use_netlink_stats;
f613a0d7
PS
1345 int error;
1346
23882115 1347 if (ovsthread_once_start(&once)) {
f613a0d7 1348 use_netlink_stats = check_for_working_netlink_stats();
23882115 1349 ovsthread_once_done(&once);
f613a0d7
PS
1350 }
1351
1352 if (use_netlink_stats) {
1353 int ifindex;
1354
1355 error = get_ifindex(netdev_, &ifindex);
1356 if (!error) {
1357 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1358 }
f613a0d7
PS
1359 } else {
1360 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1361 }
7fbef77a 1362
f613a0d7
PS
1363 if (error) {
1364 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1365 netdev_get_name(netdev_), error);
1366 }
1367 return error;
1368
1369}
1370
1371/* Retrieves current device stats for 'netdev-linux'. */
1372static int
1373netdev_linux_get_stats(const struct netdev *netdev_,
1374 struct netdev_stats *stats)
1375{
b5d57fc8 1376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1377 struct netdev_stats dev_stats;
1378 int error;
1379
1380 get_stats_via_vport(netdev_, stats);
1381
1382 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1383
1384 if (error) {
b5d57fc8 1385 if (netdev->vport_stats_error) {
f613a0d7 1386 return error;
7fbef77a 1387 } else {
f613a0d7
PS
1388 return 0;
1389 }
1390 }
1391
b5d57fc8 1392 if (netdev->vport_stats_error) {
f613a0d7
PS
1393 /* stats not available from OVS then use ioctl stats. */
1394 *stats = dev_stats;
1395 } else {
1396 stats->rx_errors += dev_stats.rx_errors;
1397 stats->tx_errors += dev_stats.tx_errors;
1398 stats->rx_dropped += dev_stats.rx_dropped;
1399 stats->tx_dropped += dev_stats.tx_dropped;
1400 stats->multicast += dev_stats.multicast;
1401 stats->collisions += dev_stats.collisions;
1402 stats->rx_length_errors += dev_stats.rx_length_errors;
1403 stats->rx_over_errors += dev_stats.rx_over_errors;
1404 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1405 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1406 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1407 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1408 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1409 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1410 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1411 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1412 stats->tx_window_errors += dev_stats.tx_window_errors;
1413 }
1414 return 0;
1415}
1416
1417/* Retrieves current device stats for 'netdev-tap' netdev or
1418 * netdev-internal. */
1419static int
bba1e6f3 1420netdev_tap_get_stats(const struct netdev *netdev_,
f613a0d7
PS
1421 struct netdev_stats *stats)
1422{
b5d57fc8 1423 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1424 struct netdev_stats dev_stats;
1425 int error;
1426
1427 get_stats_via_vport(netdev_, stats);
1428
1429 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1430 if (error) {
b5d57fc8 1431 if (netdev->vport_stats_error) {
f613a0d7
PS
1432 return error;
1433 } else {
1434 return 0;
8b61709d 1435 }
8b61709d 1436 }
fe6b0e03
JG
1437
1438 /* If this port is an internal port then the transmit and receive stats
1439 * will appear to be swapped relative to the other ports since we are the
1440 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1441 * them back here. This does not apply if we are getting stats from the
1442 * vport layer because it always tracks stats from the perspective of the
1443 * switch. */
b5d57fc8 1444 if (netdev->vport_stats_error) {
f613a0d7 1445 *stats = dev_stats;
92df599c
JG
1446 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1447 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1448 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1449 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1450 stats->rx_length_errors = 0;
1451 stats->rx_over_errors = 0;
1452 stats->rx_crc_errors = 0;
1453 stats->rx_frame_errors = 0;
1454 stats->rx_fifo_errors = 0;
1455 stats->rx_missed_errors = 0;
1456 stats->tx_aborted_errors = 0;
1457 stats->tx_carrier_errors = 0;
1458 stats->tx_fifo_errors = 0;
1459 stats->tx_heartbeat_errors = 0;
1460 stats->tx_window_errors = 0;
f613a0d7
PS
1461 } else {
1462 stats->rx_dropped += dev_stats.tx_dropped;
1463 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1464
f613a0d7
PS
1465 stats->rx_errors += dev_stats.tx_errors;
1466 stats->tx_errors += dev_stats.rx_errors;
1467
1468 stats->multicast += dev_stats.multicast;
1469 stats->collisions += dev_stats.collisions;
1470 }
1471 return 0;
8b61709d
BP
1472}
1473
bba1e6f3
PS
1474static int
1475netdev_internal_get_stats(const struct netdev *netdev_,
1476 struct netdev_stats *stats)
1477{
b5d57fc8 1478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
bba1e6f3
PS
1479
1480 get_stats_via_vport(netdev_, stats);
b5d57fc8 1481 return netdev->vport_stats_error;
bba1e6f3
PS
1482}
1483
2f31a822
EJ
1484static int
1485netdev_internal_set_stats(struct netdev *netdev,
1486 const struct netdev_stats *stats)
1487{
1488 struct ovs_vport_stats vport_stats;
1489 struct dpif_linux_vport vport;
1490 int err;
1491
1492 vport_stats.rx_packets = stats->rx_packets;
1493 vport_stats.tx_packets = stats->tx_packets;
1494 vport_stats.rx_bytes = stats->rx_bytes;
1495 vport_stats.tx_bytes = stats->tx_bytes;
1496 vport_stats.rx_errors = stats->rx_errors;
1497 vport_stats.tx_errors = stats->tx_errors;
1498 vport_stats.rx_dropped = stats->rx_dropped;
1499 vport_stats.tx_dropped = stats->tx_dropped;
1500
1501 dpif_linux_vport_init(&vport);
1502 vport.cmd = OVS_VPORT_CMD_SET;
1503 vport.name = netdev_get_name(netdev);
1504 vport.stats = &vport_stats;
1505
1506 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1507
1508 /* If the vport layer doesn't know about the device, that doesn't mean it
1509 * doesn't exist (after all were able to open it when netdev_open() was
1510 * called), it just means that it isn't attached and we'll be getting
1511 * stats a different way. */
1512 if (err == ENODEV) {
1513 err = EOPNOTSUPP;
1514 }
1515
1516 return err;
1517}
1518
51f87458 1519static void
b5d57fc8 1520netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1521{
1522 struct ethtool_cmd ecmd;
6c038611 1523 uint32_t speed;
8b61709d
BP
1524 int error;
1525
b5d57fc8 1526 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1527 return;
1528 }
1529
ab985a77 1530 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1531 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1532 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1533 ETHTOOL_GSET, "ETHTOOL_GSET");
1534 if (error) {
51f87458 1535 goto out;
8b61709d
BP
1536 }
1537
1538 /* Supported features. */
b5d57fc8 1539 netdev->supported = 0;
8b61709d 1540 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1541 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1542 }
1543 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1544 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1545 }
1546 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1547 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1548 }
1549 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1550 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1551 }
1552 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1553 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1554 }
1555 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1556 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1557 }
1558 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1559 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1560 }
1561 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1562 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1563 }
1564 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1565 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1566 }
1567 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1568 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1569 }
1570 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1571 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1572 }
1573 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1574 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1575 }
1576
1577 /* Advertised features. */
b5d57fc8 1578 netdev->advertised = 0;
8b61709d 1579 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1580 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1581 }
1582 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1583 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1584 }
1585 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1586 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1587 }
1588 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1589 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1590 }
1591 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1592 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1593 }
1594 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1595 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1596 }
1597 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1598 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1599 }
1600 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1601 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1602 }
1603 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1604 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1605 }
1606 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1607 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1608 }
1609 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1610 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1611 }
1612 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1613 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1614 }
1615
1616 /* Current settings. */
2a529ead 1617 speed = ecmd.speed;
6c038611 1618 if (speed == SPEED_10) {
b5d57fc8 1619 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1620 } else if (speed == SPEED_100) {
b5d57fc8 1621 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1622 } else if (speed == SPEED_1000) {
b5d57fc8 1623 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1624 } else if (speed == SPEED_10000) {
b5d57fc8 1625 netdev->current = NETDEV_F_10GB_FD;
6c038611 1626 } else if (speed == 40000) {
b5d57fc8 1627 netdev->current = NETDEV_F_40GB_FD;
6c038611 1628 } else if (speed == 100000) {
b5d57fc8 1629 netdev->current = NETDEV_F_100GB_FD;
6c038611 1630 } else if (speed == 1000000) {
b5d57fc8 1631 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1632 } else {
b5d57fc8 1633 netdev->current = 0;
8b61709d
BP
1634 }
1635
1636 if (ecmd.port == PORT_TP) {
b5d57fc8 1637 netdev->current |= NETDEV_F_COPPER;
8b61709d 1638 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1639 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1640 }
1641
1642 if (ecmd.autoneg) {
b5d57fc8 1643 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1644 }
1645
1646 /* Peer advertisements. */
b5d57fc8 1647 netdev->peer = 0; /* XXX */
8b61709d 1648
51f87458 1649out:
b5d57fc8
BP
1650 netdev->cache_valid |= VALID_FEATURES;
1651 netdev->get_features_error = error;
51f87458
PS
1652}
1653
1654/* Stores the features supported by 'netdev' into each of '*current',
1655 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1656 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1657 * errno value. */
1658static int
1659netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1660 enum netdev_features *current,
1661 enum netdev_features *advertised,
1662 enum netdev_features *supported,
1663 enum netdev_features *peer)
51f87458 1664{
b5d57fc8 1665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
51f87458 1666
b5d57fc8 1667 netdev_linux_read_features(netdev);
51f87458 1668
b5d57fc8
BP
1669 if (!netdev->get_features_error) {
1670 *current = netdev->current;
1671 *advertised = netdev->advertised;
1672 *supported = netdev->supported;
1673 *peer = netdev->peer;
51f87458 1674 }
b5d57fc8 1675 return netdev->get_features_error;
8b61709d
BP
1676}
1677
1678/* Set the features advertised by 'netdev' to 'advertise'. */
1679static int
6c038611
BP
1680netdev_linux_set_advertisements(struct netdev *netdev,
1681 enum netdev_features advertise)
8b61709d
BP
1682{
1683 struct ethtool_cmd ecmd;
1684 int error;
1685
ab985a77 1686 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1687 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1688 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1689 ETHTOOL_GSET, "ETHTOOL_GSET");
1690 if (error) {
1691 return error;
1692 }
1693
1694 ecmd.advertising = 0;
6c038611 1695 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1696 ecmd.advertising |= ADVERTISED_10baseT_Half;
1697 }
6c038611 1698 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1699 ecmd.advertising |= ADVERTISED_10baseT_Full;
1700 }
6c038611 1701 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1702 ecmd.advertising |= ADVERTISED_100baseT_Half;
1703 }
6c038611 1704 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1705 ecmd.advertising |= ADVERTISED_100baseT_Full;
1706 }
6c038611 1707 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1708 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1709 }
6c038611 1710 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1711 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1712 }
6c038611 1713 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1714 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1715 }
6c038611 1716 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1717 ecmd.advertising |= ADVERTISED_TP;
1718 }
6c038611 1719 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1720 ecmd.advertising |= ADVERTISED_FIBRE;
1721 }
6c038611 1722 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1723 ecmd.advertising |= ADVERTISED_Autoneg;
1724 }
6c038611 1725 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1726 ecmd.advertising |= ADVERTISED_Pause;
1727 }
6c038611 1728 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1729 ecmd.advertising |= ADVERTISED_Asym_Pause;
1730 }
ab985a77 1731 COVERAGE_INC(netdev_set_ethtool);
0b0544d7 1732 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1733 ETHTOOL_SSET, "ETHTOOL_SSET");
1734}
1735
f8500004
JP
1736/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1737 * successful, otherwise a positive errno value. */
8b61709d 1738static int
b5d57fc8 1739netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1740 uint32_t kbits_rate, uint32_t kbits_burst)
1741{
b5d57fc8
BP
1742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1743 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1744 int error;
8b61709d 1745
8e460221 1746
80a86fbe
BP
1747 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1748 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1749 : kbits_burst); /* Stick with user-specified value. */
1750
b5d57fc8
BP
1751 if (netdev->cache_valid & VALID_POLICING) {
1752 if (netdev->netdev_policing_error) {
1753 return netdev->netdev_policing_error;
c9f71668
PS
1754 }
1755
b5d57fc8
BP
1756 if (netdev->kbits_rate == kbits_rate &&
1757 netdev->kbits_burst == kbits_burst) {
c9f71668
PS
1758 /* Assume that settings haven't changed since we last set them. */
1759 return 0;
1760 }
b5d57fc8 1761 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1762 }
1763
ac8c3412 1764 COVERAGE_INC(netdev_set_policing);
f8500004 1765 /* Remove any existing ingress qdisc. */
b5d57fc8 1766 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1767 if (error) {
1768 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1769 netdev_name, ovs_strerror(error));
c9f71668 1770 goto out;
f8500004
JP
1771 }
1772
8b61709d 1773 if (kbits_rate) {
b5d57fc8 1774 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1775 if (error) {
1776 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 1777 netdev_name, ovs_strerror(error));
c9f71668 1778 goto out;
8b61709d
BP
1779 }
1780
b5d57fc8 1781 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
1782 if (error){
1783 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 1784 netdev_name, ovs_strerror(error));
c9f71668 1785 goto out;
8b61709d 1786 }
8b61709d
BP
1787 }
1788
b5d57fc8
BP
1789 netdev->kbits_rate = kbits_rate;
1790 netdev->kbits_burst = kbits_burst;
f8500004 1791
c9f71668
PS
1792out:
1793 if (!error || error == ENODEV) {
b5d57fc8
BP
1794 netdev->netdev_policing_error = error;
1795 netdev->cache_valid |= VALID_POLICING;
c9f71668
PS
1796 }
1797 return error;
8b61709d
BP
1798}
1799
c1c9c9c4
BP
1800static int
1801netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1802 struct sset *types)
c1c9c9c4 1803{
559eb230 1804 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1805
1806 for (opsp = tcs; *opsp != NULL; opsp++) {
1807 const struct tc_ops *ops = *opsp;
1808 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1809 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1810 }
1811 }
1812 return 0;
1813}
1814
1815static const struct tc_ops *
1816tc_lookup_ovs_name(const char *name)
1817{
559eb230 1818 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1819
1820 for (opsp = tcs; *opsp != NULL; opsp++) {
1821 const struct tc_ops *ops = *opsp;
1822 if (!strcmp(name, ops->ovs_name)) {
1823 return ops;
1824 }
1825 }
1826 return NULL;
1827}
1828
1829static const struct tc_ops *
1830tc_lookup_linux_name(const char *name)
1831{
559eb230 1832 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1833
1834 for (opsp = tcs; *opsp != NULL; opsp++) {
1835 const struct tc_ops *ops = *opsp;
1836 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1837 return ops;
1838 }
1839 }
1840 return NULL;
1841}
1842
93b13be8 1843static struct tc_queue *
b5d57fc8 1844tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
1845 size_t hash)
1846{
b5d57fc8 1847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
1848 struct tc_queue *queue;
1849
b5d57fc8 1850 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
1851 if (queue->queue_id == queue_id) {
1852 return queue;
1853 }
1854 }
1855 return NULL;
1856}
1857
1858static struct tc_queue *
1859tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1860{
1861 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1862}
1863
c1c9c9c4
BP
1864static int
1865netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1866 const char *type,
1867 struct netdev_qos_capabilities *caps)
1868{
1869 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1870 if (!ops) {
1871 return EOPNOTSUPP;
1872 }
1873 caps->n_queues = ops->n_queues;
1874 return 0;
1875}
1876
1877static int
b5d57fc8 1878netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 1879 const char **typep, struct smap *details)
c1c9c9c4 1880{
b5d57fc8 1881 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1882 int error;
1883
b5d57fc8 1884 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1885 if (error) {
1886 return error;
1887 }
1888
b5d57fc8
BP
1889 *typep = netdev->tc->ops->ovs_name;
1890 return (netdev->tc->ops->qdisc_get
1891 ? netdev->tc->ops->qdisc_get(netdev_, details)
c1c9c9c4
BP
1892 : 0);
1893}
1894
1895static int
b5d57fc8 1896netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 1897 const char *type, const struct smap *details)
c1c9c9c4 1898{
b5d57fc8 1899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1900 const struct tc_ops *new_ops;
1901 int error;
1902
1903 new_ops = tc_lookup_ovs_name(type);
1904 if (!new_ops || !new_ops->tc_install) {
1905 return EOPNOTSUPP;
1906 }
1907
b5d57fc8 1908 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1909 if (error) {
1910 return error;
1911 }
1912
b5d57fc8
BP
1913 if (new_ops == netdev->tc->ops) {
1914 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
1915 } else {
1916 /* Delete existing qdisc. */
b5d57fc8 1917 error = tc_del_qdisc(netdev_);
c1c9c9c4
BP
1918 if (error) {
1919 return error;
1920 }
b5d57fc8 1921 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
1922
1923 /* Install new qdisc. */
b5d57fc8
BP
1924 error = new_ops->tc_install(netdev_, details);
1925 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
1926
1927 return error;
1928 }
1929}
1930
1931static int
b5d57fc8 1932netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 1933 unsigned int queue_id, struct smap *details)
c1c9c9c4 1934{
b5d57fc8 1935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1936 int error;
1937
b5d57fc8 1938 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1939 if (error) {
1940 return error;
93b13be8 1941 } else {
b5d57fc8 1942 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
93b13be8 1943 return (queue
b5d57fc8 1944 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 1945 : ENOENT);
c1c9c9c4 1946 }
c1c9c9c4
BP
1947}
1948
1949static int
b5d57fc8 1950netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 1951 unsigned int queue_id, const struct smap *details)
c1c9c9c4 1952{
b5d57fc8 1953 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1954 int error;
1955
b5d57fc8 1956 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1957 if (error) {
1958 return error;
b5d57fc8
BP
1959 } else if (queue_id >= netdev->tc->ops->n_queues
1960 || !netdev->tc->ops->class_set) {
c1c9c9c4
BP
1961 return EINVAL;
1962 }
1963
b5d57fc8 1964 return netdev->tc->ops->class_set(netdev_, queue_id, details);
c1c9c9c4
BP
1965}
1966
1967static int
b5d57fc8 1968netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 1969{
b5d57fc8 1970 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1971 int error;
1972
b5d57fc8 1973 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1974 if (error) {
1975 return error;
b5d57fc8 1976 } else if (!netdev->tc->ops->class_delete) {
c1c9c9c4 1977 return EINVAL;
93b13be8 1978 } else {
b5d57fc8 1979 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
93b13be8 1980 return (queue
b5d57fc8 1981 ? netdev->tc->ops->class_delete(netdev_, queue)
93b13be8 1982 : ENOENT);
c1c9c9c4 1983 }
c1c9c9c4
BP
1984}
1985
1986static int
b5d57fc8 1987netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
1988 unsigned int queue_id,
1989 struct netdev_queue_stats *stats)
1990{
b5d57fc8 1991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
1992 int error;
1993
b5d57fc8 1994 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
1995 if (error) {
1996 return error;
b5d57fc8 1997 } else if (!netdev->tc->ops->class_get_stats) {
c1c9c9c4 1998 return EOPNOTSUPP;
93b13be8 1999 } else {
b5d57fc8 2000 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
93b13be8 2001 return (queue
b5d57fc8 2002 ? netdev->tc->ops->class_get_stats(netdev_, queue, stats)
93b13be8 2003 : ENOENT);
c1c9c9c4 2004 }
c1c9c9c4
BP
2005}
2006
23a98ffe 2007static bool
c1c9c9c4
BP
2008start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2009{
2010 struct ofpbuf request;
2011 struct tcmsg *tcmsg;
2012
2013 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2014 if (!tcmsg) {
2015 return false;
2016 }
3c4de644 2017 tcmsg->tcm_parent = 0;
a88b4e04 2018 nl_dump_start(dump, NETLINK_ROUTE, &request);
c1c9c9c4 2019 ofpbuf_uninit(&request);
23a98ffe 2020 return true;
c1c9c9c4
BP
2021}
2022
2023static int
b5d57fc8 2024netdev_linux_dump_queues(const struct netdev *netdev_,
c1c9c9c4
BP
2025 netdev_dump_queues_cb *cb, void *aux)
2026{
b5d57fc8 2027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f486e840 2028 struct tc_queue *queue, *next_queue;
79f1cbe9 2029 struct smap details;
c1c9c9c4 2030 int last_error;
c1c9c9c4
BP
2031 int error;
2032
b5d57fc8 2033 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
2034 if (error) {
2035 return error;
b5d57fc8 2036 } else if (!netdev->tc->ops->class_get) {
c1c9c9c4
BP
2037 return EOPNOTSUPP;
2038 }
2039
2040 last_error = 0;
79f1cbe9 2041 smap_init(&details);
f486e840 2042 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
b5d57fc8 2043 &netdev->tc->queues) {
79f1cbe9 2044 smap_clear(&details);
c1c9c9c4 2045
b5d57fc8 2046 error = netdev->tc->ops->class_get(netdev_, queue, &details);
c1c9c9c4 2047 if (!error) {
93b13be8 2048 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
2049 } else {
2050 last_error = error;
2051 }
2052 }
79f1cbe9 2053 smap_destroy(&details);
c1c9c9c4
BP
2054
2055 return last_error;
2056}
2057
2058static int
b5d57fc8 2059netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2060 netdev_dump_queue_stats_cb *cb, void *aux)
2061{
b5d57fc8 2062 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2063 struct nl_dump dump;
2064 struct ofpbuf msg;
2065 int last_error;
2066 int error;
2067
b5d57fc8 2068 error = tc_query_qdisc(netdev_);
c1c9c9c4
BP
2069 if (error) {
2070 return error;
b5d57fc8 2071 } else if (!netdev->tc->ops->class_dump_stats) {
c1c9c9c4
BP
2072 return EOPNOTSUPP;
2073 }
2074
2075 last_error = 0;
b5d57fc8 2076 if (!start_queue_dump(netdev_, &dump)) {
23a98ffe
BP
2077 return ENODEV;
2078 }
c1c9c9c4 2079 while (nl_dump_next(&dump, &msg)) {
b5d57fc8 2080 error = netdev->tc->ops->class_dump_stats(netdev_, &msg, cb, aux);
c1c9c9c4
BP
2081 if (error) {
2082 last_error = error;
2083 }
2084 }
2085
2086 error = nl_dump_done(&dump);
2087 return error ? error : last_error;
2088}
2089
8b61709d 2090static int
f1acd62b
BP
2091netdev_linux_get_in4(const struct netdev *netdev_,
2092 struct in_addr *address, struct in_addr *netmask)
8b61709d 2093{
b5d57fc8 2094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
149f577a 2095
b5d57fc8 2096 if (!(netdev->cache_valid & VALID_IN4)) {
8b61709d
BP
2097 int error;
2098
b5d57fc8 2099 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d
BP
2100 SIOCGIFADDR, "SIOCGIFADDR");
2101 if (error) {
2102 return error;
2103 }
2104
b5d57fc8 2105 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
f1acd62b
BP
2106 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2107 if (error) {
2108 return error;
2109 }
2110
b5d57fc8 2111 netdev->cache_valid |= VALID_IN4;
8b61709d 2112 }
b5d57fc8
BP
2113 *address = netdev->address;
2114 *netmask = netdev->netmask;
f1acd62b 2115 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2116}
2117
8b61709d 2118static int
f1acd62b
BP
2119netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2120 struct in_addr netmask)
8b61709d 2121{
b5d57fc8 2122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2123 int error;
2124
f1acd62b 2125 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2126 if (!error) {
b5d57fc8
BP
2127 netdev->cache_valid |= VALID_IN4;
2128 netdev->address = address;
2129 netdev->netmask = netmask;
f1acd62b 2130 if (address.s_addr != INADDR_ANY) {
8b61709d 2131 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2132 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2133 }
2134 }
2135 return error;
2136}
2137
2138static bool
2139parse_if_inet6_line(const char *line,
2140 struct in6_addr *in6, char ifname[16 + 1])
2141{
2142 uint8_t *s6 = in6->s6_addr;
2143#define X8 "%2"SCNx8
2144 return sscanf(line,
2145 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2146 "%*x %*x %*x %*x %16s\n",
2147 &s6[0], &s6[1], &s6[2], &s6[3],
2148 &s6[4], &s6[5], &s6[6], &s6[7],
2149 &s6[8], &s6[9], &s6[10], &s6[11],
2150 &s6[12], &s6[13], &s6[14], &s6[15],
2151 ifname) == 17;
2152}
2153
2154/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2155 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2156static int
2157netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2158{
b5d57fc8
BP
2159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2160 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2161 FILE *file;
2162 char line[128];
2163
b5d57fc8 2164 netdev->in6 = in6addr_any;
8b61709d
BP
2165
2166 file = fopen("/proc/net/if_inet6", "r");
2167 if (file != NULL) {
2168 const char *name = netdev_get_name(netdev_);
2169 while (fgets(line, sizeof line, file)) {
2a022368 2170 struct in6_addr in6_tmp;
8b61709d 2171 char ifname[16 + 1];
2a022368 2172 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2173 && !strcmp(name, ifname))
2174 {
b5d57fc8 2175 netdev->in6 = in6_tmp;
8b61709d
BP
2176 break;
2177 }
2178 }
2179 fclose(file);
2180 }
b5d57fc8 2181 netdev->cache_valid |= VALID_IN6;
8b61709d 2182 }
b5d57fc8 2183 *in6 = netdev->in6;
8b61709d
BP
2184 return 0;
2185}
2186
2187static void
2188make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2189{
2190 struct sockaddr_in sin;
2191 memset(&sin, 0, sizeof sin);
2192 sin.sin_family = AF_INET;
2193 sin.sin_addr = addr;
2194 sin.sin_port = 0;
2195
2196 memset(sa, 0, sizeof *sa);
2197 memcpy(sa, &sin, sizeof sin);
2198}
2199
2200static int
2201do_set_addr(struct netdev *netdev,
2202 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2203{
2204 struct ifreq ifr;
71d7c22f 2205 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2206 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2207
2208 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2209 ioctl_name);
8b61709d
BP
2210}
2211
2212/* Adds 'router' as a default IP gateway. */
2213static int
67a4917b 2214netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2215{
2216 struct in_addr any = { INADDR_ANY };
2217 struct rtentry rt;
2218 int error;
2219
2220 memset(&rt, 0, sizeof rt);
2221 make_in4_sockaddr(&rt.rt_dst, any);
2222 make_in4_sockaddr(&rt.rt_gateway, router);
2223 make_in4_sockaddr(&rt.rt_genmask, any);
2224 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2225 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2226 if (error) {
10a89ef0 2227 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2228 }
2229 return error;
2230}
2231
f1acd62b
BP
2232static int
2233netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2234 char **netdev_name)
2235{
2236 static const char fn[] = "/proc/net/route";
2237 FILE *stream;
2238 char line[256];
2239 int ln;
2240
2241 *netdev_name = NULL;
2242 stream = fopen(fn, "r");
2243 if (stream == NULL) {
10a89ef0 2244 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2245 return errno;
2246 }
2247
2248 ln = 0;
2249 while (fgets(line, sizeof line, stream)) {
2250 if (++ln >= 2) {
2251 char iface[17];
dbba996b 2252 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2253 int refcnt, metric, mtu;
2254 unsigned int flags, use, window, irtt;
2255
2256 if (sscanf(line,
2257 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2258 " %d %u %u\n",
2259 iface, &dest, &gateway, &flags, &refcnt,
2260 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2261
d295e8e9 2262 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2263 fn, ln, line);
2264 continue;
2265 }
2266 if (!(flags & RTF_UP)) {
2267 /* Skip routes that aren't up. */
2268 continue;
2269 }
2270
2271 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2272 * network byte order, so we don't need need any endian
f1acd62b
BP
2273 * conversions here. */
2274 if ((dest & mask) == (host->s_addr & mask)) {
2275 if (!gateway) {
2276 /* The host is directly reachable. */
2277 next_hop->s_addr = 0;
2278 } else {
2279 /* To reach the host, we must go through a gateway. */
2280 next_hop->s_addr = gateway;
2281 }
2282 *netdev_name = xstrdup(iface);
2283 fclose(stream);
2284 return 0;
2285 }
2286 }
2287 }
2288
2289 fclose(stream);
2290 return ENXIO;
2291}
2292
e210037e 2293static int
b5d57fc8 2294netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2295{
b5d57fc8 2296 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2297 int error = 0;
2298
b5d57fc8
BP
2299 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2300 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2301
2302 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2303 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2304 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2305 cmd,
2306 ETHTOOL_GDRVINFO,
2307 "ETHTOOL_GDRVINFO");
2308 if (!error) {
b5d57fc8 2309 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2310 }
2311 }
e210037e 2312
e210037e 2313 if (!error) {
b5d57fc8
BP
2314 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2315 smap_add(smap, "driver_version", netdev->drvinfo.version);
2316 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2317 }
e210037e
AE
2318 return error;
2319}
2320
4f925bd3 2321static int
275707c3
EJ
2322netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2323 struct smap *smap)
4f925bd3 2324{
79f1cbe9 2325 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2326 return 0;
2327}
2328
8b61709d
BP
2329/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2330 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2331 * returns 0. Otherwise, it returns a positive errno value; in particular,
2332 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2333static int
2334netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2335 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2336{
2337 struct arpreq r;
c100e025 2338 struct sockaddr_in sin;
8b61709d
BP
2339 int retval;
2340
2341 memset(&r, 0, sizeof r);
f2cc621b 2342 memset(&sin, 0, sizeof sin);
c100e025
BP
2343 sin.sin_family = AF_INET;
2344 sin.sin_addr.s_addr = ip;
2345 sin.sin_port = 0;
2346 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2347 r.arp_ha.sa_family = ARPHRD_ETHER;
2348 r.arp_flags = 0;
71d7c22f 2349 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2350 COVERAGE_INC(netdev_arp_lookup);
2351 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2352 if (!retval) {
2353 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2354 } else if (retval != ENXIO) {
2355 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2356 netdev_get_name(netdev), IP_ARGS(ip),
2357 ovs_strerror(retval));
8b61709d
BP
2358 }
2359 return retval;
2360}
2361
2362static int
2363nd_to_iff_flags(enum netdev_flags nd)
2364{
2365 int iff = 0;
2366 if (nd & NETDEV_UP) {
2367 iff |= IFF_UP;
2368 }
2369 if (nd & NETDEV_PROMISC) {
2370 iff |= IFF_PROMISC;
2371 }
2372 return iff;
2373}
2374
2375static int
2376iff_to_nd_flags(int iff)
2377{
2378 enum netdev_flags nd = 0;
2379 if (iff & IFF_UP) {
2380 nd |= NETDEV_UP;
2381 }
2382 if (iff & IFF_PROMISC) {
2383 nd |= NETDEV_PROMISC;
2384 }
2385 return nd;
2386}
2387
2388static int
b5d57fc8 2389netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
8b61709d
BP
2390 enum netdev_flags on, enum netdev_flags *old_flagsp)
2391{
b5d57fc8 2392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 2393 int old_flags, new_flags;
c37d4da4
EJ
2394 int error = 0;
2395
b5d57fc8 2396 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2397 *old_flagsp = iff_to_nd_flags(old_flags);
2398 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2399 if (new_flags != old_flags) {
b5d57fc8
BP
2400 error = set_flags(netdev_get_name(netdev_), new_flags);
2401 get_flags(netdev_, &netdev->ifi_flags);
8b61709d
BP
2402 }
2403 return error;
2404}
2405
ac4d3bcb
EJ
2406static unsigned int
2407netdev_linux_change_seq(const struct netdev *netdev)
2408{
b5d57fc8 2409 return netdev_linux_cast(netdev)->change_seq;
ac4d3bcb
EJ
2410}
2411
4f925bd3 2412#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
51f87458 2413 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2414{ \
2415 NAME, \
2416 \
2417 netdev_linux_init, \
2418 netdev_linux_run, \
2419 netdev_linux_wait, \
2420 \
2421 CREATE, \
2422 netdev_linux_destroy, \
de5cdb90 2423 NULL, /* get_config */ \
6d9e6eb4 2424 NULL, /* set_config */ \
f431bf7d 2425 NULL, /* get_tunnel_config */ \
c3827f61 2426 \
796223f5 2427 netdev_linux_rx_open, \
c3827f61
BP
2428 \
2429 netdev_linux_send, \
2430 netdev_linux_send_wait, \
2431 \
2432 netdev_linux_set_etheraddr, \
2433 netdev_linux_get_etheraddr, \
2434 netdev_linux_get_mtu, \
9b020780 2435 netdev_linux_set_mtu, \
c3827f61
BP
2436 netdev_linux_get_ifindex, \
2437 netdev_linux_get_carrier, \
65c3058c 2438 netdev_linux_get_carrier_resets, \
1670c579 2439 netdev_linux_set_miimon_interval, \
f613a0d7 2440 GET_STATS, \
c3827f61
BP
2441 SET_STATS, \
2442 \
51f87458 2443 GET_FEATURES, \
c3827f61 2444 netdev_linux_set_advertisements, \
c3827f61
BP
2445 \
2446 netdev_linux_set_policing, \
2447 netdev_linux_get_qos_types, \
2448 netdev_linux_get_qos_capabilities, \
2449 netdev_linux_get_qos, \
2450 netdev_linux_set_qos, \
2451 netdev_linux_get_queue, \
2452 netdev_linux_set_queue, \
2453 netdev_linux_delete_queue, \
2454 netdev_linux_get_queue_stats, \
2455 netdev_linux_dump_queues, \
2456 netdev_linux_dump_queue_stats, \
2457 \
2458 netdev_linux_get_in4, \
2459 netdev_linux_set_in4, \
2460 netdev_linux_get_in6, \
2461 netdev_linux_add_router, \
2462 netdev_linux_get_next_hop, \
4f925bd3 2463 GET_STATUS, \
c3827f61
BP
2464 netdev_linux_arp_lookup, \
2465 \
2466 netdev_linux_update_flags, \
2467 \
ac4d3bcb 2468 netdev_linux_change_seq \
c3827f61
BP
2469}
2470
2471const struct netdev_class netdev_linux_class =
2472 NETDEV_LINUX_CLASS(
2473 "system",
2474 netdev_linux_create,
f613a0d7 2475 netdev_linux_get_stats,
4f925bd3 2476 NULL, /* set_stats */
51f87458 2477 netdev_linux_get_features,
275707c3 2478 netdev_linux_get_status);
c3827f61
BP
2479
2480const struct netdev_class netdev_tap_class =
2481 NETDEV_LINUX_CLASS(
2482 "tap",
2483 netdev_linux_create_tap,
bba1e6f3 2484 netdev_tap_get_stats,
4f925bd3 2485 NULL, /* set_stats */
51f87458 2486 netdev_linux_get_features,
275707c3 2487 netdev_linux_get_status);
c3827f61
BP
2488
2489const struct netdev_class netdev_internal_class =
2490 NETDEV_LINUX_CLASS(
2491 "internal",
2492 netdev_linux_create,
bba1e6f3 2493 netdev_internal_get_stats,
2f31a822 2494 netdev_internal_set_stats,
51f87458 2495 NULL, /* get_features */
275707c3 2496 netdev_internal_get_status);
796223f5
BP
2497
2498static const struct netdev_rx_class netdev_rx_linux_class = {
2499 netdev_rx_linux_destroy,
2500 netdev_rx_linux_recv,
2501 netdev_rx_linux_wait,
2502 netdev_rx_linux_drain,
2503};
8b61709d 2504\f
c1c9c9c4 2505/* HTB traffic control class. */
559843ed 2506
c1c9c9c4 2507#define HTB_N_QUEUES 0xf000
8b61709d 2508
c1c9c9c4
BP
2509struct htb {
2510 struct tc tc;
2511 unsigned int max_rate; /* In bytes/s. */
2512};
8b61709d 2513
c1c9c9c4 2514struct htb_class {
93b13be8 2515 struct tc_queue tc_queue;
c1c9c9c4
BP
2516 unsigned int min_rate; /* In bytes/s. */
2517 unsigned int max_rate; /* In bytes/s. */
2518 unsigned int burst; /* In bytes. */
2519 unsigned int priority; /* Lower values are higher priorities. */
2520};
8b61709d 2521
c1c9c9c4 2522static struct htb *
b5d57fc8 2523htb_get__(const struct netdev *netdev_)
c1c9c9c4 2524{
b5d57fc8
BP
2525 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2526 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
2527}
2528
24045e35 2529static void
b5d57fc8 2530htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 2531{
b5d57fc8 2532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2533 struct htb *htb;
2534
2535 htb = xmalloc(sizeof *htb);
2536 tc_init(&htb->tc, &tc_ops_htb);
2537 htb->max_rate = max_rate;
2538
b5d57fc8 2539 netdev->tc = &htb->tc;
c1c9c9c4
BP
2540}
2541
2542/* Create an HTB qdisc.
2543 *
a339aa81 2544 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2545static int
2546htb_setup_qdisc__(struct netdev *netdev)
2547{
2548 size_t opt_offset;
2549 struct tc_htb_glob opt;
2550 struct ofpbuf request;
2551 struct tcmsg *tcmsg;
2552
2553 tc_del_qdisc(netdev);
2554
2555 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2556 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2557 if (!tcmsg) {
2558 return ENODEV;
2559 }
c1c9c9c4
BP
2560 tcmsg->tcm_handle = tc_make_handle(1, 0);
2561 tcmsg->tcm_parent = TC_H_ROOT;
2562
2563 nl_msg_put_string(&request, TCA_KIND, "htb");
2564
2565 memset(&opt, 0, sizeof opt);
2566 opt.rate2quantum = 10;
2567 opt.version = 3;
4ecf12d5 2568 opt.defcls = 1;
c1c9c9c4
BP
2569
2570 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2571 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2572 nl_msg_end_nested(&request, opt_offset);
2573
2574 return tc_transact(&request, NULL);
2575}
2576
2577/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2578 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2579static int
2580htb_setup_class__(struct netdev *netdev, unsigned int handle,
2581 unsigned int parent, struct htb_class *class)
2582{
2583 size_t opt_offset;
2584 struct tc_htb_opt opt;
2585 struct ofpbuf request;
2586 struct tcmsg *tcmsg;
2587 int error;
2588 int mtu;
2589
9b020780
PS
2590 error = netdev_get_mtu(netdev, &mtu);
2591 if (error) {
f915f1a8
BP
2592 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2593 netdev_get_name(netdev));
9b020780 2594 return error;
f915f1a8 2595 }
c1c9c9c4
BP
2596
2597 memset(&opt, 0, sizeof opt);
2598 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2599 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2600 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2601 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2602 opt.prio = class->priority;
2603
2604 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2605 if (!tcmsg) {
2606 return ENODEV;
2607 }
c1c9c9c4
BP
2608 tcmsg->tcm_handle = handle;
2609 tcmsg->tcm_parent = parent;
2610
2611 nl_msg_put_string(&request, TCA_KIND, "htb");
2612 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2613 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2614 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2615 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2616 nl_msg_end_nested(&request, opt_offset);
2617
2618 error = tc_transact(&request, NULL);
2619 if (error) {
2620 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2621 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2622 netdev_get_name(netdev),
2623 tc_get_major(handle), tc_get_minor(handle),
2624 tc_get_major(parent), tc_get_minor(parent),
2625 class->min_rate, class->max_rate,
10a89ef0 2626 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
2627 }
2628 return error;
2629}
2630
2631/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2632 * description of them into 'details'. The description complies with the
2633 * specification given in the vswitch database documentation for linux-htb
2634 * queue details. */
2635static int
2636htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2637{
2638 static const struct nl_policy tca_htb_policy[] = {
2639 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2640 .min_len = sizeof(struct tc_htb_opt) },
2641 };
2642
2643 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2644 const struct tc_htb_opt *htb;
2645
2646 if (!nl_parse_nested(nl_options, tca_htb_policy,
2647 attrs, ARRAY_SIZE(tca_htb_policy))) {
2648 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2649 return EPROTO;
2650 }
2651
2652 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2653 class->min_rate = htb->rate.rate;
2654 class->max_rate = htb->ceil.rate;
2655 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2656 class->priority = htb->prio;
2657 return 0;
2658}
2659
2660static int
2661htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2662 struct htb_class *options,
2663 struct netdev_queue_stats *stats)
2664{
2665 struct nlattr *nl_options;
2666 unsigned int handle;
2667 int error;
2668
2669 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2670 if (!error && queue_id) {
17ee3c1f
BP
2671 unsigned int major = tc_get_major(handle);
2672 unsigned int minor = tc_get_minor(handle);
2673 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2674 *queue_id = minor - 1;
c1c9c9c4
BP
2675 } else {
2676 error = EPROTO;
2677 }
2678 }
2679 if (!error && options) {
2680 error = htb_parse_tca_options__(nl_options, options);
2681 }
2682 return error;
2683}
2684
2685static void
2686htb_parse_qdisc_details__(struct netdev *netdev,
79f1cbe9 2687 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2688{
2689 const char *max_rate_s;
2690
79f1cbe9 2691 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2692 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2693 if (!hc->max_rate) {
a00ca915 2694 enum netdev_features current;
c1c9c9c4
BP
2695
2696 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 2697 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2698 }
2699 hc->min_rate = hc->max_rate;
2700 hc->burst = 0;
2701 hc->priority = 0;
2702}
2703
2704static int
2705htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2706 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2707{
2708 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2709 const char *min_rate_s = smap_get(details, "min-rate");
2710 const char *max_rate_s = smap_get(details, "max-rate");
2711 const char *burst_s = smap_get(details, "burst");
2712 const char *priority_s = smap_get(details, "priority");
9b020780 2713 int mtu, error;
c1c9c9c4 2714
9b020780
PS
2715 error = netdev_get_mtu(netdev, &mtu);
2716 if (error) {
f915f1a8
BP
2717 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2718 netdev_get_name(netdev));
9b020780 2719 return error;
f915f1a8
BP
2720 }
2721
4f104611
EJ
2722 /* HTB requires at least an mtu sized min-rate to send any traffic even
2723 * on uncongested links. */
c45ab5e9 2724 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2725 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2726 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2727
2728 /* max-rate */
2729 hc->max_rate = (max_rate_s
2730 ? strtoull(max_rate_s, NULL, 10) / 8
2731 : htb->max_rate);
2732 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2733 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2734
2735 /* burst
2736 *
2737 * According to hints in the documentation that I've read, it is important
2738 * that 'burst' be at least as big as the largest frame that might be
2739 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2740 * but having it a bit too small is a problem. Since netdev_get_mtu()
2741 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2742 * the MTU. We actually add 64, instead of 14, as a guard against
2743 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2744 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2745 hc->burst = MAX(hc->burst, mtu + 64);
2746
2747 /* priority */
2748 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2749
2750 return 0;
2751}
2752
2753static int
2754htb_query_class__(const struct netdev *netdev, unsigned int handle,
2755 unsigned int parent, struct htb_class *options,
2756 struct netdev_queue_stats *stats)
2757{
2758 struct ofpbuf *reply;
2759 int error;
2760
2761 error = tc_query_class(netdev, handle, parent, &reply);
2762 if (!error) {
2763 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2764 ofpbuf_delete(reply);
2765 }
2766 return error;
2767}
2768
2769static int
79f1cbe9 2770htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2771{
2772 int error;
2773
2774 error = htb_setup_qdisc__(netdev);
2775 if (!error) {
2776 struct htb_class hc;
2777
2778 htb_parse_qdisc_details__(netdev, details, &hc);
2779 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2780 tc_make_handle(1, 0), &hc);
2781 if (!error) {
2782 htb_install__(netdev, hc.max_rate);
2783 }
2784 }
2785 return error;
2786}
2787
93b13be8
BP
2788static struct htb_class *
2789htb_class_cast__(const struct tc_queue *queue)
2790{
2791 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2792}
2793
c1c9c9c4
BP
2794static void
2795htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2796 const struct htb_class *hc)
2797{
2798 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2799 size_t hash = hash_int(queue_id, 0);
2800 struct tc_queue *queue;
c1c9c9c4
BP
2801 struct htb_class *hcp;
2802
93b13be8
BP
2803 queue = tc_find_queue__(netdev, queue_id, hash);
2804 if (queue) {
2805 hcp = htb_class_cast__(queue);
2806 } else {
c1c9c9c4 2807 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2808 queue = &hcp->tc_queue;
2809 queue->queue_id = queue_id;
2810 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2811 }
93b13be8
BP
2812
2813 hcp->min_rate = hc->min_rate;
2814 hcp->max_rate = hc->max_rate;
2815 hcp->burst = hc->burst;
2816 hcp->priority = hc->priority;
c1c9c9c4
BP
2817}
2818
2819static int
2820htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2821{
c1c9c9c4
BP
2822 struct ofpbuf msg;
2823 struct nl_dump dump;
2824 struct htb_class hc;
c1c9c9c4
BP
2825
2826 /* Get qdisc options. */
2827 hc.max_rate = 0;
2828 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2829 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2830
2831 /* Get queues. */
23a98ffe
BP
2832 if (!start_queue_dump(netdev, &dump)) {
2833 return ENODEV;
2834 }
c1c9c9c4
BP
2835 while (nl_dump_next(&dump, &msg)) {
2836 unsigned int queue_id;
2837
2838 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2839 htb_update_queue__(netdev, queue_id, &hc);
2840 }
2841 }
2842 nl_dump_done(&dump);
2843
2844 return 0;
2845}
2846
2847static void
2848htb_tc_destroy(struct tc *tc)
2849{
2850 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2851 struct htb_class *hc, *next;
c1c9c9c4 2852
4e8e4213 2853 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2854 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2855 free(hc);
2856 }
2857 tc_destroy(tc);
2858 free(htb);
2859}
2860
2861static int
79f1cbe9 2862htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2863{
2864 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2865 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2866 return 0;
2867}
2868
2869static int
79f1cbe9 2870htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2871{
2872 struct htb_class hc;
2873 int error;
2874
2875 htb_parse_qdisc_details__(netdev, details, &hc);
2876 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2877 tc_make_handle(1, 0), &hc);
2878 if (!error) {
2879 htb_get__(netdev)->max_rate = hc.max_rate;
2880 }
2881 return error;
2882}
2883
2884static int
93b13be8 2885htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 2886 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 2887{
93b13be8 2888 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2889
79f1cbe9 2890 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 2891 if (hc->min_rate != hc->max_rate) {
79f1cbe9 2892 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 2893 }
79f1cbe9 2894 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 2895 if (hc->priority) {
79f1cbe9 2896 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
2897 }
2898 return 0;
2899}
2900
2901static int
2902htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 2903 const struct smap *details)
c1c9c9c4
BP
2904{
2905 struct htb_class hc;
2906 int error;
2907
2908 error = htb_parse_class_details__(netdev, details, &hc);
2909 if (error) {
2910 return error;
2911 }
2912
17ee3c1f 2913 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2914 tc_make_handle(1, 0xfffe), &hc);
2915 if (error) {
2916 return error;
2917 }
2918
2919 htb_update_queue__(netdev, queue_id, &hc);
2920 return 0;
2921}
2922
2923static int
93b13be8 2924htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2925{
93b13be8 2926 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2927 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2928 int error;
2929
93b13be8 2930 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2931 if (!error) {
93b13be8 2932 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2933 free(hc);
c1c9c9c4
BP
2934 }
2935 return error;
2936}
2937
2938static int
93b13be8 2939htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2940 struct netdev_queue_stats *stats)
2941{
93b13be8 2942 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2943 tc_make_handle(1, 0xfffe), NULL, stats);
2944}
2945
2946static int
2947htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2948 const struct ofpbuf *nlmsg,
2949 netdev_dump_queue_stats_cb *cb, void *aux)
2950{
2951 struct netdev_queue_stats stats;
17ee3c1f 2952 unsigned int handle, major, minor;
c1c9c9c4
BP
2953 int error;
2954
2955 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2956 if (error) {
2957 return error;
2958 }
2959
17ee3c1f
BP
2960 major = tc_get_major(handle);
2961 minor = tc_get_minor(handle);
2962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2963 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2964 }
2965 return 0;
2966}
2967
2968static const struct tc_ops tc_ops_htb = {
2969 "htb", /* linux_name */
2970 "linux-htb", /* ovs_name */
2971 HTB_N_QUEUES, /* n_queues */
2972 htb_tc_install,
2973 htb_tc_load,
2974 htb_tc_destroy,
2975 htb_qdisc_get,
2976 htb_qdisc_set,
2977 htb_class_get,
2978 htb_class_set,
2979 htb_class_delete,
2980 htb_class_get_stats,
2981 htb_class_dump_stats
2982};
2983\f
a339aa81
EJ
2984/* "linux-hfsc" traffic control class. */
2985
2986#define HFSC_N_QUEUES 0xf000
2987
2988struct hfsc {
2989 struct tc tc;
2990 uint32_t max_rate;
2991};
2992
2993struct hfsc_class {
2994 struct tc_queue tc_queue;
2995 uint32_t min_rate;
2996 uint32_t max_rate;
2997};
2998
2999static struct hfsc *
b5d57fc8 3000hfsc_get__(const struct netdev *netdev_)
a339aa81 3001{
b5d57fc8
BP
3002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3003 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
3004}
3005
3006static struct hfsc_class *
3007hfsc_class_cast__(const struct tc_queue *queue)
3008{
3009 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3010}
3011
24045e35 3012static void
b5d57fc8 3013hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 3014{
b5d57fc8 3015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3016 struct hfsc *hfsc;
3017
a339aa81
EJ
3018 hfsc = xmalloc(sizeof *hfsc);
3019 tc_init(&hfsc->tc, &tc_ops_hfsc);
3020 hfsc->max_rate = max_rate;
b5d57fc8 3021 netdev->tc = &hfsc->tc;
a339aa81
EJ
3022}
3023
3024static void
3025hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3026 const struct hfsc_class *hc)
3027{
3028 size_t hash;
3029 struct hfsc *hfsc;
3030 struct hfsc_class *hcp;
3031 struct tc_queue *queue;
3032
3033 hfsc = hfsc_get__(netdev);
3034 hash = hash_int(queue_id, 0);
3035
3036 queue = tc_find_queue__(netdev, queue_id, hash);
3037 if (queue) {
3038 hcp = hfsc_class_cast__(queue);
3039 } else {
3040 hcp = xmalloc(sizeof *hcp);
3041 queue = &hcp->tc_queue;
3042 queue->queue_id = queue_id;
3043 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3044 }
3045
3046 hcp->min_rate = hc->min_rate;
3047 hcp->max_rate = hc->max_rate;
3048}
3049
3050static int
3051hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3052{
3053 const struct tc_service_curve *rsc, *fsc, *usc;
3054 static const struct nl_policy tca_hfsc_policy[] = {
3055 [TCA_HFSC_RSC] = {
3056 .type = NL_A_UNSPEC,
3057 .optional = false,
3058 .min_len = sizeof(struct tc_service_curve),
3059 },
3060 [TCA_HFSC_FSC] = {
3061 .type = NL_A_UNSPEC,
3062 .optional = false,
3063 .min_len = sizeof(struct tc_service_curve),
3064 },
3065 [TCA_HFSC_USC] = {
3066 .type = NL_A_UNSPEC,
3067 .optional = false,
3068 .min_len = sizeof(struct tc_service_curve),
3069 },
3070 };
3071 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3072
3073 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3074 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3075 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3076 return EPROTO;
3077 }
3078
3079 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3080 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3081 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3082
3083 if (rsc->m1 != 0 || rsc->d != 0 ||
3084 fsc->m1 != 0 || fsc->d != 0 ||
3085 usc->m1 != 0 || usc->d != 0) {
3086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3087 "Non-linear service curves are not supported.");
3088 return EPROTO;
3089 }
3090
3091 if (rsc->m2 != fsc->m2) {
3092 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3093 "Real-time service curves are not supported ");
3094 return EPROTO;
3095 }
3096
3097 if (rsc->m2 > usc->m2) {
3098 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3099 "Min-rate service curve is greater than "
3100 "the max-rate service curve.");
3101 return EPROTO;
3102 }
3103
3104 class->min_rate = fsc->m2;
3105 class->max_rate = usc->m2;
3106 return 0;
3107}
3108
3109static int
3110hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3111 struct hfsc_class *options,
3112 struct netdev_queue_stats *stats)
3113{
3114 int error;
3115 unsigned int handle;
3116 struct nlattr *nl_options;
3117
3118 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3119 if (error) {
3120 return error;
3121 }
3122
3123 if (queue_id) {
3124 unsigned int major, minor;
3125
3126 major = tc_get_major(handle);
3127 minor = tc_get_minor(handle);
3128 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3129 *queue_id = minor - 1;
3130 } else {
3131 return EPROTO;
3132 }
3133 }
3134
3135 if (options) {
3136 error = hfsc_parse_tca_options__(nl_options, options);
3137 }
3138
3139 return error;
3140}
3141
3142static int
3143hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3144 unsigned int parent, struct hfsc_class *options,
3145 struct netdev_queue_stats *stats)
3146{
3147 int error;
3148 struct ofpbuf *reply;
3149
3150 error = tc_query_class(netdev, handle, parent, &reply);
3151 if (error) {
3152 return error;
3153 }
3154
3155 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3156 ofpbuf_delete(reply);
3157 return error;
3158}
3159
3160static void
79f1cbe9 3161hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
a339aa81
EJ
3162 struct hfsc_class *class)
3163{
3164 uint32_t max_rate;
3165 const char *max_rate_s;
3166
79f1cbe9 3167 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3168 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3169
3170 if (!max_rate) {
a00ca915 3171 enum netdev_features current;
a339aa81
EJ
3172
3173 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 3174 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3175 }
3176
3177 class->min_rate = max_rate;
3178 class->max_rate = max_rate;
3179}
3180
3181static int
3182hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3183 const struct smap *details,
a339aa81
EJ
3184 struct hfsc_class * class)
3185{
3186 const struct hfsc *hfsc;
3187 uint32_t min_rate, max_rate;
3188 const char *min_rate_s, *max_rate_s;
3189
3190 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3191 min_rate_s = smap_get(details, "min-rate");
3192 max_rate_s = smap_get(details, "max-rate");
a339aa81 3193
c45ab5e9 3194 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3195 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3196 min_rate = MIN(min_rate, hfsc->max_rate);
3197
3198 max_rate = (max_rate_s
3199 ? strtoull(max_rate_s, NULL, 10) / 8
3200 : hfsc->max_rate);
3201 max_rate = MAX(max_rate, min_rate);
3202 max_rate = MIN(max_rate, hfsc->max_rate);
3203
3204 class->min_rate = min_rate;
3205 class->max_rate = max_rate;
3206
3207 return 0;
3208}
3209
3210/* Create an HFSC qdisc.
3211 *
3212 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3213static int
3214hfsc_setup_qdisc__(struct netdev * netdev)
3215{
3216 struct tcmsg *tcmsg;
3217 struct ofpbuf request;
3218 struct tc_hfsc_qopt opt;
3219
3220 tc_del_qdisc(netdev);
3221
3222 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3223 NLM_F_EXCL | NLM_F_CREATE, &request);
3224
3225 if (!tcmsg) {
3226 return ENODEV;
3227 }
3228
3229 tcmsg->tcm_handle = tc_make_handle(1, 0);
3230 tcmsg->tcm_parent = TC_H_ROOT;
3231
3232 memset(&opt, 0, sizeof opt);
3233 opt.defcls = 1;
3234
3235 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3236 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3237
3238 return tc_transact(&request, NULL);
3239}
3240
3241/* Create an HFSC class.
3242 *
3243 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3244 * sc rate <min_rate> ul rate <max_rate>" */
3245static int
3246hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3247 unsigned int parent, struct hfsc_class *class)
3248{
3249 int error;
3250 size_t opt_offset;
3251 struct tcmsg *tcmsg;
3252 struct ofpbuf request;
3253 struct tc_service_curve min, max;
3254
3255 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3256
3257 if (!tcmsg) {
3258 return ENODEV;
3259 }
3260
3261 tcmsg->tcm_handle = handle;
3262 tcmsg->tcm_parent = parent;
3263
3264 min.m1 = 0;
3265 min.d = 0;
3266 min.m2 = class->min_rate;
3267
3268 max.m1 = 0;
3269 max.d = 0;
3270 max.m2 = class->max_rate;
3271
3272 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3273 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3274 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3275 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3276 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3277 nl_msg_end_nested(&request, opt_offset);
3278
3279 error = tc_transact(&request, NULL);
3280 if (error) {
3281 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3282 "min-rate %ubps, max-rate %ubps (%s)",
3283 netdev_get_name(netdev),
3284 tc_get_major(handle), tc_get_minor(handle),
3285 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 3286 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
3287 }
3288
3289 return error;
3290}
3291
3292static int
79f1cbe9 3293hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3294{
3295 int error;
3296 struct hfsc_class class;
3297
3298 error = hfsc_setup_qdisc__(netdev);
3299
3300 if (error) {
3301 return error;
3302 }
3303
3304 hfsc_parse_qdisc_details__(netdev, details, &class);
3305 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3306 tc_make_handle(1, 0), &class);
3307
3308 if (error) {
3309 return error;
3310 }
3311
3312 hfsc_install__(netdev, class.max_rate);
3313 return 0;
3314}
3315
3316static int
3317hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3318{
3319 struct ofpbuf msg;
a339aa81
EJ
3320 struct nl_dump dump;
3321 struct hfsc_class hc;
3322
3323 hc.max_rate = 0;
3324 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3325 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3326
3327 if (!start_queue_dump(netdev, &dump)) {
3328 return ENODEV;
3329 }
3330
3331 while (nl_dump_next(&dump, &msg)) {
3332 unsigned int queue_id;
3333
3334 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3335 hfsc_update_queue__(netdev, queue_id, &hc);
3336 }
3337 }
3338
3339 nl_dump_done(&dump);
3340 return 0;
3341}
3342
3343static void
3344hfsc_tc_destroy(struct tc *tc)
3345{
3346 struct hfsc *hfsc;
3347 struct hfsc_class *hc, *next;
3348
3349 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3350
3351 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3352 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3353 free(hc);
3354 }
3355
3356 tc_destroy(tc);
3357 free(hfsc);
3358}
3359
3360static int
79f1cbe9 3361hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3362{
3363 const struct hfsc *hfsc;
3364 hfsc = hfsc_get__(netdev);
79f1cbe9 3365 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3366 return 0;
3367}
3368
3369static int
79f1cbe9 3370hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3371{
3372 int error;
3373 struct hfsc_class class;
3374
3375 hfsc_parse_qdisc_details__(netdev, details, &class);
3376 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3377 tc_make_handle(1, 0), &class);
3378
3379 if (!error) {
3380 hfsc_get__(netdev)->max_rate = class.max_rate;
3381 }
3382
3383 return error;
3384}
3385
3386static int
3387hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3388 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3389{
3390 const struct hfsc_class *hc;
3391
3392 hc = hfsc_class_cast__(queue);
79f1cbe9 3393 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3394 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3395 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3396 }
3397 return 0;
3398}
3399
3400static int
3401hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3402 const struct smap *details)
a339aa81
EJ
3403{
3404 int error;
3405 struct hfsc_class class;
3406
3407 error = hfsc_parse_class_details__(netdev, details, &class);
3408 if (error) {
3409 return error;
3410 }
3411
3412 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3413 tc_make_handle(1, 0xfffe), &class);
3414 if (error) {
3415 return error;
3416 }
3417
3418 hfsc_update_queue__(netdev, queue_id, &class);
3419 return 0;
3420}
3421
3422static int
3423hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3424{
3425 int error;
3426 struct hfsc *hfsc;
3427 struct hfsc_class *hc;
3428
3429 hc = hfsc_class_cast__(queue);
3430 hfsc = hfsc_get__(netdev);
3431
3432 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3433 if (!error) {
3434 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3435 free(hc);
3436 }
3437 return error;
3438}
3439
3440static int
3441hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3442 struct netdev_queue_stats *stats)
3443{
3444 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3445 tc_make_handle(1, 0xfffe), NULL, stats);
3446}
3447
3448static int
3449hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3450 const struct ofpbuf *nlmsg,
3451 netdev_dump_queue_stats_cb *cb, void *aux)
3452{
3453 struct netdev_queue_stats stats;
3454 unsigned int handle, major, minor;
3455 int error;
3456
3457 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3458 if (error) {
3459 return error;
3460 }
3461
3462 major = tc_get_major(handle);
3463 minor = tc_get_minor(handle);
3464 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3465 (*cb)(minor - 1, &stats, aux);
3466 }
3467 return 0;
3468}
3469
3470static const struct tc_ops tc_ops_hfsc = {
3471 "hfsc", /* linux_name */
3472 "linux-hfsc", /* ovs_name */
3473 HFSC_N_QUEUES, /* n_queues */
3474 hfsc_tc_install, /* tc_install */
3475 hfsc_tc_load, /* tc_load */
3476 hfsc_tc_destroy, /* tc_destroy */
3477 hfsc_qdisc_get, /* qdisc_get */
3478 hfsc_qdisc_set, /* qdisc_set */
3479 hfsc_class_get, /* class_get */
3480 hfsc_class_set, /* class_set */
3481 hfsc_class_delete, /* class_delete */
3482 hfsc_class_get_stats, /* class_get_stats */
3483 hfsc_class_dump_stats /* class_dump_stats */
3484};
3485\f
c1c9c9c4
BP
3486/* "linux-default" traffic control class.
3487 *
3488 * This class represents the default, unnamed Linux qdisc. It corresponds to
3489 * the "" (empty string) QoS type in the OVS database. */
3490
3491static void
b5d57fc8 3492default_install__(struct netdev *netdev_)
c1c9c9c4 3493{
b5d57fc8 3494 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3495 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 3496
559eb230
BP
3497 /* Nothing but a tc class implementation is allowed to write to a tc. This
3498 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3499 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3500}
3501
3502static int
3503default_tc_install(struct netdev *netdev,
79f1cbe9 3504 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3505{
3506 default_install__(netdev);
3507 return 0;
3508}
3509
3510static int
3511default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3512{
3513 default_install__(netdev);
3514 return 0;
3515}
3516
3517static const struct tc_ops tc_ops_default = {
3518 NULL, /* linux_name */
3519 "", /* ovs_name */
3520 0, /* n_queues */
3521 default_tc_install,
3522 default_tc_load,
3523 NULL, /* tc_destroy */
3524 NULL, /* qdisc_get */
3525 NULL, /* qdisc_set */
3526 NULL, /* class_get */
3527 NULL, /* class_set */
3528 NULL, /* class_delete */
3529 NULL, /* class_get_stats */
3530 NULL /* class_dump_stats */
3531};
3532\f
3533/* "linux-other" traffic control class.
3534 *
3535 * */
3536
3537static int
b5d57fc8 3538other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 3539{
b5d57fc8 3540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3541 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 3542
559eb230
BP
3543 /* Nothing but a tc class implementation is allowed to write to a tc. This
3544 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3545 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3546 return 0;
3547}
3548
3549static const struct tc_ops tc_ops_other = {
3550 NULL, /* linux_name */
3551 "linux-other", /* ovs_name */
3552 0, /* n_queues */
3553 NULL, /* tc_install */
3554 other_tc_load,
3555 NULL, /* tc_destroy */
3556 NULL, /* qdisc_get */
3557 NULL, /* qdisc_set */
3558 NULL, /* class_get */
3559 NULL, /* class_set */
3560 NULL, /* class_delete */
3561 NULL, /* class_get_stats */
3562 NULL /* class_dump_stats */
3563};
3564\f
3565/* Traffic control. */
3566
3567/* Number of kernel "tc" ticks per second. */
3568static double ticks_per_s;
3569
3570/* Number of kernel "jiffies" per second. This is used for the purpose of
3571 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3572 * one jiffy's worth of data.
3573 *
3574 * There are two possibilities here:
3575 *
3576 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3577 * approximate range of 100 to 1024. That means that we really need to
3578 * make sure that the qdisc can buffer that much data.
3579 *
3580 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3581 * has finely granular timers and there's no need to fudge additional room
3582 * for buffers. (There's no extra effort needed to implement that: the
3583 * large 'buffer_hz' is used as a divisor, so practically any number will
3584 * come out as 0 in the division. Small integer results in the case of
3585 * really high dividends won't have any real effect anyhow.)
3586 */
3587static unsigned int buffer_hz;
3588
3589/* Returns tc handle 'major':'minor'. */
3590static unsigned int
3591tc_make_handle(unsigned int major, unsigned int minor)
3592{
3593 return TC_H_MAKE(major << 16, minor);
3594}
3595
3596/* Returns the major number from 'handle'. */
3597static unsigned int
3598tc_get_major(unsigned int handle)
3599{
3600 return TC_H_MAJ(handle) >> 16;
3601}
3602
3603/* Returns the minor number from 'handle'. */
3604static unsigned int
3605tc_get_minor(unsigned int handle)
3606{
3607 return TC_H_MIN(handle);
3608}
3609
3610static struct tcmsg *
3611tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3612 struct ofpbuf *request)
3613{
3614 struct tcmsg *tcmsg;
3615 int ifindex;
3616 int error;
3617
3618 error = get_ifindex(netdev, &ifindex);
3619 if (error) {
3620 return NULL;
3621 }
3622
3623 ofpbuf_init(request, 512);
3624 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3625 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3626 tcmsg->tcm_family = AF_UNSPEC;
3627 tcmsg->tcm_ifindex = ifindex;
3628 /* Caller should fill in tcmsg->tcm_handle. */
3629 /* Caller should fill in tcmsg->tcm_parent. */
3630
3631 return tcmsg;
3632}
3633
3634static int
3635tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3636{
a88b4e04 3637 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
3638 ofpbuf_uninit(request);
3639 return error;
3640}
3641
f8500004
JP
3642/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3643 * policing configuration.
3644 *
3645 * This function is equivalent to running the following when 'add' is true:
3646 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3647 *
3648 * This function is equivalent to running the following when 'add' is false:
3649 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3650 *
3651 * The configuration and stats may be seen with the following command:
3652 * /sbin/tc -s qdisc show dev <devname>
3653 *
3654 * Returns 0 if successful, otherwise a positive errno value.
3655 */
3656static int
3657tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3658{
3659 struct ofpbuf request;
3660 struct tcmsg *tcmsg;
3661 int error;
3662 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3663 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3664
3665 tcmsg = tc_make_request(netdev, type, flags, &request);
3666 if (!tcmsg) {
3667 return ENODEV;
3668 }
3669 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3670 tcmsg->tcm_parent = TC_H_INGRESS;
3671 nl_msg_put_string(&request, TCA_KIND, "ingress");
3672 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3673
3674 error = tc_transact(&request, NULL);
3675 if (error) {
3676 /* If we're deleting the qdisc, don't worry about some of the
3677 * error conditions. */
3678 if (!add && (error == ENOENT || error == EINVAL)) {
3679 return 0;
3680 }
3681 return error;
3682 }
3683
3684 return 0;
3685}
3686
3687/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3688 * of 'kbits_burst'.
3689 *
3690 * This function is equivalent to running:
3691 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3692 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3693 * mtu 65535 drop
3694 *
3695 * The configuration and stats may be seen with the following command:
3696 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3697 *
3698 * Returns 0 if successful, otherwise a positive errno value.
3699 */
3700static int
3701tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3702{
3703 struct tc_police tc_police;
3704 struct ofpbuf request;
3705 struct tcmsg *tcmsg;
3706 size_t basic_offset;
3707 size_t police_offset;
3708 int error;
3709 int mtu = 65535;
3710
3711 memset(&tc_police, 0, sizeof tc_police);
3712 tc_police.action = TC_POLICE_SHOT;
3713 tc_police.mtu = mtu;
e5c08015 3714 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
f8500004
JP
3715 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3716 kbits_burst * 1024);
3717
3718 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3719 NLM_F_EXCL | NLM_F_CREATE, &request);
3720 if (!tcmsg) {
3721 return ENODEV;
3722 }
3723 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3724 tcmsg->tcm_info = tc_make_handle(49,
3725 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3726
3727 nl_msg_put_string(&request, TCA_KIND, "basic");
3728 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3729 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3730 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3731 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3732 nl_msg_end_nested(&request, police_offset);
3733 nl_msg_end_nested(&request, basic_offset);
3734
3735 error = tc_transact(&request, NULL);
3736 if (error) {
3737 return error;
3738 }
3739
3740 return 0;
3741}
3742
c1c9c9c4
BP
3743static void
3744read_psched(void)
3745{
3746 /* The values in psched are not individually very meaningful, but they are
3747 * important. The tables below show some values seen in the wild.
3748 *
3749 * Some notes:
3750 *
3751 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3752 * (Before that, there are hints that it was 1000000000.)
3753 *
3754 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3755 * above.
3756 *
3757 * /proc/net/psched
3758 * -----------------------------------
3759 * [1] 000c8000 000f4240 000f4240 00000064
3760 * [2] 000003e8 00000400 000f4240 3b9aca00
3761 * [3] 000003e8 00000400 000f4240 3b9aca00
3762 * [4] 000003e8 00000400 000f4240 00000064
3763 * [5] 000003e8 00000040 000f4240 3b9aca00
3764 * [6] 000003e8 00000040 000f4240 000000f9
3765 *
3766 * a b c d ticks_per_s buffer_hz
3767 * ------- --------- ---------- ------------- ----------- -------------
3768 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3769 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3770 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3771 * [4] 1,000 1,024 1,000,000 100 976,562 100
3772 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3773 * [6] 1,000 64 1,000,000 249 15,625,000 249
3774 *
3775 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3776 * [2] 2.6.26-1-686-bigmem from Debian lenny
3777 * [3] 2.6.26-2-sparc64 from Debian lenny
3778 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3779 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3780 * [6] 2.6.34 from kernel.org on KVM
3781 */
23882115 3782 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
3783 static const char fn[] = "/proc/net/psched";
3784 unsigned int a, b, c, d;
3785 FILE *stream;
3786
23882115
BP
3787 if (!ovsthread_once_start(&once)) {
3788 return;
3789 }
3790
c1c9c9c4
BP
3791 ticks_per_s = 1.0;
3792 buffer_hz = 100;
3793
3794 stream = fopen(fn, "r");
3795 if (!stream) {
10a89ef0 3796 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 3797 goto exit;
c1c9c9c4
BP
3798 }
3799
3800 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3801 VLOG_WARN("%s: read failed", fn);
3802 fclose(stream);
23882115 3803 goto exit;
c1c9c9c4
BP
3804 }
3805 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3806 fclose(stream);
3807
3808 if (!a || !c) {
3809 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 3810 goto exit;
c1c9c9c4
BP
3811 }
3812
3813 ticks_per_s = (double) a * c / b;
3814 if (c == 1000000) {
3815 buffer_hz = d;
3816 } else {
3817 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3818 fn, a, b, c, d);
3819 }
3820 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
3821
3822exit:
3823 ovsthread_once_done(&once);
c1c9c9c4
BP
3824}
3825
3826/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3827 * rate of 'rate' bytes per second. */
3828static unsigned int
3829tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3830{
23882115 3831 read_psched();
c1c9c9c4
BP
3832 return (rate * ticks) / ticks_per_s;
3833}
3834
3835/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3836 * rate of 'rate' bytes per second. */
3837static unsigned int
3838tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3839{
23882115 3840 read_psched();
015c93a4 3841 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3842}
3843
3844/* Returns the number of bytes that need to be reserved for qdisc buffering at
3845 * a transmission rate of 'rate' bytes per second. */
3846static unsigned int
3847tc_buffer_per_jiffy(unsigned int rate)
3848{
23882115 3849 read_psched();
c1c9c9c4
BP
3850 return rate / buffer_hz;
3851}
3852
3853/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3854 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3855 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3856 * stores NULL into it if it is absent.
3857 *
3858 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3859 * 'msg'.
3860 *
3861 * Returns 0 if successful, otherwise a positive errno value. */
3862static int
3863tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3864 struct nlattr **options)
3865{
3866 static const struct nl_policy tca_policy[] = {
3867 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3868 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3869 };
3870 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3871
3872 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3873 tca_policy, ta, ARRAY_SIZE(ta))) {
3874 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3875 goto error;
3876 }
3877
3878 if (kind) {
3879 *kind = nl_attr_get_string(ta[TCA_KIND]);
3880 }
3881
3882 if (options) {
3883 *options = ta[TCA_OPTIONS];
3884 }
3885
3886 return 0;
3887
3888error:
3889 if (kind) {
3890 *kind = NULL;
3891 }
3892 if (options) {
3893 *options = NULL;
3894 }
3895 return EPROTO;
3896}
3897
3898/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3899 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3900 * into '*options', and its queue statistics into '*stats'. Any of the output
3901 * arguments may be null.
3902 *
3903 * Returns 0 if successful, otherwise a positive errno value. */
3904static int
3905tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3906 struct nlattr **options, struct netdev_queue_stats *stats)
3907{
3908 static const struct nl_policy tca_policy[] = {
3909 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3910 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3911 };
3912 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3913
3914 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3915 tca_policy, ta, ARRAY_SIZE(ta))) {
3916 VLOG_WARN_RL(&rl, "failed to parse class message");
3917 goto error;
3918 }
3919
3920 if (handlep) {
3921 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3922 *handlep = tc->tcm_handle;
3923 }
3924
3925 if (options) {
3926 *options = ta[TCA_OPTIONS];
3927 }
3928
3929 if (stats) {
3930 const struct gnet_stats_queue *gsq;
3931 struct gnet_stats_basic gsb;
3932
3933 static const struct nl_policy stats_policy[] = {
3934 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3935 .min_len = sizeof gsb },
3936 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3937 .min_len = sizeof *gsq },
3938 };
3939 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3940
3941 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3942 sa, ARRAY_SIZE(sa))) {
3943 VLOG_WARN_RL(&rl, "failed to parse class stats");
3944 goto error;
3945 }
3946
3947 /* Alignment issues screw up the length of struct gnet_stats_basic on
3948 * some arch/bitsize combinations. Newer versions of Linux have a
3949 * struct gnet_stats_basic_packed, but we can't depend on that. The
3950 * easiest thing to do is just to make a copy. */
3951 memset(&gsb, 0, sizeof gsb);
3952 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3953 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3954 stats->tx_bytes = gsb.bytes;
3955 stats->tx_packets = gsb.packets;
3956
3957 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3958 stats->tx_errors = gsq->drops;
3959 }
3960
3961 return 0;
3962
3963error:
3964 if (options) {
3965 *options = NULL;
3966 }
3967 if (stats) {
3968 memset(stats, 0, sizeof *stats);
3969 }
3970 return EPROTO;
3971}
3972
3973/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3974 * on 'netdev'. */
3975static int
3976tc_query_class(const struct netdev *netdev,
3977 unsigned int handle, unsigned int parent,
3978 struct ofpbuf **replyp)
3979{
3980 struct ofpbuf request;
3981 struct tcmsg *tcmsg;
3982 int error;
3983
3984 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3985 if (!tcmsg) {
3986 return ENODEV;
3987 }
c1c9c9c4
BP
3988 tcmsg->tcm_handle = handle;
3989 tcmsg->tcm_parent = parent;
3990
3991 error = tc_transact(&request, replyp);
3992 if (error) {
3993 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3994 netdev_get_name(netdev),
3995 tc_get_major(handle), tc_get_minor(handle),
3996 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 3997 ovs_strerror(error));
c1c9c9c4
BP
3998 }
3999 return error;
4000}
4001
4002/* Equivalent to "tc class del dev <name> handle <handle>". */
4003static int
4004tc_delete_class(const struct netdev *netdev, unsigned int handle)
4005{
4006 struct ofpbuf request;
4007 struct tcmsg *tcmsg;
4008 int error;
4009
4010 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4011 if (!tcmsg) {
4012 return ENODEV;
4013 }
c1c9c9c4
BP
4014 tcmsg->tcm_handle = handle;
4015 tcmsg->tcm_parent = 0;
4016
4017 error = tc_transact(&request, NULL);
4018 if (error) {
4019 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4020 netdev_get_name(netdev),
4021 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 4022 ovs_strerror(error));
c1c9c9c4
BP
4023 }
4024 return error;
4025}
4026
4027/* Equivalent to "tc qdisc del dev <name> root". */
4028static int
b5d57fc8 4029tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 4030{
b5d57fc8 4031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4032 struct ofpbuf request;
4033 struct tcmsg *tcmsg;
4034 int error;
4035
b5d57fc8 4036 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4037 if (!tcmsg) {
4038 return ENODEV;
4039 }
c1c9c9c4
BP
4040 tcmsg->tcm_handle = tc_make_handle(1, 0);
4041 tcmsg->tcm_parent = TC_H_ROOT;
4042
4043 error = tc_transact(&request, NULL);
4044 if (error == EINVAL) {
4045 /* EINVAL probably means that the default qdisc was in use, in which
4046 * case we've accomplished our purpose. */
4047 error = 0;
4048 }
b5d57fc8
BP
4049 if (!error && netdev->tc) {
4050 if (netdev->tc->ops->tc_destroy) {
4051 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 4052 }
b5d57fc8 4053 netdev->tc = NULL;
c1c9c9c4
BP
4054 }
4055 return error;
4056}
4057
4058/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4059 * kernel to determine what they are. Returns 0 if successful, otherwise a
4060 * positive errno value. */
4061static int
b5d57fc8 4062tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 4063{
b5d57fc8 4064 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4065 struct ofpbuf request, *qdisc;
4066 const struct tc_ops *ops;
4067 struct tcmsg *tcmsg;
4068 int load_error;
4069 int error;
4070
b5d57fc8 4071 if (netdev->tc) {
c1c9c9c4
BP
4072 return 0;
4073 }
4074
4075 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4076 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4077 * 2.6.35 without that fix backported to it.
4078 *
4079 * To avoid the OOPS, we must not make a request that would attempt to dump
4080 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4081 * few others. There are a few ways that I can see to do this, but most of
4082 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4083 * technique chosen here is to assume that any non-default qdisc that we
4084 * create will have a class with handle 1:0. The built-in qdiscs only have
4085 * a class with handle 0:0.
4086 *
4087 * We could check for Linux 2.6.35+ and use a more straightforward method
4088 * there. */
b5d57fc8 4089 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4090 if (!tcmsg) {
4091 return ENODEV;
4092 }
c1c9c9c4
BP
4093 tcmsg->tcm_handle = tc_make_handle(1, 0);
4094 tcmsg->tcm_parent = 0;
4095
4096 /* Figure out what tc class to instantiate. */
4097 error = tc_transact(&request, &qdisc);
4098 if (!error) {
4099 const char *kind;
4100
4101 error = tc_parse_qdisc(qdisc, &kind, NULL);
4102 if (error) {
4103 ops = &tc_ops_other;
4104 } else {
4105 ops = tc_lookup_linux_name(kind);
4106 if (!ops) {
4107 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4108 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4109
4110 ops = &tc_ops_other;
4111 }
4112 }
4113 } else if (error == ENOENT) {
4114 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4115 * other entity that doesn't have a handle 1:0. We will assume
4116 * that it's the system default qdisc. */
4117 ops = &tc_ops_default;
4118 error = 0;
4119 } else {
4120 /* Who knows? Maybe the device got deleted. */
4121 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 4122 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
4123 ops = &tc_ops_other;
4124 }
4125
4126 /* Instantiate it. */
b5d57fc8
BP
4127 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4128 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
4129 ofpbuf_delete(qdisc);
4130
4131 return error ? error : load_error;
4132}
4133
4134/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4135 approximate the time to transmit packets of various lengths. For an MTU of
4136 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4137 represents two possible packet lengths; for a MTU of 513 through 1024, four
4138 possible lengths; and so on.
4139
4140 Returns, for the specified 'mtu', the number of bits that packet lengths
4141 need to be shifted right to fit within such a 256-entry table. */
4142static int
4143tc_calc_cell_log(unsigned int mtu)
4144{
4145 int cell_log;
4146
4147 if (!mtu) {
4148 mtu = ETH_PAYLOAD_MAX;
4149 }
4150 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4151
4152 for (cell_log = 0; mtu >= 256; cell_log++) {
4153 mtu >>= 1;
4154 }
4155
4156 return cell_log;
4157}
4158
4159/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4160 * of 'mtu'. */
4161static void
4162tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4163{
4164 memset(rate, 0, sizeof *rate);
4165 rate->cell_log = tc_calc_cell_log(mtu);
4166 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4167 /* rate->cell_align = 0; */ /* distro headers. */
4168 rate->mpu = ETH_TOTAL_MIN;
4169 rate->rate = Bps;
4170}
4171
4172/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4173 * attribute of the specified "type".
4174 *
4175 * See tc_calc_cell_log() above for a description of "rtab"s. */
4176static void
4177tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4178{
4179 uint32_t *rtab;
4180 unsigned int i;
4181
4182 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4183 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4184 unsigned packet_size = (i + 1) << rate->cell_log;
4185 if (packet_size < rate->mpu) {
4186 packet_size = rate->mpu;
4187 }
4188 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4189 }
4190}
4191
4192/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4193 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4194 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4195 * 0 is fine.) */
c1c9c9c4
BP
4196static int
4197tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4198{
4199 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4200 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4201}
d3980822 4202\f
aaf2fb1a
BP
4203/* Linux-only functions declared in netdev-linux.h */
4204
025e874a
BP
4205/* Returns a fd for an AF_INET socket or a negative errno value. */
4206int
4207netdev_linux_get_af_inet_sock(void)
4208{
4209 int error = netdev_linux_init();
4210 return error ? -error : af_inet_sock;
4211}
4212
aaf2fb1a
BP
4213/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4214 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4215int
4216netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4217 const char *flag_name, bool enable)
4218{
4219 const char *netdev_name = netdev_get_name(netdev);
4220 struct ethtool_value evalue;
4221 uint32_t new_flags;
4222 int error;
4223
ab985a77 4224 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4225 memset(&evalue, 0, sizeof evalue);
4226 error = netdev_linux_do_ethtool(netdev_name,
4227 (struct ethtool_cmd *)&evalue,
4228 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4229 if (error) {
4230 return error;
4231 }
4232
ab985a77 4233 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4234 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4235 error = netdev_linux_do_ethtool(netdev_name,
4236 (struct ethtool_cmd *)&evalue,
4237 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4238 if (error) {
4239 return error;
4240 }
4241
ab985a77 4242 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4243 memset(&evalue, 0, sizeof evalue);
4244 error = netdev_linux_do_ethtool(netdev_name,
4245 (struct ethtool_cmd *)&evalue,
4246 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4247 if (error) {
4248 return error;
4249 }
4250
4251 if (new_flags != evalue.data) {
4252 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4253 "device %s failed", enable ? "enable" : "disable",
4254 flag_name, netdev_name);
4255 return EOPNOTSUPP;
4256 }
4257
4258 return 0;
4259}
4260\f
4261/* Utility functions. */
4262
d3980822 4263/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4264static void
d3980822
BP
4265netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4266 const struct rtnl_link_stats *src)
4267{
f613a0d7
PS
4268 dst->rx_packets = src->rx_packets;
4269 dst->tx_packets = src->tx_packets;
4270 dst->rx_bytes = src->rx_bytes;
4271 dst->tx_bytes = src->tx_bytes;
4272 dst->rx_errors = src->rx_errors;
4273 dst->tx_errors = src->tx_errors;
4274 dst->rx_dropped = src->rx_dropped;
4275 dst->tx_dropped = src->tx_dropped;
4276 dst->multicast = src->multicast;
4277 dst->collisions = src->collisions;
4278 dst->rx_length_errors = src->rx_length_errors;
4279 dst->rx_over_errors = src->rx_over_errors;
4280 dst->rx_crc_errors = src->rx_crc_errors;
4281 dst->rx_frame_errors = src->rx_frame_errors;
4282 dst->rx_fifo_errors = src->rx_fifo_errors;
4283 dst->rx_missed_errors = src->rx_missed_errors;
4284 dst->tx_aborted_errors = src->tx_aborted_errors;
4285 dst->tx_carrier_errors = src->tx_carrier_errors;
4286 dst->tx_fifo_errors = src->tx_fifo_errors;
4287 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4288 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4289}
4290
c1c9c9c4
BP
4291static int
4292get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4293{
4294 /* Policy for RTNLGRP_LINK messages.
4295 *
4296 * There are *many* more fields in these messages, but currently we only
4297 * care about these fields. */
4298 static const struct nl_policy rtnlgrp_link_policy[] = {
4299 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4300 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4301 .min_len = sizeof(struct rtnl_link_stats) },
4302 };
4303
4304 struct ofpbuf request;
4305 struct ofpbuf *reply;
4306 struct ifinfomsg *ifi;
c1c9c9c4
BP
4307 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4308 int error;
4309
4310 ofpbuf_init(&request, 0);
4311 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4312 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4313 ifi->ifi_family = PF_UNSPEC;
4314 ifi->ifi_index = ifindex;
a88b4e04 4315 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
4316 ofpbuf_uninit(&request);
4317 if (error) {
4318 return error;
4319 }
4320
4321 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4322 rtnlgrp_link_policy,
4323 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4324 ofpbuf_delete(reply);
4325 return EPROTO;
4326 }
4327
4328 if (!attrs[IFLA_STATS]) {
4329 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4330 ofpbuf_delete(reply);
4331 return EPROTO;
4332 }
8b61709d 4333
d3980822 4334 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4335
576e26d7
BP
4336 ofpbuf_delete(reply);
4337
8b61709d
BP
4338 return 0;
4339}
4340
4341static int
4342get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4343{
4344 static const char fn[] = "/proc/net/dev";
4345 char line[1024];
4346 FILE *stream;
4347 int ln;
4348
4349 stream = fopen(fn, "r");
4350 if (!stream) {
10a89ef0 4351 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
8b61709d
BP
4352 return errno;
4353 }
4354
4355 ln = 0;
4356 while (fgets(line, sizeof line, stream)) {
4357 if (++ln >= 3) {
4358 char devname[16];
4359#define X64 "%"SCNu64
4360 if (sscanf(line,
4361 " %15[^:]:"
4362 X64 X64 X64 X64 X64 X64 X64 "%*u"
4363 X64 X64 X64 X64 X64 X64 X64 "%*u",
4364 devname,
4365 &stats->rx_bytes,
4366 &stats->rx_packets,
4367 &stats->rx_errors,
4368 &stats->rx_dropped,
4369 &stats->rx_fifo_errors,
4370 &stats->rx_frame_errors,
4371 &stats->multicast,
4372 &stats->tx_bytes,
4373 &stats->tx_packets,
4374 &stats->tx_errors,
4375 &stats->tx_dropped,
4376 &stats->tx_fifo_errors,
4377 &stats->collisions,
4378 &stats->tx_carrier_errors) != 15) {
4379 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4380 } else if (!strcmp(devname, netdev_name)) {
4381 stats->rx_length_errors = UINT64_MAX;
4382 stats->rx_over_errors = UINT64_MAX;
4383 stats->rx_crc_errors = UINT64_MAX;
4384 stats->rx_missed_errors = UINT64_MAX;
4385 stats->tx_aborted_errors = UINT64_MAX;
4386 stats->tx_heartbeat_errors = UINT64_MAX;
4387 stats->tx_window_errors = UINT64_MAX;
4388 fclose(stream);
4389 return 0;
4390 }
4391 }
4392 }
4393 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4394 fclose(stream);
4395 return ENODEV;
4396}
c1c9c9c4 4397
3a183124 4398static int
b5d57fc8 4399get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
4400{
4401 struct ifreq ifr;
4402 int error;
4403
755be9ea
EJ
4404 *flags = 0;
4405 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
149f577a 4406 "SIOCGIFFLAGS");
755be9ea
EJ
4407 if (!error) {
4408 *flags = ifr.ifr_flags;
4409 }
8b61709d
BP
4410 return error;
4411}
4412
4413static int
4b609110 4414set_flags(const char *name, unsigned int flags)
8b61709d
BP
4415{
4416 struct ifreq ifr;
4417
4418 ifr.ifr_flags = flags;
4b609110 4419 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
4420}
4421
4422static int
4423do_get_ifindex(const char *netdev_name)
4424{
4425 struct ifreq ifr;
4426
71d7c22f 4427 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4428 COVERAGE_INC(netdev_get_ifindex);
4429 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4430 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
10a89ef0 4431 netdev_name, ovs_strerror(errno));
8b61709d
BP
4432 return -errno;
4433 }
4434 return ifr.ifr_ifindex;
4435}
4436
4437static int
4438get_ifindex(const struct netdev *netdev_, int *ifindexp)
4439{
b5d57fc8 4440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 4441
b5d57fc8 4442 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 4443 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4444
8b61709d 4445 if (ifindex < 0) {
b5d57fc8
BP
4446 netdev->get_ifindex_error = -ifindex;
4447 netdev->ifindex = 0;
c7b1b0a5 4448 } else {
b5d57fc8
BP
4449 netdev->get_ifindex_error = 0;
4450 netdev->ifindex = ifindex;
8b61709d 4451 }
b5d57fc8 4452 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 4453 }
c7b1b0a5 4454
b5d57fc8
BP
4455 *ifindexp = netdev->ifindex;
4456 return netdev->get_ifindex_error;
8b61709d
BP
4457}
4458
4459static int
4460get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4461{
4462 struct ifreq ifr;
4463 int hwaddr_family;
4464
4465 memset(&ifr, 0, sizeof ifr);
71d7c22f 4466 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4467 COVERAGE_INC(netdev_get_hwaddr);
4468 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4469 /* ENODEV probably means that a vif disappeared asynchronously and
4470 * hasn't been removed from the database yet, so reduce the log level
4471 * to INFO for that case. */
4472 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4473 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
10a89ef0 4474 netdev_name, ovs_strerror(errno));
8b61709d
BP
4475 return errno;
4476 }
4477 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4478 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4479 VLOG_WARN("%s device has unknown hardware address family %d",
4480 netdev_name, hwaddr_family);
4481 }
4482 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4483 return 0;
4484}
4485
4486static int
44445cac 4487set_etheraddr(const char *netdev_name,
8b61709d
BP
4488 const uint8_t mac[ETH_ADDR_LEN])
4489{
4490 struct ifreq ifr;
4491
4492 memset(&ifr, 0, sizeof ifr);
71d7c22f 4493 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4494 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4495 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4496 COVERAGE_INC(netdev_set_hwaddr);
4497 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4498 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
10a89ef0 4499 netdev_name, ovs_strerror(errno));
8b61709d
BP
4500 return errno;
4501 }
4502 return 0;
4503}
4504
4505static int
0b0544d7 4506netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4507 int cmd, const char *cmd_name)
4508{
4509 struct ifreq ifr;
4510
4511 memset(&ifr, 0, sizeof ifr);
71d7c22f 4512 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4513 ifr.ifr_data = (caddr_t) ecmd;
4514
4515 ecmd->cmd = cmd;
8b61709d
BP
4516 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4517 return 0;
4518 } else {
4519 if (errno != EOPNOTSUPP) {
4520 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
10a89ef0 4521 "failed: %s", cmd_name, name, ovs_strerror(errno));
8b61709d
BP
4522 } else {
4523 /* The device doesn't support this operation. That's pretty
4524 * common, so there's no point in logging anything. */
4525 }
4526 return errno;
4527 }
4528}
4529
4530static int
149f577a
JG
4531netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4532 const char *cmd_name)
8b61709d 4533{
71d7c22f 4534 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4535 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a 4536 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
10a89ef0 4537 ovs_strerror(errno));
8b61709d
BP
4538 return errno;
4539 }
4540 return 0;
4541}
f1acd62b
BP
4542
4543static int
4544netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4545 int cmd, const char *cmd_name)
4546{
4547 struct ifreq ifr;
4548 int error;
4549
4550 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4551 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4552 if (!error) {
4553 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4554 *ip = sin->sin_addr;
4555 }
4556 return error;
4557}
488d734d
BP
4558
4559/* Returns an AF_PACKET raw socket or a negative errno value. */
4560static int
4561af_packet_sock(void)
4562{
23882115
BP
4563 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4564 static int sock;
488d734d 4565
23882115 4566 if (ovsthread_once_start(&once)) {
488d734d
BP
4567 sock = socket(AF_PACKET, SOCK_RAW, 0);
4568 if (sock >= 0) {
8450059e
BP
4569 int error = set_nonblocking(sock);
4570 if (error) {
4571 close(sock);
4572 sock = -error;
4573 }
488d734d
BP
4574 } else {
4575 sock = -errno;
10a89ef0
BP
4576 VLOG_ERR("failed to create packet socket: %s",
4577 ovs_strerror(errno));
488d734d 4578 }
23882115 4579 ovsthread_once_done(&once);
488d734d
BP
4580 }
4581
4582 return sock;
4583}