]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
debian: Ensure that /var/run/openvswitch exists in controller init script.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
8b61709d 27#include <linux/if_tun.h>
a740f0de 28#include <linux/ip.h>
8b61709d
BP
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
6f42c8ea 32#include <linux/pkt_sched.h>
e9e28be3 33#include <linux/rtnetlink.h>
8b61709d
BP
34#include <linux/sockios.h>
35#include <linux/version.h>
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
39#include <netpacket/packet.h>
40#include <net/ethernet.h>
41#include <net/if.h>
a740f0de 42#include <linux/if_tunnel.h>
8b61709d
BP
43#include <net/if_arp.h>
44#include <net/if_packet.h>
45#include <net/route.h>
46#include <netinet/in.h>
e9e28be3 47#include <poll.h>
8b61709d
BP
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
e9e28be3
BP
51
52#include "coverage.h"
9fe3b9a2 53#include "dpif-linux.h"
8b61709d
BP
54#include "dynamic-string.h"
55#include "fatal-signal.h"
93b13be8
BP
56#include "hash.h"
57#include "hmap.h"
8b61709d 58#include "netdev-provider.h"
7fbef77a 59#include "netdev-vport.h"
e9e28be3 60#include "netlink.h"
2fe27d5a 61#include "netlink-socket.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
559843ed 66#include "rtnetlink.h"
21d6e22e 67#include "rtnetlink-link.h"
8b61709d
BP
68#include "socket-util.h"
69#include "shash.h"
19993ef3 70#include "sset.h"
1670c579 71#include "timer.h"
e9e28be3 72#include "vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
75
76COVERAGE_DEFINE(netdev_get_vlan_vid);
77COVERAGE_DEFINE(netdev_set_policing);
78COVERAGE_DEFINE(netdev_arp_lookup);
79COVERAGE_DEFINE(netdev_get_ifindex);
80COVERAGE_DEFINE(netdev_get_hwaddr);
81COVERAGE_DEFINE(netdev_set_hwaddr);
82COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
83\f
84/* These were introduced in Linux 2.6.14, so they might be missing if we have
85 * old headers. */
86#ifndef ADVERTISED_Pause
87#define ADVERTISED_Pause (1 << 13)
88#endif
89#ifndef ADVERTISED_Asym_Pause
90#define ADVERTISED_Asym_Pause (1 << 14)
91#endif
92
c1c9c9c4
BP
93/* This was introduced in Linux 2.6.25, so it might be missing if we have old
94 * headers. */
95#ifndef TC_RTAB_SIZE
96#define TC_RTAB_SIZE 1024
97#endif
98
149f577a 99static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 100static int cache_notifier_refcount;
8b61709d
BP
101
102enum {
7fbef77a
JG
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
105 VALID_IN4 = 1 << 2,
106 VALID_IN6 = 1 << 3,
107 VALID_MTU = 1 << 4,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
112};
113
149f577a
JG
114struct tap_state {
115 int fd;
61b999dd 116 bool opened;
149f577a 117};
c1c9c9c4
BP
118\f
119/* Traffic control. */
120
121/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
122 * network device.
123 *
124 * Each TC implementation subclasses this with whatever additional data it
125 * needs. */
c1c9c9c4
BP
126struct tc {
127 const struct tc_ops *ops;
93b13be8
BP
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
131};
c1c9c9c4 132
93b13be8
BP
133/* One traffic control queue.
134 *
135 * Each TC implementation subclasses this with whatever additional data it
136 * needs. */
137struct tc_queue {
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
140};
141
142/* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
144 *
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
148struct tc_ops {
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
153
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
156
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
160
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
166 *
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
170 *
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
173 *
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
177
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
181 *
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
187 * 'netdev'.
188 *
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
192
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
195 * tc_destroy(tc).
196 *
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
200 *
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
203
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
205 *
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
209 *
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
213 *
214 * This function may be null if 'tc' is not configurable.
215 */
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
217
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
220 *
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
224 *
225 * This function may be null if 'tc' is not configurable.
226 */
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
228
93b13be8
BP
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
231 *
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
235 *
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
239 *
240 * This function may be null if 'tc' does not have queues ('n_queues' is
241 * 0). */
93b13be8 242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
243 struct shash *details);
244
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
248 * 'n_queues'.
249 *
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
253 *
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
258
93b13be8
BP
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
261 *
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
93b13be8 264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 265
93b13be8
BP
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
268 *
269 * On success, initializes '*stats'.
270 *
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
93b13be8
BP
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
c1c9c9c4
BP
275 struct netdev_queue_stats *stats);
276
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
279 *
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
285};
286
287static void
288tc_init(struct tc *tc, const struct tc_ops *ops)
289{
290 tc->ops = ops;
93b13be8 291 hmap_init(&tc->queues);
c1c9c9c4
BP
292}
293
294static void
295tc_destroy(struct tc *tc)
296{
93b13be8 297 hmap_destroy(&tc->queues);
c1c9c9c4
BP
298}
299
300static const struct tc_ops tc_ops_htb;
a339aa81 301static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
302static const struct tc_ops tc_ops_default;
303static const struct tc_ops tc_ops_other;
304
305static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
310 NULL
311};
149f577a 312
c1c9c9c4
BP
313static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314static unsigned int tc_get_major(unsigned int handle);
315static unsigned int tc_get_minor(unsigned int handle);
316
317static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319static unsigned int tc_buffer_per_jiffy(unsigned int rate);
320
321static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
324
325static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333static int tc_delete_class(const struct netdev *, unsigned int handle);
334
335static int tc_del_qdisc(struct netdev *netdev);
336static int tc_query_qdisc(const struct netdev *netdev);
337
338static int tc_calc_cell_log(unsigned int mtu);
339static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
343\f
149f577a
JG
344struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
346
8b61709d 347 struct shash_node *shash_node;
149f577a 348 unsigned int cache_valid;
ac4d3bcb 349 unsigned int change_seq;
8b61709d 350
1670c579
EJ
351 bool miimon; /* Link status of last poll. */
352 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
353 struct timer miimon_timer;
354
8722022c
BP
355 /* The following are figured out "on demand" only. They are only valid
356 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
357 int ifindex;
358 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 359 struct in_addr address, netmask;
8b61709d
BP
360 struct in6_addr in6;
361 int mtu;
362 int carrier;
8722022c
BP
363 bool is_internal; /* Is this an openvswitch internal device? */
364 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
365 uint32_t kbits_rate; /* Policing data. */
366 uint32_t kbits_burst;
7fbef77a 367 bool have_vport_stats;
c1c9c9c4 368 struct tc *tc;
149f577a
JG
369
370 union {
371 struct tap_state tap;
372 } state;
8b61709d
BP
373};
374
149f577a
JG
375struct netdev_linux {
376 struct netdev netdev;
5b7448ed 377 int fd;
149f577a 378};
8b61709d 379
76c308b5
BP
380/* Sockets used for ioctl operations. */
381static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 382
ff4ed3c9
BP
383/* A Netlink routing socket that is not subscribed to any multicast groups. */
384static struct nl_sock *rtnl_sock;
385
8b61709d
BP
386/* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389
15b3596a 390static int netdev_linux_init(void);
6f643e49 391
0b0544d7 392static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 393 int cmd, const char *cmd_name);
149f577a
JG
394static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
f1acd62b
BP
396static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
8b61709d
BP
398static int get_flags(const struct netdev *, int *flagsp);
399static int set_flags(struct netdev *, int flags);
400static int do_get_ifindex(const char *netdev_name);
401static int get_ifindex(const struct netdev *, int *ifindexp);
402static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 410static int af_packet_sock(void);
1670c579
EJ
411static void netdev_linux_miimon_run(void);
412static void netdev_linux_miimon_wait(void);
8b61709d 413
15b3596a
JG
414static bool
415is_netdev_linux_class(const struct netdev_class *netdev_class)
416{
417 return netdev_class->init == netdev_linux_init;
418}
419
149f577a
JG
420static struct netdev_dev_linux *
421netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 422{
15b3596a
JG
423 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
424 assert(is_netdev_linux_class(netdev_class));
425
149f577a 426 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
427}
428
8b61709d
BP
429static struct netdev_linux *
430netdev_linux_cast(const struct netdev *netdev)
431{
15b3596a
JG
432 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
433 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
434 assert(is_netdev_linux_class(netdev_class));
435
8b61709d
BP
436 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
437}
ff4ed3c9 438\f
8b61709d
BP
439static int
440netdev_linux_init(void)
441{
442 static int status = -1;
443 if (status < 0) {
ff4ed3c9 444 /* Create AF_INET socket. */
8b61709d
BP
445 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
446 status = af_inet_sock >= 0 ? 0 : errno;
447 if (status) {
448 VLOG_ERR("failed to create inet socket: %s", strerror(status));
449 }
ff4ed3c9
BP
450
451 /* Create rtnetlink socket. */
452 if (!status) {
cceb11f5 453 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
454 if (status) {
455 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
456 strerror(status));
457 }
458 }
8b61709d
BP
459 }
460 return status;
461}
462
463static void
464netdev_linux_run(void)
465{
21d6e22e 466 rtnetlink_link_notifier_run();
1670c579 467 netdev_linux_miimon_run();
8b61709d
BP
468}
469
470static void
471netdev_linux_wait(void)
472{
21d6e22e 473 rtnetlink_link_notifier_wait();
1670c579 474 netdev_linux_miimon_wait();
8b61709d
BP
475}
476
ac4d3bcb
EJ
477static void
478netdev_dev_linux_changed(struct netdev_dev_linux *dev)
479{
480 dev->change_seq++;
481 if (!dev->change_seq) {
482 dev->change_seq++;
483 }
484 dev->cache_valid = 0;
485}
486
8b61709d 487static void
21d6e22e 488netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 489 void *aux OVS_UNUSED)
8b61709d 490{
149f577a 491 struct netdev_dev_linux *dev;
8b61709d 492 if (change) {
46415c90
JG
493 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
494 if (base_dev) {
15b3596a
JG
495 const struct netdev_class *netdev_class =
496 netdev_dev_get_class(base_dev);
497
498 if (is_netdev_linux_class(netdev_class)) {
499 dev = netdev_dev_linux_cast(base_dev);
ac4d3bcb 500 netdev_dev_linux_changed(dev);
15b3596a 501 }
8b61709d
BP
502 }
503 } else {
46415c90 504 struct shash device_shash;
8b61709d 505 struct shash_node *node;
46415c90
JG
506
507 shash_init(&device_shash);
508 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
509 SHASH_FOR_EACH (node, &device_shash) {
149f577a 510 dev = node->data;
ac4d3bcb 511 netdev_dev_linux_changed(dev);
8b61709d 512 }
46415c90 513 shash_destroy(&device_shash);
8b61709d
BP
514 }
515}
516
c3827f61 517/* Creates system and internal devices. */
8b61709d 518static int
c3827f61 519netdev_linux_create(const struct netdev_class *class,
b8dcf5e9
BP
520 const char *name, const struct shash *args,
521 struct netdev_dev **netdev_devp)
6c88d577 522{
149f577a
JG
523 struct netdev_dev_linux *netdev_dev;
524 int error;
6c88d577
JP
525
526 if (!shash_is_empty(args)) {
c3827f61
BP
527 VLOG_WARN("%s: arguments for %s devices should be empty",
528 name, class->type);
6c88d577
JP
529 }
530
46415c90 531 if (!cache_notifier_refcount) {
21d6e22e
EJ
532 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
533 netdev_linux_cache_cb, NULL);
149f577a
JG
534 if (error) {
535 return error;
536 }
537 }
46415c90 538 cache_notifier_refcount++;
6c88d577 539
149f577a 540 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 541 netdev_dev->change_seq = 1;
6d9e6eb4 542 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
46415c90 543
149f577a 544 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
545 return 0;
546}
547
5b7448ed
JG
548/* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
a740f0de 554static int
b8dcf5e9
BP
555netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, const struct shash *args,
557 struct netdev_dev **netdev_devp)
a740f0de 558{
149f577a 559 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
560 struct tap_state *state;
561 static const char tap_dev[] = "/dev/net/tun";
562 struct ifreq ifr;
563 int error;
564
565 if (!shash_is_empty(args)) {
149f577a 566 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
567 }
568
149f577a
JG
569 netdev_dev = xzalloc(sizeof *netdev_dev);
570 state = &netdev_dev->state.tap;
a740f0de 571
6c88d577 572 /* Open tap device. */
149f577a
JG
573 state->fd = open(tap_dev, O_RDWR);
574 if (state->fd < 0) {
6c88d577
JP
575 error = errno;
576 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
577 goto error;
578 }
579
580 /* Create tap device. */
581 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 582 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 583 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
584 VLOG_WARN("%s: creating tap device failed: %s", name,
585 strerror(errno));
586 error = errno;
587 goto error;
588 }
589
590 /* Make non-blocking. */
149f577a 591 error = set_nonblocking(state->fd);
a740f0de
JG
592 if (error) {
593 goto error;
594 }
595
6d9e6eb4 596 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
149f577a 597 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
598 return 0;
599
600error:
149f577a 601 free(netdev_dev);
a740f0de
JG
602 return error;
603}
604
a740f0de 605static void
149f577a 606destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 607{
149f577a
JG
608 struct tap_state *state = &netdev_dev->state.tap;
609
610 if (state->fd >= 0) {
611 close(state->fd);
a740f0de
JG
612 }
613}
614
149f577a 615/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 616static void
149f577a 617netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 618{
149f577a 619 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 620 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 621
c1c9c9c4
BP
622 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
623 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
624 }
625
d2bb2799 626 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 627 cache_notifier_refcount--;
149f577a 628
46415c90 629 if (!cache_notifier_refcount) {
21d6e22e 630 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 631 }
d2bb2799 632 } else if (class == &netdev_tap_class) {
149f577a 633 destroy_tap(netdev_dev);
d2bb2799
BP
634 } else {
635 NOT_REACHED();
6c88d577 636 }
149f577a 637
658797c8 638 free(netdev_dev);
6c88d577
JP
639}
640
8b61709d 641static int
7b6b0ef4 642netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 643{
5b7448ed 644 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
645 struct netdev_linux *netdev;
646 enum netdev_flags flags;
647 int error;
648
649 /* Allocate network device. */
ec6fde61 650 netdev = xzalloc(sizeof *netdev);
49a6a163 651 netdev->fd = -1;
5b7448ed 652 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 653
c3827f61
BP
654 /* Verify that the device really exists, by attempting to read its flags.
655 * (The flags might be cached, in which case this won't actually do an
656 * ioctl.)
657 *
658 * Don't do this for "internal" netdevs, though, because those have to be
659 * created as netdev objects before they exist in the kernel, because
660 * creating them in the kernel happens by passing a netdev object to
661 * dpif_port_add(). */
662 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
663 error = netdev_get_flags(&netdev->netdev, &flags);
664 if (error == ENODEV) {
665 goto error;
666 }
8b61709d
BP
667 }
668
61b999dd
JG
669 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
670 !netdev_dev->state.tap.opened) {
671
672 /* We assume that the first user of the tap device is the primary user
673 * and give them the tap FD. Subsequent users probably just expect
674 * this to be a system device so open it normally to avoid send/receive
675 * directions appearing to be reversed. */
5b7448ed 676 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 677 netdev_dev->state.tap.opened = true;
8b61709d
BP
678 }
679
680 *netdevp = &netdev->netdev;
681 return 0;
682
683error:
149f577a 684 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
685 return error;
686}
687
688/* Closes and destroys 'netdev'. */
689static void
690netdev_linux_close(struct netdev *netdev_)
691{
692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
693
49a6a163 694 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 695 close(netdev->fd);
8b61709d
BP
696 }
697 free(netdev);
698}
e9e28be3 699
19993ef3 700/* Initializes 'sset' with a list of the names of all known network devices. */
8b61709d 701static int
19993ef3 702netdev_linux_enumerate(struct sset *sset)
8b61709d
BP
703{
704 struct if_nameindex *names;
705
706 names = if_nameindex();
707 if (names) {
708 size_t i;
709
710 for (i = 0; names[i].if_name != NULL; i++) {
19993ef3 711 sset_add(sset, names[i].if_name);
8b61709d
BP
712 }
713 if_freenameindex(names);
714 return 0;
715 } else {
716 VLOG_WARN("could not obtain list of network device names: %s",
717 strerror(errno));
718 return errno;
719 }
720}
721
7b6b0ef4
BP
722static int
723netdev_linux_listen(struct netdev *netdev_)
724{
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 struct sockaddr_ll sll;
727 int ifindex;
728 int error;
729 int fd;
730
731 if (netdev->fd >= 0) {
732 return 0;
733 }
734
735 /* Create file descriptor. */
736 fd = socket(PF_PACKET, SOCK_RAW, 0);
737 if (fd < 0) {
738 error = errno;
739 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
740 goto error;
741 }
742
743 /* Set non-blocking mode. */
744 error = set_nonblocking(fd);
745 if (error) {
746 goto error;
747 }
748
749 /* Get ethernet device index. */
750 error = get_ifindex(&netdev->netdev, &ifindex);
751 if (error) {
752 goto error;
753 }
754
755 /* Bind to specific ethernet device. */
756 memset(&sll, 0, sizeof sll);
757 sll.sll_family = AF_PACKET;
758 sll.sll_ifindex = ifindex;
759 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
760 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
761 error = errno;
762 VLOG_ERR("%s: failed to bind raw socket (%s)",
763 netdev_get_name(netdev_), strerror(error));
764 goto error;
765 }
766
767 netdev->fd = fd;
768 return 0;
769
770error:
771 if (fd >= 0) {
772 close(fd);
773 }
774 return error;
775}
776
8b61709d
BP
777static int
778netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
779{
780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
781
5b7448ed 782 if (netdev->fd < 0) {
7b6b0ef4 783 /* Device is not listening. */
c0e5f6ca 784 return -EAGAIN;
8b61709d
BP
785 }
786
787 for (;;) {
5b7448ed 788 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
789 if (retval >= 0) {
790 return retval;
791 } else if (errno != EINTR) {
792 if (errno != EAGAIN) {
793 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
794 strerror(errno), netdev_get_name(netdev_));
795 }
c0e5f6ca 796 return -errno;
8b61709d
BP
797 }
798 }
799}
800
801/* Registers with the poll loop to wake up from the next call to poll_block()
802 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
803static void
804netdev_linux_recv_wait(struct netdev *netdev_)
805{
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
807 if (netdev->fd >= 0) {
808 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
809 }
810}
811
812/* Discards all packets waiting to be received from 'netdev'. */
813static int
814netdev_linux_drain(struct netdev *netdev_)
815{
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 817 if (netdev->fd < 0) {
8b61709d 818 return 0;
5b7448ed 819 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 820 struct ifreq ifr;
149f577a 821 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
822 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
823 if (error) {
824 return error;
825 }
5b7448ed 826 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
827 return 0;
828 } else {
5b7448ed 829 return drain_rcvbuf(netdev->fd);
8b61709d
BP
830 }
831}
832
833/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
834 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
835 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
836 * the packet is too big or too small to transmit on the device.
837 *
838 * The caller retains ownership of 'buffer' in all cases.
839 *
840 * The kernel maintains a packet transmission queue, so the caller is not
841 * expected to do additional queuing of packets. */
842static int
843netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
844{
f23347ea
BP
845 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
846 for (;;) {
847 ssize_t retval;
8b61709d 848
f23347ea
BP
849 if (netdev->fd < 0) {
850 /* Use our AF_PACKET socket to send to this device. */
851 struct sockaddr_ll sll;
852 struct msghdr msg;
853 struct iovec iov;
854 int ifindex;
855 int error;
488d734d
BP
856 int sock;
857
858 sock = af_packet_sock();
859 if (sock < 0) {
860 return sock;
861 }
f23347ea
BP
862
863 error = get_ifindex(netdev_, &ifindex);
864 if (error) {
865 return error;
866 }
8b61709d 867
f23347ea
BP
868 /* We don't bother setting most fields in sockaddr_ll because the
869 * kernel ignores them for SOCK_RAW. */
870 memset(&sll, 0, sizeof sll);
871 sll.sll_family = AF_PACKET;
872 sll.sll_ifindex = ifindex;
76c308b5 873
f23347ea
BP
874 iov.iov_base = (void *) data;
875 iov.iov_len = size;
76c308b5 876
f23347ea
BP
877 msg.msg_name = &sll;
878 msg.msg_namelen = sizeof sll;
879 msg.msg_iov = &iov;
880 msg.msg_iovlen = 1;
881 msg.msg_control = NULL;
882 msg.msg_controllen = 0;
883 msg.msg_flags = 0;
884
488d734d 885 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
886 } else {
887 /* Use the netdev's own fd to send to this device. This is
888 * essential for tap devices, because packets sent to a tap device
889 * with an AF_PACKET socket will loop back to be *received* again
890 * on the tap device. */
891 retval = write(netdev->fd, data, size);
892 }
76c308b5 893
8b61709d
BP
894 if (retval < 0) {
895 /* The Linux AF_PACKET implementation never blocks waiting for room
896 * for packets, instead returning ENOBUFS. Translate this into
897 * EAGAIN for the caller. */
898 if (errno == ENOBUFS) {
899 return EAGAIN;
900 } else if (errno == EINTR) {
901 continue;
902 } else if (errno != EAGAIN) {
903 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
904 netdev_get_name(netdev_), strerror(errno));
905 }
906 return errno;
907 } else if (retval != size) {
908 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
909 "%zu) on %s", retval, size, netdev_get_name(netdev_));
910 return EMSGSIZE;
911 } else {
912 return 0;
913 }
914 }
915}
916
917/* Registers with the poll loop to wake up from the next call to poll_block()
918 * when the packet transmission queue has sufficient room to transmit a packet
919 * with netdev_send().
920 *
921 * The kernel maintains a packet transmission queue, so the client is not
922 * expected to do additional queuing of packets. Thus, this function is
923 * unlikely to ever be used. It is included for completeness. */
924static void
925netdev_linux_send_wait(struct netdev *netdev_)
926{
927 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 928 if (netdev->fd < 0) {
8b61709d 929 /* Nothing to do. */
5b7448ed
JG
930 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
931 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
932 } else {
933 /* TAP device always accepts packets.*/
934 poll_immediate_wake();
935 }
936}
937
938/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
939 * otherwise a positive errno value. */
940static int
941netdev_linux_set_etheraddr(struct netdev *netdev_,
942 const uint8_t mac[ETH_ADDR_LEN])
943{
149f577a
JG
944 struct netdev_dev_linux *netdev_dev =
945 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
946 int error;
947
149f577a
JG
948 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
949 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
950 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
951 if (!error) {
149f577a
JG
952 netdev_dev->cache_valid |= VALID_ETHERADDR;
953 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
954 }
955 } else {
956 error = 0;
8b61709d
BP
957 }
958 return error;
959}
960
961/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
962 * free the returned buffer. */
963static int
964netdev_linux_get_etheraddr(const struct netdev *netdev_,
965 uint8_t mac[ETH_ADDR_LEN])
966{
149f577a
JG
967 struct netdev_dev_linux *netdev_dev =
968 netdev_dev_linux_cast(netdev_get_dev(netdev_));
969 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 970 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 971 netdev_dev->etheraddr);
8b61709d
BP
972 if (error) {
973 return error;
974 }
149f577a 975 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 976 }
149f577a 977 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
978 return 0;
979}
980
981/* Returns the maximum size of transmitted (and received) packets on 'netdev',
982 * in bytes, not including the hardware header; thus, this is typically 1500
983 * bytes for Ethernet devices. */
984static int
985netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
986{
149f577a
JG
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
989 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
990 struct ifreq ifr;
991 int error;
992
149f577a
JG
993 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
994 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
995 if (error) {
996 return error;
997 }
149f577a
JG
998 netdev_dev->mtu = ifr.ifr_mtu;
999 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 1000 }
149f577a 1001 *mtup = netdev_dev->mtu;
8b61709d
BP
1002 return 0;
1003}
1004
9ab3d9a3
BP
1005/* Returns the ifindex of 'netdev', if successful, as a positive number.
1006 * On failure, returns a negative errno value. */
1007static int
1008netdev_linux_get_ifindex(const struct netdev *netdev)
1009{
1010 int ifindex, error;
1011
1012 error = get_ifindex(netdev, &ifindex);
1013 return error ? -error : ifindex;
1014}
1015
8b61709d
BP
1016static int
1017netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1018{
149f577a
JG
1019 struct netdev_dev_linux *netdev_dev =
1020 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1021 int error = 0;
1022 char *fn = NULL;
1023 int fd = -1;
1024
1670c579
EJ
1025 if (netdev_dev->miimon_interval > 0) {
1026 *carrier = netdev_dev->miimon;
1027 return 0;
1028 }
1029
149f577a 1030 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
1031 char line[8];
1032 int retval;
1033
149f577a
JG
1034 fn = xasprintf("/sys/class/net/%s/carrier",
1035 netdev_get_name(netdev_));
8b61709d
BP
1036 fd = open(fn, O_RDONLY);
1037 if (fd < 0) {
1038 error = errno;
1039 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1040 goto exit;
1041 }
1042
1043 retval = read(fd, line, sizeof line);
1044 if (retval < 0) {
1045 error = errno;
1046 if (error == EINVAL) {
1047 /* This is the normal return value when we try to check carrier
1048 * if the network device is not up. */
1049 } else {
1050 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1051 }
1052 goto exit;
1053 } else if (retval == 0) {
1054 error = EPROTO;
1055 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1056 goto exit;
1057 }
1058
1059 if (line[0] != '0' && line[0] != '1') {
1060 error = EPROTO;
1061 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1062 fn, line[0]);
1063 goto exit;
1064 }
149f577a
JG
1065 netdev_dev->carrier = line[0] != '0';
1066 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 1067 }
149f577a 1068 *carrier = netdev_dev->carrier;
8b61709d
BP
1069 error = 0;
1070
1071exit:
1072 if (fd >= 0) {
1073 close(fd);
1074 }
1075 free(fn);
1076 return error;
1077}
1078
63331829 1079static int
1670c579
EJ
1080netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1081 struct mii_ioctl_data *data)
63331829 1082{
63331829 1083 struct ifreq ifr;
782e6111 1084 int error;
63331829 1085
63331829 1086 memset(&ifr, 0, sizeof ifr);
782e6111 1087 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1088 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1089 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1090
782e6111
EJ
1091 return error;
1092}
1093
1094static int
1670c579 1095netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1096{
782e6111
EJ
1097 struct mii_ioctl_data data;
1098 int error;
63331829 1099
782e6111
EJ
1100 *miimon = false;
1101
1102 memset(&data, 0, sizeof data);
1670c579 1103 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1104 if (!error) {
1105 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1106 data.reg_num = MII_BMSR;
1670c579 1107 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1108 &data);
63331829
EJ
1109
1110 if (!error) {
782e6111 1111 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1112 } else {
1113 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1114 }
1115 } else {
1116 struct ethtool_cmd ecmd;
63331829
EJ
1117
1118 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1119 name);
1120
1121 memset(&ecmd, 0, sizeof ecmd);
1122 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1123 "ETHTOOL_GLINK");
1124 if (!error) {
782e6111
EJ
1125 struct ethtool_value eval;
1126
1127 memcpy(&eval, &ecmd, sizeof eval);
1128 *miimon = !!eval.data;
63331829
EJ
1129 } else {
1130 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1131 }
1132 }
1133
1134 return error;
1135}
1136
1670c579
EJ
1137static int
1138netdev_linux_set_miimon_interval(struct netdev *netdev_,
1139 long long int interval)
1140{
1141 struct netdev_dev_linux *netdev_dev;
1142
1143 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1144
1145 interval = interval > 0 ? MAX(interval, 100) : 0;
1146 if (netdev_dev->miimon_interval != interval) {
1147 netdev_dev->miimon_interval = interval;
1148 timer_set_expired(&netdev_dev->miimon_timer);
1149 }
1150
1151 return 0;
1152}
1153
1154static void
1155netdev_linux_miimon_run(void)
1156{
1157 struct shash device_shash;
1158 struct shash_node *node;
1159
1160 shash_init(&device_shash);
1161 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1162 SHASH_FOR_EACH (node, &device_shash) {
1163 struct netdev_dev_linux *dev = node->data;
1164 bool miimon;
1165
1166 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1167 continue;
1168 }
1169
1170 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1171 if (miimon != dev->miimon) {
1670c579 1172 dev->miimon = miimon;
ac4d3bcb 1173 netdev_dev_linux_changed(dev);
1670c579
EJ
1174 }
1175
1176 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1177 }
1178
1179 shash_destroy(&device_shash);
1180}
1181
1182static void
1183netdev_linux_miimon_wait(void)
1184{
1185 struct shash device_shash;
1186 struct shash_node *node;
1187
1188 shash_init(&device_shash);
1189 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1190 SHASH_FOR_EACH (node, &device_shash) {
1191 struct netdev_dev_linux *dev = node->data;
1192
1193 if (dev->miimon_interval > 0) {
1194 timer_wait(&dev->miimon_timer);
1195 }
1196 }
1197 shash_destroy(&device_shash);
1198}
1199
8b61709d
BP
1200/* Check whether we can we use RTM_GETLINK to get network device statistics.
1201 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1202 * enabled. */
1203static bool
1204check_for_working_netlink_stats(void)
1205{
1206 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1207 * preferable, so if that works, we'll use it. */
1208 int ifindex = do_get_ifindex("lo");
1209 if (ifindex < 0) {
1210 VLOG_WARN("failed to get ifindex for lo, "
1211 "obtaining netdev stats from proc");
1212 return false;
1213 } else {
1214 struct netdev_stats stats;
1215 int error = get_stats_via_netlink(ifindex, &stats);
1216 if (!error) {
1217 VLOG_DBG("obtaining netdev stats via rtnetlink");
1218 return true;
1219 } else {
1220 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1221 "via proc (you are probably running a pre-2.6.19 "
1222 "kernel)", strerror(error));
1223 return false;
1224 }
1225 }
1226}
1227
8722022c
BP
1228/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1229static void
1230netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1231{
1232 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1233 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1234 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1235
8722022c 1236 netdev_dev->is_tap = !strcmp(type, "tap");
9fe3b9a2
BP
1237 netdev_dev->is_internal = (!netdev_dev->is_tap
1238 && dpif_linux_is_internal_device(name));
8722022c
BP
1239 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1240 }
1241}
1242
92df599c
JG
1243static void
1244swap_uint64(uint64_t *a, uint64_t *b)
1245{
1de0e8ae
BP
1246 uint64_t tmp = *a;
1247 *a = *b;
1248 *b = tmp;
92df599c
JG
1249}
1250
7fbef77a 1251/* Retrieves current device stats for 'netdev'. */
8b61709d 1252static int
149f577a
JG
1253netdev_linux_get_stats(const struct netdev *netdev_,
1254 struct netdev_stats *stats)
8b61709d 1255{
149f577a
JG
1256 struct netdev_dev_linux *netdev_dev =
1257 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1258 static int use_netlink_stats = -1;
1259 int error;
1260
7fbef77a
JG
1261 if (netdev_dev->have_vport_stats ||
1262 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1263
1264 error = netdev_vport_get_stats(netdev_, stats);
1265 netdev_dev->have_vport_stats = !error;
1266 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1267 }
8b61709d 1268
7fbef77a
JG
1269 if (!netdev_dev->have_vport_stats) {
1270 if (use_netlink_stats < 0) {
1271 use_netlink_stats = check_for_working_netlink_stats();
1272 }
1273 if (use_netlink_stats) {
1274 int ifindex;
1275
1276 error = get_ifindex(netdev_, &ifindex);
1277 if (!error) {
1278 error = get_stats_via_netlink(ifindex, stats);
1279 }
1280 } else {
1281 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1282 }
8b61709d 1283 }
fe6b0e03
JG
1284
1285 /* If this port is an internal port then the transmit and receive stats
1286 * will appear to be swapped relative to the other ports since we are the
1287 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1288 * them back here. This does not apply if we are getting stats from the
1289 * vport layer because it always tracks stats from the perspective of the
1290 * switch. */
92df599c 1291 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1292 if (!error && !netdev_dev->have_vport_stats &&
1293 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1294 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1295 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1296 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1297 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1298 stats->rx_length_errors = 0;
1299 stats->rx_over_errors = 0;
1300 stats->rx_crc_errors = 0;
1301 stats->rx_frame_errors = 0;
1302 stats->rx_fifo_errors = 0;
1303 stats->rx_missed_errors = 0;
1304 stats->tx_aborted_errors = 0;
1305 stats->tx_carrier_errors = 0;
1306 stats->tx_fifo_errors = 0;
1307 stats->tx_heartbeat_errors = 0;
1308 stats->tx_window_errors = 0;
1309 }
1310
8b61709d
BP
1311 return error;
1312}
1313
1314/* Stores the features supported by 'netdev' into each of '*current',
1315 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1316 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1317 * successful, otherwise a positive errno value. */
8b61709d 1318static int
6f2f5cce 1319netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1320 uint32_t *current, uint32_t *advertised,
1321 uint32_t *supported, uint32_t *peer)
1322{
1323 struct ethtool_cmd ecmd;
1324 int error;
1325
1326 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1327 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1328 ETHTOOL_GSET, "ETHTOOL_GSET");
1329 if (error) {
1330 return error;
1331 }
1332
1333 /* Supported features. */
1334 *supported = 0;
1335 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1336 *supported |= OFPPF_10MB_HD;
1337 }
1338 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1339 *supported |= OFPPF_10MB_FD;
1340 }
1341 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1342 *supported |= OFPPF_100MB_HD;
1343 }
1344 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1345 *supported |= OFPPF_100MB_FD;
1346 }
1347 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1348 *supported |= OFPPF_1GB_HD;
1349 }
1350 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1351 *supported |= OFPPF_1GB_FD;
1352 }
1353 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1354 *supported |= OFPPF_10GB_FD;
1355 }
1356 if (ecmd.supported & SUPPORTED_TP) {
1357 *supported |= OFPPF_COPPER;
1358 }
1359 if (ecmd.supported & SUPPORTED_FIBRE) {
1360 *supported |= OFPPF_FIBER;
1361 }
1362 if (ecmd.supported & SUPPORTED_Autoneg) {
1363 *supported |= OFPPF_AUTONEG;
1364 }
1365 if (ecmd.supported & SUPPORTED_Pause) {
1366 *supported |= OFPPF_PAUSE;
1367 }
1368 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1369 *supported |= OFPPF_PAUSE_ASYM;
1370 }
1371
1372 /* Advertised features. */
1373 *advertised = 0;
1374 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1375 *advertised |= OFPPF_10MB_HD;
1376 }
1377 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1378 *advertised |= OFPPF_10MB_FD;
1379 }
1380 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1381 *advertised |= OFPPF_100MB_HD;
1382 }
1383 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1384 *advertised |= OFPPF_100MB_FD;
1385 }
1386 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1387 *advertised |= OFPPF_1GB_HD;
1388 }
1389 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1390 *advertised |= OFPPF_1GB_FD;
1391 }
1392 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1393 *advertised |= OFPPF_10GB_FD;
1394 }
1395 if (ecmd.advertising & ADVERTISED_TP) {
1396 *advertised |= OFPPF_COPPER;
1397 }
1398 if (ecmd.advertising & ADVERTISED_FIBRE) {
1399 *advertised |= OFPPF_FIBER;
1400 }
1401 if (ecmd.advertising & ADVERTISED_Autoneg) {
1402 *advertised |= OFPPF_AUTONEG;
1403 }
1404 if (ecmd.advertising & ADVERTISED_Pause) {
1405 *advertised |= OFPPF_PAUSE;
1406 }
1407 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1408 *advertised |= OFPPF_PAUSE_ASYM;
1409 }
1410
1411 /* Current settings. */
1412 if (ecmd.speed == SPEED_10) {
1413 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1414 } else if (ecmd.speed == SPEED_100) {
1415 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1416 } else if (ecmd.speed == SPEED_1000) {
1417 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1418 } else if (ecmd.speed == SPEED_10000) {
1419 *current = OFPPF_10GB_FD;
1420 } else {
1421 *current = 0;
1422 }
1423
1424 if (ecmd.port == PORT_TP) {
1425 *current |= OFPPF_COPPER;
1426 } else if (ecmd.port == PORT_FIBRE) {
1427 *current |= OFPPF_FIBER;
1428 }
1429
1430 if (ecmd.autoneg) {
1431 *current |= OFPPF_AUTONEG;
1432 }
1433
1434 /* Peer advertisements. */
1435 *peer = 0; /* XXX */
1436
1437 return 0;
1438}
1439
1440/* Set the features advertised by 'netdev' to 'advertise'. */
1441static int
1442netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1443{
1444 struct ethtool_cmd ecmd;
1445 int error;
1446
1447 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1448 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1449 ETHTOOL_GSET, "ETHTOOL_GSET");
1450 if (error) {
1451 return error;
1452 }
1453
1454 ecmd.advertising = 0;
1455 if (advertise & OFPPF_10MB_HD) {
1456 ecmd.advertising |= ADVERTISED_10baseT_Half;
1457 }
1458 if (advertise & OFPPF_10MB_FD) {
1459 ecmd.advertising |= ADVERTISED_10baseT_Full;
1460 }
1461 if (advertise & OFPPF_100MB_HD) {
1462 ecmd.advertising |= ADVERTISED_100baseT_Half;
1463 }
1464 if (advertise & OFPPF_100MB_FD) {
1465 ecmd.advertising |= ADVERTISED_100baseT_Full;
1466 }
1467 if (advertise & OFPPF_1GB_HD) {
1468 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1469 }
1470 if (advertise & OFPPF_1GB_FD) {
1471 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1472 }
1473 if (advertise & OFPPF_10GB_FD) {
1474 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1475 }
1476 if (advertise & OFPPF_COPPER) {
1477 ecmd.advertising |= ADVERTISED_TP;
1478 }
1479 if (advertise & OFPPF_FIBER) {
1480 ecmd.advertising |= ADVERTISED_FIBRE;
1481 }
1482 if (advertise & OFPPF_AUTONEG) {
1483 ecmd.advertising |= ADVERTISED_Autoneg;
1484 }
1485 if (advertise & OFPPF_PAUSE) {
1486 ecmd.advertising |= ADVERTISED_Pause;
1487 }
1488 if (advertise & OFPPF_PAUSE_ASYM) {
1489 ecmd.advertising |= ADVERTISED_Asym_Pause;
1490 }
0b0544d7 1491 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1492 ETHTOOL_SSET, "ETHTOOL_SSET");
1493}
1494
1495/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1496 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1497 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1498 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1499 * sets '*vlan_vid' to -1. */
1500static int
1501netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1502{
1503 const char *netdev_name = netdev_get_name(netdev);
1504 struct ds line = DS_EMPTY_INITIALIZER;
1505 FILE *stream = NULL;
1506 int error;
1507 char *fn;
1508
1509 COVERAGE_INC(netdev_get_vlan_vid);
1510 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1511 stream = fopen(fn, "r");
1512 if (!stream) {
1513 error = errno;
1514 goto done;
1515 }
1516
1517 if (ds_get_line(&line, stream)) {
1518 if (ferror(stream)) {
1519 error = errno;
1520 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1521 } else {
1522 error = EPROTO;
1523 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1524 }
1525 goto done;
1526 }
1527
1528 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1529 error = EPROTO;
1530 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1531 fn, ds_cstr(&line));
1532 goto done;
1533 }
1534
1535 error = 0;
1536
1537done:
1538 free(fn);
1539 if (stream) {
1540 fclose(stream);
1541 }
1542 ds_destroy(&line);
1543 if (error) {
1544 *vlan_vid = -1;
1545 }
1546 return error;
1547}
1548
1549#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1550#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1551
8e460221 1552/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1553 * positive errno value.
1554 *
1555 * This function is equivalent to running
1556 * /sbin/tc qdisc del dev %s handle ffff: ingress
1557 * but it is much, much faster.
1558 */
8e460221
BP
1559static int
1560netdev_linux_remove_policing(struct netdev *netdev)
1561{
80a86fbe
BP
1562 struct netdev_dev_linux *netdev_dev =
1563 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1564 const char *netdev_name = netdev_get_name(netdev);
8e460221 1565
6f42c8ea 1566 struct ofpbuf request;
6f42c8ea 1567 struct tcmsg *tcmsg;
6f42c8ea
BP
1568 int error;
1569
c1c9c9c4 1570 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1571 if (!tcmsg) {
1572 return ENODEV;
1573 }
c1c9c9c4 1574 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1575 tcmsg->tcm_parent = TC_H_INGRESS;
1576 nl_msg_put_string(&request, TCA_KIND, "ingress");
1577 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1578
1579 error = tc_transact(&request, NULL);
4d10512c 1580 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1581 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1582 netdev_name, strerror(error));
1583 return error;
1584 }
1585
80a86fbe
BP
1586 netdev_dev->kbits_rate = 0;
1587 netdev_dev->kbits_burst = 0;
1588 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1589 return 0;
1590}
1591
8b61709d
BP
1592/* Attempts to set input rate limiting (policing) policy. */
1593static int
1594netdev_linux_set_policing(struct netdev *netdev,
1595 uint32_t kbits_rate, uint32_t kbits_burst)
1596{
80a86fbe
BP
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1599 const char *netdev_name = netdev_get_name(netdev);
1600 char command[1024];
1601
1602 COVERAGE_INC(netdev_set_policing);
8e460221 1603
80a86fbe
BP
1604 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1605 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1606 : kbits_burst); /* Stick with user-specified value. */
1607
1608 if (netdev_dev->cache_valid & VALID_POLICING
1609 && netdev_dev->kbits_rate == kbits_rate
1610 && netdev_dev->kbits_burst == kbits_burst) {
1611 /* Assume that settings haven't changed since we last set them. */
1612 return 0;
1613 }
1614
8e460221 1615 netdev_linux_remove_policing(netdev);
8b61709d 1616 if (kbits_rate) {
8b61709d
BP
1617 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1618 if (system(command) != 0) {
1619 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1620 return -1;
1621 }
1622
1623 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1624 kbits_rate, kbits_burst);
1625 if (system(command) != 0) {
1626 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1627 netdev_name);
1628 return -1;
1629 }
80a86fbe
BP
1630
1631 netdev_dev->kbits_rate = kbits_rate;
1632 netdev_dev->kbits_burst = kbits_burst;
1633 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1634 }
1635
1636 return 0;
1637}
1638
c1c9c9c4
BP
1639static int
1640netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1641 struct sset *types)
c1c9c9c4
BP
1642{
1643 const struct tc_ops **opsp;
1644
1645 for (opsp = tcs; *opsp != NULL; opsp++) {
1646 const struct tc_ops *ops = *opsp;
1647 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1648 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1649 }
1650 }
1651 return 0;
1652}
1653
1654static const struct tc_ops *
1655tc_lookup_ovs_name(const char *name)
1656{
1657 const struct tc_ops **opsp;
1658
1659 for (opsp = tcs; *opsp != NULL; opsp++) {
1660 const struct tc_ops *ops = *opsp;
1661 if (!strcmp(name, ops->ovs_name)) {
1662 return ops;
1663 }
1664 }
1665 return NULL;
1666}
1667
1668static const struct tc_ops *
1669tc_lookup_linux_name(const char *name)
1670{
1671 const struct tc_ops **opsp;
1672
1673 for (opsp = tcs; *opsp != NULL; opsp++) {
1674 const struct tc_ops *ops = *opsp;
1675 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1676 return ops;
1677 }
1678 }
1679 return NULL;
1680}
1681
93b13be8
BP
1682static struct tc_queue *
1683tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1684 size_t hash)
1685{
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1688 struct tc_queue *queue;
1689
4e8e4213 1690 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1691 if (queue->queue_id == queue_id) {
1692 return queue;
1693 }
1694 }
1695 return NULL;
1696}
1697
1698static struct tc_queue *
1699tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1700{
1701 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1702}
1703
c1c9c9c4
BP
1704static int
1705netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1706 const char *type,
1707 struct netdev_qos_capabilities *caps)
1708{
1709 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1710 if (!ops) {
1711 return EOPNOTSUPP;
1712 }
1713 caps->n_queues = ops->n_queues;
1714 return 0;
1715}
1716
1717static int
1718netdev_linux_get_qos(const struct netdev *netdev,
1719 const char **typep, struct shash *details)
1720{
1721 struct netdev_dev_linux *netdev_dev =
1722 netdev_dev_linux_cast(netdev_get_dev(netdev));
1723 int error;
1724
1725 error = tc_query_qdisc(netdev);
1726 if (error) {
1727 return error;
1728 }
1729
1730 *typep = netdev_dev->tc->ops->ovs_name;
1731 return (netdev_dev->tc->ops->qdisc_get
1732 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1733 : 0);
1734}
1735
1736static int
1737netdev_linux_set_qos(struct netdev *netdev,
1738 const char *type, const struct shash *details)
1739{
1740 struct netdev_dev_linux *netdev_dev =
1741 netdev_dev_linux_cast(netdev_get_dev(netdev));
1742 const struct tc_ops *new_ops;
1743 int error;
1744
1745 new_ops = tc_lookup_ovs_name(type);
1746 if (!new_ops || !new_ops->tc_install) {
1747 return EOPNOTSUPP;
1748 }
1749
1750 error = tc_query_qdisc(netdev);
1751 if (error) {
1752 return error;
1753 }
1754
1755 if (new_ops == netdev_dev->tc->ops) {
1756 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1757 } else {
1758 /* Delete existing qdisc. */
1759 error = tc_del_qdisc(netdev);
1760 if (error) {
1761 return error;
1762 }
1763 assert(netdev_dev->tc == NULL);
1764
1765 /* Install new qdisc. */
1766 error = new_ops->tc_install(netdev, details);
1767 assert((error == 0) == (netdev_dev->tc != NULL));
1768
1769 return error;
1770 }
1771}
1772
1773static int
1774netdev_linux_get_queue(const struct netdev *netdev,
1775 unsigned int queue_id, struct shash *details)
1776{
1777 struct netdev_dev_linux *netdev_dev =
1778 netdev_dev_linux_cast(netdev_get_dev(netdev));
1779 int error;
1780
1781 error = tc_query_qdisc(netdev);
1782 if (error) {
1783 return error;
93b13be8
BP
1784 } else {
1785 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1786 return (queue
1787 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1788 : ENOENT);
c1c9c9c4 1789 }
c1c9c9c4
BP
1790}
1791
1792static int
1793netdev_linux_set_queue(struct netdev *netdev,
1794 unsigned int queue_id, const struct shash *details)
1795{
1796 struct netdev_dev_linux *netdev_dev =
1797 netdev_dev_linux_cast(netdev_get_dev(netdev));
1798 int error;
1799
1800 error = tc_query_qdisc(netdev);
1801 if (error) {
1802 return error;
1803 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1804 || !netdev_dev->tc->ops->class_set) {
1805 return EINVAL;
1806 }
1807
1808 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1809}
1810
1811static int
1812netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1813{
1814 struct netdev_dev_linux *netdev_dev =
1815 netdev_dev_linux_cast(netdev_get_dev(netdev));
1816 int error;
1817
1818 error = tc_query_qdisc(netdev);
1819 if (error) {
1820 return error;
1821 } else if (!netdev_dev->tc->ops->class_delete) {
1822 return EINVAL;
93b13be8
BP
1823 } else {
1824 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1825 return (queue
1826 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1827 : ENOENT);
c1c9c9c4 1828 }
c1c9c9c4
BP
1829}
1830
1831static int
1832netdev_linux_get_queue_stats(const struct netdev *netdev,
1833 unsigned int queue_id,
1834 struct netdev_queue_stats *stats)
1835{
1836 struct netdev_dev_linux *netdev_dev =
1837 netdev_dev_linux_cast(netdev_get_dev(netdev));
1838 int error;
1839
1840 error = tc_query_qdisc(netdev);
1841 if (error) {
1842 return error;
c1c9c9c4
BP
1843 } else if (!netdev_dev->tc->ops->class_get_stats) {
1844 return EOPNOTSUPP;
93b13be8
BP
1845 } else {
1846 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1847 return (queue
1848 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1849 : ENOENT);
c1c9c9c4 1850 }
c1c9c9c4
BP
1851}
1852
23a98ffe 1853static bool
c1c9c9c4
BP
1854start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1855{
1856 struct ofpbuf request;
1857 struct tcmsg *tcmsg;
1858
1859 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1860 if (!tcmsg) {
1861 return false;
1862 }
3c4de644 1863 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1864 nl_dump_start(dump, rtnl_sock, &request);
1865 ofpbuf_uninit(&request);
23a98ffe 1866 return true;
c1c9c9c4
BP
1867}
1868
1869static int
1870netdev_linux_dump_queues(const struct netdev *netdev,
1871 netdev_dump_queues_cb *cb, void *aux)
1872{
1873 struct netdev_dev_linux *netdev_dev =
1874 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1875 struct tc_queue *queue;
c1c9c9c4
BP
1876 struct shash details;
1877 int last_error;
c1c9c9c4
BP
1878 int error;
1879
1880 error = tc_query_qdisc(netdev);
1881 if (error) {
1882 return error;
1883 } else if (!netdev_dev->tc->ops->class_get) {
1884 return EOPNOTSUPP;
1885 }
1886
1887 last_error = 0;
1888 shash_init(&details);
4e8e4213 1889 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1890 shash_clear(&details);
1891
93b13be8 1892 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1893 if (!error) {
93b13be8 1894 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1895 } else {
1896 last_error = error;
1897 }
1898 }
1899 shash_destroy(&details);
1900
1901 return last_error;
1902}
1903
1904static int
1905netdev_linux_dump_queue_stats(const struct netdev *netdev,
1906 netdev_dump_queue_stats_cb *cb, void *aux)
1907{
1908 struct netdev_dev_linux *netdev_dev =
1909 netdev_dev_linux_cast(netdev_get_dev(netdev));
1910 struct nl_dump dump;
1911 struct ofpbuf msg;
1912 int last_error;
1913 int error;
1914
1915 error = tc_query_qdisc(netdev);
1916 if (error) {
1917 return error;
1918 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1919 return EOPNOTSUPP;
1920 }
1921
1922 last_error = 0;
23a98ffe
BP
1923 if (!start_queue_dump(netdev, &dump)) {
1924 return ENODEV;
1925 }
c1c9c9c4
BP
1926 while (nl_dump_next(&dump, &msg)) {
1927 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1928 if (error) {
1929 last_error = error;
1930 }
1931 }
1932
1933 error = nl_dump_done(&dump);
1934 return error ? error : last_error;
1935}
1936
8b61709d 1937static int
f1acd62b
BP
1938netdev_linux_get_in4(const struct netdev *netdev_,
1939 struct in_addr *address, struct in_addr *netmask)
8b61709d 1940{
149f577a
JG
1941 struct netdev_dev_linux *netdev_dev =
1942 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1943
1944 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1945 int error;
1946
149f577a 1947 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1948 SIOCGIFADDR, "SIOCGIFADDR");
1949 if (error) {
1950 return error;
1951 }
1952
149f577a 1953 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1954 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1955 if (error) {
1956 return error;
1957 }
1958
149f577a 1959 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1960 }
149f577a
JG
1961 *address = netdev_dev->address;
1962 *netmask = netdev_dev->netmask;
f1acd62b 1963 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1964}
1965
8b61709d 1966static int
f1acd62b
BP
1967netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1968 struct in_addr netmask)
8b61709d 1969{
149f577a
JG
1970 struct netdev_dev_linux *netdev_dev =
1971 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1972 int error;
1973
f1acd62b 1974 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1975 if (!error) {
149f577a
JG
1976 netdev_dev->cache_valid |= VALID_IN4;
1977 netdev_dev->address = address;
1978 netdev_dev->netmask = netmask;
f1acd62b 1979 if (address.s_addr != INADDR_ANY) {
8b61709d 1980 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1981 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1982 }
1983 }
1984 return error;
1985}
1986
1987static bool
1988parse_if_inet6_line(const char *line,
1989 struct in6_addr *in6, char ifname[16 + 1])
1990{
1991 uint8_t *s6 = in6->s6_addr;
1992#define X8 "%2"SCNx8
1993 return sscanf(line,
1994 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1995 "%*x %*x %*x %*x %16s\n",
1996 &s6[0], &s6[1], &s6[2], &s6[3],
1997 &s6[4], &s6[5], &s6[6], &s6[7],
1998 &s6[8], &s6[9], &s6[10], &s6[11],
1999 &s6[12], &s6[13], &s6[14], &s6[15],
2000 ifname) == 17;
2001}
2002
2003/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2004 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2005static int
2006netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2007{
149f577a
JG
2008 struct netdev_dev_linux *netdev_dev =
2009 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2010 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2011 FILE *file;
2012 char line[128];
2013
149f577a 2014 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2015
2016 file = fopen("/proc/net/if_inet6", "r");
2017 if (file != NULL) {
2018 const char *name = netdev_get_name(netdev_);
2019 while (fgets(line, sizeof line, file)) {
2a022368 2020 struct in6_addr in6_tmp;
8b61709d 2021 char ifname[16 + 1];
2a022368 2022 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2023 && !strcmp(name, ifname))
2024 {
2a022368 2025 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2026 break;
2027 }
2028 }
2029 fclose(file);
2030 }
149f577a 2031 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2032 }
149f577a 2033 *in6 = netdev_dev->in6;
8b61709d
BP
2034 return 0;
2035}
2036
2037static void
2038make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2039{
2040 struct sockaddr_in sin;
2041 memset(&sin, 0, sizeof sin);
2042 sin.sin_family = AF_INET;
2043 sin.sin_addr = addr;
2044 sin.sin_port = 0;
2045
2046 memset(sa, 0, sizeof *sa);
2047 memcpy(sa, &sin, sizeof sin);
2048}
2049
2050static int
2051do_set_addr(struct netdev *netdev,
2052 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2053{
2054 struct ifreq ifr;
71d7c22f 2055 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2056 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2057
2058 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2059 ioctl_name);
8b61709d
BP
2060}
2061
2062/* Adds 'router' as a default IP gateway. */
2063static int
67a4917b 2064netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2065{
2066 struct in_addr any = { INADDR_ANY };
2067 struct rtentry rt;
2068 int error;
2069
2070 memset(&rt, 0, sizeof rt);
2071 make_in4_sockaddr(&rt.rt_dst, any);
2072 make_in4_sockaddr(&rt.rt_gateway, router);
2073 make_in4_sockaddr(&rt.rt_genmask, any);
2074 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2075 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2076 if (error) {
2077 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2078 }
2079 return error;
2080}
2081
f1acd62b
BP
2082static int
2083netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2084 char **netdev_name)
2085{
2086 static const char fn[] = "/proc/net/route";
2087 FILE *stream;
2088 char line[256];
2089 int ln;
2090
2091 *netdev_name = NULL;
2092 stream = fopen(fn, "r");
2093 if (stream == NULL) {
2094 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2095 return errno;
2096 }
2097
2098 ln = 0;
2099 while (fgets(line, sizeof line, stream)) {
2100 if (++ln >= 2) {
2101 char iface[17];
dbba996b 2102 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2103 int refcnt, metric, mtu;
2104 unsigned int flags, use, window, irtt;
2105
2106 if (sscanf(line,
2107 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2108 " %d %u %u\n",
2109 iface, &dest, &gateway, &flags, &refcnt,
2110 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2111
d295e8e9 2112 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2113 fn, ln, line);
2114 continue;
2115 }
2116 if (!(flags & RTF_UP)) {
2117 /* Skip routes that aren't up. */
2118 continue;
2119 }
2120
2121 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2122 * network byte order, so we don't need need any endian
f1acd62b
BP
2123 * conversions here. */
2124 if ((dest & mask) == (host->s_addr & mask)) {
2125 if (!gateway) {
2126 /* The host is directly reachable. */
2127 next_hop->s_addr = 0;
2128 } else {
2129 /* To reach the host, we must go through a gateway. */
2130 next_hop->s_addr = gateway;
2131 }
2132 *netdev_name = xstrdup(iface);
2133 fclose(stream);
2134 return 0;
2135 }
2136 }
2137 }
2138
2139 fclose(stream);
2140 return ENXIO;
2141}
2142
e210037e
AE
2143static int
2144netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2145{
2146 struct ethtool_drvinfo drvinfo;
2147 int error;
2148
2149 memset(&drvinfo, 0, sizeof drvinfo);
2150 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2151 (struct ethtool_cmd *)&drvinfo,
2152 ETHTOOL_GDRVINFO,
2153 "ETHTOOL_GDRVINFO");
2154 if (!error) {
2155 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2156 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2157 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2158 }
2159
2160 return error;
2161}
2162
8b61709d
BP
2163/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2164 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2165 * returns 0. Otherwise, it returns a positive errno value; in particular,
2166 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2167static int
2168netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2169 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2170{
2171 struct arpreq r;
c100e025 2172 struct sockaddr_in sin;
8b61709d
BP
2173 int retval;
2174
2175 memset(&r, 0, sizeof r);
f2cc621b 2176 memset(&sin, 0, sizeof sin);
c100e025
BP
2177 sin.sin_family = AF_INET;
2178 sin.sin_addr.s_addr = ip;
2179 sin.sin_port = 0;
2180 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2181 r.arp_ha.sa_family = ARPHRD_ETHER;
2182 r.arp_flags = 0;
71d7c22f 2183 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2184 COVERAGE_INC(netdev_arp_lookup);
2185 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2186 if (!retval) {
2187 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2188 } else if (retval != ENXIO) {
2189 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2190 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2191 }
2192 return retval;
2193}
2194
2195static int
2196nd_to_iff_flags(enum netdev_flags nd)
2197{
2198 int iff = 0;
2199 if (nd & NETDEV_UP) {
2200 iff |= IFF_UP;
2201 }
2202 if (nd & NETDEV_PROMISC) {
2203 iff |= IFF_PROMISC;
2204 }
2205 return iff;
2206}
2207
2208static int
2209iff_to_nd_flags(int iff)
2210{
2211 enum netdev_flags nd = 0;
2212 if (iff & IFF_UP) {
2213 nd |= NETDEV_UP;
2214 }
2215 if (iff & IFF_PROMISC) {
2216 nd |= NETDEV_PROMISC;
2217 }
2218 return nd;
2219}
2220
2221static int
2222netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2223 enum netdev_flags on, enum netdev_flags *old_flagsp)
2224{
2225 int old_flags, new_flags;
2226 int error;
2227
2228 error = get_flags(netdev, &old_flags);
2229 if (!error) {
2230 *old_flagsp = iff_to_nd_flags(old_flags);
2231 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2232 if (new_flags != old_flags) {
2233 error = set_flags(netdev, new_flags);
2234 }
2235 }
2236 return error;
2237}
2238
ac4d3bcb
EJ
2239static unsigned int
2240netdev_linux_change_seq(const struct netdev *netdev)
2241{
2242 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2243}
2244
c3827f61
BP
2245#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2246{ \
2247 NAME, \
2248 \
2249 netdev_linux_init, \
2250 netdev_linux_run, \
2251 netdev_linux_wait, \
2252 \
2253 CREATE, \
2254 netdev_linux_destroy, \
6d9e6eb4 2255 NULL, /* set_config */ \
aebf4235 2256 NULL, /* config_equal */ \
c3827f61
BP
2257 \
2258 netdev_linux_open, \
2259 netdev_linux_close, \
2260 \
2261 ENUMERATE, \
2262 \
7b6b0ef4 2263 netdev_linux_listen, \
c3827f61
BP
2264 netdev_linux_recv, \
2265 netdev_linux_recv_wait, \
2266 netdev_linux_drain, \
2267 \
2268 netdev_linux_send, \
2269 netdev_linux_send_wait, \
2270 \
2271 netdev_linux_set_etheraddr, \
2272 netdev_linux_get_etheraddr, \
2273 netdev_linux_get_mtu, \
2274 netdev_linux_get_ifindex, \
2275 netdev_linux_get_carrier, \
1670c579 2276 netdev_linux_set_miimon_interval, \
c3827f61
BP
2277 netdev_linux_get_stats, \
2278 SET_STATS, \
2279 \
2280 netdev_linux_get_features, \
2281 netdev_linux_set_advertisements, \
2282 netdev_linux_get_vlan_vid, \
2283 \
2284 netdev_linux_set_policing, \
2285 netdev_linux_get_qos_types, \
2286 netdev_linux_get_qos_capabilities, \
2287 netdev_linux_get_qos, \
2288 netdev_linux_set_qos, \
2289 netdev_linux_get_queue, \
2290 netdev_linux_set_queue, \
2291 netdev_linux_delete_queue, \
2292 netdev_linux_get_queue_stats, \
2293 netdev_linux_dump_queues, \
2294 netdev_linux_dump_queue_stats, \
2295 \
2296 netdev_linux_get_in4, \
2297 netdev_linux_set_in4, \
2298 netdev_linux_get_in6, \
2299 netdev_linux_add_router, \
2300 netdev_linux_get_next_hop, \
e210037e 2301 netdev_linux_get_status, \
c3827f61
BP
2302 netdev_linux_arp_lookup, \
2303 \
2304 netdev_linux_update_flags, \
2305 \
ac4d3bcb 2306 netdev_linux_change_seq \
c3827f61
BP
2307}
2308
2309const struct netdev_class netdev_linux_class =
2310 NETDEV_LINUX_CLASS(
2311 "system",
2312 netdev_linux_create,
2313 netdev_linux_enumerate,
98563392 2314 NULL); /* set_stats */
c3827f61
BP
2315
2316const struct netdev_class netdev_tap_class =
2317 NETDEV_LINUX_CLASS(
2318 "tap",
2319 netdev_linux_create_tap,
2320 NULL, /* enumerate */
2321 NULL); /* set_stats */
2322
2323const struct netdev_class netdev_internal_class =
2324 NETDEV_LINUX_CLASS(
2325 "internal",
2326 netdev_linux_create,
2327 NULL, /* enumerate */
2328 netdev_vport_set_stats);
8b61709d 2329\f
c1c9c9c4 2330/* HTB traffic control class. */
559843ed 2331
c1c9c9c4 2332#define HTB_N_QUEUES 0xf000
8b61709d 2333
c1c9c9c4
BP
2334struct htb {
2335 struct tc tc;
2336 unsigned int max_rate; /* In bytes/s. */
2337};
8b61709d 2338
c1c9c9c4 2339struct htb_class {
93b13be8 2340 struct tc_queue tc_queue;
c1c9c9c4
BP
2341 unsigned int min_rate; /* In bytes/s. */
2342 unsigned int max_rate; /* In bytes/s. */
2343 unsigned int burst; /* In bytes. */
2344 unsigned int priority; /* Lower values are higher priorities. */
2345};
8b61709d 2346
c1c9c9c4
BP
2347static struct htb *
2348htb_get__(const struct netdev *netdev)
2349{
2350 struct netdev_dev_linux *netdev_dev =
2351 netdev_dev_linux_cast(netdev_get_dev(netdev));
2352 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2353}
2354
24045e35 2355static void
c1c9c9c4
BP
2356htb_install__(struct netdev *netdev, uint64_t max_rate)
2357{
2358 struct netdev_dev_linux *netdev_dev =
2359 netdev_dev_linux_cast(netdev_get_dev(netdev));
2360 struct htb *htb;
2361
2362 htb = xmalloc(sizeof *htb);
2363 tc_init(&htb->tc, &tc_ops_htb);
2364 htb->max_rate = max_rate;
2365
2366 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2367}
2368
2369/* Create an HTB qdisc.
2370 *
a339aa81 2371 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2372static int
2373htb_setup_qdisc__(struct netdev *netdev)
2374{
2375 size_t opt_offset;
2376 struct tc_htb_glob opt;
2377 struct ofpbuf request;
2378 struct tcmsg *tcmsg;
2379
2380 tc_del_qdisc(netdev);
2381
2382 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2383 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2384 if (!tcmsg) {
2385 return ENODEV;
2386 }
c1c9c9c4
BP
2387 tcmsg->tcm_handle = tc_make_handle(1, 0);
2388 tcmsg->tcm_parent = TC_H_ROOT;
2389
2390 nl_msg_put_string(&request, TCA_KIND, "htb");
2391
2392 memset(&opt, 0, sizeof opt);
2393 opt.rate2quantum = 10;
2394 opt.version = 3;
4ecf12d5 2395 opt.defcls = 1;
c1c9c9c4
BP
2396
2397 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2398 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2399 nl_msg_end_nested(&request, opt_offset);
2400
2401 return tc_transact(&request, NULL);
2402}
2403
2404/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2405 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2406static int
2407htb_setup_class__(struct netdev *netdev, unsigned int handle,
2408 unsigned int parent, struct htb_class *class)
2409{
2410 size_t opt_offset;
2411 struct tc_htb_opt opt;
2412 struct ofpbuf request;
2413 struct tcmsg *tcmsg;
2414 int error;
2415 int mtu;
2416
2417 netdev_get_mtu(netdev, &mtu);
f915f1a8
BP
2418 if (mtu == INT_MAX) {
2419 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2420 netdev_get_name(netdev));
2421 return EINVAL;
2422 }
c1c9c9c4
BP
2423
2424 memset(&opt, 0, sizeof opt);
2425 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2426 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2427 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2428 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2429 opt.prio = class->priority;
2430
2431 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2432 if (!tcmsg) {
2433 return ENODEV;
2434 }
c1c9c9c4
BP
2435 tcmsg->tcm_handle = handle;
2436 tcmsg->tcm_parent = parent;
2437
2438 nl_msg_put_string(&request, TCA_KIND, "htb");
2439 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2440 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2441 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2442 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2443 nl_msg_end_nested(&request, opt_offset);
2444
2445 error = tc_transact(&request, NULL);
2446 if (error) {
2447 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2448 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2449 netdev_get_name(netdev),
2450 tc_get_major(handle), tc_get_minor(handle),
2451 tc_get_major(parent), tc_get_minor(parent),
2452 class->min_rate, class->max_rate,
2453 class->burst, class->priority, strerror(error));
2454 }
2455 return error;
2456}
2457
2458/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2459 * description of them into 'details'. The description complies with the
2460 * specification given in the vswitch database documentation for linux-htb
2461 * queue details. */
2462static int
2463htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2464{
2465 static const struct nl_policy tca_htb_policy[] = {
2466 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2467 .min_len = sizeof(struct tc_htb_opt) },
2468 };
2469
2470 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2471 const struct tc_htb_opt *htb;
2472
2473 if (!nl_parse_nested(nl_options, tca_htb_policy,
2474 attrs, ARRAY_SIZE(tca_htb_policy))) {
2475 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2476 return EPROTO;
2477 }
2478
2479 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2480 class->min_rate = htb->rate.rate;
2481 class->max_rate = htb->ceil.rate;
2482 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2483 class->priority = htb->prio;
2484 return 0;
2485}
2486
2487static int
2488htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2489 struct htb_class *options,
2490 struct netdev_queue_stats *stats)
2491{
2492 struct nlattr *nl_options;
2493 unsigned int handle;
2494 int error;
2495
2496 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2497 if (!error && queue_id) {
17ee3c1f
BP
2498 unsigned int major = tc_get_major(handle);
2499 unsigned int minor = tc_get_minor(handle);
2500 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2501 *queue_id = minor - 1;
c1c9c9c4
BP
2502 } else {
2503 error = EPROTO;
2504 }
2505 }
2506 if (!error && options) {
2507 error = htb_parse_tca_options__(nl_options, options);
2508 }
2509 return error;
2510}
2511
2512static void
2513htb_parse_qdisc_details__(struct netdev *netdev,
2514 const struct shash *details, struct htb_class *hc)
2515{
2516 const char *max_rate_s;
2517
2518 max_rate_s = shash_find_data(details, "max-rate");
2519 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2520 if (!hc->max_rate) {
2521 uint32_t current;
2522
2523 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2524 hc->max_rate = netdev_features_to_bps(current) / 8;
2525 }
2526 hc->min_rate = hc->max_rate;
2527 hc->burst = 0;
2528 hc->priority = 0;
2529}
2530
2531static int
2532htb_parse_class_details__(struct netdev *netdev,
2533 const struct shash *details, struct htb_class *hc)
2534{
2535 const struct htb *htb = htb_get__(netdev);
2536 const char *min_rate_s = shash_find_data(details, "min-rate");
2537 const char *max_rate_s = shash_find_data(details, "max-rate");
2538 const char *burst_s = shash_find_data(details, "burst");
2539 const char *priority_s = shash_find_data(details, "priority");
2540 int mtu;
2541
f915f1a8
BP
2542 netdev_get_mtu(netdev, &mtu);
2543 if (mtu == INT_MAX) {
2544 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2545 netdev_get_name(netdev));
2546 return EINVAL;
2547 }
2548
4f104611
EJ
2549 /* HTB requires at least an mtu sized min-rate to send any traffic even
2550 * on uncongested links. */
c45ab5e9 2551 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2552 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2553 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2554
2555 /* max-rate */
2556 hc->max_rate = (max_rate_s
2557 ? strtoull(max_rate_s, NULL, 10) / 8
2558 : htb->max_rate);
2559 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2560 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2561
2562 /* burst
2563 *
2564 * According to hints in the documentation that I've read, it is important
2565 * that 'burst' be at least as big as the largest frame that might be
2566 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2567 * but having it a bit too small is a problem. Since netdev_get_mtu()
2568 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2569 * the MTU. We actually add 64, instead of 14, as a guard against
2570 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2571 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2572 hc->burst = MAX(hc->burst, mtu + 64);
2573
2574 /* priority */
2575 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2576
2577 return 0;
2578}
2579
2580static int
2581htb_query_class__(const struct netdev *netdev, unsigned int handle,
2582 unsigned int parent, struct htb_class *options,
2583 struct netdev_queue_stats *stats)
2584{
2585 struct ofpbuf *reply;
2586 int error;
2587
2588 error = tc_query_class(netdev, handle, parent, &reply);
2589 if (!error) {
2590 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2591 ofpbuf_delete(reply);
2592 }
2593 return error;
2594}
2595
2596static int
2597htb_tc_install(struct netdev *netdev, const struct shash *details)
2598{
2599 int error;
2600
2601 error = htb_setup_qdisc__(netdev);
2602 if (!error) {
2603 struct htb_class hc;
2604
2605 htb_parse_qdisc_details__(netdev, details, &hc);
2606 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2607 tc_make_handle(1, 0), &hc);
2608 if (!error) {
2609 htb_install__(netdev, hc.max_rate);
2610 }
2611 }
2612 return error;
2613}
2614
93b13be8
BP
2615static struct htb_class *
2616htb_class_cast__(const struct tc_queue *queue)
2617{
2618 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2619}
2620
c1c9c9c4
BP
2621static void
2622htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2623 const struct htb_class *hc)
2624{
2625 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2626 size_t hash = hash_int(queue_id, 0);
2627 struct tc_queue *queue;
c1c9c9c4
BP
2628 struct htb_class *hcp;
2629
93b13be8
BP
2630 queue = tc_find_queue__(netdev, queue_id, hash);
2631 if (queue) {
2632 hcp = htb_class_cast__(queue);
2633 } else {
c1c9c9c4 2634 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2635 queue = &hcp->tc_queue;
2636 queue->queue_id = queue_id;
2637 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2638 }
93b13be8
BP
2639
2640 hcp->min_rate = hc->min_rate;
2641 hcp->max_rate = hc->max_rate;
2642 hcp->burst = hc->burst;
2643 hcp->priority = hc->priority;
c1c9c9c4
BP
2644}
2645
2646static int
2647htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2648{
c1c9c9c4
BP
2649 struct ofpbuf msg;
2650 struct nl_dump dump;
2651 struct htb_class hc;
c1c9c9c4
BP
2652
2653 /* Get qdisc options. */
2654 hc.max_rate = 0;
2655 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2656 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2657
2658 /* Get queues. */
23a98ffe
BP
2659 if (!start_queue_dump(netdev, &dump)) {
2660 return ENODEV;
2661 }
c1c9c9c4
BP
2662 while (nl_dump_next(&dump, &msg)) {
2663 unsigned int queue_id;
2664
2665 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2666 htb_update_queue__(netdev, queue_id, &hc);
2667 }
2668 }
2669 nl_dump_done(&dump);
2670
2671 return 0;
2672}
2673
2674static void
2675htb_tc_destroy(struct tc *tc)
2676{
2677 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2678 struct htb_class *hc, *next;
c1c9c9c4 2679
4e8e4213 2680 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2681 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2682 free(hc);
2683 }
2684 tc_destroy(tc);
2685 free(htb);
2686}
2687
2688static int
2689htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2690{
2691 const struct htb *htb = htb_get__(netdev);
2692 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2693 return 0;
2694}
2695
2696static int
2697htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2698{
2699 struct htb_class hc;
2700 int error;
2701
2702 htb_parse_qdisc_details__(netdev, details, &hc);
2703 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2704 tc_make_handle(1, 0), &hc);
2705 if (!error) {
2706 htb_get__(netdev)->max_rate = hc.max_rate;
2707 }
2708 return error;
2709}
2710
2711static int
93b13be8
BP
2712htb_class_get(const struct netdev *netdev OVS_UNUSED,
2713 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2714{
93b13be8 2715 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2716
2717 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2718 if (hc->min_rate != hc->max_rate) {
2719 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2720 }
2721 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2722 if (hc->priority) {
2723 shash_add(details, "priority", xasprintf("%u", hc->priority));
2724 }
2725 return 0;
2726}
2727
2728static int
2729htb_class_set(struct netdev *netdev, unsigned int queue_id,
2730 const struct shash *details)
2731{
2732 struct htb_class hc;
2733 int error;
2734
2735 error = htb_parse_class_details__(netdev, details, &hc);
2736 if (error) {
2737 return error;
2738 }
2739
17ee3c1f 2740 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2741 tc_make_handle(1, 0xfffe), &hc);
2742 if (error) {
2743 return error;
2744 }
2745
2746 htb_update_queue__(netdev, queue_id, &hc);
2747 return 0;
2748}
2749
2750static int
93b13be8 2751htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2752{
93b13be8 2753 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2754 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2755 int error;
2756
93b13be8 2757 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2758 if (!error) {
93b13be8 2759 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2760 free(hc);
c1c9c9c4
BP
2761 }
2762 return error;
2763}
2764
2765static int
93b13be8 2766htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2767 struct netdev_queue_stats *stats)
2768{
93b13be8 2769 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2770 tc_make_handle(1, 0xfffe), NULL, stats);
2771}
2772
2773static int
2774htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2775 const struct ofpbuf *nlmsg,
2776 netdev_dump_queue_stats_cb *cb, void *aux)
2777{
2778 struct netdev_queue_stats stats;
17ee3c1f 2779 unsigned int handle, major, minor;
c1c9c9c4
BP
2780 int error;
2781
2782 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2783 if (error) {
2784 return error;
2785 }
2786
17ee3c1f
BP
2787 major = tc_get_major(handle);
2788 minor = tc_get_minor(handle);
2789 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2790 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2791 }
2792 return 0;
2793}
2794
2795static const struct tc_ops tc_ops_htb = {
2796 "htb", /* linux_name */
2797 "linux-htb", /* ovs_name */
2798 HTB_N_QUEUES, /* n_queues */
2799 htb_tc_install,
2800 htb_tc_load,
2801 htb_tc_destroy,
2802 htb_qdisc_get,
2803 htb_qdisc_set,
2804 htb_class_get,
2805 htb_class_set,
2806 htb_class_delete,
2807 htb_class_get_stats,
2808 htb_class_dump_stats
2809};
2810\f
a339aa81
EJ
2811/* "linux-hfsc" traffic control class. */
2812
2813#define HFSC_N_QUEUES 0xf000
2814
2815struct hfsc {
2816 struct tc tc;
2817 uint32_t max_rate;
2818};
2819
2820struct hfsc_class {
2821 struct tc_queue tc_queue;
2822 uint32_t min_rate;
2823 uint32_t max_rate;
2824};
2825
2826static struct hfsc *
2827hfsc_get__(const struct netdev *netdev)
2828{
2829 struct netdev_dev_linux *netdev_dev;
2830 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2831 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2832}
2833
2834static struct hfsc_class *
2835hfsc_class_cast__(const struct tc_queue *queue)
2836{
2837 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2838}
2839
24045e35 2840static void
a339aa81
EJ
2841hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2842{
2843 struct netdev_dev_linux * netdev_dev;
2844 struct hfsc *hfsc;
2845
2846 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2847 hfsc = xmalloc(sizeof *hfsc);
2848 tc_init(&hfsc->tc, &tc_ops_hfsc);
2849 hfsc->max_rate = max_rate;
2850 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2851}
2852
2853static void
2854hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2855 const struct hfsc_class *hc)
2856{
2857 size_t hash;
2858 struct hfsc *hfsc;
2859 struct hfsc_class *hcp;
2860 struct tc_queue *queue;
2861
2862 hfsc = hfsc_get__(netdev);
2863 hash = hash_int(queue_id, 0);
2864
2865 queue = tc_find_queue__(netdev, queue_id, hash);
2866 if (queue) {
2867 hcp = hfsc_class_cast__(queue);
2868 } else {
2869 hcp = xmalloc(sizeof *hcp);
2870 queue = &hcp->tc_queue;
2871 queue->queue_id = queue_id;
2872 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2873 }
2874
2875 hcp->min_rate = hc->min_rate;
2876 hcp->max_rate = hc->max_rate;
2877}
2878
2879static int
2880hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2881{
2882 const struct tc_service_curve *rsc, *fsc, *usc;
2883 static const struct nl_policy tca_hfsc_policy[] = {
2884 [TCA_HFSC_RSC] = {
2885 .type = NL_A_UNSPEC,
2886 .optional = false,
2887 .min_len = sizeof(struct tc_service_curve),
2888 },
2889 [TCA_HFSC_FSC] = {
2890 .type = NL_A_UNSPEC,
2891 .optional = false,
2892 .min_len = sizeof(struct tc_service_curve),
2893 },
2894 [TCA_HFSC_USC] = {
2895 .type = NL_A_UNSPEC,
2896 .optional = false,
2897 .min_len = sizeof(struct tc_service_curve),
2898 },
2899 };
2900 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2901
2902 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2903 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2904 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2905 return EPROTO;
2906 }
2907
2908 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2909 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2910 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2911
2912 if (rsc->m1 != 0 || rsc->d != 0 ||
2913 fsc->m1 != 0 || fsc->d != 0 ||
2914 usc->m1 != 0 || usc->d != 0) {
2915 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2916 "Non-linear service curves are not supported.");
2917 return EPROTO;
2918 }
2919
2920 if (rsc->m2 != fsc->m2) {
2921 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2922 "Real-time service curves are not supported ");
2923 return EPROTO;
2924 }
2925
2926 if (rsc->m2 > usc->m2) {
2927 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2928 "Min-rate service curve is greater than "
2929 "the max-rate service curve.");
2930 return EPROTO;
2931 }
2932
2933 class->min_rate = fsc->m2;
2934 class->max_rate = usc->m2;
2935 return 0;
2936}
2937
2938static int
2939hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2940 struct hfsc_class *options,
2941 struct netdev_queue_stats *stats)
2942{
2943 int error;
2944 unsigned int handle;
2945 struct nlattr *nl_options;
2946
2947 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2948 if (error) {
2949 return error;
2950 }
2951
2952 if (queue_id) {
2953 unsigned int major, minor;
2954
2955 major = tc_get_major(handle);
2956 minor = tc_get_minor(handle);
2957 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2958 *queue_id = minor - 1;
2959 } else {
2960 return EPROTO;
2961 }
2962 }
2963
2964 if (options) {
2965 error = hfsc_parse_tca_options__(nl_options, options);
2966 }
2967
2968 return error;
2969}
2970
2971static int
2972hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2973 unsigned int parent, struct hfsc_class *options,
2974 struct netdev_queue_stats *stats)
2975{
2976 int error;
2977 struct ofpbuf *reply;
2978
2979 error = tc_query_class(netdev, handle, parent, &reply);
2980 if (error) {
2981 return error;
2982 }
2983
2984 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2985 ofpbuf_delete(reply);
2986 return error;
2987}
2988
2989static void
2990hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2991 struct hfsc_class *class)
2992{
2993 uint32_t max_rate;
2994 const char *max_rate_s;
2995
2996 max_rate_s = shash_find_data(details, "max-rate");
2997 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2998
2999 if (!max_rate) {
3000 uint32_t current;
3001
3002 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3003 max_rate = netdev_features_to_bps(current) / 8;
3004 }
3005
3006 class->min_rate = max_rate;
3007 class->max_rate = max_rate;
3008}
3009
3010static int
3011hfsc_parse_class_details__(struct netdev *netdev,
3012 const struct shash *details,
3013 struct hfsc_class * class)
3014{
3015 const struct hfsc *hfsc;
3016 uint32_t min_rate, max_rate;
3017 const char *min_rate_s, *max_rate_s;
3018
3019 hfsc = hfsc_get__(netdev);
3020 min_rate_s = shash_find_data(details, "min-rate");
3021 max_rate_s = shash_find_data(details, "max-rate");
3022
c45ab5e9 3023 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3024 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3025 min_rate = MIN(min_rate, hfsc->max_rate);
3026
3027 max_rate = (max_rate_s
3028 ? strtoull(max_rate_s, NULL, 10) / 8
3029 : hfsc->max_rate);
3030 max_rate = MAX(max_rate, min_rate);
3031 max_rate = MIN(max_rate, hfsc->max_rate);
3032
3033 class->min_rate = min_rate;
3034 class->max_rate = max_rate;
3035
3036 return 0;
3037}
3038
3039/* Create an HFSC qdisc.
3040 *
3041 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3042static int
3043hfsc_setup_qdisc__(struct netdev * netdev)
3044{
3045 struct tcmsg *tcmsg;
3046 struct ofpbuf request;
3047 struct tc_hfsc_qopt opt;
3048
3049 tc_del_qdisc(netdev);
3050
3051 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3052 NLM_F_EXCL | NLM_F_CREATE, &request);
3053
3054 if (!tcmsg) {
3055 return ENODEV;
3056 }
3057
3058 tcmsg->tcm_handle = tc_make_handle(1, 0);
3059 tcmsg->tcm_parent = TC_H_ROOT;
3060
3061 memset(&opt, 0, sizeof opt);
3062 opt.defcls = 1;
3063
3064 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3065 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3066
3067 return tc_transact(&request, NULL);
3068}
3069
3070/* Create an HFSC class.
3071 *
3072 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3073 * sc rate <min_rate> ul rate <max_rate>" */
3074static int
3075hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3076 unsigned int parent, struct hfsc_class *class)
3077{
3078 int error;
3079 size_t opt_offset;
3080 struct tcmsg *tcmsg;
3081 struct ofpbuf request;
3082 struct tc_service_curve min, max;
3083
3084 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3085
3086 if (!tcmsg) {
3087 return ENODEV;
3088 }
3089
3090 tcmsg->tcm_handle = handle;
3091 tcmsg->tcm_parent = parent;
3092
3093 min.m1 = 0;
3094 min.d = 0;
3095 min.m2 = class->min_rate;
3096
3097 max.m1 = 0;
3098 max.d = 0;
3099 max.m2 = class->max_rate;
3100
3101 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3102 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3103 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3104 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3105 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3106 nl_msg_end_nested(&request, opt_offset);
3107
3108 error = tc_transact(&request, NULL);
3109 if (error) {
3110 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3111 "min-rate %ubps, max-rate %ubps (%s)",
3112 netdev_get_name(netdev),
3113 tc_get_major(handle), tc_get_minor(handle),
3114 tc_get_major(parent), tc_get_minor(parent),
3115 class->min_rate, class->max_rate, strerror(error));
3116 }
3117
3118 return error;
3119}
3120
3121static int
3122hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3123{
3124 int error;
3125 struct hfsc_class class;
3126
3127 error = hfsc_setup_qdisc__(netdev);
3128
3129 if (error) {
3130 return error;
3131 }
3132
3133 hfsc_parse_qdisc_details__(netdev, details, &class);
3134 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3135 tc_make_handle(1, 0), &class);
3136
3137 if (error) {
3138 return error;
3139 }
3140
3141 hfsc_install__(netdev, class.max_rate);
3142 return 0;
3143}
3144
3145static int
3146hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3147{
3148 struct ofpbuf msg;
a339aa81
EJ
3149 struct nl_dump dump;
3150 struct hfsc_class hc;
3151
3152 hc.max_rate = 0;
3153 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3154 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3155
3156 if (!start_queue_dump(netdev, &dump)) {
3157 return ENODEV;
3158 }
3159
3160 while (nl_dump_next(&dump, &msg)) {
3161 unsigned int queue_id;
3162
3163 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3164 hfsc_update_queue__(netdev, queue_id, &hc);
3165 }
3166 }
3167
3168 nl_dump_done(&dump);
3169 return 0;
3170}
3171
3172static void
3173hfsc_tc_destroy(struct tc *tc)
3174{
3175 struct hfsc *hfsc;
3176 struct hfsc_class *hc, *next;
3177
3178 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3179
3180 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3181 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3182 free(hc);
3183 }
3184
3185 tc_destroy(tc);
3186 free(hfsc);
3187}
3188
3189static int
3190hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3191{
3192 const struct hfsc *hfsc;
3193 hfsc = hfsc_get__(netdev);
3194 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3195 return 0;
3196}
3197
3198static int
3199hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3200{
3201 int error;
3202 struct hfsc_class class;
3203
3204 hfsc_parse_qdisc_details__(netdev, details, &class);
3205 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3206 tc_make_handle(1, 0), &class);
3207
3208 if (!error) {
3209 hfsc_get__(netdev)->max_rate = class.max_rate;
3210 }
3211
3212 return error;
3213}
3214
3215static int
3216hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3217 const struct tc_queue *queue, struct shash *details)
3218{
3219 const struct hfsc_class *hc;
3220
3221 hc = hfsc_class_cast__(queue);
3222 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3223 if (hc->min_rate != hc->max_rate) {
3224 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3225 }
3226 return 0;
3227}
3228
3229static int
3230hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3231 const struct shash *details)
3232{
3233 int error;
3234 struct hfsc_class class;
3235
3236 error = hfsc_parse_class_details__(netdev, details, &class);
3237 if (error) {
3238 return error;
3239 }
3240
3241 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3242 tc_make_handle(1, 0xfffe), &class);
3243 if (error) {
3244 return error;
3245 }
3246
3247 hfsc_update_queue__(netdev, queue_id, &class);
3248 return 0;
3249}
3250
3251static int
3252hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3253{
3254 int error;
3255 struct hfsc *hfsc;
3256 struct hfsc_class *hc;
3257
3258 hc = hfsc_class_cast__(queue);
3259 hfsc = hfsc_get__(netdev);
3260
3261 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3262 if (!error) {
3263 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3264 free(hc);
3265 }
3266 return error;
3267}
3268
3269static int
3270hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3271 struct netdev_queue_stats *stats)
3272{
3273 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3274 tc_make_handle(1, 0xfffe), NULL, stats);
3275}
3276
3277static int
3278hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3279 const struct ofpbuf *nlmsg,
3280 netdev_dump_queue_stats_cb *cb, void *aux)
3281{
3282 struct netdev_queue_stats stats;
3283 unsigned int handle, major, minor;
3284 int error;
3285
3286 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3287 if (error) {
3288 return error;
3289 }
3290
3291 major = tc_get_major(handle);
3292 minor = tc_get_minor(handle);
3293 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3294 (*cb)(minor - 1, &stats, aux);
3295 }
3296 return 0;
3297}
3298
3299static const struct tc_ops tc_ops_hfsc = {
3300 "hfsc", /* linux_name */
3301 "linux-hfsc", /* ovs_name */
3302 HFSC_N_QUEUES, /* n_queues */
3303 hfsc_tc_install, /* tc_install */
3304 hfsc_tc_load, /* tc_load */
3305 hfsc_tc_destroy, /* tc_destroy */
3306 hfsc_qdisc_get, /* qdisc_get */
3307 hfsc_qdisc_set, /* qdisc_set */
3308 hfsc_class_get, /* class_get */
3309 hfsc_class_set, /* class_set */
3310 hfsc_class_delete, /* class_delete */
3311 hfsc_class_get_stats, /* class_get_stats */
3312 hfsc_class_dump_stats /* class_dump_stats */
3313};
3314\f
c1c9c9c4
BP
3315/* "linux-default" traffic control class.
3316 *
3317 * This class represents the default, unnamed Linux qdisc. It corresponds to
3318 * the "" (empty string) QoS type in the OVS database. */
3319
3320static void
3321default_install__(struct netdev *netdev)
3322{
3323 struct netdev_dev_linux *netdev_dev =
3324 netdev_dev_linux_cast(netdev_get_dev(netdev));
3325 static struct tc *tc;
3326
3327 if (!tc) {
3328 tc = xmalloc(sizeof *tc);
3329 tc_init(tc, &tc_ops_default);
3330 }
3331 netdev_dev->tc = tc;
3332}
3333
3334static int
3335default_tc_install(struct netdev *netdev,
3336 const struct shash *details OVS_UNUSED)
3337{
3338 default_install__(netdev);
3339 return 0;
3340}
3341
3342static int
3343default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3344{
3345 default_install__(netdev);
3346 return 0;
3347}
3348
3349static const struct tc_ops tc_ops_default = {
3350 NULL, /* linux_name */
3351 "", /* ovs_name */
3352 0, /* n_queues */
3353 default_tc_install,
3354 default_tc_load,
3355 NULL, /* tc_destroy */
3356 NULL, /* qdisc_get */
3357 NULL, /* qdisc_set */
3358 NULL, /* class_get */
3359 NULL, /* class_set */
3360 NULL, /* class_delete */
3361 NULL, /* class_get_stats */
3362 NULL /* class_dump_stats */
3363};
3364\f
3365/* "linux-other" traffic control class.
3366 *
3367 * */
3368
3369static int
3370other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3371{
3372 struct netdev_dev_linux *netdev_dev =
3373 netdev_dev_linux_cast(netdev_get_dev(netdev));
3374 static struct tc *tc;
3375
3376 if (!tc) {
3377 tc = xmalloc(sizeof *tc);
3378 tc_init(tc, &tc_ops_other);
3379 }
3380 netdev_dev->tc = tc;
3381 return 0;
3382}
3383
3384static const struct tc_ops tc_ops_other = {
3385 NULL, /* linux_name */
3386 "linux-other", /* ovs_name */
3387 0, /* n_queues */
3388 NULL, /* tc_install */
3389 other_tc_load,
3390 NULL, /* tc_destroy */
3391 NULL, /* qdisc_get */
3392 NULL, /* qdisc_set */
3393 NULL, /* class_get */
3394 NULL, /* class_set */
3395 NULL, /* class_delete */
3396 NULL, /* class_get_stats */
3397 NULL /* class_dump_stats */
3398};
3399\f
3400/* Traffic control. */
3401
3402/* Number of kernel "tc" ticks per second. */
3403static double ticks_per_s;
3404
3405/* Number of kernel "jiffies" per second. This is used for the purpose of
3406 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3407 * one jiffy's worth of data.
3408 *
3409 * There are two possibilities here:
3410 *
3411 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3412 * approximate range of 100 to 1024. That means that we really need to
3413 * make sure that the qdisc can buffer that much data.
3414 *
3415 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3416 * has finely granular timers and there's no need to fudge additional room
3417 * for buffers. (There's no extra effort needed to implement that: the
3418 * large 'buffer_hz' is used as a divisor, so practically any number will
3419 * come out as 0 in the division. Small integer results in the case of
3420 * really high dividends won't have any real effect anyhow.)
3421 */
3422static unsigned int buffer_hz;
3423
3424/* Returns tc handle 'major':'minor'. */
3425static unsigned int
3426tc_make_handle(unsigned int major, unsigned int minor)
3427{
3428 return TC_H_MAKE(major << 16, minor);
3429}
3430
3431/* Returns the major number from 'handle'. */
3432static unsigned int
3433tc_get_major(unsigned int handle)
3434{
3435 return TC_H_MAJ(handle) >> 16;
3436}
3437
3438/* Returns the minor number from 'handle'. */
3439static unsigned int
3440tc_get_minor(unsigned int handle)
3441{
3442 return TC_H_MIN(handle);
3443}
3444
3445static struct tcmsg *
3446tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3447 struct ofpbuf *request)
3448{
3449 struct tcmsg *tcmsg;
3450 int ifindex;
3451 int error;
3452
3453 error = get_ifindex(netdev, &ifindex);
3454 if (error) {
3455 return NULL;
3456 }
3457
3458 ofpbuf_init(request, 512);
3459 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3460 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3461 tcmsg->tcm_family = AF_UNSPEC;
3462 tcmsg->tcm_ifindex = ifindex;
3463 /* Caller should fill in tcmsg->tcm_handle. */
3464 /* Caller should fill in tcmsg->tcm_parent. */
3465
3466 return tcmsg;
3467}
3468
3469static int
3470tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3471{
3472 int error = nl_sock_transact(rtnl_sock, request, replyp);
3473 ofpbuf_uninit(request);
3474 return error;
3475}
3476
3477static void
3478read_psched(void)
3479{
3480 /* The values in psched are not individually very meaningful, but they are
3481 * important. The tables below show some values seen in the wild.
3482 *
3483 * Some notes:
3484 *
3485 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3486 * (Before that, there are hints that it was 1000000000.)
3487 *
3488 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3489 * above.
3490 *
3491 * /proc/net/psched
3492 * -----------------------------------
3493 * [1] 000c8000 000f4240 000f4240 00000064
3494 * [2] 000003e8 00000400 000f4240 3b9aca00
3495 * [3] 000003e8 00000400 000f4240 3b9aca00
3496 * [4] 000003e8 00000400 000f4240 00000064
3497 * [5] 000003e8 00000040 000f4240 3b9aca00
3498 * [6] 000003e8 00000040 000f4240 000000f9
3499 *
3500 * a b c d ticks_per_s buffer_hz
3501 * ------- --------- ---------- ------------- ----------- -------------
3502 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3503 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3504 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3505 * [4] 1,000 1,024 1,000,000 100 976,562 100
3506 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3507 * [6] 1,000 64 1,000,000 249 15,625,000 249
3508 *
3509 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3510 * [2] 2.6.26-1-686-bigmem from Debian lenny
3511 * [3] 2.6.26-2-sparc64 from Debian lenny
3512 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3513 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3514 * [6] 2.6.34 from kernel.org on KVM
3515 */
3516 static const char fn[] = "/proc/net/psched";
3517 unsigned int a, b, c, d;
3518 FILE *stream;
3519
3520 ticks_per_s = 1.0;
3521 buffer_hz = 100;
3522
3523 stream = fopen(fn, "r");
3524 if (!stream) {
3525 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3526 return;
3527 }
3528
3529 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3530 VLOG_WARN("%s: read failed", fn);
3531 fclose(stream);
3532 return;
3533 }
3534 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3535 fclose(stream);
3536
3537 if (!a || !c) {
3538 VLOG_WARN("%s: invalid scheduler parameters", fn);
3539 return;
3540 }
3541
3542 ticks_per_s = (double) a * c / b;
3543 if (c == 1000000) {
3544 buffer_hz = d;
3545 } else {
3546 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3547 fn, a, b, c, d);
3548 }
3549 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3550}
3551
3552/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3553 * rate of 'rate' bytes per second. */
3554static unsigned int
3555tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3556{
3557 if (!buffer_hz) {
3558 read_psched();
3559 }
3560 return (rate * ticks) / ticks_per_s;
3561}
3562
3563/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3564 * rate of 'rate' bytes per second. */
3565static unsigned int
3566tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3567{
3568 if (!buffer_hz) {
3569 read_psched();
3570 }
015c93a4 3571 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3572}
3573
3574/* Returns the number of bytes that need to be reserved for qdisc buffering at
3575 * a transmission rate of 'rate' bytes per second. */
3576static unsigned int
3577tc_buffer_per_jiffy(unsigned int rate)
3578{
3579 if (!buffer_hz) {
3580 read_psched();
3581 }
3582 return rate / buffer_hz;
3583}
3584
3585/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3586 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3587 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3588 * stores NULL into it if it is absent.
3589 *
3590 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3591 * 'msg'.
3592 *
3593 * Returns 0 if successful, otherwise a positive errno value. */
3594static int
3595tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3596 struct nlattr **options)
3597{
3598 static const struct nl_policy tca_policy[] = {
3599 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3600 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3601 };
3602 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3603
3604 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3605 tca_policy, ta, ARRAY_SIZE(ta))) {
3606 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3607 goto error;
3608 }
3609
3610 if (kind) {
3611 *kind = nl_attr_get_string(ta[TCA_KIND]);
3612 }
3613
3614 if (options) {
3615 *options = ta[TCA_OPTIONS];
3616 }
3617
3618 return 0;
3619
3620error:
3621 if (kind) {
3622 *kind = NULL;
3623 }
3624 if (options) {
3625 *options = NULL;
3626 }
3627 return EPROTO;
3628}
3629
3630/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3631 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3632 * into '*options', and its queue statistics into '*stats'. Any of the output
3633 * arguments may be null.
3634 *
3635 * Returns 0 if successful, otherwise a positive errno value. */
3636static int
3637tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3638 struct nlattr **options, struct netdev_queue_stats *stats)
3639{
3640 static const struct nl_policy tca_policy[] = {
3641 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3642 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3643 };
3644 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3645
3646 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3647 tca_policy, ta, ARRAY_SIZE(ta))) {
3648 VLOG_WARN_RL(&rl, "failed to parse class message");
3649 goto error;
3650 }
3651
3652 if (handlep) {
3653 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3654 *handlep = tc->tcm_handle;
3655 }
3656
3657 if (options) {
3658 *options = ta[TCA_OPTIONS];
3659 }
3660
3661 if (stats) {
3662 const struct gnet_stats_queue *gsq;
3663 struct gnet_stats_basic gsb;
3664
3665 static const struct nl_policy stats_policy[] = {
3666 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3667 .min_len = sizeof gsb },
3668 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3669 .min_len = sizeof *gsq },
3670 };
3671 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3672
3673 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3674 sa, ARRAY_SIZE(sa))) {
3675 VLOG_WARN_RL(&rl, "failed to parse class stats");
3676 goto error;
3677 }
3678
3679 /* Alignment issues screw up the length of struct gnet_stats_basic on
3680 * some arch/bitsize combinations. Newer versions of Linux have a
3681 * struct gnet_stats_basic_packed, but we can't depend on that. The
3682 * easiest thing to do is just to make a copy. */
3683 memset(&gsb, 0, sizeof gsb);
3684 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3685 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3686 stats->tx_bytes = gsb.bytes;
3687 stats->tx_packets = gsb.packets;
3688
3689 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3690 stats->tx_errors = gsq->drops;
3691 }
3692
3693 return 0;
3694
3695error:
3696 if (options) {
3697 *options = NULL;
3698 }
3699 if (stats) {
3700 memset(stats, 0, sizeof *stats);
3701 }
3702 return EPROTO;
3703}
3704
3705/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3706 * on 'netdev'. */
3707static int
3708tc_query_class(const struct netdev *netdev,
3709 unsigned int handle, unsigned int parent,
3710 struct ofpbuf **replyp)
3711{
3712 struct ofpbuf request;
3713 struct tcmsg *tcmsg;
3714 int error;
3715
3716 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3717 if (!tcmsg) {
3718 return ENODEV;
3719 }
c1c9c9c4
BP
3720 tcmsg->tcm_handle = handle;
3721 tcmsg->tcm_parent = parent;
3722
3723 error = tc_transact(&request, replyp);
3724 if (error) {
3725 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3726 netdev_get_name(netdev),
3727 tc_get_major(handle), tc_get_minor(handle),
3728 tc_get_major(parent), tc_get_minor(parent),
3729 strerror(error));
3730 }
3731 return error;
3732}
3733
3734/* Equivalent to "tc class del dev <name> handle <handle>". */
3735static int
3736tc_delete_class(const struct netdev *netdev, unsigned int handle)
3737{
3738 struct ofpbuf request;
3739 struct tcmsg *tcmsg;
3740 int error;
3741
3742 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3743 if (!tcmsg) {
3744 return ENODEV;
3745 }
c1c9c9c4
BP
3746 tcmsg->tcm_handle = handle;
3747 tcmsg->tcm_parent = 0;
3748
3749 error = tc_transact(&request, NULL);
3750 if (error) {
3751 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3752 netdev_get_name(netdev),
3753 tc_get_major(handle), tc_get_minor(handle),
3754 strerror(error));
3755 }
3756 return error;
3757}
3758
3759/* Equivalent to "tc qdisc del dev <name> root". */
3760static int
3761tc_del_qdisc(struct netdev *netdev)
3762{
3763 struct netdev_dev_linux *netdev_dev =
3764 netdev_dev_linux_cast(netdev_get_dev(netdev));
3765 struct ofpbuf request;
3766 struct tcmsg *tcmsg;
3767 int error;
3768
3769 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3770 if (!tcmsg) {
3771 return ENODEV;
3772 }
c1c9c9c4
BP
3773 tcmsg->tcm_handle = tc_make_handle(1, 0);
3774 tcmsg->tcm_parent = TC_H_ROOT;
3775
3776 error = tc_transact(&request, NULL);
3777 if (error == EINVAL) {
3778 /* EINVAL probably means that the default qdisc was in use, in which
3779 * case we've accomplished our purpose. */
3780 error = 0;
3781 }
3782 if (!error && netdev_dev->tc) {
3783 if (netdev_dev->tc->ops->tc_destroy) {
3784 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3785 }
3786 netdev_dev->tc = NULL;
3787 }
3788 return error;
3789}
3790
3791/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3792 * kernel to determine what they are. Returns 0 if successful, otherwise a
3793 * positive errno value. */
3794static int
3795tc_query_qdisc(const struct netdev *netdev)
3796{
3797 struct netdev_dev_linux *netdev_dev =
3798 netdev_dev_linux_cast(netdev_get_dev(netdev));
3799 struct ofpbuf request, *qdisc;
3800 const struct tc_ops *ops;
3801 struct tcmsg *tcmsg;
3802 int load_error;
3803 int error;
3804
3805 if (netdev_dev->tc) {
3806 return 0;
3807 }
3808
3809 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3810 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3811 * 2.6.35 without that fix backported to it.
3812 *
3813 * To avoid the OOPS, we must not make a request that would attempt to dump
3814 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3815 * few others. There are a few ways that I can see to do this, but most of
3816 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3817 * technique chosen here is to assume that any non-default qdisc that we
3818 * create will have a class with handle 1:0. The built-in qdiscs only have
3819 * a class with handle 0:0.
3820 *
3821 * We could check for Linux 2.6.35+ and use a more straightforward method
3822 * there. */
3823 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3824 if (!tcmsg) {
3825 return ENODEV;
3826 }
c1c9c9c4
BP
3827 tcmsg->tcm_handle = tc_make_handle(1, 0);
3828 tcmsg->tcm_parent = 0;
3829
3830 /* Figure out what tc class to instantiate. */
3831 error = tc_transact(&request, &qdisc);
3832 if (!error) {
3833 const char *kind;
3834
3835 error = tc_parse_qdisc(qdisc, &kind, NULL);
3836 if (error) {
3837 ops = &tc_ops_other;
3838 } else {
3839 ops = tc_lookup_linux_name(kind);
3840 if (!ops) {
3841 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3842 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3843
3844 ops = &tc_ops_other;
3845 }
3846 }
3847 } else if (error == ENOENT) {
3848 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3849 * other entity that doesn't have a handle 1:0. We will assume
3850 * that it's the system default qdisc. */
3851 ops = &tc_ops_default;
3852 error = 0;
3853 } else {
3854 /* Who knows? Maybe the device got deleted. */
3855 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3856 netdev_get_name(netdev), strerror(error));
3857 ops = &tc_ops_other;
3858 }
3859
3860 /* Instantiate it. */
3861 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3862 assert((load_error == 0) == (netdev_dev->tc != NULL));
3863 ofpbuf_delete(qdisc);
3864
3865 return error ? error : load_error;
3866}
3867
3868/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3869 approximate the time to transmit packets of various lengths. For an MTU of
3870 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3871 represents two possible packet lengths; for a MTU of 513 through 1024, four
3872 possible lengths; and so on.
3873
3874 Returns, for the specified 'mtu', the number of bits that packet lengths
3875 need to be shifted right to fit within such a 256-entry table. */
3876static int
3877tc_calc_cell_log(unsigned int mtu)
3878{
3879 int cell_log;
3880
3881 if (!mtu) {
3882 mtu = ETH_PAYLOAD_MAX;
3883 }
3884 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3885
3886 for (cell_log = 0; mtu >= 256; cell_log++) {
3887 mtu >>= 1;
3888 }
3889
3890 return cell_log;
3891}
3892
3893/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3894 * of 'mtu'. */
3895static void
3896tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3897{
3898 memset(rate, 0, sizeof *rate);
3899 rate->cell_log = tc_calc_cell_log(mtu);
3900 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3901 /* rate->cell_align = 0; */ /* distro headers. */
3902 rate->mpu = ETH_TOTAL_MIN;
3903 rate->rate = Bps;
3904}
3905
3906/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3907 * attribute of the specified "type".
3908 *
3909 * See tc_calc_cell_log() above for a description of "rtab"s. */
3910static void
3911tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3912{
3913 uint32_t *rtab;
3914 unsigned int i;
3915
3916 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3917 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3918 unsigned packet_size = (i + 1) << rate->cell_log;
3919 if (packet_size < rate->mpu) {
3920 packet_size = rate->mpu;
3921 }
3922 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3923 }
3924}
3925
3926/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3927 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3928 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3929 * 0 is fine.) */
c1c9c9c4
BP
3930static int
3931tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3932{
3933 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3934 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3935}
d3980822
BP
3936\f
3937/* Public utility functions. */
3938
3939#define COPY_NETDEV_STATS \
3940 dst->rx_packets = src->rx_packets; \
3941 dst->tx_packets = src->tx_packets; \
3942 dst->rx_bytes = src->rx_bytes; \
3943 dst->tx_bytes = src->tx_bytes; \
3944 dst->rx_errors = src->rx_errors; \
3945 dst->tx_errors = src->tx_errors; \
3946 dst->rx_dropped = src->rx_dropped; \
3947 dst->tx_dropped = src->tx_dropped; \
3948 dst->multicast = src->multicast; \
3949 dst->collisions = src->collisions; \
3950 dst->rx_length_errors = src->rx_length_errors; \
3951 dst->rx_over_errors = src->rx_over_errors; \
3952 dst->rx_crc_errors = src->rx_crc_errors; \
3953 dst->rx_frame_errors = src->rx_frame_errors; \
3954 dst->rx_fifo_errors = src->rx_fifo_errors; \
3955 dst->rx_missed_errors = src->rx_missed_errors; \
3956 dst->tx_aborted_errors = src->tx_aborted_errors; \
3957 dst->tx_carrier_errors = src->tx_carrier_errors; \
3958 dst->tx_fifo_errors = src->tx_fifo_errors; \
3959 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3960 dst->tx_window_errors = src->tx_window_errors
3961
3962/* Copies 'src' into 'dst', performing format conversion in the process. */
3963void
3964netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3965 const struct rtnl_link_stats *src)
3966{
3967 COPY_NETDEV_STATS;
3968}
3969
3970/* Copies 'src' into 'dst', performing format conversion in the process. */
3971void
3972netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3973 const struct rtnl_link_stats64 *src)
3974{
3975 COPY_NETDEV_STATS;
3976}
3977
3978/* Copies 'src' into 'dst', performing format conversion in the process. */
3979void
3980netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3981 const struct netdev_stats *src)
3982{
3983 COPY_NETDEV_STATS;
7afa4f1d
BP
3984 dst->rx_compressed = 0;
3985 dst->tx_compressed = 0;
d3980822 3986}
c1c9c9c4
BP
3987\f
3988/* Utility functions. */
3989
3990static int
3991get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3992{
3993 /* Policy for RTNLGRP_LINK messages.
3994 *
3995 * There are *many* more fields in these messages, but currently we only
3996 * care about these fields. */
3997 static const struct nl_policy rtnlgrp_link_policy[] = {
3998 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3999 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4000 .min_len = sizeof(struct rtnl_link_stats) },
4001 };
4002
4003 struct ofpbuf request;
4004 struct ofpbuf *reply;
4005 struct ifinfomsg *ifi;
c1c9c9c4
BP
4006 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4007 int error;
4008
4009 ofpbuf_init(&request, 0);
4010 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4011 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4012 ifi->ifi_family = PF_UNSPEC;
4013 ifi->ifi_index = ifindex;
4014 error = nl_sock_transact(rtnl_sock, &request, &reply);
4015 ofpbuf_uninit(&request);
4016 if (error) {
4017 return error;
4018 }
4019
4020 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4021 rtnlgrp_link_policy,
4022 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4023 ofpbuf_delete(reply);
4024 return EPROTO;
4025 }
4026
4027 if (!attrs[IFLA_STATS]) {
4028 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4029 ofpbuf_delete(reply);
4030 return EPROTO;
4031 }
8b61709d 4032
d3980822 4033 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4034
576e26d7
BP
4035 ofpbuf_delete(reply);
4036
8b61709d
BP
4037 return 0;
4038}
4039
4040static int
4041get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4042{
4043 static const char fn[] = "/proc/net/dev";
4044 char line[1024];
4045 FILE *stream;
4046 int ln;
4047
4048 stream = fopen(fn, "r");
4049 if (!stream) {
4050 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4051 return errno;
4052 }
4053
4054 ln = 0;
4055 while (fgets(line, sizeof line, stream)) {
4056 if (++ln >= 3) {
4057 char devname[16];
4058#define X64 "%"SCNu64
4059 if (sscanf(line,
4060 " %15[^:]:"
4061 X64 X64 X64 X64 X64 X64 X64 "%*u"
4062 X64 X64 X64 X64 X64 X64 X64 "%*u",
4063 devname,
4064 &stats->rx_bytes,
4065 &stats->rx_packets,
4066 &stats->rx_errors,
4067 &stats->rx_dropped,
4068 &stats->rx_fifo_errors,
4069 &stats->rx_frame_errors,
4070 &stats->multicast,
4071 &stats->tx_bytes,
4072 &stats->tx_packets,
4073 &stats->tx_errors,
4074 &stats->tx_dropped,
4075 &stats->tx_fifo_errors,
4076 &stats->collisions,
4077 &stats->tx_carrier_errors) != 15) {
4078 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4079 } else if (!strcmp(devname, netdev_name)) {
4080 stats->rx_length_errors = UINT64_MAX;
4081 stats->rx_over_errors = UINT64_MAX;
4082 stats->rx_crc_errors = UINT64_MAX;
4083 stats->rx_missed_errors = UINT64_MAX;
4084 stats->tx_aborted_errors = UINT64_MAX;
4085 stats->tx_heartbeat_errors = UINT64_MAX;
4086 stats->tx_window_errors = UINT64_MAX;
4087 fclose(stream);
4088 return 0;
4089 }
4090 }
4091 }
4092 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4093 fclose(stream);
4094 return ENODEV;
4095}
c1c9c9c4 4096
8b61709d
BP
4097static int
4098get_flags(const struct netdev *netdev, int *flags)
4099{
4100 struct ifreq ifr;
4101 int error;
4102
149f577a
JG
4103 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4104 "SIOCGIFFLAGS");
8b61709d
BP
4105 *flags = ifr.ifr_flags;
4106 return error;
4107}
4108
4109static int
4110set_flags(struct netdev *netdev, int flags)
4111{
4112 struct ifreq ifr;
4113
4114 ifr.ifr_flags = flags;
149f577a
JG
4115 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4116 "SIOCSIFFLAGS");
8b61709d
BP
4117}
4118
4119static int
4120do_get_ifindex(const char *netdev_name)
4121{
4122 struct ifreq ifr;
4123
71d7c22f 4124 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4125 COVERAGE_INC(netdev_get_ifindex);
4126 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4127 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4128 netdev_name, strerror(errno));
4129 return -errno;
4130 }
4131 return ifr.ifr_ifindex;
4132}
4133
4134static int
4135get_ifindex(const struct netdev *netdev_, int *ifindexp)
4136{
149f577a
JG
4137 struct netdev_dev_linux *netdev_dev =
4138 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4139 *ifindexp = 0;
149f577a 4140 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4141 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4142 if (ifindex < 0) {
4143 return -ifindex;
4144 }
149f577a
JG
4145 netdev_dev->cache_valid |= VALID_IFINDEX;
4146 netdev_dev->ifindex = ifindex;
8b61709d 4147 }
149f577a 4148 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4149 return 0;
4150}
4151
4152static int
4153get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4154{
4155 struct ifreq ifr;
4156 int hwaddr_family;
4157
4158 memset(&ifr, 0, sizeof ifr);
71d7c22f 4159 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4160 COVERAGE_INC(netdev_get_hwaddr);
4161 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4162 /* ENODEV probably means that a vif disappeared asynchronously and
4163 * hasn't been removed from the database yet, so reduce the log level
4164 * to INFO for that case. */
4165 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4166 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4167 netdev_name, strerror(errno));
8b61709d
BP
4168 return errno;
4169 }
4170 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4171 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4172 VLOG_WARN("%s device has unknown hardware address family %d",
4173 netdev_name, hwaddr_family);
4174 }
4175 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4176 return 0;
4177}
4178
4179static int
4180set_etheraddr(const char *netdev_name, int hwaddr_family,
4181 const uint8_t mac[ETH_ADDR_LEN])
4182{
4183 struct ifreq ifr;
4184
4185 memset(&ifr, 0, sizeof ifr);
71d7c22f 4186 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4187 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4188 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4189 COVERAGE_INC(netdev_set_hwaddr);
4190 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4191 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4192 netdev_name, strerror(errno));
4193 return errno;
4194 }
4195 return 0;
4196}
4197
4198static int
0b0544d7 4199netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4200 int cmd, const char *cmd_name)
4201{
4202 struct ifreq ifr;
4203
4204 memset(&ifr, 0, sizeof ifr);
71d7c22f 4205 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4206 ifr.ifr_data = (caddr_t) ecmd;
4207
4208 ecmd->cmd = cmd;
4209 COVERAGE_INC(netdev_ethtool);
4210 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4211 return 0;
4212 } else {
4213 if (errno != EOPNOTSUPP) {
4214 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4215 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4216 } else {
4217 /* The device doesn't support this operation. That's pretty
4218 * common, so there's no point in logging anything. */
4219 }
4220 return errno;
4221 }
4222}
4223
4224static int
149f577a
JG
4225netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4226 const char *cmd_name)
8b61709d 4227{
71d7c22f 4228 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4229 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4230 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4231 strerror(errno));
8b61709d
BP
4232 return errno;
4233 }
4234 return 0;
4235}
f1acd62b
BP
4236
4237static int
4238netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4239 int cmd, const char *cmd_name)
4240{
4241 struct ifreq ifr;
4242 int error;
4243
4244 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4245 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4246 if (!error) {
4247 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4248 *ip = sin->sin_addr;
4249 }
4250 return error;
4251}
488d734d
BP
4252
4253/* Returns an AF_PACKET raw socket or a negative errno value. */
4254static int
4255af_packet_sock(void)
4256{
4257 static int sock = INT_MIN;
4258
4259 if (sock == INT_MIN) {
4260 sock = socket(AF_PACKET, SOCK_RAW, 0);
4261 if (sock >= 0) {
4262 set_nonblocking(sock);
4263 } else {
4264 sock = -errno;
4265 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4266 }
4267 }
4268
4269 return sock;
4270}