]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
ofproto: Warn about excessive rule counts in OpenFlow tables.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
55bc98d6 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
55bc98d6 39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
55bc98d6 42#include <net/if_packet.h>
8b61709d
BP
43#include <net/route.h>
44#include <netinet/in.h>
e9e28be3 45#include <poll.h>
8b61709d
BP
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
e9e28be3
BP
49
50#include "coverage.h"
9fe3b9a2 51#include "dpif-linux.h"
df1e5a3b 52#include "dpif-netdev.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
c060c4cf 61#include "netlink.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d 63#include "openflow/openflow.h"
19c8e9c1 64#include "ovs-atomic.h"
91088554 65#include "packet-dpif.h"
8b61709d
BP
66#include "packets.h"
67#include "poll-loop.h"
21d6e22e 68#include "rtnetlink-link.h"
8b61709d 69#include "shash.h"
c060c4cf 70#include "socket-util.h"
19993ef3 71#include "sset.h"
1670c579 72#include "timer.h"
c060c4cf 73#include "unaligned.h"
e9e28be3 74#include "vlog.h"
5136ce49 75
d98e6007 76VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 77
d76f09ea
BP
78COVERAGE_DEFINE(netdev_set_policing);
79COVERAGE_DEFINE(netdev_arp_lookup);
80COVERAGE_DEFINE(netdev_get_ifindex);
81COVERAGE_DEFINE(netdev_get_hwaddr);
82COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
83COVERAGE_DEFINE(netdev_get_ethtool);
84COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 85
8b61709d
BP
86\f
87/* These were introduced in Linux 2.6.14, so they might be missing if we have
88 * old headers. */
89#ifndef ADVERTISED_Pause
90#define ADVERTISED_Pause (1 << 13)
91#endif
92#ifndef ADVERTISED_Asym_Pause
93#define ADVERTISED_Asym_Pause (1 << 14)
94#endif
95
e47bd51a
JP
96/* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98#ifndef ETHTOOL_GFLAGS
99#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100#endif
101#ifndef ETHTOOL_SFLAGS
102#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103#endif
104
c1c9c9c4
BP
105/* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 * headers. */
107#ifndef TC_RTAB_SIZE
108#define TC_RTAB_SIZE 1024
109#endif
110
b73c8518
SH
111/* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
116 *
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
119 */
55bc98d6
BP
120#ifndef PACKET_AUXDATA
121#define PACKET_AUXDATA 8
122#endif
b73c8518
SH
123#ifndef TP_STATUS_VLAN_VALID
124#define TP_STATUS_VLAN_VALID (1 << 4)
125#endif
126#ifndef TP_STATUS_VLAN_TPID_VALID
127#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
128#endif
129#undef tpacket_auxdata
130#define tpacket_auxdata rpl_tpacket_auxdata
131struct tpacket_auxdata {
132 uint32_t tp_status;
133 uint32_t tp_len;
134 uint32_t tp_snaplen;
135 uint16_t tp_mac;
136 uint16_t tp_net;
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
139};
140
8b61709d 141enum {
7fbef77a
JG
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
144 VALID_IN4 = 1 << 2,
145 VALID_IN6 = 1 << 3,
146 VALID_MTU = 1 << 4,
3a183124 147 VALID_POLICING = 1 << 5,
4f925bd3
PS
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
51f87458 150 VALID_FEATURES = 1 << 8,
8b61709d 151};
c1c9c9c4
BP
152\f
153/* Traffic control. */
154
155/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
156 * network device.
157 *
158 * Each TC implementation subclasses this with whatever additional data it
159 * needs. */
c1c9c9c4
BP
160struct tc {
161 const struct tc_ops *ops;
93b13be8
BP
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
165};
c1c9c9c4 166
559eb230
BP
167#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
168
93b13be8
BP
169/* One traffic control queue.
170 *
171 * Each TC implementation subclasses this with whatever additional data it
172 * needs. */
173struct tc_queue {
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 176 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
177};
178
179/* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
181 *
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
185struct tc_ops {
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
190
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
193
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
197
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
203 *
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
207 *
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
210 *
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
79f1cbe9 213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
214
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
218 *
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
224 * 'netdev'.
225 *
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
229
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
232 * tc_destroy(tc).
233 *
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
237 *
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
240
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
242 *
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
246 *
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
250 *
251 * This function may be null if 'tc' is not configurable.
252 */
79f1cbe9 253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
254
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * This function may be null if 'tc' is not configurable.
263 */
79f1cbe9 264 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 265
93b13be8
BP
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
268 *
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
272 *
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
276 *
277 * This function may be null if 'tc' does not have queues ('n_queues' is
278 * 0). */
93b13be8 279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 280 struct smap *details);
c1c9c9c4
BP
281
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
285 * 'n_queues'.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 294 const struct smap *details);
c1c9c9c4 295
93b13be8
BP
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
298 *
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
93b13be8 301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 302
93b13be8
BP
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
305 *
306 * On success, initializes '*stats'.
307 *
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
93b13be8
BP
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
c1c9c9c4
BP
312 struct netdev_queue_stats *stats);
313
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
316 *
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
322};
323
324static void
325tc_init(struct tc *tc, const struct tc_ops *ops)
326{
327 tc->ops = ops;
93b13be8 328 hmap_init(&tc->queues);
c1c9c9c4
BP
329}
330
331static void
332tc_destroy(struct tc *tc)
333{
93b13be8 334 hmap_destroy(&tc->queues);
c1c9c9c4
BP
335}
336
337static const struct tc_ops tc_ops_htb;
a339aa81 338static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
339static const struct tc_ops tc_ops_default;
340static const struct tc_ops tc_ops_other;
341
559eb230 342static const struct tc_ops *const tcs[] = {
c1c9c9c4 343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
347 NULL
348};
149f577a 349
c1c9c9c4
BP
350static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351static unsigned int tc_get_major(unsigned int handle);
352static unsigned int tc_get_minor(unsigned int handle);
353
354static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356static unsigned int tc_buffer_per_jiffy(unsigned int rate);
357
358static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
361static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362static int tc_add_policer(struct netdev *netdev, int kbits_rate,
363 int kbits_burst);
c1c9c9c4
BP
364
365static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373static int tc_delete_class(const struct netdev *, unsigned int handle);
374
375static int tc_del_qdisc(struct netdev *netdev);
376static int tc_query_qdisc(const struct netdev *netdev);
377
378static int tc_calc_cell_log(unsigned int mtu);
379static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
383\f
b5d57fc8
BP
384struct netdev_linux {
385 struct netdev up;
149f577a 386
86383816
BP
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
389
149f577a 390 unsigned int cache_valid;
8b61709d 391
1670c579
EJ
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
395
8722022c
BP
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
398 int ifindex;
399 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 400 struct in_addr address, netmask;
8b61709d
BP
401 struct in6_addr in6;
402 int mtu;
059e5f4f 403 unsigned int ifi_flags;
65c3058c 404 long long int carrier_resets;
80a86fbe
BP
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
bba1e6f3
PS
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
90a6637d 409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 411 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 414
a00ca915
EJ
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 418
4f925bd3 419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 420 struct tc *tc;
149f577a 421
d0d08f8a
BP
422 /* For devices of class netdev_tap_class only. */
423 int tap_fd;
8b61709d
BP
424};
425
f7791740
PS
426struct netdev_rxq_linux {
427 struct netdev_rxq up;
796223f5 428 bool is_tap;
5b7448ed 429 int fd;
149f577a 430};
8b61709d 431
8b61709d
BP
432/* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
435
19c8e9c1
JS
436/* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
439 *
440 * Readers do not depend on this variable synchronizing with the related
441 * changes in the device miimon status, so we can use atomic_count. */
442static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 443
259e0b1a 444static void netdev_linux_run(void);
6f643e49 445
0b0544d7 446static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 447 int cmd, const char *cmd_name);
f1acd62b
BP
448static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
449 int cmd, const char *cmd_name);
b5d57fc8 450static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 451static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
452static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
453 enum netdev_flags on, enum netdev_flags *old_flagsp)
454 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
455static int do_get_ifindex(const char *netdev_name);
456static int get_ifindex(const struct netdev *, int *ifindexp);
457static int do_set_addr(struct netdev *netdev,
458 int ioctl_nr, const char *ioctl_name,
459 struct in_addr addr);
460static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 461static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
35eef899 462static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 463static int af_packet_sock(void);
19c8e9c1 464static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
465static void netdev_linux_miimon_run(void);
466static void netdev_linux_miimon_wait(void);
df1e5a3b 467static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 468
15b3596a
JG
469static bool
470is_netdev_linux_class(const struct netdev_class *netdev_class)
471{
259e0b1a 472 return netdev_class->run == netdev_linux_run;
15b3596a
JG
473}
474
796223f5
BP
475static bool
476is_tap_netdev(const struct netdev *netdev)
477{
b5d57fc8 478 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
479}
480
8b61709d
BP
481static struct netdev_linux *
482netdev_linux_cast(const struct netdev *netdev)
483{
b5d57fc8 484 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 485
180c6d0b 486 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 487}
796223f5 488
f7791740
PS
489static struct netdev_rxq_linux *
490netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 491{
9dc63482 492 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 493 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 494}
ff4ed3c9 495\f
cee87338 496static void netdev_linux_update(struct netdev_linux *netdev,
86383816
BP
497 const struct rtnetlink_link_change *)
498 OVS_REQUIRES(netdev->mutex);
cee87338 499static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
500 unsigned int ifi_flags, unsigned int mask)
501 OVS_REQUIRES(netdev->mutex);
cee87338
BP
502
503/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
504 * if no such socket could be created. */
505static struct nl_sock *
506netdev_linux_notify_sock(void)
507{
508 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
509 static struct nl_sock *sock;
510
511 if (ovsthread_once_start(&once)) {
512 int error;
513
514 error = nl_sock_create(NETLINK_ROUTE, &sock);
515 if (!error) {
516 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
517 if (error) {
518 nl_sock_destroy(sock);
519 sock = NULL;
520 }
521 }
522 ovsthread_once_done(&once);
523 }
524
525 return sock;
526}
527
19c8e9c1
JS
528static bool
529netdev_linux_miimon_enabled(void)
530{
812c272c 531 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
532}
533
8b61709d
BP
534static void
535netdev_linux_run(void)
536{
cee87338
BP
537 struct nl_sock *sock;
538 int error;
539
19c8e9c1
JS
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
542 }
cee87338
BP
543
544 sock = netdev_linux_notify_sock();
545 if (!sock) {
546 return;
547 }
548
549 do {
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
552 struct ofpbuf buf;
553
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
556 if (!error) {
557 struct rtnetlink_link_change change;
558
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
563
564 ovs_mutex_lock(&netdev->mutex);
cee87338 565 netdev_linux_update(netdev, &change);
86383816 566 ovs_mutex_unlock(&netdev->mutex);
cee87338 567 }
38e0065b 568 netdev_close(netdev_);
cee87338
BP
569 }
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
573
574 nl_sock_drain(sock);
575
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
581 unsigned int flags;
582
86383816 583 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
86383816
BP
586 ovs_mutex_unlock(&netdev->mutex);
587
cee87338
BP
588 netdev_close(netdev_);
589 }
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
594 }
595 ofpbuf_uninit(&buf);
596 } while (!error);
8b61709d
BP
597}
598
599static void
600netdev_linux_wait(void)
601{
cee87338
BP
602 struct nl_sock *sock;
603
19c8e9c1
JS
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
606 }
cee87338
BP
607 sock = netdev_linux_notify_sock();
608 if (sock) {
609 nl_sock_wait(sock, POLLIN);
610 }
8b61709d
BP
611}
612
ac4d3bcb 613static void
b5d57fc8
BP
614netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
86383816 616 OVS_REQUIRES(dev->mutex)
ac4d3bcb 617{
3e912ffc 618 netdev_change_seq_changed(&dev->up);
8aa77183
BP
619
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
622 }
623 dev->ifi_flags = ifi_flags;
624
4f925bd3
PS
625 dev->cache_valid &= mask;
626}
627
628static void
b5d57fc8
BP
629netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
86383816 631 OVS_REQUIRES(dev->mutex)
4f925bd3
PS
632{
633 if (change->nlmsg_type == RTM_NEWLINK) {
634 /* Keep drv-info */
b5d57fc8 635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 636
c7b1b0a5 637 /* Update netdev from rtnl-change msg. */
90a6637d
PS
638 if (change->mtu) {
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
642 }
643
44445cac
PS
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
648 }
649
c7b1b0a5
PS
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
653
4f925bd3 654 } else {
b5d57fc8 655 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 656 }
ac4d3bcb
EJ
657}
658
9dc63482
BP
659static struct netdev *
660netdev_linux_alloc(void)
661{
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
663 return &netdev->up;
664}
665
cee87338 666static void
9dc63482
BP
667netdev_linux_common_construct(struct netdev_linux *netdev)
668{
834d6caf 669 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
670}
671
1f6e0fbd
BP
672/* Creates system and internal devices. */
673static int
9dc63482 674netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 675{
9dc63482 676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
677 int error;
678
cee87338 679 netdev_linux_common_construct(netdev);
1f6e0fbd 680
b5d57fc8
BP
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
9dc63482 683 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 684 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
685 return ENODEV;
686 } else {
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
691 }
692 }
46415c90 693
a740f0de
JG
694 return 0;
695}
696
5b7448ed
JG
697/* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
a740f0de 703static int
9dc63482 704netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 705{
9dc63482 706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 707 static const char tap_dev[] = "/dev/net/tun";
9dc63482 708 const char *name = netdev_->name;
a740f0de
JG
709 struct ifreq ifr;
710 int error;
711
cee87338 712 netdev_linux_common_construct(netdev);
1f6e0fbd 713
6c88d577 714 /* Open tap device. */
d0d08f8a
BP
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
6c88d577 717 error = errno;
10a89ef0 718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 719 return error;
6c88d577
JP
720 }
721
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 726 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 727 ovs_strerror(errno));
6c88d577 728 error = errno;
f61d8d29 729 goto error_close;
6c88d577
JP
730 }
731
732 /* Make non-blocking. */
d0d08f8a 733 error = set_nonblocking(netdev->tap_fd);
a740f0de 734 if (error) {
f61d8d29 735 goto error_close;
a740f0de
JG
736 }
737
738 return 0;
739
f61d8d29 740error_close:
d0d08f8a 741 close(netdev->tap_fd);
a740f0de
JG
742 return error;
743}
744
6c88d577 745static void
9dc63482 746netdev_linux_destruct(struct netdev *netdev_)
6c88d577 747{
b5d57fc8 748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 749
b5d57fc8
BP
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
752 }
753
d0d08f8a
BP
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
756 {
757 close(netdev->tap_fd);
6c88d577 758 }
86383816 759
19c8e9c1 760 if (netdev->miimon_interval > 0) {
812c272c 761 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
762 }
763
86383816 764 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
765}
766
9dc63482
BP
767static void
768netdev_linux_dealloc(struct netdev *netdev_)
769{
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771 free(netdev);
772}
773
f7791740
PS
774static struct netdev_rxq *
775netdev_linux_rxq_alloc(void)
9dc63482 776{
f7791740 777 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
778 return &rx->up;
779}
780
7b6b0ef4 781static int
f7791740 782netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 783{
f7791740 784 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 785 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 787 int error;
7b6b0ef4 788
86383816 789 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
790 rx->is_tap = is_tap_netdev(netdev_);
791 if (rx->is_tap) {
792 rx->fd = netdev->tap_fd;
796223f5
BP
793 } else {
794 struct sockaddr_ll sll;
b73c8518 795 int ifindex, val;
32383c3b 796 /* Result of tcpdump -dd inbound */
259e0b1a 797 static const struct sock_filter filt[] = {
32383c3b
MM
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
802 };
259e0b1a
BP
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
805 };
7b6b0ef4 806
796223f5 807 /* Create file descriptor. */
9dc63482
BP
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
809 if (rx->fd < 0) {
796223f5 810 error = errno;
10a89ef0 811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
812 goto error;
813 }
33d82a56 814
b73c8518
SH
815 val = 1;
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
817 error = errno;
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
820 goto error;
821 }
822
796223f5 823 /* Set non-blocking mode. */
9dc63482 824 error = set_nonblocking(rx->fd);
796223f5
BP
825 if (error) {
826 goto error;
827 }
7b6b0ef4 828
796223f5 829 /* Get ethernet device index. */
180c6d0b 830 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
831 if (error) {
832 goto error;
833 }
7b6b0ef4 834
796223f5
BP
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
b73c8518 839 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
841 error = errno;
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 843 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
844 goto error;
845 }
32383c3b
MM
846
847 /* Filter for only inbound packets. */
9dc63482 848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
849 sizeof fprog);
850 if (error) {
851 error = errno;
259e0b1a 852 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 853 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
854 goto error;
855 }
7b6b0ef4 856 }
86383816 857 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 858
7b6b0ef4
BP
859 return 0;
860
861error:
9dc63482
BP
862 if (rx->fd >= 0) {
863 close(rx->fd);
7b6b0ef4 864 }
86383816 865 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
866 return error;
867}
868
796223f5 869static void
f7791740 870netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 871{
f7791740 872 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 873
796223f5
BP
874 if (!rx->is_tap) {
875 close(rx->fd);
8b61709d 876 }
9dc63482
BP
877}
878
879static void
f7791740 880netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 881{
f7791740 882 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 883
796223f5
BP
884 free(rx);
885}
8b61709d 886
b73c8518
SH
887static ovs_be16
888auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
889{
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
892 } else {
893 return htons(ETH_TYPE_VLAN);
894 }
895}
896
897static bool
898auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
899{
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
901}
902
796223f5 903static int
f7791740 904netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
796223f5 905{
b73c8518 906 size_t size;
796223f5 907 ssize_t retval;
b73c8518
SH
908 struct iovec iov;
909 struct cmsghdr *cmsg;
910 union {
911 struct cmsghdr cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
913 } cmsg_buffer;
914 struct msghdr msgh;
915
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
919
1f317cb5 920 iov.iov_base = ofpbuf_data(buffer);
b73c8518
SH
921 iov.iov_len = size;
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
924 msgh.msg_iov = &iov;
925 msgh.msg_iovlen = 1;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
928 msgh.msg_flags = 0;
8e8cddf7 929
796223f5 930 do {
b73c8518 931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
932 } while (retval < 0 && errno == EINTR);
933
bfd3367b 934 if (retval < 0) {
b73c8518
SH
935 return errno;
936 } else if (retval > size) {
937 return EMSGSIZE;
938 }
939
1f317cb5 940 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
b73c8518
SH
941
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
944
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
948 continue;
8b61709d 949 }
b73c8518
SH
950
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
954 return EINVAL;
955 }
956
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
959 break;
960 }
961 }
962
963 return 0;
964}
965
966static int
f7791740 967netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
b73c8518
SH
968{
969 ssize_t retval;
970 size_t size = ofpbuf_tailroom(buffer);
971
972 do {
1f317cb5 973 retval = read(fd, ofpbuf_data(buffer), size);
b73c8518
SH
974 } while (retval < 0 && errno == EINTR);
975
976 if (retval < 0) {
bfd3367b
SH
977 return errno;
978 } else if (retval > size) {
979 return EMSGSIZE;
8b61709d 980 }
b73c8518 981
1f317cb5 982 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
b73c8518
SH
983 return 0;
984}
985
986static int
91088554
DDP
987netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
988 int *c)
b73c8518 989{
f7791740 990 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 991 struct netdev *netdev = rx->up.netdev;
91088554 992 struct dpif_packet *packet;
df1e5a3b
PS
993 struct ofpbuf *buffer;
994 ssize_t retval;
995 int mtu;
996
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
999 }
1000
91088554
DDP
1001 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1002 DP_NETDEV_HEADROOM);
1003 buffer = &packet->ofpbuf;
b73c8518
SH
1004
1005 retval = (rx->is_tap
f7791740
PS
1006 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1007 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1008
1009 if (retval) {
1010 if (retval != EAGAIN && retval != EMSGSIZE) {
1011 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
f7791740 1012 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
df1e5a3b 1013 }
f4fd623c 1014 dpif_packet_delete(packet);
df1e5a3b
PS
1015 } else {
1016 dp_packet_pad(buffer);
61a2647e 1017 dpif_packet_set_dp_hash(packet, 0);
91088554 1018 packets[0] = packet;
df1e5a3b 1019 *c = 1;
b73c8518
SH
1020 }
1021
1022 return retval;
8b61709d
BP
1023}
1024
8b61709d 1025static void
f7791740 1026netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1027{
f7791740 1028 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1029 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1030}
1031
8b61709d 1032static int
f7791740 1033netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1034{
f7791740 1035 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1036 if (rx->is_tap) {
8b61709d 1037 struct ifreq ifr;
f7791740 1038 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1039 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1040 if (error) {
1041 return error;
1042 }
796223f5 1043 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1044 return 0;
1045 } else {
796223f5 1046 return drain_rcvbuf(rx->fd);
8b61709d
BP
1047 }
1048}
1049
1050/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1051 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1052 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1053 * the packet is too big or too small to transmit on the device.
1054 *
1055 * The caller retains ownership of 'buffer' in all cases.
1056 *
1057 * The kernel maintains a packet transmission queue, so the caller is not
1058 * expected to do additional queuing of packets. */
1059static int
f00fa8cb
AW
1060netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1061 struct dpif_packet **pkts, int cnt, bool may_steal)
8b61709d 1062{
f4fd623c
DDP
1063 int i;
1064 int error = 0;
40d26f04 1065
f4fd623c
DDP
1066 /* 'i' is incremented only if there's no error */
1067 for (i = 0; i < cnt;) {
1068 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1069 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
f23347ea 1070 ssize_t retval;
8b61709d 1071
796223f5 1072 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1073 /* Use our AF_PACKET socket to send to this device. */
1074 struct sockaddr_ll sll;
1075 struct msghdr msg;
1076 struct iovec iov;
1077 int ifindex;
488d734d
BP
1078 int sock;
1079
1080 sock = af_packet_sock();
1081 if (sock < 0) {
c4c7a3d7 1082 return -sock;
488d734d 1083 }
f23347ea 1084
86383816
BP
1085 ifindex = netdev_get_ifindex(netdev_);
1086 if (ifindex < 0) {
1087 return -ifindex;
f23347ea 1088 }
8b61709d 1089
f23347ea
BP
1090 /* We don't bother setting most fields in sockaddr_ll because the
1091 * kernel ignores them for SOCK_RAW. */
1092 memset(&sll, 0, sizeof sll);
1093 sll.sll_family = AF_PACKET;
1094 sll.sll_ifindex = ifindex;
76c308b5 1095
ebc56baa 1096 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1097 iov.iov_len = size;
76c308b5 1098
f23347ea
BP
1099 msg.msg_name = &sll;
1100 msg.msg_namelen = sizeof sll;
1101 msg.msg_iov = &iov;
1102 msg.msg_iovlen = 1;
1103 msg.msg_control = NULL;
1104 msg.msg_controllen = 0;
1105 msg.msg_flags = 0;
1106
488d734d 1107 retval = sendmsg(sock, &msg, 0);
f23347ea 1108 } else {
796223f5
BP
1109 /* Use the tap fd to send to this device. This is essential for
1110 * tap devices, because packets sent to a tap device with an
1111 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1112 * tap device. This doesn't occur on other interface types
1113 * because we attach a socket filter to the rx socket. */
b5d57fc8 1114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1115
d0d08f8a 1116 retval = write(netdev->tap_fd, data, size);
f23347ea 1117 }
76c308b5 1118
8b61709d
BP
1119 if (retval < 0) {
1120 /* The Linux AF_PACKET implementation never blocks waiting for room
1121 * for packets, instead returning ENOBUFS. Translate this into
1122 * EAGAIN for the caller. */
f4fd623c
DDP
1123 error = errno == ENOBUFS ? EAGAIN : errno;
1124 if (error == EINTR) {
1125 /* continue without incrementing 'i', i.e. retry this packet */
8b61709d 1126 continue;
8b61709d 1127 }
f4fd623c 1128 break;
8b61709d 1129 } else if (retval != size) {
f4fd623c
DDP
1130 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1131 " of %"PRIuSIZE") on %s", retval, size,
1132 netdev_get_name(netdev_));
1133 error = EMSGSIZE;
1134 break;
1135 }
1136
1137 /* Process the next packet in the batch */
1138 i++;
1139 }
1140
1141 if (may_steal) {
1142 for (i = 0; i < cnt; i++) {
1143 dpif_packet_delete(pkts[i]);
8b61709d
BP
1144 }
1145 }
f4fd623c
DDP
1146
1147 if (error && error != EAGAIN) {
1148 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1149 netdev_get_name(netdev_), ovs_strerror(error));
1150 }
1151
1152 return error;
1153
8b61709d
BP
1154}
1155
1156/* Registers with the poll loop to wake up from the next call to poll_block()
1157 * when the packet transmission queue has sufficient room to transmit a packet
1158 * with netdev_send().
1159 *
1160 * The kernel maintains a packet transmission queue, so the client is not
1161 * expected to do additional queuing of packets. Thus, this function is
1162 * unlikely to ever be used. It is included for completeness. */
1163static void
f00fa8cb 1164netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1165{
796223f5 1166 if (is_tap_netdev(netdev)) {
8b61709d
BP
1167 /* TAP device always accepts packets.*/
1168 poll_immediate_wake();
1169 }
1170}
1171
1172/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1173 * otherwise a positive errno value. */
1174static int
1175netdev_linux_set_etheraddr(struct netdev *netdev_,
1176 const uint8_t mac[ETH_ADDR_LEN])
1177{
b5d57fc8 1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1179 enum netdev_flags old_flags = 0;
eb395f2e
BP
1180 int error;
1181
86383816
BP
1182 ovs_mutex_lock(&netdev->mutex);
1183
b5d57fc8 1184 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1185 error = netdev->ether_addr_error;
1186 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1187 goto exit;
44445cac 1188 }
b5d57fc8 1189 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1190 }
1191
7eb1bd81 1192 /* Tap devices must be brought down before setting the address. */
796223f5 1193 if (is_tap_netdev(netdev_)) {
4f9f3f21 1194 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1195 }
44445cac
PS
1196 error = set_etheraddr(netdev_get_name(netdev_), mac);
1197 if (!error || error == ENODEV) {
b5d57fc8
BP
1198 netdev->ether_addr_error = error;
1199 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1200 if (!error) {
b5d57fc8 1201 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1202 }
8b61709d 1203 }
44445cac 1204
4f9f3f21
BP
1205 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1206 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1207 }
7eb1bd81 1208
86383816
BP
1209exit:
1210 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1211 return error;
1212}
1213
44445cac 1214/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1215static int
1216netdev_linux_get_etheraddr(const struct netdev *netdev_,
1217 uint8_t mac[ETH_ADDR_LEN])
1218{
b5d57fc8 1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1220 int error;
44445cac 1221
86383816 1222 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1223 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816
BP
1224 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1225 netdev->etheraddr);
b5d57fc8 1226 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1227 }
44445cac 1228
86383816
BP
1229 error = netdev->ether_addr_error;
1230 if (!error) {
b5d57fc8 1231 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac 1232 }
86383816 1233 ovs_mutex_unlock(&netdev->mutex);
44445cac 1234
86383816 1235 return error;
8b61709d
BP
1236}
1237
8b61709d 1238static int
73371c09 1239netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1240{
86383816
BP
1241 int error;
1242
b5d57fc8 1243 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1244 struct ifreq ifr;
90a6637d 1245
86383816 1246 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1247 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1248 netdev->mtu = ifr.ifr_mtu;
1249 netdev->cache_valid |= VALID_MTU;
8b61709d 1250 }
90a6637d 1251
86383816
BP
1252 error = netdev->netdev_mtu_error;
1253 if (!error) {
b5d57fc8 1254 *mtup = netdev->mtu;
90a6637d 1255 }
73371c09
BP
1256
1257 return error;
1258}
1259
1260/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1261 * in bytes, not including the hardware header; thus, this is typically 1500
1262 * bytes for Ethernet devices. */
1263static int
1264netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1265{
1266 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1267 int error;
1268
1269 ovs_mutex_lock(&netdev->mutex);
1270 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1271 ovs_mutex_unlock(&netdev->mutex);
1272
1273 return error;
8b61709d
BP
1274}
1275
9b020780
PS
1276/* Sets the maximum size of transmitted (MTU) for given device using linux
1277 * networking ioctl interface.
1278 */
1279static int
1280netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1281{
b5d57fc8 1282 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1283 struct ifreq ifr;
1284 int error;
1285
86383816 1286 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1287 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1288 error = netdev->netdev_mtu_error;
1289 if (error || netdev->mtu == mtu) {
1290 goto exit;
90a6637d 1291 }
b5d57fc8 1292 netdev->cache_valid &= ~VALID_MTU;
153e5481 1293 }
9b020780 1294 ifr.ifr_mtu = mtu;
259e0b1a
BP
1295 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1296 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1297 if (!error || error == ENODEV) {
b5d57fc8
BP
1298 netdev->netdev_mtu_error = error;
1299 netdev->mtu = ifr.ifr_mtu;
1300 netdev->cache_valid |= VALID_MTU;
9b020780 1301 }
86383816
BP
1302exit:
1303 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1304 return error;
9b020780
PS
1305}
1306
9ab3d9a3
BP
1307/* Returns the ifindex of 'netdev', if successful, as a positive number.
1308 * On failure, returns a negative errno value. */
1309static int
86383816 1310netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1311{
86383816 1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1313 int ifindex, error;
1314
86383816
BP
1315 ovs_mutex_lock(&netdev->mutex);
1316 error = get_ifindex(netdev_, &ifindex);
1317 ovs_mutex_unlock(&netdev->mutex);
1318
9ab3d9a3
BP
1319 return error ? -error : ifindex;
1320}
1321
8b61709d
BP
1322static int
1323netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1324{
b5d57fc8 1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1326
86383816 1327 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1328 if (netdev->miimon_interval > 0) {
1329 *carrier = netdev->miimon;
3a183124 1330 } else {
b5d57fc8 1331 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1332 }
86383816 1333 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1334
3a183124 1335 return 0;
8b61709d
BP
1336}
1337
65c3058c 1338static long long int
86383816 1339netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1340{
86383816
BP
1341 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1342 long long int carrier_resets;
1343
1344 ovs_mutex_lock(&netdev->mutex);
1345 carrier_resets = netdev->carrier_resets;
1346 ovs_mutex_unlock(&netdev->mutex);
1347
1348 return carrier_resets;
65c3058c
EJ
1349}
1350
63331829 1351static int
1670c579
EJ
1352netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1353 struct mii_ioctl_data *data)
63331829 1354{
63331829 1355 struct ifreq ifr;
782e6111 1356 int error;
63331829 1357
63331829 1358 memset(&ifr, 0, sizeof ifr);
782e6111 1359 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1360 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1361 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1362
782e6111
EJ
1363 return error;
1364}
1365
1366static int
1670c579 1367netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1368{
782e6111
EJ
1369 struct mii_ioctl_data data;
1370 int error;
63331829 1371
782e6111
EJ
1372 *miimon = false;
1373
1374 memset(&data, 0, sizeof data);
1670c579 1375 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1376 if (!error) {
1377 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1378 data.reg_num = MII_BMSR;
1670c579 1379 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1380 &data);
63331829
EJ
1381
1382 if (!error) {
782e6111 1383 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1384 } else {
1385 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1386 }
1387 } else {
1388 struct ethtool_cmd ecmd;
63331829
EJ
1389
1390 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1391 name);
1392
ab985a77 1393 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1394 memset(&ecmd, 0, sizeof ecmd);
1395 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1396 "ETHTOOL_GLINK");
1397 if (!error) {
782e6111
EJ
1398 struct ethtool_value eval;
1399
1400 memcpy(&eval, &ecmd, sizeof eval);
1401 *miimon = !!eval.data;
63331829
EJ
1402 } else {
1403 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1404 }
1405 }
1406
1407 return error;
1408}
1409
1670c579
EJ
1410static int
1411netdev_linux_set_miimon_interval(struct netdev *netdev_,
1412 long long int interval)
1413{
b5d57fc8 1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1415
86383816 1416 ovs_mutex_lock(&netdev->mutex);
1670c579 1417 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1418 if (netdev->miimon_interval != interval) {
19c8e9c1 1419 if (interval && !netdev->miimon_interval) {
812c272c 1420 atomic_count_inc(&miimon_cnt);
19c8e9c1 1421 } else if (!interval && netdev->miimon_interval) {
812c272c 1422 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1423 }
1424
b5d57fc8
BP
1425 netdev->miimon_interval = interval;
1426 timer_set_expired(&netdev->miimon_timer);
1670c579 1427 }
86383816 1428 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1429
1430 return 0;
1431}
1432
1433static void
1434netdev_linux_miimon_run(void)
1435{
1436 struct shash device_shash;
1437 struct shash_node *node;
1438
1439 shash_init(&device_shash);
b5d57fc8 1440 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1441 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1442 struct netdev *netdev = node->data;
1443 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1444 bool miimon;
1445
86383816
BP
1446 ovs_mutex_lock(&dev->mutex);
1447 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1448 netdev_linux_get_miimon(dev->up.name, &miimon);
1449 if (miimon != dev->miimon) {
1450 dev->miimon = miimon;
1451 netdev_linux_changed(dev, dev->ifi_flags, 0);
1452 }
1670c579 1453
86383816 1454 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1455 }
86383816 1456 ovs_mutex_unlock(&dev->mutex);
2f980d74 1457 netdev_close(netdev);
1670c579
EJ
1458 }
1459
1460 shash_destroy(&device_shash);
1461}
1462
1463static void
1464netdev_linux_miimon_wait(void)
1465{
1466 struct shash device_shash;
1467 struct shash_node *node;
1468
1469 shash_init(&device_shash);
b5d57fc8 1470 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1471 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1472 struct netdev *netdev = node->data;
1473 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1474
86383816 1475 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1476 if (dev->miimon_interval > 0) {
1477 timer_wait(&dev->miimon_timer);
1478 }
86383816 1479 ovs_mutex_unlock(&dev->mutex);
2f980d74 1480 netdev_close(netdev);
1670c579
EJ
1481 }
1482 shash_destroy(&device_shash);
1483}
1484
92df599c
JG
1485static void
1486swap_uint64(uint64_t *a, uint64_t *b)
1487{
1de0e8ae
BP
1488 uint64_t tmp = *a;
1489 *a = *b;
1490 *b = tmp;
92df599c
JG
1491}
1492
c060c4cf
EJ
1493/* Copies 'src' into 'dst', performing format conversion in the process.
1494 *
1495 * 'src' is allowed to be misaligned. */
1496static void
1497netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1498 const struct ovs_vport_stats *src)
1499{
6a54dedc
BP
1500 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1501 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1502 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1503 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1504 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1505 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1506 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1507 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1508 dst->multicast = 0;
1509 dst->collisions = 0;
1510 dst->rx_length_errors = 0;
1511 dst->rx_over_errors = 0;
1512 dst->rx_crc_errors = 0;
1513 dst->rx_frame_errors = 0;
1514 dst->rx_fifo_errors = 0;
1515 dst->rx_missed_errors = 0;
1516 dst->tx_aborted_errors = 0;
1517 dst->tx_carrier_errors = 0;
1518 dst->tx_fifo_errors = 0;
1519 dst->tx_heartbeat_errors = 0;
1520 dst->tx_window_errors = 0;
1521}
1522
1523static int
1524get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1525{
1526 struct dpif_linux_vport reply;
1527 struct ofpbuf *buf;
1528 int error;
1529
1530 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1531 if (error) {
1532 return error;
1533 } else if (!reply.stats) {
1534 ofpbuf_delete(buf);
1535 return EOPNOTSUPP;
1536 }
1537
1538 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1539
1540 ofpbuf_delete(buf);
1541
1542 return 0;
1543}
1544
f613a0d7
PS
1545static void
1546get_stats_via_vport(const struct netdev *netdev_,
1547 struct netdev_stats *stats)
8b61709d 1548{
b5d57fc8 1549 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1550
b5d57fc8
BP
1551 if (!netdev->vport_stats_error ||
1552 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1553 int error;
7fbef77a 1554
c060c4cf 1555 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1556 if (error && error != ENOENT) {
a57a8488 1557 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1558 "(%s)",
1559 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1560 }
b5d57fc8
BP
1561 netdev->vport_stats_error = error;
1562 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1563 }
f613a0d7 1564}
8b61709d 1565
f613a0d7
PS
1566/* Retrieves current device stats for 'netdev-linux'. */
1567static int
1568netdev_linux_get_stats(const struct netdev *netdev_,
1569 struct netdev_stats *stats)
1570{
b5d57fc8 1571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1572 struct netdev_stats dev_stats;
1573 int error;
1574
86383816 1575 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1576 get_stats_via_vport(netdev_, stats);
35eef899 1577 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1578 if (error) {
86383816
BP
1579 if (!netdev->vport_stats_error) {
1580 error = 0;
f613a0d7 1581 }
86383816 1582 } else if (netdev->vport_stats_error) {
04c881eb 1583 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1584 *stats = dev_stats;
1585 } else {
04c881eb
AZ
1586 /* Use kernel netdev's packet and byte counts since vport's counters
1587 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1588 * enabled. */
1589 stats->rx_packets = dev_stats.rx_packets;
1590 stats->rx_bytes = dev_stats.rx_bytes;
1591 stats->tx_packets = dev_stats.tx_packets;
1592 stats->tx_bytes = dev_stats.tx_bytes;
1593
f613a0d7
PS
1594 stats->rx_errors += dev_stats.rx_errors;
1595 stats->tx_errors += dev_stats.tx_errors;
1596 stats->rx_dropped += dev_stats.rx_dropped;
1597 stats->tx_dropped += dev_stats.tx_dropped;
1598 stats->multicast += dev_stats.multicast;
1599 stats->collisions += dev_stats.collisions;
1600 stats->rx_length_errors += dev_stats.rx_length_errors;
1601 stats->rx_over_errors += dev_stats.rx_over_errors;
1602 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1603 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1604 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1605 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1606 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1607 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1608 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1609 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1610 stats->tx_window_errors += dev_stats.tx_window_errors;
1611 }
86383816
BP
1612 ovs_mutex_unlock(&netdev->mutex);
1613
1614 return error;
f613a0d7
PS
1615}
1616
1617/* Retrieves current device stats for 'netdev-tap' netdev or
1618 * netdev-internal. */
1619static int
15aee116 1620netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1621{
b5d57fc8 1622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1623 struct netdev_stats dev_stats;
1624 int error;
1625
86383816 1626 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1627 get_stats_via_vport(netdev_, stats);
35eef899 1628 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1629 if (error) {
86383816
BP
1630 if (!netdev->vport_stats_error) {
1631 error = 0;
8b61709d 1632 }
86383816
BP
1633 } else if (netdev->vport_stats_error) {
1634 /* Transmit and receive stats will appear to be swapped relative to the
1635 * other ports since we are the one sending the data, not a remote
1636 * computer. For consistency, we swap them back here. This does not
1637 * apply if we are getting stats from the vport layer because it always
1638 * tracks stats from the perspective of the switch. */
fe6b0e03 1639
f613a0d7 1640 *stats = dev_stats;
92df599c
JG
1641 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1642 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1643 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1644 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1645 stats->rx_length_errors = 0;
1646 stats->rx_over_errors = 0;
1647 stats->rx_crc_errors = 0;
1648 stats->rx_frame_errors = 0;
1649 stats->rx_fifo_errors = 0;
1650 stats->rx_missed_errors = 0;
1651 stats->tx_aborted_errors = 0;
1652 stats->tx_carrier_errors = 0;
1653 stats->tx_fifo_errors = 0;
1654 stats->tx_heartbeat_errors = 0;
1655 stats->tx_window_errors = 0;
f613a0d7 1656 } else {
04c881eb
AZ
1657 /* Use kernel netdev's packet and byte counts since vport counters
1658 * do not reflect packet counts on the wire when GSO, TSO or GRO
1659 * are enabled. */
1660 stats->rx_packets = dev_stats.tx_packets;
1661 stats->rx_bytes = dev_stats.tx_bytes;
1662 stats->tx_packets = dev_stats.rx_packets;
1663 stats->tx_bytes = dev_stats.rx_bytes;
1664
f613a0d7
PS
1665 stats->rx_dropped += dev_stats.tx_dropped;
1666 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1667
f613a0d7
PS
1668 stats->rx_errors += dev_stats.tx_errors;
1669 stats->tx_errors += dev_stats.rx_errors;
1670
1671 stats->multicast += dev_stats.multicast;
1672 stats->collisions += dev_stats.collisions;
1673 }
86383816
BP
1674 ovs_mutex_unlock(&netdev->mutex);
1675
1676 return error;
8b61709d
BP
1677}
1678
bba1e6f3
PS
1679static int
1680netdev_internal_get_stats(const struct netdev *netdev_,
1681 struct netdev_stats *stats)
1682{
b5d57fc8 1683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1684 int error;
bba1e6f3 1685
86383816 1686 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1687 get_stats_via_vport(netdev_, stats);
86383816
BP
1688 error = netdev->vport_stats_error;
1689 ovs_mutex_unlock(&netdev->mutex);
1690
1691 return error;
bba1e6f3
PS
1692}
1693
51f87458 1694static void
b5d57fc8 1695netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1696{
1697 struct ethtool_cmd ecmd;
6c038611 1698 uint32_t speed;
8b61709d
BP
1699 int error;
1700
b5d57fc8 1701 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1702 return;
1703 }
1704
ab985a77 1705 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1706 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1707 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1708 ETHTOOL_GSET, "ETHTOOL_GSET");
1709 if (error) {
51f87458 1710 goto out;
8b61709d
BP
1711 }
1712
1713 /* Supported features. */
b5d57fc8 1714 netdev->supported = 0;
8b61709d 1715 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1716 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1717 }
1718 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1719 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1720 }
1721 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1722 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1723 }
1724 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1725 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1726 }
1727 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1728 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1729 }
1730 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1731 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1732 }
1733 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1734 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1735 }
1736 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1737 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1738 }
1739 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1740 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1741 }
1742 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1743 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1744 }
1745 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1746 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1747 }
1748 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1749 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1750 }
1751
1752 /* Advertised features. */
b5d57fc8 1753 netdev->advertised = 0;
8b61709d 1754 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1755 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1756 }
1757 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1758 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1759 }
1760 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1761 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1762 }
1763 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1764 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1765 }
1766 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1767 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1768 }
1769 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1770 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1771 }
1772 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1773 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1774 }
1775 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1776 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1777 }
1778 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1779 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1780 }
1781 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1782 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1783 }
1784 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1785 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1786 }
1787 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1788 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1789 }
1790
1791 /* Current settings. */
2a529ead 1792 speed = ecmd.speed;
6c038611 1793 if (speed == SPEED_10) {
b5d57fc8 1794 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1795 } else if (speed == SPEED_100) {
b5d57fc8 1796 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1797 } else if (speed == SPEED_1000) {
b5d57fc8 1798 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1799 } else if (speed == SPEED_10000) {
b5d57fc8 1800 netdev->current = NETDEV_F_10GB_FD;
6c038611 1801 } else if (speed == 40000) {
b5d57fc8 1802 netdev->current = NETDEV_F_40GB_FD;
6c038611 1803 } else if (speed == 100000) {
b5d57fc8 1804 netdev->current = NETDEV_F_100GB_FD;
6c038611 1805 } else if (speed == 1000000) {
b5d57fc8 1806 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1807 } else {
b5d57fc8 1808 netdev->current = 0;
8b61709d
BP
1809 }
1810
1811 if (ecmd.port == PORT_TP) {
b5d57fc8 1812 netdev->current |= NETDEV_F_COPPER;
8b61709d 1813 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1814 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1815 }
1816
1817 if (ecmd.autoneg) {
b5d57fc8 1818 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1819 }
1820
51f87458 1821out:
b5d57fc8
BP
1822 netdev->cache_valid |= VALID_FEATURES;
1823 netdev->get_features_error = error;
51f87458
PS
1824}
1825
887ed8b2
BP
1826/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1827 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1828 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1829static int
1830netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1831 enum netdev_features *current,
1832 enum netdev_features *advertised,
1833 enum netdev_features *supported,
1834 enum netdev_features *peer)
51f87458 1835{
b5d57fc8 1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1837 int error;
51f87458 1838
86383816 1839 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1840 netdev_linux_read_features(netdev);
b5d57fc8
BP
1841 if (!netdev->get_features_error) {
1842 *current = netdev->current;
1843 *advertised = netdev->advertised;
1844 *supported = netdev->supported;
887ed8b2 1845 *peer = 0; /* XXX */
51f87458 1846 }
86383816
BP
1847 error = netdev->get_features_error;
1848 ovs_mutex_unlock(&netdev->mutex);
1849
1850 return error;
8b61709d
BP
1851}
1852
1853/* Set the features advertised by 'netdev' to 'advertise'. */
1854static int
86383816 1855netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1856 enum netdev_features advertise)
8b61709d 1857{
86383816 1858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1859 struct ethtool_cmd ecmd;
1860 int error;
1861
86383816
BP
1862 ovs_mutex_lock(&netdev->mutex);
1863
ab985a77 1864 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1865 memset(&ecmd, 0, sizeof ecmd);
86383816 1866 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1867 ETHTOOL_GSET, "ETHTOOL_GSET");
1868 if (error) {
86383816 1869 goto exit;
8b61709d
BP
1870 }
1871
1872 ecmd.advertising = 0;
6c038611 1873 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1874 ecmd.advertising |= ADVERTISED_10baseT_Half;
1875 }
6c038611 1876 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1877 ecmd.advertising |= ADVERTISED_10baseT_Full;
1878 }
6c038611 1879 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1880 ecmd.advertising |= ADVERTISED_100baseT_Half;
1881 }
6c038611 1882 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1883 ecmd.advertising |= ADVERTISED_100baseT_Full;
1884 }
6c038611 1885 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1886 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1887 }
6c038611 1888 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1889 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1890 }
6c038611 1891 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1892 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1893 }
6c038611 1894 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1895 ecmd.advertising |= ADVERTISED_TP;
1896 }
6c038611 1897 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1898 ecmd.advertising |= ADVERTISED_FIBRE;
1899 }
6c038611 1900 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1901 ecmd.advertising |= ADVERTISED_Autoneg;
1902 }
6c038611 1903 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1904 ecmd.advertising |= ADVERTISED_Pause;
1905 }
6c038611 1906 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1907 ecmd.advertising |= ADVERTISED_Asym_Pause;
1908 }
ab985a77 1909 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
1910 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1911 ETHTOOL_SSET, "ETHTOOL_SSET");
1912
1913exit:
1914 ovs_mutex_unlock(&netdev->mutex);
1915 return error;
8b61709d
BP
1916}
1917
f8500004
JP
1918/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1919 * successful, otherwise a positive errno value. */
8b61709d 1920static int
b5d57fc8 1921netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1922 uint32_t kbits_rate, uint32_t kbits_burst)
1923{
b5d57fc8
BP
1924 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1925 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1926 int error;
8b61709d 1927
80a86fbe
BP
1928 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1929 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1930 : kbits_burst); /* Stick with user-specified value. */
1931
86383816 1932 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1933 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
1934 error = netdev->netdev_policing_error;
1935 if (error || (netdev->kbits_rate == kbits_rate &&
1936 netdev->kbits_burst == kbits_burst)) {
c9f71668 1937 /* Assume that settings haven't changed since we last set them. */
86383816 1938 goto out;
c9f71668 1939 }
b5d57fc8 1940 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1941 }
1942
ac8c3412 1943 COVERAGE_INC(netdev_set_policing);
f8500004 1944 /* Remove any existing ingress qdisc. */
b5d57fc8 1945 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1946 if (error) {
1947 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1948 netdev_name, ovs_strerror(error));
c9f71668 1949 goto out;
f8500004
JP
1950 }
1951
8b61709d 1952 if (kbits_rate) {
b5d57fc8 1953 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1954 if (error) {
1955 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 1956 netdev_name, ovs_strerror(error));
c9f71668 1957 goto out;
8b61709d
BP
1958 }
1959
b5d57fc8 1960 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
1961 if (error){
1962 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 1963 netdev_name, ovs_strerror(error));
c9f71668 1964 goto out;
8b61709d 1965 }
8b61709d
BP
1966 }
1967
b5d57fc8
BP
1968 netdev->kbits_rate = kbits_rate;
1969 netdev->kbits_burst = kbits_burst;
f8500004 1970
c9f71668
PS
1971out:
1972 if (!error || error == ENODEV) {
b5d57fc8
BP
1973 netdev->netdev_policing_error = error;
1974 netdev->cache_valid |= VALID_POLICING;
c9f71668 1975 }
86383816 1976 ovs_mutex_unlock(&netdev->mutex);
c9f71668 1977 return error;
8b61709d
BP
1978}
1979
c1c9c9c4
BP
1980static int
1981netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1982 struct sset *types)
c1c9c9c4 1983{
559eb230 1984 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1985
1986 for (opsp = tcs; *opsp != NULL; opsp++) {
1987 const struct tc_ops *ops = *opsp;
1988 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1989 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1990 }
1991 }
1992 return 0;
1993}
1994
1995static const struct tc_ops *
1996tc_lookup_ovs_name(const char *name)
1997{
559eb230 1998 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1999
2000 for (opsp = tcs; *opsp != NULL; opsp++) {
2001 const struct tc_ops *ops = *opsp;
2002 if (!strcmp(name, ops->ovs_name)) {
2003 return ops;
2004 }
2005 }
2006 return NULL;
2007}
2008
2009static const struct tc_ops *
2010tc_lookup_linux_name(const char *name)
2011{
559eb230 2012 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2013
2014 for (opsp = tcs; *opsp != NULL; opsp++) {
2015 const struct tc_ops *ops = *opsp;
2016 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2017 return ops;
2018 }
2019 }
2020 return NULL;
2021}
2022
93b13be8 2023static struct tc_queue *
b5d57fc8 2024tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2025 size_t hash)
2026{
b5d57fc8 2027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2028 struct tc_queue *queue;
2029
b5d57fc8 2030 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2031 if (queue->queue_id == queue_id) {
2032 return queue;
2033 }
2034 }
2035 return NULL;
2036}
2037
2038static struct tc_queue *
2039tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2040{
2041 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2042}
2043
c1c9c9c4
BP
2044static int
2045netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2046 const char *type,
2047 struct netdev_qos_capabilities *caps)
2048{
2049 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2050 if (!ops) {
2051 return EOPNOTSUPP;
2052 }
2053 caps->n_queues = ops->n_queues;
2054 return 0;
2055}
2056
2057static int
b5d57fc8 2058netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2059 const char **typep, struct smap *details)
c1c9c9c4 2060{
b5d57fc8 2061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2062 int error;
2063
86383816 2064 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2065 error = tc_query_qdisc(netdev_);
86383816
BP
2066 if (!error) {
2067 *typep = netdev->tc->ops->ovs_name;
2068 error = (netdev->tc->ops->qdisc_get
2069 ? netdev->tc->ops->qdisc_get(netdev_, details)
2070 : 0);
c1c9c9c4 2071 }
86383816 2072 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2073
86383816 2074 return error;
c1c9c9c4
BP
2075}
2076
2077static int
b5d57fc8 2078netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2079 const char *type, const struct smap *details)
c1c9c9c4 2080{
b5d57fc8 2081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2082 const struct tc_ops *new_ops;
2083 int error;
2084
2085 new_ops = tc_lookup_ovs_name(type);
2086 if (!new_ops || !new_ops->tc_install) {
2087 return EOPNOTSUPP;
2088 }
2089
86383816 2090 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2091 error = tc_query_qdisc(netdev_);
c1c9c9c4 2092 if (error) {
86383816 2093 goto exit;
c1c9c9c4
BP
2094 }
2095
b5d57fc8 2096 if (new_ops == netdev->tc->ops) {
86383816 2097 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2098 } else {
2099 /* Delete existing qdisc. */
b5d57fc8 2100 error = tc_del_qdisc(netdev_);
c1c9c9c4 2101 if (error) {
86383816 2102 goto exit;
c1c9c9c4 2103 }
b5d57fc8 2104 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2105
2106 /* Install new qdisc. */
b5d57fc8
BP
2107 error = new_ops->tc_install(netdev_, details);
2108 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2109 }
86383816
BP
2110
2111exit:
2112 ovs_mutex_unlock(&netdev->mutex);
2113 return error;
c1c9c9c4
BP
2114}
2115
2116static int
b5d57fc8 2117netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2118 unsigned int queue_id, struct smap *details)
c1c9c9c4 2119{
b5d57fc8 2120 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2121 int error;
2122
86383816 2123 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2124 error = tc_query_qdisc(netdev_);
86383816 2125 if (!error) {
b5d57fc8 2126 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2127 error = (queue
b5d57fc8 2128 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2129 : ENOENT);
c1c9c9c4 2130 }
86383816
BP
2131 ovs_mutex_unlock(&netdev->mutex);
2132
2133 return error;
c1c9c9c4
BP
2134}
2135
2136static int
b5d57fc8 2137netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2138 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2139{
b5d57fc8 2140 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2141 int error;
2142
86383816 2143 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2144 error = tc_query_qdisc(netdev_);
86383816
BP
2145 if (!error) {
2146 error = (queue_id < netdev->tc->ops->n_queues
2147 && netdev->tc->ops->class_set
2148 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2149 : EINVAL);
c1c9c9c4 2150 }
86383816 2151 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2152
86383816 2153 return error;
c1c9c9c4
BP
2154}
2155
2156static int
b5d57fc8 2157netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2158{
b5d57fc8 2159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2160 int error;
2161
86383816 2162 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2163 error = tc_query_qdisc(netdev_);
86383816
BP
2164 if (!error) {
2165 if (netdev->tc->ops->class_delete) {
2166 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2167 error = (queue
2168 ? netdev->tc->ops->class_delete(netdev_, queue)
2169 : ENOENT);
2170 } else {
2171 error = EINVAL;
2172 }
c1c9c9c4 2173 }
86383816
BP
2174 ovs_mutex_unlock(&netdev->mutex);
2175
2176 return error;
c1c9c9c4
BP
2177}
2178
2179static int
b5d57fc8 2180netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2181 unsigned int queue_id,
2182 struct netdev_queue_stats *stats)
2183{
b5d57fc8 2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2185 int error;
2186
86383816 2187 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2188 error = tc_query_qdisc(netdev_);
86383816
BP
2189 if (!error) {
2190 if (netdev->tc->ops->class_get_stats) {
2191 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2192 if (queue) {
2193 stats->created = queue->created;
2194 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2195 stats);
2196 } else {
2197 error = ENOENT;
2198 }
2199 } else {
2200 error = EOPNOTSUPP;
6dc34a0d 2201 }
c1c9c9c4 2202 }
86383816
BP
2203 ovs_mutex_unlock(&netdev->mutex);
2204
2205 return error;
c1c9c9c4
BP
2206}
2207
d57695d7
JS
2208struct queue_dump_state {
2209 struct nl_dump dump;
2210 struct ofpbuf buf;
2211};
2212
23a98ffe 2213static bool
d57695d7 2214start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2215{
2216 struct ofpbuf request;
2217 struct tcmsg *tcmsg;
2218
2219 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2220 if (!tcmsg) {
2221 return false;
2222 }
3c4de644 2223 tcmsg->tcm_parent = 0;
d57695d7 2224 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2225 ofpbuf_uninit(&request);
d57695d7
JS
2226
2227 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2228 return true;
c1c9c9c4
BP
2229}
2230
d57695d7
JS
2231static int
2232finish_queue_dump(struct queue_dump_state *state)
2233{
2234 ofpbuf_uninit(&state->buf);
2235 return nl_dump_done(&state->dump);
2236}
2237
89454bf4
BP
2238struct netdev_linux_queue_state {
2239 unsigned int *queues;
2240 size_t cur_queue;
2241 size_t n_queues;
2242};
2243
c1c9c9c4 2244static int
89454bf4 2245netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2246{
89454bf4 2247 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2248 int error;
2249
86383816 2250 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2251 error = tc_query_qdisc(netdev_);
86383816
BP
2252 if (!error) {
2253 if (netdev->tc->ops->class_get) {
89454bf4
BP
2254 struct netdev_linux_queue_state *state;
2255 struct tc_queue *queue;
2256 size_t i;
2257
2258 *statep = state = xmalloc(sizeof *state);
2259 state->n_queues = hmap_count(&netdev->tc->queues);
2260 state->cur_queue = 0;
2261 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2262
2263 i = 0;
2264 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2265 state->queues[i++] = queue->queue_id;
86383816 2266 }
c1c9c9c4 2267 } else {
86383816 2268 error = EOPNOTSUPP;
c1c9c9c4
BP
2269 }
2270 }
86383816 2271 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2272
86383816 2273 return error;
c1c9c9c4
BP
2274}
2275
89454bf4
BP
2276static int
2277netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2278 unsigned int *queue_idp, struct smap *details)
2279{
2280 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2281 struct netdev_linux_queue_state *state = state_;
2282 int error = EOF;
2283
2284 ovs_mutex_lock(&netdev->mutex);
2285 while (state->cur_queue < state->n_queues) {
2286 unsigned int queue_id = state->queues[state->cur_queue++];
2287 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2288
2289 if (queue) {
2290 *queue_idp = queue_id;
2291 error = netdev->tc->ops->class_get(netdev_, queue, details);
2292 break;
2293 }
2294 }
2295 ovs_mutex_unlock(&netdev->mutex);
2296
2297 return error;
2298}
2299
2300static int
2301netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2302 void *state_)
2303{
2304 struct netdev_linux_queue_state *state = state_;
2305
2306 free(state->queues);
2307 free(state);
2308 return 0;
2309}
2310
c1c9c9c4 2311static int
b5d57fc8 2312netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2313 netdev_dump_queue_stats_cb *cb, void *aux)
2314{
b5d57fc8 2315 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2316 int error;
2317
86383816 2318 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2319 error = tc_query_qdisc(netdev_);
86383816 2320 if (!error) {
d57695d7 2321 struct queue_dump_state state;
c1c9c9c4 2322
86383816
BP
2323 if (!netdev->tc->ops->class_dump_stats) {
2324 error = EOPNOTSUPP;
d57695d7 2325 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2326 error = ENODEV;
2327 } else {
2328 struct ofpbuf msg;
2329 int retval;
2330
d57695d7 2331 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2332 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2333 cb, aux);
2334 if (retval) {
2335 error = retval;
2336 }
2337 }
2338
d57695d7 2339 retval = finish_queue_dump(&state);
86383816
BP
2340 if (retval) {
2341 error = retval;
2342 }
c1c9c9c4
BP
2343 }
2344 }
86383816 2345 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2346
86383816 2347 return error;
c1c9c9c4
BP
2348}
2349
8b61709d 2350static int
f1acd62b
BP
2351netdev_linux_get_in4(const struct netdev *netdev_,
2352 struct in_addr *address, struct in_addr *netmask)
8b61709d 2353{
b5d57fc8 2354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2355 int error;
149f577a 2356
86383816 2357 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2358 if (!(netdev->cache_valid & VALID_IN4)) {
b5d57fc8 2359 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d 2360 SIOCGIFADDR, "SIOCGIFADDR");
86383816
BP
2361 if (!error) {
2362 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2363 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2364 if (!error) {
2365 netdev->cache_valid |= VALID_IN4;
2366 }
8b61709d 2367 }
86383816
BP
2368 } else {
2369 error = 0;
2370 }
8b61709d 2371
86383816
BP
2372 if (!error) {
2373 if (netdev->address.s_addr != INADDR_ANY) {
2374 *address = netdev->address;
2375 *netmask = netdev->netmask;
2376 } else {
2377 error = EADDRNOTAVAIL;
f1acd62b 2378 }
8b61709d 2379 }
86383816
BP
2380 ovs_mutex_unlock(&netdev->mutex);
2381
2382 return error;
8b61709d
BP
2383}
2384
8b61709d 2385static int
f1acd62b
BP
2386netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2387 struct in_addr netmask)
8b61709d 2388{
b5d57fc8 2389 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2390 int error;
2391
86383816 2392 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2393 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2394 if (!error) {
b5d57fc8
BP
2395 netdev->cache_valid |= VALID_IN4;
2396 netdev->address = address;
2397 netdev->netmask = netmask;
f1acd62b 2398 if (address.s_addr != INADDR_ANY) {
8b61709d 2399 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2400 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2401 }
2402 }
86383816
BP
2403 ovs_mutex_unlock(&netdev->mutex);
2404
8b61709d
BP
2405 return error;
2406}
2407
2408static bool
2409parse_if_inet6_line(const char *line,
2410 struct in6_addr *in6, char ifname[16 + 1])
2411{
2412 uint8_t *s6 = in6->s6_addr;
2413#define X8 "%2"SCNx8
c2c28dfd
BP
2414 return ovs_scan(line,
2415 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2416 "%*x %*x %*x %*x %16s\n",
2417 &s6[0], &s6[1], &s6[2], &s6[3],
2418 &s6[4], &s6[5], &s6[6], &s6[7],
2419 &s6[8], &s6[9], &s6[10], &s6[11],
2420 &s6[12], &s6[13], &s6[14], &s6[15],
2421 ifname);
8b61709d
BP
2422}
2423
2424/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2425 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2426static int
2427netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2428{
b5d57fc8 2429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
2430
2431 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2432 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2433 FILE *file;
2434 char line[128];
2435
b5d57fc8 2436 netdev->in6 = in6addr_any;
8b61709d
BP
2437
2438 file = fopen("/proc/net/if_inet6", "r");
2439 if (file != NULL) {
2440 const char *name = netdev_get_name(netdev_);
2441 while (fgets(line, sizeof line, file)) {
2a022368 2442 struct in6_addr in6_tmp;
8b61709d 2443 char ifname[16 + 1];
2a022368 2444 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2445 && !strcmp(name, ifname))
2446 {
b5d57fc8 2447 netdev->in6 = in6_tmp;
8b61709d
BP
2448 break;
2449 }
2450 }
2451 fclose(file);
2452 }
b5d57fc8 2453 netdev->cache_valid |= VALID_IN6;
8b61709d 2454 }
b5d57fc8 2455 *in6 = netdev->in6;
86383816
BP
2456 ovs_mutex_unlock(&netdev->mutex);
2457
8b61709d
BP
2458 return 0;
2459}
2460
2461static void
2462make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2463{
2464 struct sockaddr_in sin;
2465 memset(&sin, 0, sizeof sin);
2466 sin.sin_family = AF_INET;
2467 sin.sin_addr = addr;
2468 sin.sin_port = 0;
2469
2470 memset(sa, 0, sizeof *sa);
2471 memcpy(sa, &sin, sizeof sin);
2472}
2473
2474static int
2475do_set_addr(struct netdev *netdev,
2476 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2477{
2478 struct ifreq ifr;
149f577a 2479
259e0b1a
BP
2480 make_in4_sockaddr(&ifr.ifr_addr, addr);
2481 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2482 ioctl_name);
8b61709d
BP
2483}
2484
2485/* Adds 'router' as a default IP gateway. */
2486static int
67a4917b 2487netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2488{
2489 struct in_addr any = { INADDR_ANY };
2490 struct rtentry rt;
2491 int error;
2492
2493 memset(&rt, 0, sizeof rt);
2494 make_in4_sockaddr(&rt.rt_dst, any);
2495 make_in4_sockaddr(&rt.rt_gateway, router);
2496 make_in4_sockaddr(&rt.rt_genmask, any);
2497 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2498 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2499 if (error) {
10a89ef0 2500 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2501 }
2502 return error;
2503}
2504
f1acd62b
BP
2505static int
2506netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2507 char **netdev_name)
2508{
2509 static const char fn[] = "/proc/net/route";
2510 FILE *stream;
2511 char line[256];
2512 int ln;
2513
2514 *netdev_name = NULL;
2515 stream = fopen(fn, "r");
2516 if (stream == NULL) {
10a89ef0 2517 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2518 return errno;
2519 }
2520
2521 ln = 0;
2522 while (fgets(line, sizeof line, stream)) {
2523 if (++ln >= 2) {
2524 char iface[17];
dbba996b 2525 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2526 int refcnt, metric, mtu;
2527 unsigned int flags, use, window, irtt;
2528
c2c28dfd
BP
2529 if (!ovs_scan(line,
2530 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2531 " %d %u %u\n",
2532 iface, &dest, &gateway, &flags, &refcnt,
2533 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2534 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2535 fn, ln, line);
2536 continue;
2537 }
2538 if (!(flags & RTF_UP)) {
2539 /* Skip routes that aren't up. */
2540 continue;
2541 }
2542
2543 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2544 * network byte order, so we don't need need any endian
f1acd62b
BP
2545 * conversions here. */
2546 if ((dest & mask) == (host->s_addr & mask)) {
2547 if (!gateway) {
2548 /* The host is directly reachable. */
2549 next_hop->s_addr = 0;
2550 } else {
2551 /* To reach the host, we must go through a gateway. */
2552 next_hop->s_addr = gateway;
2553 }
2554 *netdev_name = xstrdup(iface);
2555 fclose(stream);
2556 return 0;
2557 }
2558 }
2559 }
2560
2561 fclose(stream);
2562 return ENXIO;
2563}
2564
e210037e 2565static int
b5d57fc8 2566netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2567{
b5d57fc8 2568 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2569 int error = 0;
2570
86383816 2571 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2572 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2573 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2574
2575 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2576 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2577 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2578 cmd,
2579 ETHTOOL_GDRVINFO,
2580 "ETHTOOL_GDRVINFO");
2581 if (!error) {
b5d57fc8 2582 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2583 }
2584 }
e210037e 2585
e210037e 2586 if (!error) {
b5d57fc8
BP
2587 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2588 smap_add(smap, "driver_version", netdev->drvinfo.version);
2589 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2590 }
86383816
BP
2591 ovs_mutex_unlock(&netdev->mutex);
2592
e210037e
AE
2593 return error;
2594}
2595
4f925bd3 2596static int
275707c3
EJ
2597netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2598 struct smap *smap)
4f925bd3 2599{
79f1cbe9 2600 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2601 return 0;
2602}
2603
8b61709d
BP
2604/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2605 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2606 * returns 0. Otherwise, it returns a positive errno value; in particular,
2607 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2608static int
2609netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2610 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2611{
2612 struct arpreq r;
c100e025 2613 struct sockaddr_in sin;
8b61709d
BP
2614 int retval;
2615
2616 memset(&r, 0, sizeof r);
f2cc621b 2617 memset(&sin, 0, sizeof sin);
c100e025
BP
2618 sin.sin_family = AF_INET;
2619 sin.sin_addr.s_addr = ip;
2620 sin.sin_port = 0;
2621 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2622 r.arp_ha.sa_family = ARPHRD_ETHER;
2623 r.arp_flags = 0;
71d7c22f 2624 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2625 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2626 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2627 if (!retval) {
2628 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2629 } else if (retval != ENXIO) {
2630 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2631 netdev_get_name(netdev), IP_ARGS(ip),
2632 ovs_strerror(retval));
8b61709d
BP
2633 }
2634 return retval;
2635}
2636
2637static int
2638nd_to_iff_flags(enum netdev_flags nd)
2639{
2640 int iff = 0;
2641 if (nd & NETDEV_UP) {
2642 iff |= IFF_UP;
2643 }
2644 if (nd & NETDEV_PROMISC) {
2645 iff |= IFF_PROMISC;
2646 }
7ba19d41
AC
2647 if (nd & NETDEV_LOOPBACK) {
2648 iff |= IFF_LOOPBACK;
2649 }
8b61709d
BP
2650 return iff;
2651}
2652
2653static int
2654iff_to_nd_flags(int iff)
2655{
2656 enum netdev_flags nd = 0;
2657 if (iff & IFF_UP) {
2658 nd |= NETDEV_UP;
2659 }
2660 if (iff & IFF_PROMISC) {
2661 nd |= NETDEV_PROMISC;
2662 }
7ba19d41
AC
2663 if (iff & IFF_LOOPBACK) {
2664 nd |= NETDEV_LOOPBACK;
2665 }
8b61709d
BP
2666 return nd;
2667}
2668
2669static int
4f9f3f21
BP
2670update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2671 enum netdev_flags on, enum netdev_flags *old_flagsp)
2672 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2673{
2674 int old_flags, new_flags;
c37d4da4
EJ
2675 int error = 0;
2676
b5d57fc8 2677 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2678 *old_flagsp = iff_to_nd_flags(old_flags);
2679 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2680 if (new_flags != old_flags) {
4f9f3f21
BP
2681 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2682 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2683 }
4f9f3f21
BP
2684
2685 return error;
2686}
2687
2688static int
2689netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2690 enum netdev_flags on, enum netdev_flags *old_flagsp)
2691{
2692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2693 int error;
2694
2695 ovs_mutex_lock(&netdev->mutex);
2696 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2697 ovs_mutex_unlock(&netdev->mutex);
2698
8b61709d
BP
2699 return error;
2700}
2701
2f9dd77f 2702#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2703 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2704{ \
2705 NAME, \
2706 \
259e0b1a 2707 NULL, \
c3827f61
BP
2708 netdev_linux_run, \
2709 netdev_linux_wait, \
2710 \
9dc63482
BP
2711 netdev_linux_alloc, \
2712 CONSTRUCT, \
2713 netdev_linux_destruct, \
2714 netdev_linux_dealloc, \
de5cdb90 2715 NULL, /* get_config */ \
6d9e6eb4 2716 NULL, /* set_config */ \
f431bf7d 2717 NULL, /* get_tunnel_config */ \
7dec44fe 2718 NULL, /* get_numa_id */ \
5496878c 2719 NULL, /* set_multiq */ \
c3827f61 2720 \
c3827f61
BP
2721 netdev_linux_send, \
2722 netdev_linux_send_wait, \
2723 \
2724 netdev_linux_set_etheraddr, \
2725 netdev_linux_get_etheraddr, \
2726 netdev_linux_get_mtu, \
9b020780 2727 netdev_linux_set_mtu, \
c3827f61
BP
2728 netdev_linux_get_ifindex, \
2729 netdev_linux_get_carrier, \
65c3058c 2730 netdev_linux_get_carrier_resets, \
1670c579 2731 netdev_linux_set_miimon_interval, \
f613a0d7 2732 GET_STATS, \
c3827f61 2733 \
51f87458 2734 GET_FEATURES, \
c3827f61 2735 netdev_linux_set_advertisements, \
c3827f61
BP
2736 \
2737 netdev_linux_set_policing, \
2738 netdev_linux_get_qos_types, \
2739 netdev_linux_get_qos_capabilities, \
2740 netdev_linux_get_qos, \
2741 netdev_linux_set_qos, \
2742 netdev_linux_get_queue, \
2743 netdev_linux_set_queue, \
2744 netdev_linux_delete_queue, \
2745 netdev_linux_get_queue_stats, \
89454bf4
BP
2746 netdev_linux_queue_dump_start, \
2747 netdev_linux_queue_dump_next, \
2748 netdev_linux_queue_dump_done, \
c3827f61
BP
2749 netdev_linux_dump_queue_stats, \
2750 \
2751 netdev_linux_get_in4, \
2752 netdev_linux_set_in4, \
2753 netdev_linux_get_in6, \
2754 netdev_linux_add_router, \
2755 netdev_linux_get_next_hop, \
4f925bd3 2756 GET_STATUS, \
c3827f61
BP
2757 netdev_linux_arp_lookup, \
2758 \
2759 netdev_linux_update_flags, \
2760 \
f7791740
PS
2761 netdev_linux_rxq_alloc, \
2762 netdev_linux_rxq_construct, \
2763 netdev_linux_rxq_destruct, \
2764 netdev_linux_rxq_dealloc, \
2765 netdev_linux_rxq_recv, \
2766 netdev_linux_rxq_wait, \
2767 netdev_linux_rxq_drain, \
c3827f61
BP
2768}
2769
2770const struct netdev_class netdev_linux_class =
2771 NETDEV_LINUX_CLASS(
2772 "system",
9dc63482 2773 netdev_linux_construct,
f613a0d7 2774 netdev_linux_get_stats,
51f87458 2775 netdev_linux_get_features,
275707c3 2776 netdev_linux_get_status);
c3827f61
BP
2777
2778const struct netdev_class netdev_tap_class =
2779 NETDEV_LINUX_CLASS(
2780 "tap",
9dc63482 2781 netdev_linux_construct_tap,
bba1e6f3 2782 netdev_tap_get_stats,
51f87458 2783 netdev_linux_get_features,
275707c3 2784 netdev_linux_get_status);
c3827f61
BP
2785
2786const struct netdev_class netdev_internal_class =
2787 NETDEV_LINUX_CLASS(
2788 "internal",
9dc63482 2789 netdev_linux_construct,
bba1e6f3 2790 netdev_internal_get_stats,
51f87458 2791 NULL, /* get_features */
275707c3 2792 netdev_internal_get_status);
8b61709d 2793\f
c1c9c9c4 2794/* HTB traffic control class. */
559843ed 2795
c1c9c9c4 2796#define HTB_N_QUEUES 0xf000
8b61709d 2797
c1c9c9c4
BP
2798struct htb {
2799 struct tc tc;
2800 unsigned int max_rate; /* In bytes/s. */
2801};
8b61709d 2802
c1c9c9c4 2803struct htb_class {
93b13be8 2804 struct tc_queue tc_queue;
c1c9c9c4
BP
2805 unsigned int min_rate; /* In bytes/s. */
2806 unsigned int max_rate; /* In bytes/s. */
2807 unsigned int burst; /* In bytes. */
2808 unsigned int priority; /* Lower values are higher priorities. */
2809};
8b61709d 2810
c1c9c9c4 2811static struct htb *
b5d57fc8 2812htb_get__(const struct netdev *netdev_)
c1c9c9c4 2813{
b5d57fc8
BP
2814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2815 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
2816}
2817
24045e35 2818static void
b5d57fc8 2819htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 2820{
b5d57fc8 2821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2822 struct htb *htb;
2823
2824 htb = xmalloc(sizeof *htb);
2825 tc_init(&htb->tc, &tc_ops_htb);
2826 htb->max_rate = max_rate;
2827
b5d57fc8 2828 netdev->tc = &htb->tc;
c1c9c9c4
BP
2829}
2830
2831/* Create an HTB qdisc.
2832 *
a339aa81 2833 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2834static int
2835htb_setup_qdisc__(struct netdev *netdev)
2836{
2837 size_t opt_offset;
2838 struct tc_htb_glob opt;
2839 struct ofpbuf request;
2840 struct tcmsg *tcmsg;
2841
2842 tc_del_qdisc(netdev);
2843
2844 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2845 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2846 if (!tcmsg) {
2847 return ENODEV;
2848 }
c1c9c9c4
BP
2849 tcmsg->tcm_handle = tc_make_handle(1, 0);
2850 tcmsg->tcm_parent = TC_H_ROOT;
2851
2852 nl_msg_put_string(&request, TCA_KIND, "htb");
2853
2854 memset(&opt, 0, sizeof opt);
2855 opt.rate2quantum = 10;
2856 opt.version = 3;
4ecf12d5 2857 opt.defcls = 1;
c1c9c9c4
BP
2858
2859 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2860 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2861 nl_msg_end_nested(&request, opt_offset);
2862
2863 return tc_transact(&request, NULL);
2864}
2865
2866/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2867 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2868static int
2869htb_setup_class__(struct netdev *netdev, unsigned int handle,
2870 unsigned int parent, struct htb_class *class)
2871{
2872 size_t opt_offset;
2873 struct tc_htb_opt opt;
2874 struct ofpbuf request;
2875 struct tcmsg *tcmsg;
2876 int error;
2877 int mtu;
2878
73371c09 2879 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 2880 if (error) {
f915f1a8
BP
2881 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2882 netdev_get_name(netdev));
9b020780 2883 return error;
f915f1a8 2884 }
c1c9c9c4
BP
2885
2886 memset(&opt, 0, sizeof opt);
2887 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2888 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2889 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2890 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2891 opt.prio = class->priority;
2892
2893 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2894 if (!tcmsg) {
2895 return ENODEV;
2896 }
c1c9c9c4
BP
2897 tcmsg->tcm_handle = handle;
2898 tcmsg->tcm_parent = parent;
2899
2900 nl_msg_put_string(&request, TCA_KIND, "htb");
2901 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2902 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2903 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2904 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2905 nl_msg_end_nested(&request, opt_offset);
2906
2907 error = tc_transact(&request, NULL);
2908 if (error) {
2909 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2910 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2911 netdev_get_name(netdev),
2912 tc_get_major(handle), tc_get_minor(handle),
2913 tc_get_major(parent), tc_get_minor(parent),
2914 class->min_rate, class->max_rate,
10a89ef0 2915 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
2916 }
2917 return error;
2918}
2919
2920/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2921 * description of them into 'details'. The description complies with the
2922 * specification given in the vswitch database documentation for linux-htb
2923 * queue details. */
2924static int
2925htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2926{
2927 static const struct nl_policy tca_htb_policy[] = {
2928 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2929 .min_len = sizeof(struct tc_htb_opt) },
2930 };
2931
2932 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2933 const struct tc_htb_opt *htb;
2934
2935 if (!nl_parse_nested(nl_options, tca_htb_policy,
2936 attrs, ARRAY_SIZE(tca_htb_policy))) {
2937 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2938 return EPROTO;
2939 }
2940
2941 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2942 class->min_rate = htb->rate.rate;
2943 class->max_rate = htb->ceil.rate;
2944 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2945 class->priority = htb->prio;
2946 return 0;
2947}
2948
2949static int
2950htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2951 struct htb_class *options,
2952 struct netdev_queue_stats *stats)
2953{
2954 struct nlattr *nl_options;
2955 unsigned int handle;
2956 int error;
2957
2958 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2959 if (!error && queue_id) {
17ee3c1f
BP
2960 unsigned int major = tc_get_major(handle);
2961 unsigned int minor = tc_get_minor(handle);
2962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2963 *queue_id = minor - 1;
c1c9c9c4
BP
2964 } else {
2965 error = EPROTO;
2966 }
2967 }
2968 if (!error && options) {
2969 error = htb_parse_tca_options__(nl_options, options);
2970 }
2971 return error;
2972}
2973
2974static void
73371c09 2975htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 2976 const struct smap *details, struct htb_class *hc)
c1c9c9c4 2977{
73371c09 2978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2979 const char *max_rate_s;
2980
79f1cbe9 2981 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2982 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2983 if (!hc->max_rate) {
a00ca915 2984 enum netdev_features current;
c1c9c9c4 2985
73371c09
BP
2986 netdev_linux_read_features(netdev);
2987 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 2988 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2989 }
2990 hc->min_rate = hc->max_rate;
2991 hc->burst = 0;
2992 hc->priority = 0;
2993}
2994
2995static int
2996htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2997 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2998{
2999 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
3000 const char *min_rate_s = smap_get(details, "min-rate");
3001 const char *max_rate_s = smap_get(details, "max-rate");
3002 const char *burst_s = smap_get(details, "burst");
3003 const char *priority_s = smap_get(details, "priority");
9b020780 3004 int mtu, error;
c1c9c9c4 3005
73371c09 3006 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3007 if (error) {
f915f1a8
BP
3008 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3009 netdev_get_name(netdev));
9b020780 3010 return error;
f915f1a8
BP
3011 }
3012
4f104611
EJ
3013 /* HTB requires at least an mtu sized min-rate to send any traffic even
3014 * on uncongested links. */
c45ab5e9 3015 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 3016 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3017 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3018
3019 /* max-rate */
3020 hc->max_rate = (max_rate_s
3021 ? strtoull(max_rate_s, NULL, 10) / 8
3022 : htb->max_rate);
3023 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3024 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3025
3026 /* burst
3027 *
3028 * According to hints in the documentation that I've read, it is important
3029 * that 'burst' be at least as big as the largest frame that might be
3030 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3031 * but having it a bit too small is a problem. Since netdev_get_mtu()
3032 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3033 * the MTU. We actually add 64, instead of 14, as a guard against
3034 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
3035 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3036 hc->burst = MAX(hc->burst, mtu + 64);
3037
3038 /* priority */
3039 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3040
3041 return 0;
3042}
3043
3044static int
3045htb_query_class__(const struct netdev *netdev, unsigned int handle,
3046 unsigned int parent, struct htb_class *options,
3047 struct netdev_queue_stats *stats)
3048{
3049 struct ofpbuf *reply;
3050 int error;
3051
3052 error = tc_query_class(netdev, handle, parent, &reply);
3053 if (!error) {
3054 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3055 ofpbuf_delete(reply);
3056 }
3057 return error;
3058}
3059
3060static int
79f1cbe9 3061htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3062{
3063 int error;
3064
3065 error = htb_setup_qdisc__(netdev);
3066 if (!error) {
3067 struct htb_class hc;
3068
3069 htb_parse_qdisc_details__(netdev, details, &hc);
3070 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3071 tc_make_handle(1, 0), &hc);
3072 if (!error) {
3073 htb_install__(netdev, hc.max_rate);
3074 }
3075 }
3076 return error;
3077}
3078
93b13be8
BP
3079static struct htb_class *
3080htb_class_cast__(const struct tc_queue *queue)
3081{
3082 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3083}
3084
c1c9c9c4
BP
3085static void
3086htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3087 const struct htb_class *hc)
3088{
3089 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3090 size_t hash = hash_int(queue_id, 0);
3091 struct tc_queue *queue;
c1c9c9c4
BP
3092 struct htb_class *hcp;
3093
93b13be8
BP
3094 queue = tc_find_queue__(netdev, queue_id, hash);
3095 if (queue) {
3096 hcp = htb_class_cast__(queue);
3097 } else {
c1c9c9c4 3098 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3099 queue = &hcp->tc_queue;
3100 queue->queue_id = queue_id;
6dc34a0d 3101 queue->created = time_msec();
93b13be8 3102 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3103 }
93b13be8
BP
3104
3105 hcp->min_rate = hc->min_rate;
3106 hcp->max_rate = hc->max_rate;
3107 hcp->burst = hc->burst;
3108 hcp->priority = hc->priority;
c1c9c9c4
BP
3109}
3110
3111static int
3112htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3113{
c1c9c9c4 3114 struct ofpbuf msg;
d57695d7 3115 struct queue_dump_state state;
c1c9c9c4 3116 struct htb_class hc;
c1c9c9c4
BP
3117
3118 /* Get qdisc options. */
3119 hc.max_rate = 0;
3120 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3121 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3122
3123 /* Get queues. */
d57695d7 3124 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3125 return ENODEV;
3126 }
d57695d7 3127 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3128 unsigned int queue_id;
3129
3130 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3131 htb_update_queue__(netdev, queue_id, &hc);
3132 }
3133 }
d57695d7 3134 finish_queue_dump(&state);
c1c9c9c4
BP
3135
3136 return 0;
3137}
3138
3139static void
3140htb_tc_destroy(struct tc *tc)
3141{
3142 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 3143 struct htb_class *hc, *next;
c1c9c9c4 3144
4e8e4213 3145 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 3146 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
3147 free(hc);
3148 }
3149 tc_destroy(tc);
3150 free(htb);
3151}
3152
3153static int
79f1cbe9 3154htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3155{
3156 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3157 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3158 return 0;
3159}
3160
3161static int
79f1cbe9 3162htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3163{
3164 struct htb_class hc;
3165 int error;
3166
3167 htb_parse_qdisc_details__(netdev, details, &hc);
3168 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3169 tc_make_handle(1, 0), &hc);
3170 if (!error) {
3171 htb_get__(netdev)->max_rate = hc.max_rate;
3172 }
3173 return error;
3174}
3175
3176static int
93b13be8 3177htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3178 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3179{
93b13be8 3180 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3181
79f1cbe9 3182 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3183 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3184 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3185 }
79f1cbe9 3186 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3187 if (hc->priority) {
79f1cbe9 3188 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3189 }
3190 return 0;
3191}
3192
3193static int
3194htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3195 const struct smap *details)
c1c9c9c4
BP
3196{
3197 struct htb_class hc;
3198 int error;
3199
3200 error = htb_parse_class_details__(netdev, details, &hc);
3201 if (error) {
3202 return error;
3203 }
3204
17ee3c1f 3205 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3206 tc_make_handle(1, 0xfffe), &hc);
3207 if (error) {
3208 return error;
3209 }
3210
3211 htb_update_queue__(netdev, queue_id, &hc);
3212 return 0;
3213}
3214
3215static int
93b13be8 3216htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3217{
93b13be8 3218 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3219 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3220 int error;
3221
93b13be8 3222 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3223 if (!error) {
93b13be8 3224 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3225 free(hc);
c1c9c9c4
BP
3226 }
3227 return error;
3228}
3229
3230static int
93b13be8 3231htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3232 struct netdev_queue_stats *stats)
3233{
93b13be8 3234 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3235 tc_make_handle(1, 0xfffe), NULL, stats);
3236}
3237
3238static int
3239htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3240 const struct ofpbuf *nlmsg,
3241 netdev_dump_queue_stats_cb *cb, void *aux)
3242{
3243 struct netdev_queue_stats stats;
17ee3c1f 3244 unsigned int handle, major, minor;
c1c9c9c4
BP
3245 int error;
3246
3247 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3248 if (error) {
3249 return error;
3250 }
3251
17ee3c1f
BP
3252 major = tc_get_major(handle);
3253 minor = tc_get_minor(handle);
3254 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3255 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3256 }
3257 return 0;
3258}
3259
3260static const struct tc_ops tc_ops_htb = {
3261 "htb", /* linux_name */
3262 "linux-htb", /* ovs_name */
3263 HTB_N_QUEUES, /* n_queues */
3264 htb_tc_install,
3265 htb_tc_load,
3266 htb_tc_destroy,
3267 htb_qdisc_get,
3268 htb_qdisc_set,
3269 htb_class_get,
3270 htb_class_set,
3271 htb_class_delete,
3272 htb_class_get_stats,
3273 htb_class_dump_stats
3274};
3275\f
a339aa81
EJ
3276/* "linux-hfsc" traffic control class. */
3277
3278#define HFSC_N_QUEUES 0xf000
3279
3280struct hfsc {
3281 struct tc tc;
3282 uint32_t max_rate;
3283};
3284
3285struct hfsc_class {
3286 struct tc_queue tc_queue;
3287 uint32_t min_rate;
3288 uint32_t max_rate;
3289};
3290
3291static struct hfsc *
b5d57fc8 3292hfsc_get__(const struct netdev *netdev_)
a339aa81 3293{
b5d57fc8
BP
3294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3295 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
3296}
3297
3298static struct hfsc_class *
3299hfsc_class_cast__(const struct tc_queue *queue)
3300{
3301 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3302}
3303
24045e35 3304static void
b5d57fc8 3305hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 3306{
b5d57fc8 3307 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3308 struct hfsc *hfsc;
3309
a339aa81
EJ
3310 hfsc = xmalloc(sizeof *hfsc);
3311 tc_init(&hfsc->tc, &tc_ops_hfsc);
3312 hfsc->max_rate = max_rate;
b5d57fc8 3313 netdev->tc = &hfsc->tc;
a339aa81
EJ
3314}
3315
3316static void
3317hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3318 const struct hfsc_class *hc)
3319{
3320 size_t hash;
3321 struct hfsc *hfsc;
3322 struct hfsc_class *hcp;
3323 struct tc_queue *queue;
3324
3325 hfsc = hfsc_get__(netdev);
3326 hash = hash_int(queue_id, 0);
3327
3328 queue = tc_find_queue__(netdev, queue_id, hash);
3329 if (queue) {
3330 hcp = hfsc_class_cast__(queue);
3331 } else {
3332 hcp = xmalloc(sizeof *hcp);
3333 queue = &hcp->tc_queue;
3334 queue->queue_id = queue_id;
6dc34a0d 3335 queue->created = time_msec();
a339aa81
EJ
3336 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3337 }
3338
3339 hcp->min_rate = hc->min_rate;
3340 hcp->max_rate = hc->max_rate;
3341}
3342
3343static int
3344hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3345{
3346 const struct tc_service_curve *rsc, *fsc, *usc;
3347 static const struct nl_policy tca_hfsc_policy[] = {
3348 [TCA_HFSC_RSC] = {
3349 .type = NL_A_UNSPEC,
3350 .optional = false,
3351 .min_len = sizeof(struct tc_service_curve),
3352 },
3353 [TCA_HFSC_FSC] = {
3354 .type = NL_A_UNSPEC,
3355 .optional = false,
3356 .min_len = sizeof(struct tc_service_curve),
3357 },
3358 [TCA_HFSC_USC] = {
3359 .type = NL_A_UNSPEC,
3360 .optional = false,
3361 .min_len = sizeof(struct tc_service_curve),
3362 },
3363 };
3364 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3365
3366 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3367 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3368 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3369 return EPROTO;
3370 }
3371
3372 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3373 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3374 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3375
3376 if (rsc->m1 != 0 || rsc->d != 0 ||
3377 fsc->m1 != 0 || fsc->d != 0 ||
3378 usc->m1 != 0 || usc->d != 0) {
3379 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3380 "Non-linear service curves are not supported.");
3381 return EPROTO;
3382 }
3383
3384 if (rsc->m2 != fsc->m2) {
3385 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3386 "Real-time service curves are not supported ");
3387 return EPROTO;
3388 }
3389
3390 if (rsc->m2 > usc->m2) {
3391 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3392 "Min-rate service curve is greater than "
3393 "the max-rate service curve.");
3394 return EPROTO;
3395 }
3396
3397 class->min_rate = fsc->m2;
3398 class->max_rate = usc->m2;
3399 return 0;
3400}
3401
3402static int
3403hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3404 struct hfsc_class *options,
3405 struct netdev_queue_stats *stats)
3406{
3407 int error;
3408 unsigned int handle;
3409 struct nlattr *nl_options;
3410
3411 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3412 if (error) {
3413 return error;
3414 }
3415
3416 if (queue_id) {
3417 unsigned int major, minor;
3418
3419 major = tc_get_major(handle);
3420 minor = tc_get_minor(handle);
3421 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3422 *queue_id = minor - 1;
3423 } else {
3424 return EPROTO;
3425 }
3426 }
3427
3428 if (options) {
3429 error = hfsc_parse_tca_options__(nl_options, options);
3430 }
3431
3432 return error;
3433}
3434
3435static int
3436hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3437 unsigned int parent, struct hfsc_class *options,
3438 struct netdev_queue_stats *stats)
3439{
3440 int error;
3441 struct ofpbuf *reply;
3442
3443 error = tc_query_class(netdev, handle, parent, &reply);
3444 if (error) {
3445 return error;
3446 }
3447
3448 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3449 ofpbuf_delete(reply);
3450 return error;
3451}
3452
3453static void
73371c09 3454hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
3455 struct hfsc_class *class)
3456{
73371c09 3457 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
3458 uint32_t max_rate;
3459 const char *max_rate_s;
3460
79f1cbe9 3461 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3462 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3463
3464 if (!max_rate) {
a00ca915 3465 enum netdev_features current;
a339aa81 3466
73371c09
BP
3467 netdev_linux_read_features(netdev);
3468 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3469 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3470 }
3471
3472 class->min_rate = max_rate;
3473 class->max_rate = max_rate;
3474}
3475
3476static int
3477hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3478 const struct smap *details,
a339aa81
EJ
3479 struct hfsc_class * class)
3480{
3481 const struct hfsc *hfsc;
3482 uint32_t min_rate, max_rate;
3483 const char *min_rate_s, *max_rate_s;
3484
3485 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3486 min_rate_s = smap_get(details, "min-rate");
3487 max_rate_s = smap_get(details, "max-rate");
a339aa81 3488
c45ab5e9 3489 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3490 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3491 min_rate = MIN(min_rate, hfsc->max_rate);
3492
3493 max_rate = (max_rate_s
3494 ? strtoull(max_rate_s, NULL, 10) / 8
3495 : hfsc->max_rate);
3496 max_rate = MAX(max_rate, min_rate);
3497 max_rate = MIN(max_rate, hfsc->max_rate);
3498
3499 class->min_rate = min_rate;
3500 class->max_rate = max_rate;
3501
3502 return 0;
3503}
3504
3505/* Create an HFSC qdisc.
3506 *
3507 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3508static int
3509hfsc_setup_qdisc__(struct netdev * netdev)
3510{
3511 struct tcmsg *tcmsg;
3512 struct ofpbuf request;
3513 struct tc_hfsc_qopt opt;
3514
3515 tc_del_qdisc(netdev);
3516
3517 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3518 NLM_F_EXCL | NLM_F_CREATE, &request);
3519
3520 if (!tcmsg) {
3521 return ENODEV;
3522 }
3523
3524 tcmsg->tcm_handle = tc_make_handle(1, 0);
3525 tcmsg->tcm_parent = TC_H_ROOT;
3526
3527 memset(&opt, 0, sizeof opt);
3528 opt.defcls = 1;
3529
3530 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3531 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3532
3533 return tc_transact(&request, NULL);
3534}
3535
3536/* Create an HFSC class.
3537 *
3538 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3539 * sc rate <min_rate> ul rate <max_rate>" */
3540static int
3541hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3542 unsigned int parent, struct hfsc_class *class)
3543{
3544 int error;
3545 size_t opt_offset;
3546 struct tcmsg *tcmsg;
3547 struct ofpbuf request;
3548 struct tc_service_curve min, max;
3549
3550 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3551
3552 if (!tcmsg) {
3553 return ENODEV;
3554 }
3555
3556 tcmsg->tcm_handle = handle;
3557 tcmsg->tcm_parent = parent;
3558
3559 min.m1 = 0;
3560 min.d = 0;
3561 min.m2 = class->min_rate;
3562
3563 max.m1 = 0;
3564 max.d = 0;
3565 max.m2 = class->max_rate;
3566
3567 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3568 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3569 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3570 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3571 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3572 nl_msg_end_nested(&request, opt_offset);
3573
3574 error = tc_transact(&request, NULL);
3575 if (error) {
3576 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3577 "min-rate %ubps, max-rate %ubps (%s)",
3578 netdev_get_name(netdev),
3579 tc_get_major(handle), tc_get_minor(handle),
3580 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 3581 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
3582 }
3583
3584 return error;
3585}
3586
3587static int
79f1cbe9 3588hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3589{
3590 int error;
3591 struct hfsc_class class;
3592
3593 error = hfsc_setup_qdisc__(netdev);
3594
3595 if (error) {
3596 return error;
3597 }
3598
3599 hfsc_parse_qdisc_details__(netdev, details, &class);
3600 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3601 tc_make_handle(1, 0), &class);
3602
3603 if (error) {
3604 return error;
3605 }
3606
3607 hfsc_install__(netdev, class.max_rate);
3608 return 0;
3609}
3610
3611static int
3612hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3613{
3614 struct ofpbuf msg;
d57695d7 3615 struct queue_dump_state state;
a339aa81
EJ
3616 struct hfsc_class hc;
3617
3618 hc.max_rate = 0;
3619 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3620 hfsc_install__(netdev, hc.max_rate);
a339aa81 3621
d57695d7 3622 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
3623 return ENODEV;
3624 }
3625
d57695d7 3626 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
3627 unsigned int queue_id;
3628
3629 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3630 hfsc_update_queue__(netdev, queue_id, &hc);
3631 }
3632 }
3633
d57695d7 3634 finish_queue_dump(&state);
a339aa81
EJ
3635 return 0;
3636}
3637
3638static void
3639hfsc_tc_destroy(struct tc *tc)
3640{
3641 struct hfsc *hfsc;
3642 struct hfsc_class *hc, *next;
3643
3644 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3645
3646 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3647 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3648 free(hc);
3649 }
3650
3651 tc_destroy(tc);
3652 free(hfsc);
3653}
3654
3655static int
79f1cbe9 3656hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3657{
3658 const struct hfsc *hfsc;
3659 hfsc = hfsc_get__(netdev);
79f1cbe9 3660 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3661 return 0;
3662}
3663
3664static int
79f1cbe9 3665hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3666{
3667 int error;
3668 struct hfsc_class class;
3669
3670 hfsc_parse_qdisc_details__(netdev, details, &class);
3671 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3672 tc_make_handle(1, 0), &class);
3673
3674 if (!error) {
3675 hfsc_get__(netdev)->max_rate = class.max_rate;
3676 }
3677
3678 return error;
3679}
3680
3681static int
3682hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3683 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3684{
3685 const struct hfsc_class *hc;
3686
3687 hc = hfsc_class_cast__(queue);
79f1cbe9 3688 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3689 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3690 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3691 }
3692 return 0;
3693}
3694
3695static int
3696hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3697 const struct smap *details)
a339aa81
EJ
3698{
3699 int error;
3700 struct hfsc_class class;
3701
3702 error = hfsc_parse_class_details__(netdev, details, &class);
3703 if (error) {
3704 return error;
3705 }
3706
3707 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3708 tc_make_handle(1, 0xfffe), &class);
3709 if (error) {
3710 return error;
3711 }
3712
3713 hfsc_update_queue__(netdev, queue_id, &class);
3714 return 0;
3715}
3716
3717static int
3718hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3719{
3720 int error;
3721 struct hfsc *hfsc;
3722 struct hfsc_class *hc;
3723
3724 hc = hfsc_class_cast__(queue);
3725 hfsc = hfsc_get__(netdev);
3726
3727 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3728 if (!error) {
3729 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3730 free(hc);
3731 }
3732 return error;
3733}
3734
3735static int
3736hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3737 struct netdev_queue_stats *stats)
3738{
3739 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3740 tc_make_handle(1, 0xfffe), NULL, stats);
3741}
3742
3743static int
3744hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3745 const struct ofpbuf *nlmsg,
3746 netdev_dump_queue_stats_cb *cb, void *aux)
3747{
3748 struct netdev_queue_stats stats;
3749 unsigned int handle, major, minor;
3750 int error;
3751
3752 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3753 if (error) {
3754 return error;
3755 }
3756
3757 major = tc_get_major(handle);
3758 minor = tc_get_minor(handle);
3759 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3760 (*cb)(minor - 1, &stats, aux);
3761 }
3762 return 0;
3763}
3764
3765static const struct tc_ops tc_ops_hfsc = {
3766 "hfsc", /* linux_name */
3767 "linux-hfsc", /* ovs_name */
3768 HFSC_N_QUEUES, /* n_queues */
3769 hfsc_tc_install, /* tc_install */
3770 hfsc_tc_load, /* tc_load */
3771 hfsc_tc_destroy, /* tc_destroy */
3772 hfsc_qdisc_get, /* qdisc_get */
3773 hfsc_qdisc_set, /* qdisc_set */
3774 hfsc_class_get, /* class_get */
3775 hfsc_class_set, /* class_set */
3776 hfsc_class_delete, /* class_delete */
3777 hfsc_class_get_stats, /* class_get_stats */
3778 hfsc_class_dump_stats /* class_dump_stats */
3779};
3780\f
c1c9c9c4
BP
3781/* "linux-default" traffic control class.
3782 *
3783 * This class represents the default, unnamed Linux qdisc. It corresponds to
3784 * the "" (empty string) QoS type in the OVS database. */
3785
3786static void
b5d57fc8 3787default_install__(struct netdev *netdev_)
c1c9c9c4 3788{
b5d57fc8 3789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3790 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 3791
559eb230
BP
3792 /* Nothing but a tc class implementation is allowed to write to a tc. This
3793 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3794 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3795}
3796
3797static int
3798default_tc_install(struct netdev *netdev,
79f1cbe9 3799 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3800{
3801 default_install__(netdev);
3802 return 0;
3803}
3804
3805static int
3806default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3807{
3808 default_install__(netdev);
3809 return 0;
3810}
3811
3812static const struct tc_ops tc_ops_default = {
3813 NULL, /* linux_name */
3814 "", /* ovs_name */
3815 0, /* n_queues */
3816 default_tc_install,
3817 default_tc_load,
3818 NULL, /* tc_destroy */
3819 NULL, /* qdisc_get */
3820 NULL, /* qdisc_set */
3821 NULL, /* class_get */
3822 NULL, /* class_set */
3823 NULL, /* class_delete */
3824 NULL, /* class_get_stats */
3825 NULL /* class_dump_stats */
3826};
3827\f
3828/* "linux-other" traffic control class.
3829 *
3830 * */
3831
3832static int
b5d57fc8 3833other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 3834{
b5d57fc8 3835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 3836 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 3837
559eb230
BP
3838 /* Nothing but a tc class implementation is allowed to write to a tc. This
3839 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 3840 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3841 return 0;
3842}
3843
3844static const struct tc_ops tc_ops_other = {
3845 NULL, /* linux_name */
3846 "linux-other", /* ovs_name */
3847 0, /* n_queues */
3848 NULL, /* tc_install */
3849 other_tc_load,
3850 NULL, /* tc_destroy */
3851 NULL, /* qdisc_get */
3852 NULL, /* qdisc_set */
3853 NULL, /* class_get */
3854 NULL, /* class_set */
3855 NULL, /* class_delete */
3856 NULL, /* class_get_stats */
3857 NULL /* class_dump_stats */
3858};
3859\f
3860/* Traffic control. */
3861
3862/* Number of kernel "tc" ticks per second. */
3863static double ticks_per_s;
3864
3865/* Number of kernel "jiffies" per second. This is used for the purpose of
3866 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3867 * one jiffy's worth of data.
3868 *
3869 * There are two possibilities here:
3870 *
3871 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3872 * approximate range of 100 to 1024. That means that we really need to
3873 * make sure that the qdisc can buffer that much data.
3874 *
3875 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3876 * has finely granular timers and there's no need to fudge additional room
3877 * for buffers. (There's no extra effort needed to implement that: the
3878 * large 'buffer_hz' is used as a divisor, so practically any number will
3879 * come out as 0 in the division. Small integer results in the case of
3880 * really high dividends won't have any real effect anyhow.)
3881 */
3882static unsigned int buffer_hz;
3883
3884/* Returns tc handle 'major':'minor'. */
3885static unsigned int
3886tc_make_handle(unsigned int major, unsigned int minor)
3887{
3888 return TC_H_MAKE(major << 16, minor);
3889}
3890
3891/* Returns the major number from 'handle'. */
3892static unsigned int
3893tc_get_major(unsigned int handle)
3894{
3895 return TC_H_MAJ(handle) >> 16;
3896}
3897
3898/* Returns the minor number from 'handle'. */
3899static unsigned int
3900tc_get_minor(unsigned int handle)
3901{
3902 return TC_H_MIN(handle);
3903}
3904
3905static struct tcmsg *
3906tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3907 struct ofpbuf *request)
3908{
3909 struct tcmsg *tcmsg;
3910 int ifindex;
3911 int error;
3912
3913 error = get_ifindex(netdev, &ifindex);
3914 if (error) {
3915 return NULL;
3916 }
3917
3918 ofpbuf_init(request, 512);
3919 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3920 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3921 tcmsg->tcm_family = AF_UNSPEC;
3922 tcmsg->tcm_ifindex = ifindex;
3923 /* Caller should fill in tcmsg->tcm_handle. */
3924 /* Caller should fill in tcmsg->tcm_parent. */
3925
3926 return tcmsg;
3927}
3928
3929static int
3930tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3931{
a88b4e04 3932 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
3933 ofpbuf_uninit(request);
3934 return error;
3935}
3936
f8500004
JP
3937/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3938 * policing configuration.
3939 *
3940 * This function is equivalent to running the following when 'add' is true:
3941 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3942 *
3943 * This function is equivalent to running the following when 'add' is false:
3944 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3945 *
3946 * The configuration and stats may be seen with the following command:
3947 * /sbin/tc -s qdisc show dev <devname>
3948 *
3949 * Returns 0 if successful, otherwise a positive errno value.
3950 */
3951static int
3952tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3953{
3954 struct ofpbuf request;
3955 struct tcmsg *tcmsg;
3956 int error;
3957 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3958 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3959
3960 tcmsg = tc_make_request(netdev, type, flags, &request);
3961 if (!tcmsg) {
3962 return ENODEV;
3963 }
3964 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3965 tcmsg->tcm_parent = TC_H_INGRESS;
3966 nl_msg_put_string(&request, TCA_KIND, "ingress");
3967 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3968
3969 error = tc_transact(&request, NULL);
3970 if (error) {
3971 /* If we're deleting the qdisc, don't worry about some of the
3972 * error conditions. */
3973 if (!add && (error == ENOENT || error == EINVAL)) {
3974 return 0;
3975 }
3976 return error;
3977 }
3978
3979 return 0;
3980}
3981
3982/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3983 * of 'kbits_burst'.
3984 *
3985 * This function is equivalent to running:
3986 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3987 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3988 * mtu 65535 drop
3989 *
3990 * The configuration and stats may be seen with the following command:
3991 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3992 *
3993 * Returns 0 if successful, otherwise a positive errno value.
3994 */
3995static int
3996tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3997{
3998 struct tc_police tc_police;
3999 struct ofpbuf request;
4000 struct tcmsg *tcmsg;
4001 size_t basic_offset;
4002 size_t police_offset;
4003 int error;
4004 int mtu = 65535;
4005
4006 memset(&tc_police, 0, sizeof tc_police);
4007 tc_police.action = TC_POLICE_SHOT;
4008 tc_police.mtu = mtu;
1aca400c 4009 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
f8500004
JP
4010 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4011 kbits_burst * 1024);
4012
4013 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4014 NLM_F_EXCL | NLM_F_CREATE, &request);
4015 if (!tcmsg) {
4016 return ENODEV;
4017 }
4018 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4019 tcmsg->tcm_info = tc_make_handle(49,
4020 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4021
4022 nl_msg_put_string(&request, TCA_KIND, "basic");
4023 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4024 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4025 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4026 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4027 nl_msg_end_nested(&request, police_offset);
4028 nl_msg_end_nested(&request, basic_offset);
4029
4030 error = tc_transact(&request, NULL);
4031 if (error) {
4032 return error;
4033 }
4034
4035 return 0;
4036}
4037
c1c9c9c4
BP
4038static void
4039read_psched(void)
4040{
4041 /* The values in psched are not individually very meaningful, but they are
4042 * important. The tables below show some values seen in the wild.
4043 *
4044 * Some notes:
4045 *
4046 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4047 * (Before that, there are hints that it was 1000000000.)
4048 *
4049 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4050 * above.
4051 *
4052 * /proc/net/psched
4053 * -----------------------------------
4054 * [1] 000c8000 000f4240 000f4240 00000064
4055 * [2] 000003e8 00000400 000f4240 3b9aca00
4056 * [3] 000003e8 00000400 000f4240 3b9aca00
4057 * [4] 000003e8 00000400 000f4240 00000064
4058 * [5] 000003e8 00000040 000f4240 3b9aca00
4059 * [6] 000003e8 00000040 000f4240 000000f9
4060 *
4061 * a b c d ticks_per_s buffer_hz
4062 * ------- --------- ---------- ------------- ----------- -------------
4063 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4064 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4065 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4066 * [4] 1,000 1,024 1,000,000 100 976,562 100
4067 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4068 * [6] 1,000 64 1,000,000 249 15,625,000 249
4069 *
4070 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4071 * [2] 2.6.26-1-686-bigmem from Debian lenny
4072 * [3] 2.6.26-2-sparc64 from Debian lenny
4073 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4074 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4075 * [6] 2.6.34 from kernel.org on KVM
4076 */
23882115 4077 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4078 static const char fn[] = "/proc/net/psched";
4079 unsigned int a, b, c, d;
4080 FILE *stream;
4081
23882115
BP
4082 if (!ovsthread_once_start(&once)) {
4083 return;
4084 }
4085
c1c9c9c4
BP
4086 ticks_per_s = 1.0;
4087 buffer_hz = 100;
4088
4089 stream = fopen(fn, "r");
4090 if (!stream) {
10a89ef0 4091 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4092 goto exit;
c1c9c9c4
BP
4093 }
4094
4095 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4096 VLOG_WARN("%s: read failed", fn);
4097 fclose(stream);
23882115 4098 goto exit;
c1c9c9c4
BP
4099 }
4100 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4101 fclose(stream);
4102
4103 if (!a || !c) {
4104 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4105 goto exit;
c1c9c9c4
BP
4106 }
4107
4108 ticks_per_s = (double) a * c / b;
4109 if (c == 1000000) {
4110 buffer_hz = d;
4111 } else {
4112 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4113 fn, a, b, c, d);
4114 }
4115 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4116
4117exit:
4118 ovsthread_once_done(&once);
c1c9c9c4
BP
4119}
4120
4121/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4122 * rate of 'rate' bytes per second. */
4123static unsigned int
4124tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4125{
23882115 4126 read_psched();
c1c9c9c4
BP
4127 return (rate * ticks) / ticks_per_s;
4128}
4129
4130/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4131 * rate of 'rate' bytes per second. */
4132static unsigned int
4133tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4134{
23882115 4135 read_psched();
015c93a4 4136 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4137}
4138
4139/* Returns the number of bytes that need to be reserved for qdisc buffering at
4140 * a transmission rate of 'rate' bytes per second. */
4141static unsigned int
4142tc_buffer_per_jiffy(unsigned int rate)
4143{
23882115 4144 read_psched();
c1c9c9c4
BP
4145 return rate / buffer_hz;
4146}
4147
4148/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4149 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4150 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4151 * stores NULL into it if it is absent.
4152 *
4153 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4154 * 'msg'.
4155 *
4156 * Returns 0 if successful, otherwise a positive errno value. */
4157static int
4158tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4159 struct nlattr **options)
4160{
4161 static const struct nl_policy tca_policy[] = {
4162 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4163 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4164 };
4165 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4166
4167 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4168 tca_policy, ta, ARRAY_SIZE(ta))) {
4169 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4170 goto error;
4171 }
4172
4173 if (kind) {
4174 *kind = nl_attr_get_string(ta[TCA_KIND]);
4175 }
4176
4177 if (options) {
4178 *options = ta[TCA_OPTIONS];
4179 }
4180
4181 return 0;
4182
4183error:
4184 if (kind) {
4185 *kind = NULL;
4186 }
4187 if (options) {
4188 *options = NULL;
4189 }
4190 return EPROTO;
4191}
4192
4193/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4194 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4195 * into '*options', and its queue statistics into '*stats'. Any of the output
4196 * arguments may be null.
4197 *
4198 * Returns 0 if successful, otherwise a positive errno value. */
4199static int
4200tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4201 struct nlattr **options, struct netdev_queue_stats *stats)
4202{
4203 static const struct nl_policy tca_policy[] = {
4204 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4205 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4206 };
4207 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4208
4209 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4210 tca_policy, ta, ARRAY_SIZE(ta))) {
4211 VLOG_WARN_RL(&rl, "failed to parse class message");
4212 goto error;
4213 }
4214
4215 if (handlep) {
4216 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4217 *handlep = tc->tcm_handle;
4218 }
4219
4220 if (options) {
4221 *options = ta[TCA_OPTIONS];
4222 }
4223
4224 if (stats) {
4225 const struct gnet_stats_queue *gsq;
4226 struct gnet_stats_basic gsb;
4227
4228 static const struct nl_policy stats_policy[] = {
4229 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4230 .min_len = sizeof gsb },
4231 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4232 .min_len = sizeof *gsq },
4233 };
4234 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4235
4236 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4237 sa, ARRAY_SIZE(sa))) {
4238 VLOG_WARN_RL(&rl, "failed to parse class stats");
4239 goto error;
4240 }
4241
4242 /* Alignment issues screw up the length of struct gnet_stats_basic on
4243 * some arch/bitsize combinations. Newer versions of Linux have a
4244 * struct gnet_stats_basic_packed, but we can't depend on that. The
4245 * easiest thing to do is just to make a copy. */
4246 memset(&gsb, 0, sizeof gsb);
4247 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4248 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4249 stats->tx_bytes = gsb.bytes;
4250 stats->tx_packets = gsb.packets;
4251
4252 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4253 stats->tx_errors = gsq->drops;
4254 }
4255
4256 return 0;
4257
4258error:
4259 if (options) {
4260 *options = NULL;
4261 }
4262 if (stats) {
4263 memset(stats, 0, sizeof *stats);
4264 }
4265 return EPROTO;
4266}
4267
4268/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4269 * on 'netdev'. */
4270static int
4271tc_query_class(const struct netdev *netdev,
4272 unsigned int handle, unsigned int parent,
4273 struct ofpbuf **replyp)
4274{
4275 struct ofpbuf request;
4276 struct tcmsg *tcmsg;
4277 int error;
4278
4279 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
4280 if (!tcmsg) {
4281 return ENODEV;
4282 }
c1c9c9c4
BP
4283 tcmsg->tcm_handle = handle;
4284 tcmsg->tcm_parent = parent;
4285
4286 error = tc_transact(&request, replyp);
4287 if (error) {
4288 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4289 netdev_get_name(netdev),
4290 tc_get_major(handle), tc_get_minor(handle),
4291 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4292 ovs_strerror(error));
c1c9c9c4
BP
4293 }
4294 return error;
4295}
4296
4297/* Equivalent to "tc class del dev <name> handle <handle>". */
4298static int
4299tc_delete_class(const struct netdev *netdev, unsigned int handle)
4300{
4301 struct ofpbuf request;
4302 struct tcmsg *tcmsg;
4303 int error;
4304
4305 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4306 if (!tcmsg) {
4307 return ENODEV;
4308 }
c1c9c9c4
BP
4309 tcmsg->tcm_handle = handle;
4310 tcmsg->tcm_parent = 0;
4311
4312 error = tc_transact(&request, NULL);
4313 if (error) {
4314 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4315 netdev_get_name(netdev),
4316 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 4317 ovs_strerror(error));
c1c9c9c4
BP
4318 }
4319 return error;
4320}
4321
4322/* Equivalent to "tc qdisc del dev <name> root". */
4323static int
b5d57fc8 4324tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 4325{
b5d57fc8 4326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4327 struct ofpbuf request;
4328 struct tcmsg *tcmsg;
4329 int error;
4330
b5d57fc8 4331 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4332 if (!tcmsg) {
4333 return ENODEV;
4334 }
c1c9c9c4
BP
4335 tcmsg->tcm_handle = tc_make_handle(1, 0);
4336 tcmsg->tcm_parent = TC_H_ROOT;
4337
4338 error = tc_transact(&request, NULL);
4339 if (error == EINVAL) {
4340 /* EINVAL probably means that the default qdisc was in use, in which
4341 * case we've accomplished our purpose. */
4342 error = 0;
4343 }
b5d57fc8
BP
4344 if (!error && netdev->tc) {
4345 if (netdev->tc->ops->tc_destroy) {
4346 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 4347 }
b5d57fc8 4348 netdev->tc = NULL;
c1c9c9c4
BP
4349 }
4350 return error;
4351}
4352
4353/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4354 * kernel to determine what they are. Returns 0 if successful, otherwise a
4355 * positive errno value. */
4356static int
b5d57fc8 4357tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 4358{
b5d57fc8 4359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4360 struct ofpbuf request, *qdisc;
4361 const struct tc_ops *ops;
4362 struct tcmsg *tcmsg;
4363 int load_error;
4364 int error;
4365
b5d57fc8 4366 if (netdev->tc) {
c1c9c9c4
BP
4367 return 0;
4368 }
4369
4370 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4371 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4372 * 2.6.35 without that fix backported to it.
4373 *
4374 * To avoid the OOPS, we must not make a request that would attempt to dump
4375 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4376 * few others. There are a few ways that I can see to do this, but most of
4377 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4378 * technique chosen here is to assume that any non-default qdisc that we
4379 * create will have a class with handle 1:0. The built-in qdiscs only have
4380 * a class with handle 0:0.
4381 *
4382 * We could check for Linux 2.6.35+ and use a more straightforward method
4383 * there. */
b5d57fc8 4384 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4385 if (!tcmsg) {
4386 return ENODEV;
4387 }
c1c9c9c4
BP
4388 tcmsg->tcm_handle = tc_make_handle(1, 0);
4389 tcmsg->tcm_parent = 0;
4390
4391 /* Figure out what tc class to instantiate. */
4392 error = tc_transact(&request, &qdisc);
4393 if (!error) {
4394 const char *kind;
4395
4396 error = tc_parse_qdisc(qdisc, &kind, NULL);
4397 if (error) {
4398 ops = &tc_ops_other;
4399 } else {
4400 ops = tc_lookup_linux_name(kind);
4401 if (!ops) {
4402 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4403 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4404
4405 ops = &tc_ops_other;
4406 }
4407 }
4408 } else if (error == ENOENT) {
4409 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4410 * other entity that doesn't have a handle 1:0. We will assume
4411 * that it's the system default qdisc. */
4412 ops = &tc_ops_default;
4413 error = 0;
4414 } else {
4415 /* Who knows? Maybe the device got deleted. */
4416 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 4417 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
4418 ops = &tc_ops_other;
4419 }
4420
4421 /* Instantiate it. */
b5d57fc8
BP
4422 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4423 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
4424 ofpbuf_delete(qdisc);
4425
4426 return error ? error : load_error;
4427}
4428
4429/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4430 approximate the time to transmit packets of various lengths. For an MTU of
4431 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4432 represents two possible packet lengths; for a MTU of 513 through 1024, four
4433 possible lengths; and so on.
4434
4435 Returns, for the specified 'mtu', the number of bits that packet lengths
4436 need to be shifted right to fit within such a 256-entry table. */
4437static int
4438tc_calc_cell_log(unsigned int mtu)
4439{
4440 int cell_log;
4441
4442 if (!mtu) {
4443 mtu = ETH_PAYLOAD_MAX;
4444 }
4445 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4446
4447 for (cell_log = 0; mtu >= 256; cell_log++) {
4448 mtu >>= 1;
4449 }
4450
4451 return cell_log;
4452}
4453
4454/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4455 * of 'mtu'. */
4456static void
4457tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4458{
4459 memset(rate, 0, sizeof *rate);
4460 rate->cell_log = tc_calc_cell_log(mtu);
4461 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4462 /* rate->cell_align = 0; */ /* distro headers. */
4463 rate->mpu = ETH_TOTAL_MIN;
4464 rate->rate = Bps;
4465}
4466
4467/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4468 * attribute of the specified "type".
4469 *
4470 * See tc_calc_cell_log() above for a description of "rtab"s. */
4471static void
4472tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4473{
4474 uint32_t *rtab;
4475 unsigned int i;
4476
4477 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4478 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4479 unsigned packet_size = (i + 1) << rate->cell_log;
4480 if (packet_size < rate->mpu) {
4481 packet_size = rate->mpu;
4482 }
4483 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4484 }
4485}
4486
4487/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4488 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4489 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4490 * 0 is fine.) */
c1c9c9c4
BP
4491static int
4492tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4493{
4494 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4495 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4496}
d3980822 4497\f
aaf2fb1a
BP
4498/* Linux-only functions declared in netdev-linux.h */
4499
4500/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4501 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4502int
4503netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4504 const char *flag_name, bool enable)
4505{
4506 const char *netdev_name = netdev_get_name(netdev);
4507 struct ethtool_value evalue;
4508 uint32_t new_flags;
4509 int error;
4510
ab985a77 4511 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4512 memset(&evalue, 0, sizeof evalue);
4513 error = netdev_linux_do_ethtool(netdev_name,
4514 (struct ethtool_cmd *)&evalue,
4515 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4516 if (error) {
4517 return error;
4518 }
4519
ab985a77 4520 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4521 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4522 error = netdev_linux_do_ethtool(netdev_name,
4523 (struct ethtool_cmd *)&evalue,
4524 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4525 if (error) {
4526 return error;
4527 }
4528
ab985a77 4529 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4530 memset(&evalue, 0, sizeof evalue);
4531 error = netdev_linux_do_ethtool(netdev_name,
4532 (struct ethtool_cmd *)&evalue,
4533 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4534 if (error) {
4535 return error;
4536 }
4537
4538 if (new_flags != evalue.data) {
4539 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4540 "device %s failed", enable ? "enable" : "disable",
4541 flag_name, netdev_name);
4542 return EOPNOTSUPP;
4543 }
4544
4545 return 0;
4546}
4547\f
4548/* Utility functions. */
4549
d3980822 4550/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4551static void
d3980822
BP
4552netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4553 const struct rtnl_link_stats *src)
4554{
f613a0d7
PS
4555 dst->rx_packets = src->rx_packets;
4556 dst->tx_packets = src->tx_packets;
4557 dst->rx_bytes = src->rx_bytes;
4558 dst->tx_bytes = src->tx_bytes;
4559 dst->rx_errors = src->rx_errors;
4560 dst->tx_errors = src->tx_errors;
4561 dst->rx_dropped = src->rx_dropped;
4562 dst->tx_dropped = src->tx_dropped;
4563 dst->multicast = src->multicast;
4564 dst->collisions = src->collisions;
4565 dst->rx_length_errors = src->rx_length_errors;
4566 dst->rx_over_errors = src->rx_over_errors;
4567 dst->rx_crc_errors = src->rx_crc_errors;
4568 dst->rx_frame_errors = src->rx_frame_errors;
4569 dst->rx_fifo_errors = src->rx_fifo_errors;
4570 dst->rx_missed_errors = src->rx_missed_errors;
4571 dst->tx_aborted_errors = src->tx_aborted_errors;
4572 dst->tx_carrier_errors = src->tx_carrier_errors;
4573 dst->tx_fifo_errors = src->tx_fifo_errors;
4574 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4575 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4576}
4577
c1c9c9c4 4578static int
35eef899 4579get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 4580{
c1c9c9c4
BP
4581 struct ofpbuf request;
4582 struct ofpbuf *reply;
c1c9c9c4
BP
4583 int error;
4584
4585 ofpbuf_init(&request, 0);
13a24df8
BP
4586 nl_msg_put_nlmsghdr(&request,
4587 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4588 RTM_GETLINK, NLM_F_REQUEST);
4589 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4590 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 4591 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
4592 ofpbuf_uninit(&request);
4593 if (error) {
4594 return error;
4595 }
4596
13a24df8
BP
4597 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4598 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4599 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4600 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4601 error = 0;
4602 } else {
4603 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4604 error = EPROTO;
4605 }
4606 } else {
4607 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4608 error = EPROTO;
c1c9c9c4 4609 }
8b61709d 4610
8b61709d 4611
576e26d7 4612 ofpbuf_delete(reply);
35eef899 4613 return error;
8b61709d 4614}
c1c9c9c4 4615
3a183124 4616static int
b5d57fc8 4617get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
4618{
4619 struct ifreq ifr;
4620 int error;
4621
755be9ea 4622 *flags = 0;
259e0b1a 4623 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
4624 if (!error) {
4625 *flags = ifr.ifr_flags;
4626 }
8b61709d
BP
4627 return error;
4628}
4629
4630static int
4b609110 4631set_flags(const char *name, unsigned int flags)
8b61709d
BP
4632{
4633 struct ifreq ifr;
4634
4635 ifr.ifr_flags = flags;
259e0b1a 4636 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
4637}
4638
4639static int
4640do_get_ifindex(const char *netdev_name)
4641{
4642 struct ifreq ifr;
259e0b1a 4643 int error;
8b61709d 4644
71d7c22f 4645 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 4646 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
4647
4648 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4649 if (error) {
8b61709d 4650 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
4651 netdev_name, ovs_strerror(error));
4652 return -error;
8b61709d
BP
4653 }
4654 return ifr.ifr_ifindex;
4655}
4656
4657static int
4658get_ifindex(const struct netdev *netdev_, int *ifindexp)
4659{
b5d57fc8 4660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 4661
b5d57fc8 4662 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 4663 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4664
8b61709d 4665 if (ifindex < 0) {
b5d57fc8
BP
4666 netdev->get_ifindex_error = -ifindex;
4667 netdev->ifindex = 0;
c7b1b0a5 4668 } else {
b5d57fc8
BP
4669 netdev->get_ifindex_error = 0;
4670 netdev->ifindex = ifindex;
8b61709d 4671 }
b5d57fc8 4672 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 4673 }
c7b1b0a5 4674
b5d57fc8
BP
4675 *ifindexp = netdev->ifindex;
4676 return netdev->get_ifindex_error;
8b61709d
BP
4677}
4678
4679static int
4680get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4681{
4682 struct ifreq ifr;
4683 int hwaddr_family;
259e0b1a 4684 int error;
8b61709d
BP
4685
4686 memset(&ifr, 0, sizeof ifr);
71d7c22f 4687 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 4688 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
4689 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4690 if (error) {
78857dfb
BP
4691 /* ENODEV probably means that a vif disappeared asynchronously and
4692 * hasn't been removed from the database yet, so reduce the log level
4693 * to INFO for that case. */
259e0b1a 4694 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 4695 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
4696 netdev_name, ovs_strerror(error));
4697 return error;
8b61709d
BP
4698 }
4699 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4700 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4701 VLOG_WARN("%s device has unknown hardware address family %d",
4702 netdev_name, hwaddr_family);
4703 }
4704 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4705 return 0;
4706}
4707
4708static int
44445cac 4709set_etheraddr(const char *netdev_name,
8b61709d
BP
4710 const uint8_t mac[ETH_ADDR_LEN])
4711{
4712 struct ifreq ifr;
259e0b1a 4713 int error;
8b61709d
BP
4714
4715 memset(&ifr, 0, sizeof ifr);
71d7c22f 4716 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4717 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4718 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4719 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
4720 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4721 if (error) {
8b61709d 4722 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 4723 netdev_name, ovs_strerror(error));
8b61709d 4724 }
259e0b1a 4725 return error;
8b61709d
BP
4726}
4727
4728static int
0b0544d7 4729netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4730 int cmd, const char *cmd_name)
4731{
4732 struct ifreq ifr;
259e0b1a 4733 int error;
8b61709d
BP
4734
4735 memset(&ifr, 0, sizeof ifr);
71d7c22f 4736 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4737 ifr.ifr_data = (caddr_t) ecmd;
4738
4739 ecmd->cmd = cmd;
259e0b1a
BP
4740 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4741 if (error) {
4742 if (error != EOPNOTSUPP) {
8b61709d 4743 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 4744 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
4745 } else {
4746 /* The device doesn't support this operation. That's pretty
4747 * common, so there's no point in logging anything. */
4748 }
8b61709d 4749 }
259e0b1a 4750 return error;
8b61709d 4751}
f1acd62b
BP
4752
4753static int
4754netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4755 int cmd, const char *cmd_name)
4756{
4757 struct ifreq ifr;
4758 int error;
4759
4760 ifr.ifr_addr.sa_family = AF_INET;
259e0b1a 4761 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 4762 if (!error) {
db5a1019
AW
4763 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4764 &ifr.ifr_addr);
f1acd62b
BP
4765 *ip = sin->sin_addr;
4766 }
4767 return error;
4768}
488d734d
BP
4769
4770/* Returns an AF_PACKET raw socket or a negative errno value. */
4771static int
4772af_packet_sock(void)
4773{
23882115
BP
4774 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4775 static int sock;
488d734d 4776
23882115 4777 if (ovsthread_once_start(&once)) {
488d734d
BP
4778 sock = socket(AF_PACKET, SOCK_RAW, 0);
4779 if (sock >= 0) {
8450059e
BP
4780 int error = set_nonblocking(sock);
4781 if (error) {
4782 close(sock);
4783 sock = -error;
4784 }
488d734d
BP
4785 } else {
4786 sock = -errno;
10a89ef0
BP
4787 VLOG_ERR("failed to create packet socket: %s",
4788 ovs_strerror(errno));
488d734d 4789 }
23882115 4790 ovsthread_once_done(&once);
488d734d
BP
4791 }
4792
4793 return sock;
4794}