]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
linux: Assume it is local if no API is available.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
32383c3b 27#include <linux/filter.h>
c1c9c9c4 28#include <linux/gen_stats.h>
bb7d0e22 29#include <linux/if_ether.h>
8b61709d
BP
30#include <linux/if_tun.h>
31#include <linux/types.h>
32#include <linux/ethtool.h>
63331829 33#include <linux/mii.h>
ef3767f5 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/ioctl.h>
37#include <sys/socket.h>
ac3e3aaa 38#include <sys/utsname.h>
55bc98d6 39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
55bc98d6 42#include <net/if_packet.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
18ebd48c 58#include "netdev-tc-offloads.h"
7fbef77a 59#include "netdev-vport.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
c060c4cf 62#include "netlink.h"
bfda5239 63#include "netnsid.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d 67#include "packets.h"
fd016ae3 68#include "openvswitch/poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
c1c5c723 73#include "tc.h"
1670c579 74#include "timer.h"
c060c4cf 75#include "unaligned.h"
e6211adc 76#include "openvswitch/vlog.h"
ee89ea7b 77#include "util.h"
5136ce49 78
d98e6007 79VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 80
d76f09ea
BP
81COVERAGE_DEFINE(netdev_set_policing);
82COVERAGE_DEFINE(netdev_arp_lookup);
83COVERAGE_DEFINE(netdev_get_ifindex);
84COVERAGE_DEFINE(netdev_get_hwaddr);
85COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
86COVERAGE_DEFINE(netdev_get_ethtool);
87COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 88
8b61709d 89\f
756819dd
FL
90#ifndef IFLA_IF_NETNSID
91#define IFLA_IF_NETNSID 0x45
92#endif
8b61709d
BP
93/* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95#ifndef ADVERTISED_Pause
96#define ADVERTISED_Pause (1 << 13)
97#endif
98#ifndef ADVERTISED_Asym_Pause
99#define ADVERTISED_Asym_Pause (1 << 14)
100#endif
101
e47bd51a
JP
102/* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104#ifndef ETHTOOL_GFLAGS
105#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106#endif
107#ifndef ETHTOOL_SFLAGS
108#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109#endif
110
c1c9c9c4
BP
111/* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113#ifndef TC_RTAB_SIZE
114#define TC_RTAB_SIZE 1024
115#endif
116
b73c8518
SH
117/* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
55bc98d6
BP
126#ifndef PACKET_AUXDATA
127#define PACKET_AUXDATA 8
128#endif
b73c8518
SH
129#ifndef TP_STATUS_VLAN_VALID
130#define TP_STATUS_VLAN_VALID (1 << 4)
131#endif
132#ifndef TP_STATUS_VLAN_TPID_VALID
133#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134#endif
135#undef tpacket_auxdata
136#define tpacket_auxdata rpl_tpacket_auxdata
137struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145};
146
0c615356
SH
147/* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
8a7903c6 151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
152 * unconditionally replace ethtool_cmd_speed. */
153#define ethtool_cmd_speed rpl_ethtool_cmd_speed
154static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155{
156 return ep->speed | (ep->speed_hi << 16);
157}
158
67bed84c
SH
159/* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161#ifndef SUPPORTED_1000baseKX_Full
162#define SUPPORTED_1000baseKX_Full (1 << 17)
163#define SUPPORTED_10000baseKX4_Full (1 << 18)
164#define SUPPORTED_10000baseKR_Full (1 << 19)
165#define SUPPORTED_10000baseR_FEC (1 << 20)
166#define ADVERTISED_1000baseKX_Full (1 << 17)
167#define ADVERTISED_10000baseKX4_Full (1 << 18)
168#define ADVERTISED_10000baseKR_Full (1 << 19)
169#define ADVERTISED_10000baseR_FEC (1 << 20)
170#endif
171
172/* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174#ifndef SUPPORTED_40000baseKR4_Full
175#define SUPPORTED_40000baseKR4_Full (1 << 23)
176#define SUPPORTED_40000baseCR4_Full (1 << 24)
177#define SUPPORTED_40000baseSR4_Full (1 << 25)
178#define SUPPORTED_40000baseLR4_Full (1 << 26)
179#define ADVERTISED_40000baseKR4_Full (1 << 23)
180#define ADVERTISED_40000baseCR4_Full (1 << 24)
181#define ADVERTISED_40000baseSR4_Full (1 << 25)
182#define ADVERTISED_40000baseLR4_Full (1 << 26)
183#endif
184
fa373af4
BP
185/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
191 * unconditionally define a replacement. */
192#ifndef IFLA_STATS64
337c9b99 193#define IFLA_STATS64 23
fa373af4
BP
194#endif
195#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
196struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223};
337c9b99 224
8b61709d 225enum {
7fbef77a
JG
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
8b61709d 234};
c1c9c9c4
BP
235\f
236/* Traffic control. */
237
238/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
239 * network device.
240 *
241 * Each TC implementation subclasses this with whatever additional data it
242 * needs. */
c1c9c9c4
BP
243struct tc {
244 const struct tc_ops *ops;
93b13be8
BP
245 struct hmap queues; /* Contains "struct tc_queue"s.
246 * Read by generic TC layer.
247 * Written only by TC implementation. */
248};
c1c9c9c4 249
559eb230
BP
250#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
251
93b13be8
BP
252/* One traffic control queue.
253 *
254 * Each TC implementation subclasses this with whatever additional data it
255 * needs. */
256struct tc_queue {
257 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
258 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 259 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
260};
261
262/* A particular kind of traffic control. Each implementation generally maps to
263 * one particular Linux qdisc class.
264 *
265 * The functions below return 0 if successful or a positive errno value on
266 * failure, except where otherwise noted. All of them must be provided, except
267 * where otherwise noted. */
268struct tc_ops {
269 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
270 * This is null for tc_ops_default and tc_ops_other, for which there are no
271 * appropriate values. */
272 const char *linux_name;
273
274 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
275 const char *ovs_name;
276
277 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
278 * queues. The queues are numbered 0 through n_queues - 1. */
279 unsigned int n_queues;
280
281 /* Called to install this TC class on 'netdev'. The implementation should
282 * make the Netlink calls required to set up 'netdev' with the right qdisc
283 * and configure it according to 'details'. The implementation may assume
284 * that the current qdisc is the default; that is, there is no need for it
285 * to delete the current qdisc before installing itself.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function must return 0 if and only if it sets 'netdev->tc' to an
292 * initialized 'struct tc'.
293 *
294 * (This function is null for tc_ops_other, which cannot be installed. For
295 * other TC classes it should always be nonnull.) */
79f1cbe9 296 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
297
298 /* Called when the netdev code determines (through a Netlink query) that
299 * this TC class's qdisc is installed on 'netdev', but we didn't install
300 * it ourselves and so don't know any of the details.
301 *
302 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
303 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
304 * implementation should parse the other attributes of 'nlmsg' as
305 * necessary to determine its configuration. If necessary it should also
306 * use Netlink queries to determine the configuration of queues on
307 * 'netdev'.
308 *
309 * This function must return 0 if and only if it sets 'netdev->tc' to an
310 * initialized 'struct tc'. */
311 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
312
313 /* Destroys the data structures allocated by the implementation as part of
314 * 'tc'. (This includes destroying 'tc->queues' by calling
315 * tc_destroy(tc).
316 *
317 * The implementation should not need to perform any Netlink calls. If
318 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
319 * (But it may not be desirable.)
320 *
321 * This function may be null if 'tc' is trivial. */
322 void (*tc_destroy)(struct tc *tc);
323
324 /* Retrieves details of 'netdev->tc' configuration into 'details'.
325 *
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the configuration.
329 *
330 * The contents of 'details' should be documented as valid for 'ovs_name'
331 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
332 * (which is built as ovs-vswitchd.conf.db(8)).
333 *
334 * This function may be null if 'tc' is not configurable.
335 */
79f1cbe9 336 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
337
338 /* Reconfigures 'netdev->tc' according to 'details', performing any
339 * required Netlink calls to complete the reconfiguration.
340 *
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
344 *
345 * This function may be null if 'tc' is not configurable.
346 */
79f1cbe9 347 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 348
93b13be8
BP
349 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
350 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
351 *
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "Queue" table in
354 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
355 *
356 * The implementation should not need to perform any Netlink calls, because
357 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
358 * cached the queue configuration.
359 *
360 * This function may be null if 'tc' does not have queues ('n_queues' is
361 * 0). */
93b13be8 362 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 363 struct smap *details);
c1c9c9c4
BP
364
365 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
366 * 'details', perfoming any required Netlink calls to complete the
367 * reconfiguration. The caller ensures that 'queue_id' is less than
368 * 'n_queues'.
369 *
370 * The contents of 'details' should be documented as valid for 'ovs_name'
371 * in the "other_config" column in the "Queue" table in
372 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
373 *
374 * This function may be null if 'tc' does not have queues or its queues are
375 * not configurable. */
376 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 377 const struct smap *details);
c1c9c9c4 378
93b13be8
BP
379 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
380 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
381 *
382 * This function may be null if 'tc' does not have queues or its queues
383 * cannot be deleted. */
93b13be8 384 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 385
93b13be8
BP
386 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
387 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
388 *
389 * On success, initializes '*stats'.
390 *
391 * This function may be null if 'tc' does not have queues or if it cannot
392 * report queue statistics. */
93b13be8
BP
393 int (*class_get_stats)(const struct netdev *netdev,
394 const struct tc_queue *queue,
c1c9c9c4
BP
395 struct netdev_queue_stats *stats);
396
397 /* Extracts queue stats from 'nlmsg', which is a response to a
398 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
399 *
400 * This function may be null if 'tc' does not have queues or if it cannot
401 * report queue statistics. */
402 int (*class_dump_stats)(const struct netdev *netdev,
403 const struct ofpbuf *nlmsg,
404 netdev_dump_queue_stats_cb *cb, void *aux);
405};
406
407static void
408tc_init(struct tc *tc, const struct tc_ops *ops)
409{
410 tc->ops = ops;
93b13be8 411 hmap_init(&tc->queues);
c1c9c9c4
BP
412}
413
414static void
415tc_destroy(struct tc *tc)
416{
93b13be8 417 hmap_destroy(&tc->queues);
c1c9c9c4
BP
418}
419
420static const struct tc_ops tc_ops_htb;
a339aa81 421static const struct tc_ops tc_ops_hfsc;
677d9158
JV
422static const struct tc_ops tc_ops_codel;
423static const struct tc_ops tc_ops_fqcodel;
424static const struct tc_ops tc_ops_sfq;
c1c9c9c4 425static const struct tc_ops tc_ops_default;
6cf888b8 426static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
427static const struct tc_ops tc_ops_other;
428
559eb230 429static const struct tc_ops *const tcs[] = {
c1c9c9c4 430 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 431 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
432 &tc_ops_codel, /* Controlled delay */
433 &tc_ops_fqcodel, /* Fair queue controlled delay */
434 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 435 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
436 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
437 &tc_ops_other, /* Some other qdisc. */
438 NULL
439};
149f577a 440
c1c9c9c4
BP
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
7874bdff
RD
445static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
446 int type,
447 unsigned int flags,
448 struct ofpbuf *);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
bfda5239 483 int netnsid; /* Network namespace ID. */
8722022c
BP
484 /* The following are figured out "on demand" only. They are only valid
485 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 486 int ifindex;
74ff3298 487 struct eth_addr etheraddr;
8b61709d 488 int mtu;
059e5f4f 489 unsigned int ifi_flags;
65c3058c 490 long long int carrier_resets;
80a86fbe
BP
491 uint32_t kbits_rate; /* Policing data. */
492 uint32_t kbits_burst;
bba1e6f3
PS
493 int vport_stats_error; /* Cached error code from vport_get_stats().
494 0 or an errno value. */
90a6637d 495 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 496 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 497 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 498 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 499 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 500
a00ca915
EJ
501 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 504
4f925bd3 505 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 506 struct tc *tc;
149f577a 507
d0d08f8a
BP
508 /* For devices of class netdev_tap_class only. */
509 int tap_fd;
22dcb534
FL
510 bool present; /* If the device is present in the namespace */
511 uint64_t tx_dropped; /* tap device can drop if the iface is down */
8b61709d
BP
512};
513
f7791740
PS
514struct netdev_rxq_linux {
515 struct netdev_rxq up;
796223f5 516 bool is_tap;
5b7448ed 517 int fd;
149f577a 518};
8b61709d 519
8b61709d
BP
520/* This is set pretty low because we probably won't learn anything from the
521 * additional log messages. */
522static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
523
19c8e9c1
JS
524/* Polling miimon status for all ports causes performance degradation when
525 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
526 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 *
528 * Readers do not depend on this variable synchronizing with the related
529 * changes in the device miimon status, so we can use atomic_count. */
530static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 531
1c33f0c3 532static void netdev_linux_run(const struct netdev_class *);
6f643e49 533
0b0544d7 534static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 535 int cmd, const char *cmd_name);
b5d57fc8 536static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 537static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
538static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
539 enum netdev_flags on, enum netdev_flags *old_flagsp)
540 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
541static int get_ifindex(const struct netdev *, int *ifindexp);
542static int do_set_addr(struct netdev *netdev,
543 int ioctl_nr, const char *ioctl_name,
544 struct in_addr addr);
74ff3298
JR
545static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
546static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 547static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 548static int af_packet_sock(void);
19c8e9c1 549static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
550static void netdev_linux_miimon_run(void);
551static void netdev_linux_miimon_wait(void);
df1e5a3b 552static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 553
15b3596a
JG
554static bool
555is_netdev_linux_class(const struct netdev_class *netdev_class)
556{
259e0b1a 557 return netdev_class->run == netdev_linux_run;
15b3596a
JG
558}
559
796223f5
BP
560static bool
561is_tap_netdev(const struct netdev *netdev)
562{
b5d57fc8 563 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
564}
565
8b61709d
BP
566static struct netdev_linux *
567netdev_linux_cast(const struct netdev *netdev)
568{
b5d57fc8 569 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 570
180c6d0b 571 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 572}
796223f5 573
f7791740
PS
574static struct netdev_rxq_linux *
575netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 576{
9dc63482 577 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 578 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 579}
ff4ed3c9 580\f
bfda5239
FL
581static int
582netdev_linux_netnsid_update__(struct netdev_linux *netdev)
583{
584 struct dpif_netlink_vport reply;
585 struct ofpbuf *buf;
586 int error;
587
588 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
589 if (error) {
629e1476
FL
590 if (error == ENOENT) {
591 /* Assume it is local if there is no API (e.g. if the openvswitch
592 * kernel module is not loaded). */
593 netnsid_set_local(&netdev->netnsid);
594 } else {
595 netnsid_unset(&netdev->netnsid);
596 }
bfda5239
FL
597 return error;
598 }
599
600 netnsid_set(&netdev->netnsid, reply.netnsid);
601 ofpbuf_delete(buf);
602 return 0;
603}
604
605static int
606netdev_linux_netnsid_update(struct netdev_linux *netdev)
607{
608 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
609 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
610 netnsid_set_local(&netdev->netnsid);
611 } else {
612 return netdev_linux_netnsid_update__(netdev);
613 }
bfda5239
FL
614 }
615
616 return 0;
617}
618
619static bool
620netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
621{
622 netdev_linux_netnsid_update(netdev);
623 return netnsid_eq(netdev->netnsid, nsid);
624}
625
756819dd
FL
626static bool
627netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
628{
629 netdev_linux_netnsid_update(netdev);
630 return netnsid_is_remote(netdev->netnsid);
631}
632
633static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 634static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 635 const struct rtnetlink_change *)
86383816 636 OVS_REQUIRES(netdev->mutex);
cee87338 637static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
638 unsigned int ifi_flags, unsigned int mask)
639 OVS_REQUIRES(netdev->mutex);
cee87338 640
d6384a3a
AW
641/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
642 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
643 * if no such socket could be created. */
644static struct nl_sock *
645netdev_linux_notify_sock(void)
646{
647 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
648 static struct nl_sock *sock;
989d7135
PS
649 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
650 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
651
652 if (ovsthread_once_start(&once)) {
653 int error;
654
655 error = nl_sock_create(NETLINK_ROUTE, &sock);
656 if (!error) {
d6384a3a
AW
657 size_t i;
658
659 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
660 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
661 if (error) {
662 nl_sock_destroy(sock);
663 sock = NULL;
664 break;
665 }
cee87338
BP
666 }
667 }
cf114a7f 668 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
669 ovsthread_once_done(&once);
670 }
671
672 return sock;
673}
674
19c8e9c1
JS
675static bool
676netdev_linux_miimon_enabled(void)
677{
812c272c 678 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
679}
680
8b61709d 681static void
1c33f0c3 682netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 683{
cee87338
BP
684 struct nl_sock *sock;
685 int error;
686
19c8e9c1
JS
687 if (netdev_linux_miimon_enabled()) {
688 netdev_linux_miimon_run();
689 }
cee87338
BP
690
691 sock = netdev_linux_notify_sock();
692 if (!sock) {
693 return;
694 }
695
696 do {
cee87338 697 uint64_t buf_stub[4096 / 8];
bfda5239 698 int nsid;
cee87338
BP
699 struct ofpbuf buf;
700
701 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 702 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 703 if (!error) {
7e9dcc0f 704 struct rtnetlink_change change;
cee87338 705
7e9dcc0f 706 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
707 struct netdev *netdev_ = NULL;
708 char dev_name[IFNAMSIZ];
709
710 if (!change.ifname) {
711 change.ifname = if_indextoname(change.if_index, dev_name);
712 }
713
714 if (change.ifname) {
715 netdev_ = netdev_from_name(change.ifname);
716 }
cee87338
BP
717 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
719
720 ovs_mutex_lock(&netdev->mutex);
bfda5239 721 netdev_linux_update(netdev, nsid, &change);
86383816 722 ovs_mutex_unlock(&netdev->mutex);
cee87338 723 }
38e0065b 724 netdev_close(netdev_);
cee87338
BP
725 }
726 } else if (error == ENOBUFS) {
727 struct shash device_shash;
728 struct shash_node *node;
729
730 nl_sock_drain(sock);
731
732 shash_init(&device_shash);
733 netdev_get_devices(&netdev_linux_class, &device_shash);
734 SHASH_FOR_EACH (node, &device_shash) {
735 struct netdev *netdev_ = node->data;
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
737 unsigned int flags;
738
86383816 739 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
740 get_flags(netdev_, &flags);
741 netdev_linux_changed(netdev, flags, 0);
86383816
BP
742 ovs_mutex_unlock(&netdev->mutex);
743
cee87338
BP
744 netdev_close(netdev_);
745 }
746 shash_destroy(&device_shash);
747 } else if (error != EAGAIN) {
7ed58d4a
JP
748 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
749 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
750 ovs_strerror(error));
751 }
752 ofpbuf_uninit(&buf);
753 } while (!error);
8b61709d
BP
754}
755
756static void
1c33f0c3 757netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 758{
cee87338
BP
759 struct nl_sock *sock;
760
19c8e9c1
JS
761 if (netdev_linux_miimon_enabled()) {
762 netdev_linux_miimon_wait();
763 }
cee87338
BP
764 sock = netdev_linux_notify_sock();
765 if (sock) {
766 nl_sock_wait(sock, POLLIN);
767 }
8b61709d
BP
768}
769
ac4d3bcb 770static void
b5d57fc8
BP
771netdev_linux_changed(struct netdev_linux *dev,
772 unsigned int ifi_flags, unsigned int mask)
86383816 773 OVS_REQUIRES(dev->mutex)
ac4d3bcb 774{
3e912ffc 775 netdev_change_seq_changed(&dev->up);
8aa77183
BP
776
777 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
778 dev->carrier_resets++;
779 }
780 dev->ifi_flags = ifi_flags;
781
4f925bd3 782 dev->cache_valid &= mask;
6b6e1329 783 if (!(mask & VALID_IN)) {
a8704b50
PS
784 netdev_get_addrs_list_flush();
785 }
4f925bd3
PS
786}
787
788static void
bfda5239
FL
789netdev_linux_update__(struct netdev_linux *dev,
790 const struct rtnetlink_change *change)
86383816 791 OVS_REQUIRES(dev->mutex)
4f925bd3 792{
bfda5239 793 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 794 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 795 /* Keep drv-info, and ip addresses. */
d6384a3a 796 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 797 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
798
799 /* Update netdev from rtnl-change msg. */
800 if (change->mtu) {
801 dev->mtu = change->mtu;
802 dev->cache_valid |= VALID_MTU;
803 dev->netdev_mtu_error = 0;
804 }
90a6637d 805
74ff3298
JR
806 if (!eth_addr_is_zero(change->mac)) {
807 dev->etheraddr = change->mac;
d6384a3a
AW
808 dev->cache_valid |= VALID_ETHERADDR;
809 dev->ether_addr_error = 0;
e8e1a409
TZ
810
811 /* The mac addr has been changed, report it now. */
812 rtnetlink_report_link();
d6384a3a 813 }
44445cac 814
d6384a3a
AW
815 dev->ifindex = change->if_index;
816 dev->cache_valid |= VALID_IFINDEX;
817 dev->get_ifindex_error = 0;
22dcb534 818 dev->present = true;
d6384a3a 819 } else {
bfda5239 820 /* FIXME */
d6384a3a 821 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 822 dev->present = false;
bfda5239 823 netnsid_unset(&dev->netnsid);
d6384a3a
AW
824 }
825 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
826 /* Invalidates in4, in6. */
6b6e1329 827 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 828 } else {
d6384a3a 829 OVS_NOT_REACHED();
4f925bd3 830 }
ac4d3bcb
EJ
831}
832
bfda5239
FL
833static void
834netdev_linux_update(struct netdev_linux *dev, int nsid,
835 const struct rtnetlink_change *change)
836 OVS_REQUIRES(dev->mutex)
837{
838 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
839 netdev_linux_update__(dev, change);
840 }
841}
842
9dc63482
BP
843static struct netdev *
844netdev_linux_alloc(void)
845{
846 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
847 return &netdev->up;
848}
849
48c6733c
WT
850static int
851netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 852{
48c6733c
WT
853 /* Prevent any attempt to create (or open) a network device named "default"
854 * or "all". These device names are effectively reserved on Linux because
855 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
856 * itself this wouldn't call for any special treatment, but in practice if
857 * a program tries to create devices with these names, it causes the kernel
858 * to fire a "new device" notification event even though creation failed,
859 * and in turn that causes OVS to wake up and try to create them again,
860 * which ends up as a 100% CPU loop. */
861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
862 const char *name = netdev_->name;
863 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
864 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
865 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
866 name);
867 return EINVAL;
868 }
869
bfda5239
FL
870 /* The device could be in the same network namespace or in another one. */
871 netnsid_unset(&netdev->netnsid);
834d6caf 872 ovs_mutex_init(&netdev->mutex);
48c6733c 873 return 0;
9dc63482
BP
874}
875
1f6e0fbd
BP
876/* Creates system and internal devices. */
877static int
9dc63482 878netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 879{
9dc63482 880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
881 int error = netdev_linux_common_construct(netdev_);
882 if (error) {
883 return error;
884 }
1f6e0fbd 885
b5d57fc8
BP
886 error = get_flags(&netdev->up, &netdev->ifi_flags);
887 if (error == ENODEV) {
9dc63482 888 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 889 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
890 return ENODEV;
891 } else {
892 /* "Internal" netdevs have to be created as netdev objects before
893 * they exist in the kernel, because creating them in the kernel
894 * happens by passing a netdev object to dpif_port_add().
895 * Therefore, ignore the error. */
896 }
897 }
46415c90 898
a740f0de
JG
899 return 0;
900}
901
5b7448ed
JG
902/* For most types of netdevs we open the device for each call of
903 * netdev_open(). However, this is not the case with tap devices,
904 * since it is only possible to open the device once. In this
905 * situation we share a single file descriptor, and consequently
906 * buffers, across all readers. Therefore once data is read it will
907 * be unavailable to other reads for tap devices. */
a740f0de 908static int
9dc63482 909netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 910{
9dc63482 911 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 912 static const char tap_dev[] = "/dev/net/tun";
9dc63482 913 const char *name = netdev_->name;
a740f0de 914 struct ifreq ifr;
a740f0de 915
48c6733c
WT
916 int error = netdev_linux_common_construct(netdev_);
917 if (error) {
918 return error;
919 }
1f6e0fbd 920
6c88d577 921 /* Open tap device. */
d0d08f8a
BP
922 netdev->tap_fd = open(tap_dev, O_RDWR);
923 if (netdev->tap_fd < 0) {
6c88d577 924 error = errno;
10a89ef0 925 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 926 return error;
6c88d577
JP
927 }
928
929 /* Create tap device. */
61b9d078 930 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 931 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 932 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 933 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 934 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 935 ovs_strerror(errno));
6c88d577 936 error = errno;
f61d8d29 937 goto error_close;
6c88d577
JP
938 }
939
940 /* Make non-blocking. */
d0d08f8a 941 error = set_nonblocking(netdev->tap_fd);
a740f0de 942 if (error) {
f61d8d29 943 goto error_close;
a740f0de
JG
944 }
945
0f28164b
FL
946 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
947 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
948 ovs_strerror(errno));
949 error = errno;
950 goto error_close;
951 }
952
a740f0de
JG
953 return 0;
954
f61d8d29 955error_close:
d0d08f8a 956 close(netdev->tap_fd);
a740f0de
JG
957 return error;
958}
959
6c88d577 960static void
9dc63482 961netdev_linux_destruct(struct netdev *netdev_)
6c88d577 962{
b5d57fc8 963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 964
b5d57fc8
BP
965 if (netdev->tc && netdev->tc->ops->tc_destroy) {
966 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
967 }
968
d0d08f8a
BP
969 if (netdev_get_class(netdev_) == &netdev_tap_class
970 && netdev->tap_fd >= 0)
971 {
0f28164b 972 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 973 close(netdev->tap_fd);
6c88d577 974 }
86383816 975
19c8e9c1 976 if (netdev->miimon_interval > 0) {
812c272c 977 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
978 }
979
86383816 980 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
981}
982
9dc63482
BP
983static void
984netdev_linux_dealloc(struct netdev *netdev_)
985{
986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
987 free(netdev);
988}
989
f7791740
PS
990static struct netdev_rxq *
991netdev_linux_rxq_alloc(void)
9dc63482 992{
f7791740 993 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
994 return &rx->up;
995}
996
7b6b0ef4 997static int
f7791740 998netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 999{
f7791740 1000 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1001 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1003 int error;
7b6b0ef4 1004
86383816 1005 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1006 rx->is_tap = is_tap_netdev(netdev_);
1007 if (rx->is_tap) {
1008 rx->fd = netdev->tap_fd;
796223f5
BP
1009 } else {
1010 struct sockaddr_ll sll;
b73c8518 1011 int ifindex, val;
32383c3b 1012 /* Result of tcpdump -dd inbound */
259e0b1a 1013 static const struct sock_filter filt[] = {
32383c3b
MM
1014 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1015 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1016 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1017 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1018 };
259e0b1a
BP
1019 static const struct sock_fprog fprog = {
1020 ARRAY_SIZE(filt), (struct sock_filter *) filt
1021 };
7b6b0ef4 1022
796223f5 1023 /* Create file descriptor. */
9dc63482
BP
1024 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1025 if (rx->fd < 0) {
796223f5 1026 error = errno;
10a89ef0 1027 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1028 goto error;
1029 }
33d82a56 1030
b73c8518
SH
1031 val = 1;
1032 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1033 error = errno;
1034 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1035 netdev_get_name(netdev_), ovs_strerror(error));
1036 goto error;
1037 }
1038
796223f5 1039 /* Set non-blocking mode. */
9dc63482 1040 error = set_nonblocking(rx->fd);
796223f5
BP
1041 if (error) {
1042 goto error;
1043 }
7b6b0ef4 1044
796223f5 1045 /* Get ethernet device index. */
180c6d0b 1046 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1047 if (error) {
1048 goto error;
1049 }
7b6b0ef4 1050
796223f5
BP
1051 /* Bind to specific ethernet device. */
1052 memset(&sll, 0, sizeof sll);
1053 sll.sll_family = AF_PACKET;
1054 sll.sll_ifindex = ifindex;
b73c8518 1055 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1056 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1057 error = errno;
1058 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1059 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1060 goto error;
1061 }
32383c3b
MM
1062
1063 /* Filter for only inbound packets. */
9dc63482 1064 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1065 sizeof fprog);
1066 if (error) {
1067 error = errno;
259e0b1a 1068 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1069 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1070 goto error;
1071 }
7b6b0ef4 1072 }
86383816 1073 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1074
7b6b0ef4
BP
1075 return 0;
1076
1077error:
9dc63482
BP
1078 if (rx->fd >= 0) {
1079 close(rx->fd);
7b6b0ef4 1080 }
86383816 1081 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1082 return error;
1083}
1084
796223f5 1085static void
f7791740 1086netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1087{
f7791740 1088 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1089
796223f5
BP
1090 if (!rx->is_tap) {
1091 close(rx->fd);
8b61709d 1092 }
9dc63482
BP
1093}
1094
1095static void
f7791740 1096netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1097{
f7791740 1098 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1099
796223f5
BP
1100 free(rx);
1101}
8b61709d 1102
b73c8518 1103static ovs_be16
1ebdc7eb 1104auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1105{
1106 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1107 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1108 } else if (double_tagged) {
1109 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1110 } else {
1ebdc7eb 1111 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1112 }
1113}
1114
1115static bool
1116auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1117{
1118 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1119}
1120
796223f5 1121static int
cf62fa4c 1122netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1123{
b73c8518 1124 size_t size;
796223f5 1125 ssize_t retval;
b73c8518
SH
1126 struct iovec iov;
1127 struct cmsghdr *cmsg;
1128 union {
1129 struct cmsghdr cmsg;
1130 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1131 } cmsg_buffer;
1132 struct msghdr msgh;
1133
1134 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1135 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1136 size = dp_packet_tailroom(buffer);
b73c8518 1137
cf62fa4c 1138 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1139 iov.iov_len = size;
1140 msgh.msg_name = NULL;
1141 msgh.msg_namelen = 0;
1142 msgh.msg_iov = &iov;
1143 msgh.msg_iovlen = 1;
1144 msgh.msg_control = &cmsg_buffer;
1145 msgh.msg_controllen = sizeof cmsg_buffer;
1146 msgh.msg_flags = 0;
8e8cddf7 1147
796223f5 1148 do {
b73c8518 1149 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1150 } while (retval < 0 && errno == EINTR);
1151
bfd3367b 1152 if (retval < 0) {
b73c8518
SH
1153 return errno;
1154 } else if (retval > size) {
1155 return EMSGSIZE;
1156 }
1157
cf62fa4c 1158 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1159
1160 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1161 const struct tpacket_auxdata *aux;
1162
1163 if (cmsg->cmsg_level != SOL_PACKET
1164 || cmsg->cmsg_type != PACKET_AUXDATA
1165 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1166 continue;
8b61709d 1167 }
b73c8518
SH
1168
1169 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1170 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1171 struct eth_header *eth;
1172 bool double_tagged;
1173
b73c8518
SH
1174 if (retval < ETH_HEADER_LEN) {
1175 return EINVAL;
1176 }
1177
1ebdc7eb
EG
1178 eth = dp_packet_data(buffer);
1179 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1180
1181 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1182 htons(aux->tp_vlan_tci));
1183 break;
1184 }
1185 }
1186
1187 return 0;
1188}
1189
1190static int
cf62fa4c 1191netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1192{
1193 ssize_t retval;
cf62fa4c 1194 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1195
1196 do {
cf62fa4c 1197 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1198 } while (retval < 0 && errno == EINTR);
1199
1200 if (retval < 0) {
bfd3367b 1201 return errno;
8b61709d 1202 }
b73c8518 1203
cf62fa4c 1204 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1205 return 0;
1206}
1207
1208static int
8492adc2
JS
1209netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1210 int *qfill)
b73c8518 1211{
f7791740 1212 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1213 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1214 struct dp_packet *buffer;
df1e5a3b
PS
1215 ssize_t retval;
1216 int mtu;
1217
1218 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1219 mtu = ETH_PAYLOAD_MAX;
1220 }
1221
2482b0b0 1222 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1223 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1224 DP_NETDEV_HEADROOM);
b73c8518 1225 retval = (rx->is_tap
f7791740
PS
1226 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1227 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1228
1229 if (retval) {
1230 if (retval != EAGAIN && retval != EMSGSIZE) {
1231 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1232 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1233 }
cf62fa4c 1234 dp_packet_delete(buffer);
df1e5a3b 1235 } else {
72c84bc2 1236 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1237 }
1238
8492adc2
JS
1239 if (qfill) {
1240 *qfill = -ENOTSUP;
1241 }
1242
b73c8518 1243 return retval;
8b61709d
BP
1244}
1245
8b61709d 1246static void
f7791740 1247netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1248{
f7791740 1249 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1250 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1251}
1252
8b61709d 1253static int
f7791740 1254netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1255{
f7791740 1256 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1257 if (rx->is_tap) {
8b61709d 1258 struct ifreq ifr;
f7791740 1259 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1260 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1261 if (error) {
1262 return error;
1263 }
796223f5 1264 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1265 return 0;
1266 } else {
796223f5 1267 return drain_rcvbuf(rx->fd);
8b61709d
BP
1268 }
1269}
1270
d19cf8bb
ZG
1271static int
1272netdev_linux_sock_batch_send(int sock, int ifindex,
1273 struct dp_packet_batch *batch)
1274{
e0a00cee 1275 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1276 /* We don't bother setting most fields in sockaddr_ll because the
1277 * kernel ignores them for SOCK_RAW. */
1278 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1279 .sll_ifindex = ifindex };
1280
e0a00cee
BB
1281 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1282 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1283
e0a00cee 1284 struct dp_packet *packet;
e883448e 1285 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1286 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1287 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1288 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1289 .msg_namelen = sizeof sll,
1290 .msg_iov = &iov[i],
1291 .msg_iovlen = 1 };
1292 }
1293
1294 int error = 0;
e0a00cee 1295 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1296 ssize_t retval;
1297 do {
e0a00cee 1298 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1299 error = retval < 0 ? errno : 0;
1300 } while (error == EINTR);
1301 if (error) {
1302 break;
1303 }
1304 ofs += retval;
1305 }
1306
1307 free(mmsg);
1308 free(iov);
1309 return error;
1310}
1311
1312/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1313 * essential, because packets sent to a tap device with an AF_PACKET socket
1314 * will loop back to be *received* again on the tap device. This doesn't occur
1315 * on other interface types because we attach a socket filter to the rx
1316 * socket. */
1317static int
1318netdev_linux_tap_batch_send(struct netdev *netdev_,
1319 struct dp_packet_batch *batch)
1320{
1321 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1322 struct dp_packet *packet;
22dcb534
FL
1323
1324 /* The Linux tap driver returns EIO if the device is not up,
1325 * so if the device is not up, don't waste time sending it.
1326 * However, if the device is in another network namespace
1327 * then OVS can't retrieve the state. In that case, send the
1328 * packets anyway. */
1329 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1330 netdev->tx_dropped += dp_packet_batch_size(batch);
1331 return 0;
1332 }
1333
e883448e 1334 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1335 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1336 ssize_t retval;
1337 int error;
1338
1339 do {
1340 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1341 error = retval < 0 ? errno : 0;
1342 } while (error == EINTR);
1343
1344 if (error) {
1345 /* The Linux tap driver returns EIO if the device is not up. From
1346 * the OVS side this is not an error, so we ignore it; otherwise,
1347 * return the erro. */
1348 if (error != EIO) {
1349 return error;
1350 }
1351 } else if (retval != size) {
1352 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1353 "bytes of %"PRIuSIZE") on %s",
1354 retval, size, netdev_get_name(netdev_));
1355 return EMSGSIZE;
1356 }
1357 }
1358 return 0;
1359}
1360
1361/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1362 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1363 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1364 * the packet is too big or too small to transmit on the device.
1365 *
8b61709d
BP
1366 * The kernel maintains a packet transmission queue, so the caller is not
1367 * expected to do additional queuing of packets. */
1368static int
f00fa8cb 1369netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1370 struct dp_packet_batch *batch,
324c8374 1371 bool concurrent_txq OVS_UNUSED)
8b61709d 1372{
f4fd623c 1373 int error = 0;
0a62ae2c
ZG
1374 int sock = 0;
1375
0a62ae2c 1376 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1377 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1378 error = EOPNOTSUPP;
1379 goto free_batch;
1380 }
1381
0a62ae2c
ZG
1382 sock = af_packet_sock();
1383 if (sock < 0) {
1384 error = -sock;
1385 goto free_batch;
1386 }
1387
1388 int ifindex = netdev_get_ifindex(netdev_);
1389 if (ifindex < 0) {
1390 error = -ifindex;
1391 goto free_batch;
1392 }
1393
d19cf8bb
ZG
1394 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1395 } else {
1396 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1397 }
d19cf8bb
ZG
1398 if (error) {
1399 if (error == ENOBUFS) {
1400 /* The Linux AF_PACKET implementation never blocks waiting
1401 * for room for packets, instead returning ENOBUFS.
1402 * Translate this into EAGAIN for the caller. */
1403 error = EAGAIN;
f23347ea 1404 } else {
f4fd623c
DDP
1405 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1406 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1407 }
f4fd623c
DDP
1408 }
1409
0a62ae2c 1410free_batch:
b30896c9 1411 dp_packet_delete_batch(batch, true);
f4fd623c 1412 return error;
8b61709d
BP
1413}
1414
1415/* Registers with the poll loop to wake up from the next call to poll_block()
1416 * when the packet transmission queue has sufficient room to transmit a packet
1417 * with netdev_send().
1418 *
1419 * The kernel maintains a packet transmission queue, so the client is not
1420 * expected to do additional queuing of packets. Thus, this function is
1421 * unlikely to ever be used. It is included for completeness. */
1422static void
f00fa8cb 1423netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1424{
796223f5 1425 if (is_tap_netdev(netdev)) {
8b61709d
BP
1426 /* TAP device always accepts packets.*/
1427 poll_immediate_wake();
1428 }
1429}
1430
1431/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1432 * otherwise a positive errno value. */
1433static int
74ff3298 1434netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1435{
b5d57fc8 1436 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1437 enum netdev_flags old_flags = 0;
eb395f2e
BP
1438 int error;
1439
86383816 1440 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1441 if (netdev_linux_netnsid_is_remote(netdev)) {
1442 error = EOPNOTSUPP;
1443 goto exit;
1444 }
86383816 1445
b5d57fc8 1446 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1447 error = netdev->ether_addr_error;
1448 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1449 goto exit;
44445cac 1450 }
b5d57fc8 1451 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1452 }
1453
7eb1bd81 1454 /* Tap devices must be brought down before setting the address. */
796223f5 1455 if (is_tap_netdev(netdev_)) {
4f9f3f21 1456 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1457 }
44445cac
PS
1458 error = set_etheraddr(netdev_get_name(netdev_), mac);
1459 if (!error || error == ENODEV) {
b5d57fc8
BP
1460 netdev->ether_addr_error = error;
1461 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1462 if (!error) {
74ff3298 1463 netdev->etheraddr = mac;
eb395f2e 1464 }
8b61709d 1465 }
44445cac 1466
4f9f3f21
BP
1467 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1468 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1469 }
7eb1bd81 1470
86383816
BP
1471exit:
1472 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1473 return error;
1474}
1475
44445cac 1476/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1477static int
74ff3298 1478netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1479{
b5d57fc8 1480 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1481 int error;
44445cac 1482
86383816 1483 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1484 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1485 netdev_linux_update_via_netlink(netdev);
1486 }
1487
1488 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1489 /* Fall back to ioctl if netlink fails */
86383816 1490 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1491 &netdev->etheraddr);
b5d57fc8 1492 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1493 }
44445cac 1494
86383816
BP
1495 error = netdev->ether_addr_error;
1496 if (!error) {
74ff3298 1497 *mac = netdev->etheraddr;
44445cac 1498 }
86383816 1499 ovs_mutex_unlock(&netdev->mutex);
44445cac 1500
86383816 1501 return error;
8b61709d
BP
1502}
1503
8b61709d 1504static int
73371c09 1505netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1506{
86383816
BP
1507 int error;
1508
b5d57fc8 1509 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1510 netdev_linux_update_via_netlink(netdev);
1511 }
1512
1513 if (!(netdev->cache_valid & VALID_MTU)) {
1514 /* Fall back to ioctl if netlink fails */
8b61709d 1515 struct ifreq ifr;
90a6637d 1516
86383816 1517 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1518 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1519 netdev->mtu = ifr.ifr_mtu;
1520 netdev->cache_valid |= VALID_MTU;
8b61709d 1521 }
90a6637d 1522
86383816
BP
1523 error = netdev->netdev_mtu_error;
1524 if (!error) {
b5d57fc8 1525 *mtup = netdev->mtu;
90a6637d 1526 }
73371c09
BP
1527
1528 return error;
1529}
1530
1531/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1532 * in bytes, not including the hardware header; thus, this is typically 1500
1533 * bytes for Ethernet devices. */
1534static int
1535netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1536{
1537 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1538 int error;
1539
1540 ovs_mutex_lock(&netdev->mutex);
1541 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1542 ovs_mutex_unlock(&netdev->mutex);
1543
1544 return error;
8b61709d
BP
1545}
1546
9b020780
PS
1547/* Sets the maximum size of transmitted (MTU) for given device using linux
1548 * networking ioctl interface.
1549 */
1550static int
4124cb12 1551netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1552{
b5d57fc8 1553 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1554 struct ifreq ifr;
1555 int error;
1556
86383816 1557 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1558 if (netdev_linux_netnsid_is_remote(netdev)) {
1559 error = EOPNOTSUPP;
1560 goto exit;
1561 }
1562
b5d57fc8 1563 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1564 error = netdev->netdev_mtu_error;
1565 if (error || netdev->mtu == mtu) {
1566 goto exit;
90a6637d 1567 }
b5d57fc8 1568 netdev->cache_valid &= ~VALID_MTU;
153e5481 1569 }
9b020780 1570 ifr.ifr_mtu = mtu;
259e0b1a
BP
1571 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1572 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1573 if (!error || error == ENODEV) {
b5d57fc8
BP
1574 netdev->netdev_mtu_error = error;
1575 netdev->mtu = ifr.ifr_mtu;
1576 netdev->cache_valid |= VALID_MTU;
9b020780 1577 }
86383816
BP
1578exit:
1579 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1580 return error;
9b020780
PS
1581}
1582
9ab3d9a3
BP
1583/* Returns the ifindex of 'netdev', if successful, as a positive number.
1584 * On failure, returns a negative errno value. */
1585static int
86383816 1586netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1587{
86383816 1588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1589 int ifindex, error;
1590
86383816 1591 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1592 if (netdev_linux_netnsid_is_remote(netdev)) {
1593 error = EOPNOTSUPP;
1594 goto exit;
1595 }
86383816 1596 error = get_ifindex(netdev_, &ifindex);
86383816 1597
e0e2410d
FL
1598exit:
1599 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1600 return error ? -error : ifindex;
1601}
1602
8b61709d
BP
1603static int
1604netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1605{
b5d57fc8 1606 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1607
86383816 1608 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1609 if (netdev->miimon_interval > 0) {
1610 *carrier = netdev->miimon;
3a183124 1611 } else {
b5d57fc8 1612 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1613 }
86383816 1614 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1615
3a183124 1616 return 0;
8b61709d
BP
1617}
1618
65c3058c 1619static long long int
86383816 1620netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1621{
86383816
BP
1622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1623 long long int carrier_resets;
1624
1625 ovs_mutex_lock(&netdev->mutex);
1626 carrier_resets = netdev->carrier_resets;
1627 ovs_mutex_unlock(&netdev->mutex);
1628
1629 return carrier_resets;
65c3058c
EJ
1630}
1631
63331829 1632static int
1670c579
EJ
1633netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1634 struct mii_ioctl_data *data)
63331829 1635{
63331829 1636 struct ifreq ifr;
782e6111 1637 int error;
63331829 1638
63331829 1639 memset(&ifr, 0, sizeof ifr);
782e6111 1640 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1641 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1642 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1643
782e6111
EJ
1644 return error;
1645}
1646
1647static int
1670c579 1648netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1649{
782e6111
EJ
1650 struct mii_ioctl_data data;
1651 int error;
63331829 1652
782e6111
EJ
1653 *miimon = false;
1654
1655 memset(&data, 0, sizeof data);
1670c579 1656 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1657 if (!error) {
1658 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1659 data.reg_num = MII_BMSR;
1670c579 1660 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1661 &data);
63331829
EJ
1662
1663 if (!error) {
782e6111 1664 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1665 }
9120cfc0
DH
1666 }
1667 if (error) {
63331829 1668 struct ethtool_cmd ecmd;
63331829
EJ
1669
1670 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1671 name);
1672
ab985a77 1673 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1674 memset(&ecmd, 0, sizeof ecmd);
1675 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1676 "ETHTOOL_GLINK");
1677 if (!error) {
782e6111
EJ
1678 struct ethtool_value eval;
1679
1680 memcpy(&eval, &ecmd, sizeof eval);
1681 *miimon = !!eval.data;
63331829
EJ
1682 } else {
1683 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1684 }
1685 }
1686
1687 return error;
1688}
1689
1670c579
EJ
1690static int
1691netdev_linux_set_miimon_interval(struct netdev *netdev_,
1692 long long int interval)
1693{
b5d57fc8 1694 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1695
86383816 1696 ovs_mutex_lock(&netdev->mutex);
1670c579 1697 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1698 if (netdev->miimon_interval != interval) {
19c8e9c1 1699 if (interval && !netdev->miimon_interval) {
812c272c 1700 atomic_count_inc(&miimon_cnt);
19c8e9c1 1701 } else if (!interval && netdev->miimon_interval) {
812c272c 1702 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1703 }
1704
b5d57fc8
BP
1705 netdev->miimon_interval = interval;
1706 timer_set_expired(&netdev->miimon_timer);
1670c579 1707 }
86383816 1708 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1709
1710 return 0;
1711}
1712
1713static void
1714netdev_linux_miimon_run(void)
1715{
1716 struct shash device_shash;
1717 struct shash_node *node;
1718
1719 shash_init(&device_shash);
b5d57fc8 1720 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1721 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1722 struct netdev *netdev = node->data;
1723 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1724 bool miimon;
1725
86383816
BP
1726 ovs_mutex_lock(&dev->mutex);
1727 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1728 netdev_linux_get_miimon(dev->up.name, &miimon);
1729 if (miimon != dev->miimon) {
1730 dev->miimon = miimon;
1731 netdev_linux_changed(dev, dev->ifi_flags, 0);
1732 }
1670c579 1733
86383816 1734 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1735 }
86383816 1736 ovs_mutex_unlock(&dev->mutex);
2f980d74 1737 netdev_close(netdev);
1670c579
EJ
1738 }
1739
1740 shash_destroy(&device_shash);
1741}
1742
1743static void
1744netdev_linux_miimon_wait(void)
1745{
1746 struct shash device_shash;
1747 struct shash_node *node;
1748
1749 shash_init(&device_shash);
b5d57fc8 1750 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1751 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1752 struct netdev *netdev = node->data;
1753 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1754
86383816 1755 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1756 if (dev->miimon_interval > 0) {
1757 timer_wait(&dev->miimon_timer);
1758 }
86383816 1759 ovs_mutex_unlock(&dev->mutex);
2f980d74 1760 netdev_close(netdev);
1670c579
EJ
1761 }
1762 shash_destroy(&device_shash);
1763}
1764
92df599c
JG
1765static void
1766swap_uint64(uint64_t *a, uint64_t *b)
1767{
1de0e8ae
BP
1768 uint64_t tmp = *a;
1769 *a = *b;
1770 *b = tmp;
92df599c
JG
1771}
1772
c060c4cf
EJ
1773/* Copies 'src' into 'dst', performing format conversion in the process.
1774 *
1775 * 'src' is allowed to be misaligned. */
1776static void
1777netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1778 const struct ovs_vport_stats *src)
1779{
6a54dedc
BP
1780 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1781 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1782 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1783 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1784 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1785 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1786 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1787 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1788 dst->multicast = 0;
1789 dst->collisions = 0;
1790 dst->rx_length_errors = 0;
1791 dst->rx_over_errors = 0;
1792 dst->rx_crc_errors = 0;
1793 dst->rx_frame_errors = 0;
1794 dst->rx_fifo_errors = 0;
1795 dst->rx_missed_errors = 0;
1796 dst->tx_aborted_errors = 0;
1797 dst->tx_carrier_errors = 0;
1798 dst->tx_fifo_errors = 0;
1799 dst->tx_heartbeat_errors = 0;
1800 dst->tx_window_errors = 0;
1801}
1802
1803static int
1804get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1805{
93451a0a 1806 struct dpif_netlink_vport reply;
c060c4cf
EJ
1807 struct ofpbuf *buf;
1808 int error;
1809
93451a0a 1810 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1811 if (error) {
1812 return error;
1813 } else if (!reply.stats) {
1814 ofpbuf_delete(buf);
1815 return EOPNOTSUPP;
1816 }
1817
1818 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1819
1820 ofpbuf_delete(buf);
1821
1822 return 0;
1823}
1824
f613a0d7
PS
1825static void
1826get_stats_via_vport(const struct netdev *netdev_,
1827 struct netdev_stats *stats)
8b61709d 1828{
b5d57fc8 1829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1830
b5d57fc8
BP
1831 if (!netdev->vport_stats_error ||
1832 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1833 int error;
7fbef77a 1834
c060c4cf 1835 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1836 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1837 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1838 "(%s)",
1839 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1840 }
b5d57fc8
BP
1841 netdev->vport_stats_error = error;
1842 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1843 }
f613a0d7 1844}
8b61709d 1845
f613a0d7
PS
1846/* Retrieves current device stats for 'netdev-linux'. */
1847static int
1848netdev_linux_get_stats(const struct netdev *netdev_,
1849 struct netdev_stats *stats)
1850{
b5d57fc8 1851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1852 struct netdev_stats dev_stats;
1853 int error;
1854
86383816 1855 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1856 get_stats_via_vport(netdev_, stats);
35eef899 1857 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1858 if (error) {
86383816
BP
1859 if (!netdev->vport_stats_error) {
1860 error = 0;
f613a0d7 1861 }
86383816 1862 } else if (netdev->vport_stats_error) {
04c881eb 1863 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1864 *stats = dev_stats;
1865 } else {
04c881eb
AZ
1866 /* Use kernel netdev's packet and byte counts since vport's counters
1867 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1868 * enabled. */
1869 stats->rx_packets = dev_stats.rx_packets;
1870 stats->rx_bytes = dev_stats.rx_bytes;
1871 stats->tx_packets = dev_stats.tx_packets;
1872 stats->tx_bytes = dev_stats.tx_bytes;
1873
f613a0d7
PS
1874 stats->rx_errors += dev_stats.rx_errors;
1875 stats->tx_errors += dev_stats.tx_errors;
1876 stats->rx_dropped += dev_stats.rx_dropped;
1877 stats->tx_dropped += dev_stats.tx_dropped;
1878 stats->multicast += dev_stats.multicast;
1879 stats->collisions += dev_stats.collisions;
1880 stats->rx_length_errors += dev_stats.rx_length_errors;
1881 stats->rx_over_errors += dev_stats.rx_over_errors;
1882 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1883 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1884 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1885 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1886 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1887 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1888 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1889 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1890 stats->tx_window_errors += dev_stats.tx_window_errors;
1891 }
86383816
BP
1892 ovs_mutex_unlock(&netdev->mutex);
1893
1894 return error;
f613a0d7
PS
1895}
1896
1897/* Retrieves current device stats for 'netdev-tap' netdev or
1898 * netdev-internal. */
1899static int
15aee116 1900netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1901{
b5d57fc8 1902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1903 struct netdev_stats dev_stats;
1904 int error;
1905
86383816 1906 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1907 get_stats_via_vport(netdev_, stats);
35eef899 1908 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1909 if (error) {
86383816
BP
1910 if (!netdev->vport_stats_error) {
1911 error = 0;
8b61709d 1912 }
86383816
BP
1913 } else if (netdev->vport_stats_error) {
1914 /* Transmit and receive stats will appear to be swapped relative to the
1915 * other ports since we are the one sending the data, not a remote
1916 * computer. For consistency, we swap them back here. This does not
1917 * apply if we are getting stats from the vport layer because it always
1918 * tracks stats from the perspective of the switch. */
fe6b0e03 1919
f613a0d7 1920 *stats = dev_stats;
92df599c
JG
1921 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1922 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1923 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1924 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1925 stats->rx_length_errors = 0;
1926 stats->rx_over_errors = 0;
1927 stats->rx_crc_errors = 0;
1928 stats->rx_frame_errors = 0;
1929 stats->rx_fifo_errors = 0;
1930 stats->rx_missed_errors = 0;
1931 stats->tx_aborted_errors = 0;
1932 stats->tx_carrier_errors = 0;
1933 stats->tx_fifo_errors = 0;
1934 stats->tx_heartbeat_errors = 0;
1935 stats->tx_window_errors = 0;
f613a0d7 1936 } else {
04c881eb
AZ
1937 /* Use kernel netdev's packet and byte counts since vport counters
1938 * do not reflect packet counts on the wire when GSO, TSO or GRO
1939 * are enabled. */
1940 stats->rx_packets = dev_stats.tx_packets;
1941 stats->rx_bytes = dev_stats.tx_bytes;
1942 stats->tx_packets = dev_stats.rx_packets;
1943 stats->tx_bytes = dev_stats.rx_bytes;
1944
f613a0d7
PS
1945 stats->rx_dropped += dev_stats.tx_dropped;
1946 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1947
f613a0d7
PS
1948 stats->rx_errors += dev_stats.tx_errors;
1949 stats->tx_errors += dev_stats.rx_errors;
1950
1951 stats->multicast += dev_stats.multicast;
1952 stats->collisions += dev_stats.collisions;
1953 }
22dcb534 1954 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
1955 ovs_mutex_unlock(&netdev->mutex);
1956
1957 return error;
8b61709d
BP
1958}
1959
bba1e6f3
PS
1960static int
1961netdev_internal_get_stats(const struct netdev *netdev_,
1962 struct netdev_stats *stats)
1963{
b5d57fc8 1964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1965 int error;
bba1e6f3 1966
86383816 1967 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1968 get_stats_via_vport(netdev_, stats);
86383816
BP
1969 error = netdev->vport_stats_error;
1970 ovs_mutex_unlock(&netdev->mutex);
1971
1972 return error;
bba1e6f3
PS
1973}
1974
51f87458 1975static void
b5d57fc8 1976netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1977{
1978 struct ethtool_cmd ecmd;
6c038611 1979 uint32_t speed;
8b61709d
BP
1980 int error;
1981
b5d57fc8 1982 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1983 return;
1984 }
1985
ab985a77 1986 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1987 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1988 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1989 ETHTOOL_GSET, "ETHTOOL_GSET");
1990 if (error) {
51f87458 1991 goto out;
8b61709d
BP
1992 }
1993
1994 /* Supported features. */
b5d57fc8 1995 netdev->supported = 0;
8b61709d 1996 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1997 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1998 }
1999 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2000 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2001 }
2002 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2003 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2004 }
2005 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2006 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2007 }
2008 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2009 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2010 }
67bed84c
SH
2011 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2012 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2013 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2014 }
67bed84c
SH
2015 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2016 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2017 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2018 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2019 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2020 }
67bed84c
SH
2021 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2022 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2023 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2024 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2025 netdev->supported |= NETDEV_F_40GB_FD;
2026 }
8b61709d 2027 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2028 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2029 }
2030 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2031 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2032 }
2033 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2034 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2035 }
2036 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2037 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2038 }
2039 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2040 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2041 }
2042
2043 /* Advertised features. */
b5d57fc8 2044 netdev->advertised = 0;
8b61709d 2045 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2046 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2047 }
2048 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2049 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2050 }
2051 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2052 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2053 }
2054 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2055 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2056 }
2057 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2058 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2059 }
67bed84c
SH
2060 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2061 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2062 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2063 }
67bed84c
SH
2064 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2065 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2066 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2067 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2068 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2069 }
67bed84c
SH
2070 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2071 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2072 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2073 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2074 netdev->advertised |= NETDEV_F_40GB_FD;
2075 }
8b61709d 2076 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2077 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2078 }
2079 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2080 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2081 }
2082 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2083 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2084 }
2085 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2086 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2087 }
2088 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2089 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2090 }
2091
2092 /* Current settings. */
0c615356 2093 speed = ethtool_cmd_speed(&ecmd);
6c038611 2094 if (speed == SPEED_10) {
b5d57fc8 2095 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2096 } else if (speed == SPEED_100) {
b5d57fc8 2097 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2098 } else if (speed == SPEED_1000) {
b5d57fc8 2099 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2100 } else if (speed == SPEED_10000) {
b5d57fc8 2101 netdev->current = NETDEV_F_10GB_FD;
6c038611 2102 } else if (speed == 40000) {
b5d57fc8 2103 netdev->current = NETDEV_F_40GB_FD;
6c038611 2104 } else if (speed == 100000) {
b5d57fc8 2105 netdev->current = NETDEV_F_100GB_FD;
6c038611 2106 } else if (speed == 1000000) {
b5d57fc8 2107 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2108 } else {
b5d57fc8 2109 netdev->current = 0;
8b61709d
BP
2110 }
2111
2112 if (ecmd.port == PORT_TP) {
b5d57fc8 2113 netdev->current |= NETDEV_F_COPPER;
8b61709d 2114 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2115 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2116 }
2117
2118 if (ecmd.autoneg) {
b5d57fc8 2119 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2120 }
2121
51f87458 2122out:
b5d57fc8
BP
2123 netdev->cache_valid |= VALID_FEATURES;
2124 netdev->get_features_error = error;
51f87458
PS
2125}
2126
887ed8b2
BP
2127/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2128 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2129 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2130static int
2131netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2132 enum netdev_features *current,
2133 enum netdev_features *advertised,
2134 enum netdev_features *supported,
2135 enum netdev_features *peer)
51f87458 2136{
b5d57fc8 2137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2138 int error;
51f87458 2139
86383816 2140 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2141 if (netdev_linux_netnsid_is_remote(netdev)) {
2142 error = EOPNOTSUPP;
2143 goto exit;
2144 }
2145
b5d57fc8 2146 netdev_linux_read_features(netdev);
b5d57fc8
BP
2147 if (!netdev->get_features_error) {
2148 *current = netdev->current;
2149 *advertised = netdev->advertised;
2150 *supported = netdev->supported;
887ed8b2 2151 *peer = 0; /* XXX */
51f87458 2152 }
86383816 2153 error = netdev->get_features_error;
86383816 2154
e0e2410d
FL
2155exit:
2156 ovs_mutex_unlock(&netdev->mutex);
86383816 2157 return error;
8b61709d
BP
2158}
2159
2160/* Set the features advertised by 'netdev' to 'advertise'. */
2161static int
86383816 2162netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2163 enum netdev_features advertise)
8b61709d 2164{
86383816 2165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2166 struct ethtool_cmd ecmd;
2167 int error;
2168
86383816
BP
2169 ovs_mutex_lock(&netdev->mutex);
2170
ab985a77 2171 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2172
2173 if (netdev_linux_netnsid_is_remote(netdev)) {
2174 error = EOPNOTSUPP;
2175 goto exit;
2176 }
2177
8b61709d 2178 memset(&ecmd, 0, sizeof ecmd);
86383816 2179 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2180 ETHTOOL_GSET, "ETHTOOL_GSET");
2181 if (error) {
86383816 2182 goto exit;
8b61709d
BP
2183 }
2184
2185 ecmd.advertising = 0;
6c038611 2186 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2187 ecmd.advertising |= ADVERTISED_10baseT_Half;
2188 }
6c038611 2189 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2190 ecmd.advertising |= ADVERTISED_10baseT_Full;
2191 }
6c038611 2192 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2193 ecmd.advertising |= ADVERTISED_100baseT_Half;
2194 }
6c038611 2195 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2196 ecmd.advertising |= ADVERTISED_100baseT_Full;
2197 }
6c038611 2198 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2199 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2200 }
6c038611 2201 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2202 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2203 }
6c038611 2204 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2205 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2206 }
6c038611 2207 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2208 ecmd.advertising |= ADVERTISED_TP;
2209 }
6c038611 2210 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2211 ecmd.advertising |= ADVERTISED_FIBRE;
2212 }
6c038611 2213 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2214 ecmd.advertising |= ADVERTISED_Autoneg;
2215 }
6c038611 2216 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2217 ecmd.advertising |= ADVERTISED_Pause;
2218 }
6c038611 2219 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2220 ecmd.advertising |= ADVERTISED_Asym_Pause;
2221 }
ab985a77 2222 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2223 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2224 ETHTOOL_SSET, "ETHTOOL_SSET");
2225
2226exit:
2227 ovs_mutex_unlock(&netdev->mutex);
2228 return error;
8b61709d
BP
2229}
2230
f8500004
JP
2231/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2232 * successful, otherwise a positive errno value. */
8b61709d 2233static int
b5d57fc8 2234netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2235 uint32_t kbits_rate, uint32_t kbits_burst)
2236{
b5d57fc8
BP
2237 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2238 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2239 int ifindex;
f8500004 2240 int error;
8b61709d 2241
d5ae4a60
PB
2242 if (netdev_is_flow_api_enabled()) {
2243 if (kbits_rate) {
2244 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2245 netdev_name);
2246 }
2247 return EOPNOTSUPP;
2248 }
2249
80a86fbe 2250 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2251 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2252 : kbits_burst); /* Stick with user-specified value. */
2253
86383816 2254 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2255 if (netdev_linux_netnsid_is_remote(netdev)) {
2256 error = EOPNOTSUPP;
2257 goto out;
2258 }
2259
b5d57fc8 2260 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2261 error = netdev->netdev_policing_error;
2262 if (error || (netdev->kbits_rate == kbits_rate &&
2263 netdev->kbits_burst == kbits_burst)) {
c9f71668 2264 /* Assume that settings haven't changed since we last set them. */
86383816 2265 goto out;
c9f71668 2266 }
b5d57fc8 2267 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2268 }
2269
7874bdff
RD
2270 error = get_ifindex(netdev_, &ifindex);
2271 if (error) {
2272 goto out;
2273 }
2274
ac8c3412 2275 COVERAGE_INC(netdev_set_policing);
f8500004 2276 /* Remove any existing ingress qdisc. */
7874bdff 2277 error = tc_add_del_ingress_qdisc(ifindex, false);
f8500004
JP
2278 if (error) {
2279 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2280 netdev_name, ovs_strerror(error));
c9f71668 2281 goto out;
f8500004
JP
2282 }
2283
8b61709d 2284 if (kbits_rate) {
7874bdff 2285 error = tc_add_del_ingress_qdisc(ifindex, true);
f8500004
JP
2286 if (error) {
2287 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2288 netdev_name, ovs_strerror(error));
c9f71668 2289 goto out;
8b61709d
BP
2290 }
2291
b5d57fc8 2292 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2293 if (error){
2294 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2295 netdev_name, ovs_strerror(error));
c9f71668 2296 goto out;
8b61709d 2297 }
8b61709d
BP
2298 }
2299
b5d57fc8
BP
2300 netdev->kbits_rate = kbits_rate;
2301 netdev->kbits_burst = kbits_burst;
f8500004 2302
c9f71668
PS
2303out:
2304 if (!error || error == ENODEV) {
b5d57fc8
BP
2305 netdev->netdev_policing_error = error;
2306 netdev->cache_valid |= VALID_POLICING;
c9f71668 2307 }
86383816 2308 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2309 return error;
8b61709d
BP
2310}
2311
c1c9c9c4
BP
2312static int
2313netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2314 struct sset *types)
c1c9c9c4 2315{
559eb230 2316 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2317 for (opsp = tcs; *opsp != NULL; opsp++) {
2318 const struct tc_ops *ops = *opsp;
2319 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2320 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2321 }
2322 }
2323 return 0;
2324}
2325
2326static const struct tc_ops *
2327tc_lookup_ovs_name(const char *name)
2328{
559eb230 2329 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2330
2331 for (opsp = tcs; *opsp != NULL; opsp++) {
2332 const struct tc_ops *ops = *opsp;
2333 if (!strcmp(name, ops->ovs_name)) {
2334 return ops;
2335 }
2336 }
2337 return NULL;
2338}
2339
2340static const struct tc_ops *
2341tc_lookup_linux_name(const char *name)
2342{
559eb230 2343 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2344
2345 for (opsp = tcs; *opsp != NULL; opsp++) {
2346 const struct tc_ops *ops = *opsp;
2347 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2348 return ops;
2349 }
2350 }
2351 return NULL;
2352}
2353
93b13be8 2354static struct tc_queue *
b5d57fc8 2355tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2356 size_t hash)
2357{
b5d57fc8 2358 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2359 struct tc_queue *queue;
2360
b5d57fc8 2361 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2362 if (queue->queue_id == queue_id) {
2363 return queue;
2364 }
2365 }
2366 return NULL;
2367}
2368
2369static struct tc_queue *
2370tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2371{
2372 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2373}
2374
c1c9c9c4
BP
2375static int
2376netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2377 const char *type,
2378 struct netdev_qos_capabilities *caps)
2379{
2380 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2381 if (!ops) {
2382 return EOPNOTSUPP;
2383 }
2384 caps->n_queues = ops->n_queues;
2385 return 0;
2386}
2387
2388static int
b5d57fc8 2389netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2390 const char **typep, struct smap *details)
c1c9c9c4 2391{
b5d57fc8 2392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2393 int error;
2394
86383816 2395 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2396 if (netdev_linux_netnsid_is_remote(netdev)) {
2397 error = EOPNOTSUPP;
2398 goto exit;
2399 }
2400
b5d57fc8 2401 error = tc_query_qdisc(netdev_);
86383816
BP
2402 if (!error) {
2403 *typep = netdev->tc->ops->ovs_name;
2404 error = (netdev->tc->ops->qdisc_get
2405 ? netdev->tc->ops->qdisc_get(netdev_, details)
2406 : 0);
c1c9c9c4
BP
2407 }
2408
e0e2410d
FL
2409exit:
2410 ovs_mutex_unlock(&netdev->mutex);
86383816 2411 return error;
c1c9c9c4
BP
2412}
2413
2414static int
b5d57fc8 2415netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2416 const char *type, const struct smap *details)
c1c9c9c4 2417{
b5d57fc8 2418 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2419 const struct tc_ops *new_ops;
2420 int error;
2421
2422 new_ops = tc_lookup_ovs_name(type);
2423 if (!new_ops || !new_ops->tc_install) {
2424 return EOPNOTSUPP;
2425 }
2426
6cf888b8
BS
2427 if (new_ops == &tc_ops_noop) {
2428 return new_ops->tc_install(netdev_, details);
2429 }
2430
86383816 2431 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2432 if (netdev_linux_netnsid_is_remote(netdev)) {
2433 error = EOPNOTSUPP;
2434 goto exit;
2435 }
2436
b5d57fc8 2437 error = tc_query_qdisc(netdev_);
c1c9c9c4 2438 if (error) {
86383816 2439 goto exit;
c1c9c9c4
BP
2440 }
2441
b5d57fc8 2442 if (new_ops == netdev->tc->ops) {
86383816 2443 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2444 } else {
2445 /* Delete existing qdisc. */
b5d57fc8 2446 error = tc_del_qdisc(netdev_);
c1c9c9c4 2447 if (error) {
86383816 2448 goto exit;
c1c9c9c4 2449 }
b5d57fc8 2450 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2451
2452 /* Install new qdisc. */
b5d57fc8
BP
2453 error = new_ops->tc_install(netdev_, details);
2454 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2455 }
86383816
BP
2456
2457exit:
2458 ovs_mutex_unlock(&netdev->mutex);
2459 return error;
c1c9c9c4
BP
2460}
2461
2462static int
b5d57fc8 2463netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2464 unsigned int queue_id, struct smap *details)
c1c9c9c4 2465{
b5d57fc8 2466 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2467 int error;
2468
86383816 2469 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2470 if (netdev_linux_netnsid_is_remote(netdev)) {
2471 error = EOPNOTSUPP;
2472 goto exit;
2473 }
2474
b5d57fc8 2475 error = tc_query_qdisc(netdev_);
86383816 2476 if (!error) {
b5d57fc8 2477 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2478 error = (queue
b5d57fc8 2479 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2480 : ENOENT);
c1c9c9c4 2481 }
86383816 2482
e0e2410d
FL
2483exit:
2484 ovs_mutex_unlock(&netdev->mutex);
86383816 2485 return error;
c1c9c9c4
BP
2486}
2487
2488static int
b5d57fc8 2489netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2490 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2491{
b5d57fc8 2492 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2493 int error;
2494
86383816 2495 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2496 if (netdev_linux_netnsid_is_remote(netdev)) {
2497 error = EOPNOTSUPP;
2498 goto exit;
2499 }
2500
b5d57fc8 2501 error = tc_query_qdisc(netdev_);
86383816
BP
2502 if (!error) {
2503 error = (queue_id < netdev->tc->ops->n_queues
2504 && netdev->tc->ops->class_set
2505 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2506 : EINVAL);
c1c9c9c4
BP
2507 }
2508
e0e2410d
FL
2509exit:
2510 ovs_mutex_unlock(&netdev->mutex);
86383816 2511 return error;
c1c9c9c4
BP
2512}
2513
2514static int
b5d57fc8 2515netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2516{
b5d57fc8 2517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2518 int error;
2519
86383816 2520 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2521 if (netdev_linux_netnsid_is_remote(netdev)) {
2522 error = EOPNOTSUPP;
2523 goto exit;
2524 }
2525
b5d57fc8 2526 error = tc_query_qdisc(netdev_);
86383816
BP
2527 if (!error) {
2528 if (netdev->tc->ops->class_delete) {
2529 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2530 error = (queue
2531 ? netdev->tc->ops->class_delete(netdev_, queue)
2532 : ENOENT);
2533 } else {
2534 error = EINVAL;
2535 }
c1c9c9c4 2536 }
86383816 2537
e0e2410d
FL
2538exit:
2539 ovs_mutex_unlock(&netdev->mutex);
86383816 2540 return error;
c1c9c9c4
BP
2541}
2542
2543static int
b5d57fc8 2544netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2545 unsigned int queue_id,
2546 struct netdev_queue_stats *stats)
2547{
b5d57fc8 2548 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2549 int error;
2550
86383816 2551 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2552 if (netdev_linux_netnsid_is_remote(netdev)) {
2553 error = EOPNOTSUPP;
2554 goto exit;
2555 }
2556
b5d57fc8 2557 error = tc_query_qdisc(netdev_);
86383816
BP
2558 if (!error) {
2559 if (netdev->tc->ops->class_get_stats) {
2560 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2561 if (queue) {
2562 stats->created = queue->created;
2563 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2564 stats);
2565 } else {
2566 error = ENOENT;
2567 }
2568 } else {
2569 error = EOPNOTSUPP;
6dc34a0d 2570 }
c1c9c9c4 2571 }
86383816 2572
e0e2410d
FL
2573exit:
2574 ovs_mutex_unlock(&netdev->mutex);
86383816 2575 return error;
c1c9c9c4
BP
2576}
2577
d57695d7
JS
2578struct queue_dump_state {
2579 struct nl_dump dump;
2580 struct ofpbuf buf;
2581};
2582
23a98ffe 2583static bool
d57695d7 2584start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2585{
2586 struct ofpbuf request;
2587 struct tcmsg *tcmsg;
2588
7874bdff 2589 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2590 if (!tcmsg) {
2591 return false;
2592 }
3c4de644 2593 tcmsg->tcm_parent = 0;
d57695d7 2594 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2595 ofpbuf_uninit(&request);
d57695d7
JS
2596
2597 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2598 return true;
c1c9c9c4
BP
2599}
2600
d57695d7
JS
2601static int
2602finish_queue_dump(struct queue_dump_state *state)
2603{
2604 ofpbuf_uninit(&state->buf);
2605 return nl_dump_done(&state->dump);
2606}
2607
89454bf4
BP
2608struct netdev_linux_queue_state {
2609 unsigned int *queues;
2610 size_t cur_queue;
2611 size_t n_queues;
2612};
2613
c1c9c9c4 2614static int
89454bf4 2615netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2616{
e0e2410d 2617 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2618 int error;
2619
86383816 2620 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2621 if (netdev_linux_netnsid_is_remote(netdev)) {
2622 error = EOPNOTSUPP;
2623 goto exit;
2624 }
2625
b5d57fc8 2626 error = tc_query_qdisc(netdev_);
86383816
BP
2627 if (!error) {
2628 if (netdev->tc->ops->class_get) {
89454bf4
BP
2629 struct netdev_linux_queue_state *state;
2630 struct tc_queue *queue;
2631 size_t i;
2632
2633 *statep = state = xmalloc(sizeof *state);
2634 state->n_queues = hmap_count(&netdev->tc->queues);
2635 state->cur_queue = 0;
2636 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2637
2638 i = 0;
2639 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2640 state->queues[i++] = queue->queue_id;
86383816 2641 }
c1c9c9c4 2642 } else {
86383816 2643 error = EOPNOTSUPP;
c1c9c9c4
BP
2644 }
2645 }
c1c9c9c4 2646
e0e2410d
FL
2647exit:
2648 ovs_mutex_unlock(&netdev->mutex);
86383816 2649 return error;
c1c9c9c4
BP
2650}
2651
89454bf4
BP
2652static int
2653netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2654 unsigned int *queue_idp, struct smap *details)
2655{
e0e2410d 2656 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
2657 struct netdev_linux_queue_state *state = state_;
2658 int error = EOF;
2659
2660 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2661 if (netdev_linux_netnsid_is_remote(netdev)) {
2662 error = EOPNOTSUPP;
2663 goto exit;
2664 }
2665
89454bf4
BP
2666 while (state->cur_queue < state->n_queues) {
2667 unsigned int queue_id = state->queues[state->cur_queue++];
2668 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2669
2670 if (queue) {
2671 *queue_idp = queue_id;
2672 error = netdev->tc->ops->class_get(netdev_, queue, details);
2673 break;
2674 }
2675 }
89454bf4 2676
e0e2410d
FL
2677exit:
2678 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
2679 return error;
2680}
2681
2682static int
2683netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2684 void *state_)
2685{
2686 struct netdev_linux_queue_state *state = state_;
2687
2688 free(state->queues);
2689 free(state);
2690 return 0;
2691}
2692
c1c9c9c4 2693static int
b5d57fc8 2694netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2695 netdev_dump_queue_stats_cb *cb, void *aux)
2696{
b5d57fc8 2697 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2698 int error;
2699
86383816 2700 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2701 if (netdev_linux_netnsid_is_remote(netdev)) {
2702 error = EOPNOTSUPP;
2703 goto exit;
2704 }
2705
b5d57fc8 2706 error = tc_query_qdisc(netdev_);
86383816 2707 if (!error) {
d57695d7 2708 struct queue_dump_state state;
c1c9c9c4 2709
86383816
BP
2710 if (!netdev->tc->ops->class_dump_stats) {
2711 error = EOPNOTSUPP;
d57695d7 2712 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2713 error = ENODEV;
2714 } else {
2715 struct ofpbuf msg;
2716 int retval;
2717
d57695d7 2718 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2719 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2720 cb, aux);
2721 if (retval) {
2722 error = retval;
2723 }
2724 }
2725
d57695d7 2726 retval = finish_queue_dump(&state);
86383816
BP
2727 if (retval) {
2728 error = retval;
2729 }
c1c9c9c4
BP
2730 }
2731 }
2732
e0e2410d
FL
2733exit:
2734 ovs_mutex_unlock(&netdev->mutex);
86383816 2735 return error;
c1c9c9c4
BP
2736}
2737
8b61709d 2738static int
f1acd62b
BP
2739netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2740 struct in_addr netmask)
8b61709d 2741{
b5d57fc8 2742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2743 int error;
2744
86383816 2745 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2746 if (netdev_linux_netnsid_is_remote(netdev)) {
2747 error = EOPNOTSUPP;
2748 goto exit;
2749 }
2750
f1acd62b 2751 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2752 if (!error) {
f1acd62b 2753 if (address.s_addr != INADDR_ANY) {
8b61709d 2754 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2755 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2756 }
2757 }
49af9a3d 2758
e0e2410d 2759exit:
86383816 2760 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
2761 return error;
2762}
2763
7df6932e
AW
2764/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2765 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2766 * error. */
8b61709d 2767static int
a8704b50
PS
2768netdev_linux_get_addr_list(const struct netdev *netdev_,
2769 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2770{
b5d57fc8 2771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2772 int error;
86383816
BP
2773
2774 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2775 if (netdev_linux_netnsid_is_remote(netdev)) {
2776 error = EOPNOTSUPP;
2777 goto exit;
2778 }
2779
a8704b50 2780 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 2781
e0e2410d
FL
2782exit:
2783 ovs_mutex_unlock(&netdev->mutex);
7df6932e 2784 return error;
8b61709d
BP
2785}
2786
2787static void
2788make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2789{
2790 struct sockaddr_in sin;
2791 memset(&sin, 0, sizeof sin);
2792 sin.sin_family = AF_INET;
2793 sin.sin_addr = addr;
2794 sin.sin_port = 0;
2795
2796 memset(sa, 0, sizeof *sa);
2797 memcpy(sa, &sin, sizeof sin);
2798}
2799
2800static int
2801do_set_addr(struct netdev *netdev,
2802 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2803{
2804 struct ifreq ifr;
149f577a 2805
259e0b1a
BP
2806 make_in4_sockaddr(&ifr.ifr_addr, addr);
2807 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2808 ioctl_name);
8b61709d
BP
2809}
2810
2811/* Adds 'router' as a default IP gateway. */
2812static int
67a4917b 2813netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2814{
2815 struct in_addr any = { INADDR_ANY };
2816 struct rtentry rt;
2817 int error;
2818
2819 memset(&rt, 0, sizeof rt);
2820 make_in4_sockaddr(&rt.rt_dst, any);
2821 make_in4_sockaddr(&rt.rt_gateway, router);
2822 make_in4_sockaddr(&rt.rt_genmask, any);
2823 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2824 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2825 if (error) {
10a89ef0 2826 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2827 }
2828 return error;
2829}
2830
f1acd62b
BP
2831static int
2832netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2833 char **netdev_name)
2834{
2835 static const char fn[] = "/proc/net/route";
2836 FILE *stream;
2837 char line[256];
2838 int ln;
2839
2840 *netdev_name = NULL;
2841 stream = fopen(fn, "r");
2842 if (stream == NULL) {
10a89ef0 2843 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2844 return errno;
2845 }
2846
2847 ln = 0;
2848 while (fgets(line, sizeof line, stream)) {
2849 if (++ln >= 2) {
2850 char iface[17];
dbba996b 2851 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2852 int refcnt, metric, mtu;
2853 unsigned int flags, use, window, irtt;
2854
c2c28dfd
BP
2855 if (!ovs_scan(line,
2856 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2857 " %d %u %u\n",
2858 iface, &dest, &gateway, &flags, &refcnt,
2859 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2860 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2861 fn, ln, line);
2862 continue;
2863 }
2864 if (!(flags & RTF_UP)) {
2865 /* Skip routes that aren't up. */
2866 continue;
2867 }
2868
2869 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2870 * network byte order, so we don't need need any endian
f1acd62b
BP
2871 * conversions here. */
2872 if ((dest & mask) == (host->s_addr & mask)) {
2873 if (!gateway) {
2874 /* The host is directly reachable. */
2875 next_hop->s_addr = 0;
2876 } else {
2877 /* To reach the host, we must go through a gateway. */
2878 next_hop->s_addr = gateway;
2879 }
2880 *netdev_name = xstrdup(iface);
2881 fclose(stream);
2882 return 0;
2883 }
2884 }
2885 }
2886
2887 fclose(stream);
2888 return ENXIO;
2889}
2890
e210037e 2891static int
b5d57fc8 2892netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2893{
b5d57fc8 2894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2895 int error = 0;
2896
86383816 2897 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2898 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2899 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2900
2901 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2902 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2903 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2904 cmd,
2905 ETHTOOL_GDRVINFO,
2906 "ETHTOOL_GDRVINFO");
2907 if (!error) {
b5d57fc8 2908 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2909 }
2910 }
e210037e 2911
e210037e 2912 if (!error) {
b5d57fc8
BP
2913 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2914 smap_add(smap, "driver_version", netdev->drvinfo.version);
2915 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2916 }
86383816
BP
2917 ovs_mutex_unlock(&netdev->mutex);
2918
e210037e
AE
2919 return error;
2920}
2921
4f925bd3 2922static int
275707c3
EJ
2923netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2924 struct smap *smap)
4f925bd3 2925{
79f1cbe9 2926 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2927 return 0;
2928}
2929
8b61709d
BP
2930/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2931 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2932 * returns 0. Otherwise, it returns a positive errno value; in particular,
2933 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2934static int
2935netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2936 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2937{
2938 struct arpreq r;
c100e025 2939 struct sockaddr_in sin;
8b61709d
BP
2940 int retval;
2941
2942 memset(&r, 0, sizeof r);
f2cc621b 2943 memset(&sin, 0, sizeof sin);
c100e025
BP
2944 sin.sin_family = AF_INET;
2945 sin.sin_addr.s_addr = ip;
2946 sin.sin_port = 0;
2947 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2948 r.arp_ha.sa_family = ARPHRD_ETHER;
2949 r.arp_flags = 0;
71d7c22f 2950 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2951 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2952 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2953 if (!retval) {
2954 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2955 } else if (retval != ENXIO) {
2956 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2957 netdev_get_name(netdev), IP_ARGS(ip),
2958 ovs_strerror(retval));
8b61709d
BP
2959 }
2960 return retval;
2961}
2962
2963static int
2964nd_to_iff_flags(enum netdev_flags nd)
2965{
2966 int iff = 0;
2967 if (nd & NETDEV_UP) {
2968 iff |= IFF_UP;
2969 }
2970 if (nd & NETDEV_PROMISC) {
2971 iff |= IFF_PROMISC;
2972 }
7ba19d41
AC
2973 if (nd & NETDEV_LOOPBACK) {
2974 iff |= IFF_LOOPBACK;
2975 }
8b61709d
BP
2976 return iff;
2977}
2978
2979static int
2980iff_to_nd_flags(int iff)
2981{
2982 enum netdev_flags nd = 0;
2983 if (iff & IFF_UP) {
2984 nd |= NETDEV_UP;
2985 }
2986 if (iff & IFF_PROMISC) {
2987 nd |= NETDEV_PROMISC;
2988 }
7ba19d41
AC
2989 if (iff & IFF_LOOPBACK) {
2990 nd |= NETDEV_LOOPBACK;
2991 }
8b61709d
BP
2992 return nd;
2993}
2994
2995static int
4f9f3f21
BP
2996update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2997 enum netdev_flags on, enum netdev_flags *old_flagsp)
2998 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2999{
3000 int old_flags, new_flags;
c37d4da4
EJ
3001 int error = 0;
3002
b5d57fc8 3003 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3004 *old_flagsp = iff_to_nd_flags(old_flags);
3005 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3006 if (new_flags != old_flags) {
4f9f3f21
BP
3007 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3008 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3009 }
4f9f3f21
BP
3010
3011 return error;
3012}
3013
3014static int
3015netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3016 enum netdev_flags on, enum netdev_flags *old_flagsp)
3017{
3018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3019 int error = 0;
4f9f3f21
BP
3020
3021 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3022 if (on || off) {
3023 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3024 if (netdev_linux_netnsid_is_remote(netdev)) {
3025 error = EOPNOTSUPP;
3026 goto exit;
3027 }
756819dd
FL
3028 error = update_flags(netdev, off, on, old_flagsp);
3029 } else {
3030 /* Try reading flags over netlink, or fall back to ioctl. */
3031 if (!netdev_linux_update_via_netlink(netdev)) {
3032 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3033 } else {
3034 error = update_flags(netdev, off, on, old_flagsp);
3035 }
3036 }
e0e2410d
FL
3037
3038exit:
86383816 3039 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3040 return error;
3041}
3042
2f9dd77f 3043#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
18ebd48c
PB
3044 GET_FEATURES, GET_STATUS, \
3045 FLOW_OFFLOAD_API) \
c3827f61
BP
3046{ \
3047 NAME, \
118c77b1 3048 false, /* is_pmd */ \
c3827f61 3049 \
259e0b1a 3050 NULL, \
c3827f61
BP
3051 netdev_linux_run, \
3052 netdev_linux_wait, \
3053 \
9dc63482
BP
3054 netdev_linux_alloc, \
3055 CONSTRUCT, \
3056 netdev_linux_destruct, \
3057 netdev_linux_dealloc, \
de5cdb90 3058 NULL, /* get_config */ \
6d9e6eb4 3059 NULL, /* set_config */ \
f431bf7d 3060 NULL, /* get_tunnel_config */ \
a36de779
PS
3061 NULL, /* build header */ \
3062 NULL, /* push header */ \
3063 NULL, /* pop header */ \
7dec44fe 3064 NULL, /* get_numa_id */ \
050c60bf 3065 NULL, /* set_tx_multiq */ \
c3827f61 3066 \
c3827f61
BP
3067 netdev_linux_send, \
3068 netdev_linux_send_wait, \
3069 \
3070 netdev_linux_set_etheraddr, \
3071 netdev_linux_get_etheraddr, \
3072 netdev_linux_get_mtu, \
9b020780 3073 netdev_linux_set_mtu, \
c3827f61
BP
3074 netdev_linux_get_ifindex, \
3075 netdev_linux_get_carrier, \
65c3058c 3076 netdev_linux_get_carrier_resets, \
1670c579 3077 netdev_linux_set_miimon_interval, \
f613a0d7 3078 GET_STATS, \
971f4b39 3079 NULL, \
c3827f61 3080 \
51f87458 3081 GET_FEATURES, \
c3827f61 3082 netdev_linux_set_advertisements, \
875ab130 3083 NULL, /* get_pt_mode */ \
c3827f61
BP
3084 \
3085 netdev_linux_set_policing, \
3086 netdev_linux_get_qos_types, \
3087 netdev_linux_get_qos_capabilities, \
3088 netdev_linux_get_qos, \
3089 netdev_linux_set_qos, \
3090 netdev_linux_get_queue, \
3091 netdev_linux_set_queue, \
3092 netdev_linux_delete_queue, \
3093 netdev_linux_get_queue_stats, \
89454bf4
BP
3094 netdev_linux_queue_dump_start, \
3095 netdev_linux_queue_dump_next, \
3096 netdev_linux_queue_dump_done, \
c3827f61
BP
3097 netdev_linux_dump_queue_stats, \
3098 \
c3827f61 3099 netdev_linux_set_in4, \
a8704b50 3100 netdev_linux_get_addr_list, \
c3827f61
BP
3101 netdev_linux_add_router, \
3102 netdev_linux_get_next_hop, \
4f925bd3 3103 GET_STATUS, \
c3827f61
BP
3104 netdev_linux_arp_lookup, \
3105 \
3106 netdev_linux_update_flags, \
790fb3b7 3107 NULL, /* reconfigure */ \
c3827f61 3108 \
f7791740
PS
3109 netdev_linux_rxq_alloc, \
3110 netdev_linux_rxq_construct, \
3111 netdev_linux_rxq_destruct, \
3112 netdev_linux_rxq_dealloc, \
3113 netdev_linux_rxq_recv, \
3114 netdev_linux_rxq_wait, \
3115 netdev_linux_rxq_drain, \
18ebd48c
PB
3116 \
3117 FLOW_OFFLOAD_API \
c3827f61
BP
3118}
3119
3120const struct netdev_class netdev_linux_class =
3121 NETDEV_LINUX_CLASS(
3122 "system",
9dc63482 3123 netdev_linux_construct,
f613a0d7 3124 netdev_linux_get_stats,
51f87458 3125 netdev_linux_get_features,
18ebd48c
PB
3126 netdev_linux_get_status,
3127 LINUX_FLOW_OFFLOAD_API);
c3827f61
BP
3128
3129const struct netdev_class netdev_tap_class =
3130 NETDEV_LINUX_CLASS(
3131 "tap",
9dc63482 3132 netdev_linux_construct_tap,
bba1e6f3 3133 netdev_tap_get_stats,
51f87458 3134 netdev_linux_get_features,
18ebd48c
PB
3135 netdev_linux_get_status,
3136 NO_OFFLOAD_API);
c3827f61
BP
3137
3138const struct netdev_class netdev_internal_class =
3139 NETDEV_LINUX_CLASS(
3140 "internal",
9dc63482 3141 netdev_linux_construct,
bba1e6f3 3142 netdev_internal_get_stats,
51f87458 3143 NULL, /* get_features */
18ebd48c
PB
3144 netdev_internal_get_status,
3145 NO_OFFLOAD_API);
8b61709d 3146\f
677d9158
JV
3147
3148#define CODEL_N_QUEUES 0x0000
3149
2f4298ce
BP
3150/* In sufficiently new kernel headers these are defined as enums in
3151 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3152 * kernels. (This overrides any enum definition in the header file but that's
3153 * harmless.) */
3154#define TCA_CODEL_TARGET 1
3155#define TCA_CODEL_LIMIT 2
3156#define TCA_CODEL_INTERVAL 3
3157
677d9158
JV
3158struct codel {
3159 struct tc tc;
3160 uint32_t target;
3161 uint32_t limit;
3162 uint32_t interval;
3163};
3164
3165static struct codel *
3166codel_get__(const struct netdev *netdev_)
3167{
3168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3169 return CONTAINER_OF(netdev->tc, struct codel, tc);
3170}
3171
3172static void
3173codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3174 uint32_t interval)
3175{
3176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3177 struct codel *codel;
3178
3179 codel = xmalloc(sizeof *codel);
3180 tc_init(&codel->tc, &tc_ops_codel);
3181 codel->target = target;
3182 codel->limit = limit;
3183 codel->interval = interval;
3184
3185 netdev->tc = &codel->tc;
3186}
3187
3188static int
3189codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3190 uint32_t interval)
3191{
3192 size_t opt_offset;
3193 struct ofpbuf request;
3194 struct tcmsg *tcmsg;
3195 uint32_t otarget, olimit, ointerval;
3196 int error;
3197
3198 tc_del_qdisc(netdev);
3199
7874bdff
RD
3200 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3201 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3202 if (!tcmsg) {
3203 return ENODEV;
3204 }
3205 tcmsg->tcm_handle = tc_make_handle(1, 0);
3206 tcmsg->tcm_parent = TC_H_ROOT;
3207
3208 otarget = target ? target : 5000;
3209 olimit = limit ? limit : 10240;
3210 ointerval = interval ? interval : 100000;
3211
3212 nl_msg_put_string(&request, TCA_KIND, "codel");
3213 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3214 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3215 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3216 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3217 nl_msg_end_nested(&request, opt_offset);
3218
3219 error = tc_transact(&request, NULL);
3220 if (error) {
3221 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3222 "target %u, limit %u, interval %u error %d(%s)",
3223 netdev_get_name(netdev),
3224 otarget, olimit, ointerval,
3225 error, ovs_strerror(error));
3226 }
3227 return error;
3228}
3229
3230static void
3231codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3232 const struct smap *details, struct codel *codel)
3233{
13c1637f
BP
3234 codel->target = smap_get_ullong(details, "target", 0);
3235 codel->limit = smap_get_ullong(details, "limit", 0);
3236 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3237
3238 if (!codel->target) {
3239 codel->target = 5000;
3240 }
3241 if (!codel->limit) {
3242 codel->limit = 10240;
3243 }
3244 if (!codel->interval) {
3245 codel->interval = 100000;
3246 }
3247}
3248
3249static int
3250codel_tc_install(struct netdev *netdev, const struct smap *details)
3251{
3252 int error;
3253 struct codel codel;
3254
3255 codel_parse_qdisc_details__(netdev, details, &codel);
3256 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3257 codel.interval);
3258 if (!error) {
3259 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3260 }
3261 return error;
3262}
3263
3264static int
3265codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3266{
3267 static const struct nl_policy tca_codel_policy[] = {
3268 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3269 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3270 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3271 };
3272
3273 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3274
3275 if (!nl_parse_nested(nl_options, tca_codel_policy,
3276 attrs, ARRAY_SIZE(tca_codel_policy))) {
3277 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3278 return EPROTO;
3279 }
3280
3281 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3282 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3283 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3284 return 0;
3285}
3286
3287static int
3288codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3289{
3290 struct nlattr *nlattr;
3291 const char * kind;
3292 int error;
3293 struct codel codel;
3294
3295 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3296 if (error != 0) {
3297 return error;
3298 }
3299
3300 error = codel_parse_tca_options__(nlattr, &codel);
3301 if (error != 0) {
3302 return error;
3303 }
3304
3305 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3306 return 0;
3307}
3308
3309
3310static void
3311codel_tc_destroy(struct tc *tc)
3312{
3313 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3314 tc_destroy(tc);
3315 free(codel);
3316}
3317
3318static int
3319codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3320{
3321 const struct codel *codel = codel_get__(netdev);
3322 smap_add_format(details, "target", "%u", codel->target);
3323 smap_add_format(details, "limit", "%u", codel->limit);
3324 smap_add_format(details, "interval", "%u", codel->interval);
3325 return 0;
3326}
3327
3328static int
3329codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3330{
3331 struct codel codel;
3332
3333 codel_parse_qdisc_details__(netdev, details, &codel);
3334 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3335 codel_get__(netdev)->target = codel.target;
3336 codel_get__(netdev)->limit = codel.limit;
3337 codel_get__(netdev)->interval = codel.interval;
3338 return 0;
3339}
3340
3341static const struct tc_ops tc_ops_codel = {
3342 "codel", /* linux_name */
3343 "linux-codel", /* ovs_name */
3344 CODEL_N_QUEUES, /* n_queues */
3345 codel_tc_install,
3346 codel_tc_load,
3347 codel_tc_destroy,
3348 codel_qdisc_get,
3349 codel_qdisc_set,
3350 NULL,
3351 NULL,
3352 NULL,
3353 NULL,
3354 NULL
3355};
3356\f
3357/* FQ-CoDel traffic control class. */
3358
3359#define FQCODEL_N_QUEUES 0x0000
3360
2f4298ce
BP
3361/* In sufficiently new kernel headers these are defined as enums in
3362 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3363 * kernels. (This overrides any enum definition in the header file but that's
3364 * harmless.) */
3365#define TCA_FQ_CODEL_TARGET 1
3366#define TCA_FQ_CODEL_LIMIT 2
3367#define TCA_FQ_CODEL_INTERVAL 3
3368#define TCA_FQ_CODEL_ECN 4
3369#define TCA_FQ_CODEL_FLOWS 5
3370#define TCA_FQ_CODEL_QUANTUM 6
3371
677d9158
JV
3372struct fqcodel {
3373 struct tc tc;
3374 uint32_t target;
3375 uint32_t limit;
3376 uint32_t interval;
3377 uint32_t flows;
3378 uint32_t quantum;
3379};
3380
3381static struct fqcodel *
3382fqcodel_get__(const struct netdev *netdev_)
3383{
3384 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3385 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3386}
3387
3388static void
3389fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3390 uint32_t interval, uint32_t flows, uint32_t quantum)
3391{
3392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3393 struct fqcodel *fqcodel;
3394
3395 fqcodel = xmalloc(sizeof *fqcodel);
3396 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3397 fqcodel->target = target;
3398 fqcodel->limit = limit;
3399 fqcodel->interval = interval;
3400 fqcodel->flows = flows;
3401 fqcodel->quantum = quantum;
3402
3403 netdev->tc = &fqcodel->tc;
3404}
3405
3406static int
3407fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3408 uint32_t interval, uint32_t flows, uint32_t quantum)
3409{
3410 size_t opt_offset;
3411 struct ofpbuf request;
3412 struct tcmsg *tcmsg;
3413 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3414 int error;
3415
3416 tc_del_qdisc(netdev);
3417
7874bdff
RD
3418 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3419 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3420 if (!tcmsg) {
3421 return ENODEV;
3422 }
3423 tcmsg->tcm_handle = tc_make_handle(1, 0);
3424 tcmsg->tcm_parent = TC_H_ROOT;
3425
3426 otarget = target ? target : 5000;
3427 olimit = limit ? limit : 10240;
3428 ointerval = interval ? interval : 100000;
3429 oflows = flows ? flows : 1024;
3430 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3431 not mtu */
3432
3433 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3434 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3435 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3436 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3437 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3438 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3439 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3440 nl_msg_end_nested(&request, opt_offset);
3441
3442 error = tc_transact(&request, NULL);
3443 if (error) {
3444 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3445 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3446 netdev_get_name(netdev),
3447 otarget, olimit, ointerval, oflows, oquantum,
3448 error, ovs_strerror(error));
3449 }
3450 return error;
3451}
3452
3453static void
3454fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3455 const struct smap *details, struct fqcodel *fqcodel)
3456{
13c1637f
BP
3457 fqcodel->target = smap_get_ullong(details, "target", 0);
3458 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3459 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3460 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3461 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3462
677d9158
JV
3463 if (!fqcodel->target) {
3464 fqcodel->target = 5000;
3465 }
3466 if (!fqcodel->limit) {
3467 fqcodel->limit = 10240;
3468 }
3469 if (!fqcodel->interval) {
3470 fqcodel->interval = 1000000;
3471 }
3472 if (!fqcodel->flows) {
3473 fqcodel->flows = 1024;
3474 }
3475 if (!fqcodel->quantum) {
3476 fqcodel->quantum = 1514;
3477 }
3478}
3479
3480static int
3481fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3482{
3483 int error;
3484 struct fqcodel fqcodel;
3485
3486 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3487 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3488 fqcodel.interval, fqcodel.flows,
3489 fqcodel.quantum);
3490 if (!error) {
3491 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3492 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3493 }
3494 return error;
3495}
3496
3497static int
3498fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3499{
3500 static const struct nl_policy tca_fqcodel_policy[] = {
3501 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3502 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3503 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3504 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3505 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3506 };
3507
3508 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3509
3510 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3511 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3512 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3513 return EPROTO;
3514 }
3515
3516 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3517 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3518 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3519 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3520 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3521 return 0;
3522}
3523
3524static int
3525fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3526{
3527 struct nlattr *nlattr;
3528 const char * kind;
3529 int error;
3530 struct fqcodel fqcodel;
3531
3532 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3533 if (error != 0) {
3534 return error;
3535 }
3536
3537 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3538 if (error != 0) {
3539 return error;
3540 }
3541
3542 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3543 fqcodel.flows, fqcodel.quantum);
3544 return 0;
3545}
3546
3547static void
3548fqcodel_tc_destroy(struct tc *tc)
3549{
3550 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3551 tc_destroy(tc);
3552 free(fqcodel);
3553}
3554
3555static int
3556fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3557{
3558 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3559 smap_add_format(details, "target", "%u", fqcodel->target);
3560 smap_add_format(details, "limit", "%u", fqcodel->limit);
3561 smap_add_format(details, "interval", "%u", fqcodel->interval);
3562 smap_add_format(details, "flows", "%u", fqcodel->flows);
3563 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3564 return 0;
3565}
3566
3567static int
3568fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3569{
3570 struct fqcodel fqcodel;
3571
3572 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3573 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3574 fqcodel.flows, fqcodel.quantum);
3575 fqcodel_get__(netdev)->target = fqcodel.target;
3576 fqcodel_get__(netdev)->limit = fqcodel.limit;
3577 fqcodel_get__(netdev)->interval = fqcodel.interval;
3578 fqcodel_get__(netdev)->flows = fqcodel.flows;
3579 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3580 return 0;
3581}
3582
3583static const struct tc_ops tc_ops_fqcodel = {
3584 "fq_codel", /* linux_name */
3585 "linux-fq_codel", /* ovs_name */
3586 FQCODEL_N_QUEUES, /* n_queues */
3587 fqcodel_tc_install,
3588 fqcodel_tc_load,
3589 fqcodel_tc_destroy,
3590 fqcodel_qdisc_get,
3591 fqcodel_qdisc_set,
3592 NULL,
3593 NULL,
3594 NULL,
3595 NULL,
3596 NULL
3597};
3598\f
3599/* SFQ traffic control class. */
3600
3601#define SFQ_N_QUEUES 0x0000
3602
3603struct sfq {
3604 struct tc tc;
3605 uint32_t quantum;
3606 uint32_t perturb;
3607};
3608
3609static struct sfq *
3610sfq_get__(const struct netdev *netdev_)
3611{
3612 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3613 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3614}
3615
3616static void
3617sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3618{
3619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3620 struct sfq *sfq;
3621
3622 sfq = xmalloc(sizeof *sfq);
3623 tc_init(&sfq->tc, &tc_ops_sfq);
3624 sfq->perturb = perturb;
3625 sfq->quantum = quantum;
3626
3627 netdev->tc = &sfq->tc;
3628}
3629
3630static int
3631sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3632{
3633 struct tc_sfq_qopt opt;
3634 struct ofpbuf request;
3635 struct tcmsg *tcmsg;
3636 int mtu;
3637 int mtu_error, error;
3638 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3639
3640 tc_del_qdisc(netdev);
3641
7874bdff
RD
3642 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3643 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3644 if (!tcmsg) {
3645 return ENODEV;
3646 }
3647 tcmsg->tcm_handle = tc_make_handle(1, 0);
3648 tcmsg->tcm_parent = TC_H_ROOT;
3649
3650 memset(&opt, 0, sizeof opt);
3651 if (!quantum) {
3652 if (!mtu_error) {
3653 opt.quantum = mtu; /* if we cannot find mtu, use default */
3654 }
3655 } else {
3656 opt.quantum = quantum;
3657 }
3658
3659 if (!perturb) {
3660 opt.perturb_period = 10;
3661 } else {
3662 opt.perturb_period = perturb;
3663 }
3664
3665 nl_msg_put_string(&request, TCA_KIND, "sfq");
3666 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3667
3668 error = tc_transact(&request, NULL);
3669 if (error) {
3670 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3671 "quantum %u, perturb %u error %d(%s)",
3672 netdev_get_name(netdev),
3673 opt.quantum, opt.perturb_period,
3674 error, ovs_strerror(error));
3675 }
3676 return error;
3677}
3678
3679static void
3680sfq_parse_qdisc_details__(struct netdev *netdev,
3681 const struct smap *details, struct sfq *sfq)
3682{
13c1637f
BP
3683 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3684 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3685
677d9158
JV
3686 if (!sfq->perturb) {
3687 sfq->perturb = 10;
3688 }
3689
3690 if (!sfq->quantum) {
13c1637f
BP
3691 int mtu;
3692 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3693 sfq->quantum = mtu;
3694 } else {
3695 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3696 "device without mtu");
677d9158
JV
3697 }
3698 }
3699}
3700
3701static int
3702sfq_tc_install(struct netdev *netdev, const struct smap *details)
3703{
3704 int error;
3705 struct sfq sfq;
3706
3707 sfq_parse_qdisc_details__(netdev, details, &sfq);
3708 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3709 if (!error) {
3710 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3711 }
3712 return error;
3713}
3714
3715static int
3716sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3717{
3718 const struct tc_sfq_qopt *sfq;
3719 struct nlattr *nlattr;
3720 const char * kind;
3721 int error;
3722
3723 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3724 if (error == 0) {
3725 sfq = nl_attr_get(nlattr);
3726 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3727 return 0;
3728 }
3729
3730 return error;
3731}
3732
3733static void
3734sfq_tc_destroy(struct tc *tc)
3735{
3736 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3737 tc_destroy(tc);
3738 free(sfq);
3739}
3740
3741static int
3742sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3743{
3744 const struct sfq *sfq = sfq_get__(netdev);
3745 smap_add_format(details, "quantum", "%u", sfq->quantum);
3746 smap_add_format(details, "perturb", "%u", sfq->perturb);
3747 return 0;
3748}
3749
3750static int
3751sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3752{
3753 struct sfq sfq;
3754
3755 sfq_parse_qdisc_details__(netdev, details, &sfq);
3756 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3757 sfq_get__(netdev)->quantum = sfq.quantum;
3758 sfq_get__(netdev)->perturb = sfq.perturb;
3759 return 0;
3760}
3761
3762static const struct tc_ops tc_ops_sfq = {
3763 "sfq", /* linux_name */
3764 "linux-sfq", /* ovs_name */
3765 SFQ_N_QUEUES, /* n_queues */
3766 sfq_tc_install,
3767 sfq_tc_load,
3768 sfq_tc_destroy,
3769 sfq_qdisc_get,
3770 sfq_qdisc_set,
3771 NULL,
3772 NULL,
3773 NULL,
3774 NULL,
3775 NULL
3776};
3777\f
c1c9c9c4 3778/* HTB traffic control class. */
559843ed 3779
c1c9c9c4 3780#define HTB_N_QUEUES 0xf000
4f631ccd 3781#define HTB_RATE2QUANTUM 10
8b61709d 3782
c1c9c9c4
BP
3783struct htb {
3784 struct tc tc;
3785 unsigned int max_rate; /* In bytes/s. */
3786};
8b61709d 3787
c1c9c9c4 3788struct htb_class {
93b13be8 3789 struct tc_queue tc_queue;
c1c9c9c4
BP
3790 unsigned int min_rate; /* In bytes/s. */
3791 unsigned int max_rate; /* In bytes/s. */
3792 unsigned int burst; /* In bytes. */
3793 unsigned int priority; /* Lower values are higher priorities. */
3794};
8b61709d 3795
c1c9c9c4 3796static struct htb *
b5d57fc8 3797htb_get__(const struct netdev *netdev_)
c1c9c9c4 3798{
b5d57fc8
BP
3799 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3800 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3801}
3802
24045e35 3803static void
b5d57fc8 3804htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3805{
b5d57fc8 3806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3807 struct htb *htb;
3808
3809 htb = xmalloc(sizeof *htb);
3810 tc_init(&htb->tc, &tc_ops_htb);
3811 htb->max_rate = max_rate;
3812
b5d57fc8 3813 netdev->tc = &htb->tc;
c1c9c9c4
BP
3814}
3815
3816/* Create an HTB qdisc.
3817 *
a339aa81 3818 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3819static int
3820htb_setup_qdisc__(struct netdev *netdev)
3821{
3822 size_t opt_offset;
3823 struct tc_htb_glob opt;
3824 struct ofpbuf request;
3825 struct tcmsg *tcmsg;
3826
3827 tc_del_qdisc(netdev);
3828
7874bdff
RD
3829 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3830 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3831 if (!tcmsg) {
3832 return ENODEV;
3833 }
c1c9c9c4
BP
3834 tcmsg->tcm_handle = tc_make_handle(1, 0);
3835 tcmsg->tcm_parent = TC_H_ROOT;
3836
3837 nl_msg_put_string(&request, TCA_KIND, "htb");
3838
3839 memset(&opt, 0, sizeof opt);
4f631ccd 3840 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3841 opt.version = 3;
4ecf12d5 3842 opt.defcls = 1;
c1c9c9c4
BP
3843
3844 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3845 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3846 nl_msg_end_nested(&request, opt_offset);
3847
3848 return tc_transact(&request, NULL);
3849}
3850
3851/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3852 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3853static int
3854htb_setup_class__(struct netdev *netdev, unsigned int handle,
3855 unsigned int parent, struct htb_class *class)
3856{
3857 size_t opt_offset;
3858 struct tc_htb_opt opt;
3859 struct ofpbuf request;
3860 struct tcmsg *tcmsg;
3861 int error;
3862 int mtu;
3863
73371c09 3864 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3865 if (error) {
f915f1a8
BP
3866 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3867 netdev_get_name(netdev));
9b020780 3868 return error;
f915f1a8 3869 }
c1c9c9c4
BP
3870
3871 memset(&opt, 0, sizeof opt);
3872 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3873 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3874 /* Makes sure the quantum is at least MTU. Setting quantum will
3875 * make htb ignore the r2q for this class. */
3876 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3877 opt.quantum = mtu;
3878 }
c1c9c9c4
BP
3879 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3880 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3881 opt.prio = class->priority;
3882
7874bdff
RD
3883 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3884 &request);
23a98ffe
BP
3885 if (!tcmsg) {
3886 return ENODEV;
3887 }
c1c9c9c4
BP
3888 tcmsg->tcm_handle = handle;
3889 tcmsg->tcm_parent = parent;
3890
3891 nl_msg_put_string(&request, TCA_KIND, "htb");
3892 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3893 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3894 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3895 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3896 nl_msg_end_nested(&request, opt_offset);
3897
3898 error = tc_transact(&request, NULL);
3899 if (error) {
3900 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3901 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3902 netdev_get_name(netdev),
3903 tc_get_major(handle), tc_get_minor(handle),
3904 tc_get_major(parent), tc_get_minor(parent),
3905 class->min_rate, class->max_rate,
10a89ef0 3906 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3907 }
3908 return error;
3909}
3910
3911/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3912 * description of them into 'details'. The description complies with the
3913 * specification given in the vswitch database documentation for linux-htb
3914 * queue details. */
3915static int
3916htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3917{
3918 static const struct nl_policy tca_htb_policy[] = {
3919 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3920 .min_len = sizeof(struct tc_htb_opt) },
3921 };
3922
3923 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3924 const struct tc_htb_opt *htb;
3925
3926 if (!nl_parse_nested(nl_options, tca_htb_policy,
3927 attrs, ARRAY_SIZE(tca_htb_policy))) {
3928 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3929 return EPROTO;
3930 }
3931
3932 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3933 class->min_rate = htb->rate.rate;
3934 class->max_rate = htb->ceil.rate;
3935 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3936 class->priority = htb->prio;
3937 return 0;
3938}
3939
3940static int
3941htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3942 struct htb_class *options,
3943 struct netdev_queue_stats *stats)
3944{
3945 struct nlattr *nl_options;
3946 unsigned int handle;
3947 int error;
3948
3949 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3950 if (!error && queue_id) {
17ee3c1f
BP
3951 unsigned int major = tc_get_major(handle);
3952 unsigned int minor = tc_get_minor(handle);
3953 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3954 *queue_id = minor - 1;
c1c9c9c4
BP
3955 } else {
3956 error = EPROTO;
3957 }
3958 }
3959 if (!error && options) {
3960 error = htb_parse_tca_options__(nl_options, options);
3961 }
3962 return error;
3963}
3964
3965static void
73371c09 3966htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3967 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3968{
73371c09 3969 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3970
13c1637f 3971 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3972 if (!hc->max_rate) {
a00ca915 3973 enum netdev_features current;
c1c9c9c4 3974
73371c09
BP
3975 netdev_linux_read_features(netdev);
3976 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3977 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3978 }
3979 hc->min_rate = hc->max_rate;
3980 hc->burst = 0;
3981 hc->priority = 0;
3982}
3983
3984static int
3985htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3986 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3987{
3988 const struct htb *htb = htb_get__(netdev);
9b020780 3989 int mtu, error;
214117fd 3990 unsigned long long int max_rate_bit;
c1c9c9c4 3991
73371c09 3992 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3993 if (error) {
f915f1a8
BP
3994 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3995 netdev_get_name(netdev));
9b020780 3996 return error;
f915f1a8
BP
3997 }
3998
4f104611
EJ
3999 /* HTB requires at least an mtu sized min-rate to send any traffic even
4000 * on uncongested links. */
13c1637f 4001 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4002 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4003 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4004
4005 /* max-rate */
214117fd
KF
4006 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4007 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4008 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4009 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4010
4011 /* burst
4012 *
4013 * According to hints in the documentation that I've read, it is important
4014 * that 'burst' be at least as big as the largest frame that might be
4015 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4016 * but having it a bit too small is a problem. Since netdev_get_mtu()
4017 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4018 * the MTU. We actually add 64, instead of 14, as a guard against
4019 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4020 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4021 hc->burst = MAX(hc->burst, mtu + 64);
4022
4023 /* priority */
13c1637f 4024 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4025
4026 return 0;
4027}
4028
4029static int
4030htb_query_class__(const struct netdev *netdev, unsigned int handle,
4031 unsigned int parent, struct htb_class *options,
4032 struct netdev_queue_stats *stats)
4033{
4034 struct ofpbuf *reply;
4035 int error;
4036
4037 error = tc_query_class(netdev, handle, parent, &reply);
4038 if (!error) {
4039 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4040 ofpbuf_delete(reply);
4041 }
4042 return error;
4043}
4044
4045static int
79f1cbe9 4046htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4047{
4048 int error;
4049
4050 error = htb_setup_qdisc__(netdev);
4051 if (!error) {
4052 struct htb_class hc;
4053
4054 htb_parse_qdisc_details__(netdev, details, &hc);
4055 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4056 tc_make_handle(1, 0), &hc);
4057 if (!error) {
4058 htb_install__(netdev, hc.max_rate);
4059 }
4060 }
4061 return error;
4062}
4063
93b13be8
BP
4064static struct htb_class *
4065htb_class_cast__(const struct tc_queue *queue)
4066{
4067 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4068}
4069
c1c9c9c4
BP
4070static void
4071htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4072 const struct htb_class *hc)
4073{
4074 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4075 size_t hash = hash_int(queue_id, 0);
4076 struct tc_queue *queue;
c1c9c9c4
BP
4077 struct htb_class *hcp;
4078
93b13be8
BP
4079 queue = tc_find_queue__(netdev, queue_id, hash);
4080 if (queue) {
4081 hcp = htb_class_cast__(queue);
4082 } else {
c1c9c9c4 4083 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4084 queue = &hcp->tc_queue;
4085 queue->queue_id = queue_id;
6dc34a0d 4086 queue->created = time_msec();
93b13be8 4087 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4088 }
93b13be8
BP
4089
4090 hcp->min_rate = hc->min_rate;
4091 hcp->max_rate = hc->max_rate;
4092 hcp->burst = hc->burst;
4093 hcp->priority = hc->priority;
c1c9c9c4
BP
4094}
4095
4096static int
4097htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4098{
c1c9c9c4 4099 struct ofpbuf msg;
d57695d7 4100 struct queue_dump_state state;
c1c9c9c4 4101 struct htb_class hc;
c1c9c9c4
BP
4102
4103 /* Get qdisc options. */
4104 hc.max_rate = 0;
4105 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4106 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4107
4108 /* Get queues. */
d57695d7 4109 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4110 return ENODEV;
4111 }
d57695d7 4112 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4113 unsigned int queue_id;
4114
4115 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4116 htb_update_queue__(netdev, queue_id, &hc);
4117 }
4118 }
d57695d7 4119 finish_queue_dump(&state);
c1c9c9c4
BP
4120
4121 return 0;
4122}
4123
4124static void
4125htb_tc_destroy(struct tc *tc)
4126{
4127 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4128 struct htb_class *hc;
c1c9c9c4 4129
4ec3d7c7 4130 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4131 free(hc);
4132 }
4133 tc_destroy(tc);
4134 free(htb);
4135}
4136
4137static int
79f1cbe9 4138htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4139{
4140 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4141 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4142 return 0;
4143}
4144
4145static int
79f1cbe9 4146htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4147{
4148 struct htb_class hc;
4149 int error;
4150
4151 htb_parse_qdisc_details__(netdev, details, &hc);
4152 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4153 tc_make_handle(1, 0), &hc);
4154 if (!error) {
4155 htb_get__(netdev)->max_rate = hc.max_rate;
4156 }
4157 return error;
4158}
4159
4160static int
93b13be8 4161htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4162 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4163{
93b13be8 4164 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4165
79f1cbe9 4166 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4167 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4168 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4169 }
79f1cbe9 4170 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4171 if (hc->priority) {
79f1cbe9 4172 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4173 }
4174 return 0;
4175}
4176
4177static int
4178htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4179 const struct smap *details)
c1c9c9c4
BP
4180{
4181 struct htb_class hc;
4182 int error;
4183
4184 error = htb_parse_class_details__(netdev, details, &hc);
4185 if (error) {
4186 return error;
4187 }
4188
17ee3c1f 4189 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4190 tc_make_handle(1, 0xfffe), &hc);
4191 if (error) {
4192 return error;
4193 }
4194
4195 htb_update_queue__(netdev, queue_id, &hc);
4196 return 0;
4197}
4198
4199static int
93b13be8 4200htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4201{
93b13be8 4202 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4203 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4204 int error;
4205
93b13be8 4206 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4207 if (!error) {
93b13be8 4208 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4209 free(hc);
c1c9c9c4
BP
4210 }
4211 return error;
4212}
4213
4214static int
93b13be8 4215htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4216 struct netdev_queue_stats *stats)
4217{
93b13be8 4218 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4219 tc_make_handle(1, 0xfffe), NULL, stats);
4220}
4221
4222static int
4223htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4224 const struct ofpbuf *nlmsg,
4225 netdev_dump_queue_stats_cb *cb, void *aux)
4226{
4227 struct netdev_queue_stats stats;
17ee3c1f 4228 unsigned int handle, major, minor;
c1c9c9c4
BP
4229 int error;
4230
4231 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4232 if (error) {
4233 return error;
4234 }
4235
17ee3c1f
BP
4236 major = tc_get_major(handle);
4237 minor = tc_get_minor(handle);
4238 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4239 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4240 }
4241 return 0;
4242}
4243
4244static const struct tc_ops tc_ops_htb = {
4245 "htb", /* linux_name */
4246 "linux-htb", /* ovs_name */
4247 HTB_N_QUEUES, /* n_queues */
4248 htb_tc_install,
4249 htb_tc_load,
4250 htb_tc_destroy,
4251 htb_qdisc_get,
4252 htb_qdisc_set,
4253 htb_class_get,
4254 htb_class_set,
4255 htb_class_delete,
4256 htb_class_get_stats,
4257 htb_class_dump_stats
4258};
4259\f
a339aa81
EJ
4260/* "linux-hfsc" traffic control class. */
4261
4262#define HFSC_N_QUEUES 0xf000
4263
4264struct hfsc {
4265 struct tc tc;
4266 uint32_t max_rate;
4267};
4268
4269struct hfsc_class {
4270 struct tc_queue tc_queue;
4271 uint32_t min_rate;
4272 uint32_t max_rate;
4273};
4274
4275static struct hfsc *
b5d57fc8 4276hfsc_get__(const struct netdev *netdev_)
a339aa81 4277{
b5d57fc8
BP
4278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4279 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4280}
4281
4282static struct hfsc_class *
4283hfsc_class_cast__(const struct tc_queue *queue)
4284{
4285 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4286}
4287
24045e35 4288static void
b5d57fc8 4289hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4290{
b5d57fc8 4291 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4292 struct hfsc *hfsc;
4293
a339aa81
EJ
4294 hfsc = xmalloc(sizeof *hfsc);
4295 tc_init(&hfsc->tc, &tc_ops_hfsc);
4296 hfsc->max_rate = max_rate;
b5d57fc8 4297 netdev->tc = &hfsc->tc;
a339aa81
EJ
4298}
4299
4300static void
4301hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4302 const struct hfsc_class *hc)
4303{
4304 size_t hash;
4305 struct hfsc *hfsc;
4306 struct hfsc_class *hcp;
4307 struct tc_queue *queue;
4308
4309 hfsc = hfsc_get__(netdev);
4310 hash = hash_int(queue_id, 0);
4311
4312 queue = tc_find_queue__(netdev, queue_id, hash);
4313 if (queue) {
4314 hcp = hfsc_class_cast__(queue);
4315 } else {
4316 hcp = xmalloc(sizeof *hcp);
4317 queue = &hcp->tc_queue;
4318 queue->queue_id = queue_id;
6dc34a0d 4319 queue->created = time_msec();
a339aa81
EJ
4320 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4321 }
4322
4323 hcp->min_rate = hc->min_rate;
4324 hcp->max_rate = hc->max_rate;
4325}
4326
4327static int
4328hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4329{
4330 const struct tc_service_curve *rsc, *fsc, *usc;
4331 static const struct nl_policy tca_hfsc_policy[] = {
4332 [TCA_HFSC_RSC] = {
4333 .type = NL_A_UNSPEC,
4334 .optional = false,
4335 .min_len = sizeof(struct tc_service_curve),
4336 },
4337 [TCA_HFSC_FSC] = {
4338 .type = NL_A_UNSPEC,
4339 .optional = false,
4340 .min_len = sizeof(struct tc_service_curve),
4341 },
4342 [TCA_HFSC_USC] = {
4343 .type = NL_A_UNSPEC,
4344 .optional = false,
4345 .min_len = sizeof(struct tc_service_curve),
4346 },
4347 };
4348 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4349
4350 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4351 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4352 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4353 return EPROTO;
4354 }
4355
4356 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4357 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4358 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4359
4360 if (rsc->m1 != 0 || rsc->d != 0 ||
4361 fsc->m1 != 0 || fsc->d != 0 ||
4362 usc->m1 != 0 || usc->d != 0) {
4363 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4364 "Non-linear service curves are not supported.");
4365 return EPROTO;
4366 }
4367
4368 if (rsc->m2 != fsc->m2) {
4369 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4370 "Real-time service curves are not supported ");
4371 return EPROTO;
4372 }
4373
4374 if (rsc->m2 > usc->m2) {
4375 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4376 "Min-rate service curve is greater than "
4377 "the max-rate service curve.");
4378 return EPROTO;
4379 }
4380
4381 class->min_rate = fsc->m2;
4382 class->max_rate = usc->m2;
4383 return 0;
4384}
4385
4386static int
4387hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4388 struct hfsc_class *options,
4389 struct netdev_queue_stats *stats)
4390{
4391 int error;
4392 unsigned int handle;
4393 struct nlattr *nl_options;
4394
4395 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4396 if (error) {
4397 return error;
4398 }
4399
4400 if (queue_id) {
4401 unsigned int major, minor;
4402
4403 major = tc_get_major(handle);
4404 minor = tc_get_minor(handle);
4405 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4406 *queue_id = minor - 1;
4407 } else {
4408 return EPROTO;
4409 }
4410 }
4411
4412 if (options) {
4413 error = hfsc_parse_tca_options__(nl_options, options);
4414 }
4415
4416 return error;
4417}
4418
4419static int
4420hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4421 unsigned int parent, struct hfsc_class *options,
4422 struct netdev_queue_stats *stats)
4423{
4424 int error;
4425 struct ofpbuf *reply;
4426
4427 error = tc_query_class(netdev, handle, parent, &reply);
4428 if (error) {
4429 return error;
4430 }
4431
4432 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4433 ofpbuf_delete(reply);
4434 return error;
4435}
4436
4437static void
73371c09 4438hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4439 struct hfsc_class *class)
4440{
73371c09 4441 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4442
13c1637f 4443 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4444 if (!max_rate) {
a00ca915 4445 enum netdev_features current;
a339aa81 4446
73371c09
BP
4447 netdev_linux_read_features(netdev);
4448 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4449 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4450 }
4451
4452 class->min_rate = max_rate;
4453 class->max_rate = max_rate;
4454}
4455
4456static int
4457hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4458 const struct smap *details,
a339aa81
EJ
4459 struct hfsc_class * class)
4460{
4461 const struct hfsc *hfsc;
4462 uint32_t min_rate, max_rate;
a339aa81
EJ
4463
4464 hfsc = hfsc_get__(netdev);
a339aa81 4465
13c1637f 4466 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4467 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4468 min_rate = MIN(min_rate, hfsc->max_rate);
4469
13c1637f 4470 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4471 max_rate = MAX(max_rate, min_rate);
4472 max_rate = MIN(max_rate, hfsc->max_rate);
4473
4474 class->min_rate = min_rate;
4475 class->max_rate = max_rate;
4476
4477 return 0;
4478}
4479
4480/* Create an HFSC qdisc.
4481 *
4482 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4483static int
4484hfsc_setup_qdisc__(struct netdev * netdev)
4485{
4486 struct tcmsg *tcmsg;
4487 struct ofpbuf request;
4488 struct tc_hfsc_qopt opt;
4489
4490 tc_del_qdisc(netdev);
4491
7874bdff
RD
4492 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4493 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4494
4495 if (!tcmsg) {
4496 return ENODEV;
4497 }
4498
4499 tcmsg->tcm_handle = tc_make_handle(1, 0);
4500 tcmsg->tcm_parent = TC_H_ROOT;
4501
4502 memset(&opt, 0, sizeof opt);
4503 opt.defcls = 1;
4504
4505 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4506 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4507
4508 return tc_transact(&request, NULL);
4509}
4510
4511/* Create an HFSC class.
4512 *
4513 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4514 * sc rate <min_rate> ul rate <max_rate>" */
4515static int
4516hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4517 unsigned int parent, struct hfsc_class *class)
4518{
4519 int error;
4520 size_t opt_offset;
4521 struct tcmsg *tcmsg;
4522 struct ofpbuf request;
4523 struct tc_service_curve min, max;
4524
7874bdff
RD
4525 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4526 &request);
a339aa81
EJ
4527
4528 if (!tcmsg) {
4529 return ENODEV;
4530 }
4531
4532 tcmsg->tcm_handle = handle;
4533 tcmsg->tcm_parent = parent;
4534
4535 min.m1 = 0;
4536 min.d = 0;
4537 min.m2 = class->min_rate;
4538
4539 max.m1 = 0;
4540 max.d = 0;
4541 max.m2 = class->max_rate;
4542
4543 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4544 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4545 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4546 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4547 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4548 nl_msg_end_nested(&request, opt_offset);
4549
4550 error = tc_transact(&request, NULL);
4551 if (error) {
4552 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4553 "min-rate %ubps, max-rate %ubps (%s)",
4554 netdev_get_name(netdev),
4555 tc_get_major(handle), tc_get_minor(handle),
4556 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4557 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4558 }
4559
4560 return error;
4561}
4562
4563static int
79f1cbe9 4564hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4565{
4566 int error;
4567 struct hfsc_class class;
4568
4569 error = hfsc_setup_qdisc__(netdev);
4570
4571 if (error) {
4572 return error;
4573 }
4574
4575 hfsc_parse_qdisc_details__(netdev, details, &class);
4576 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4577 tc_make_handle(1, 0), &class);
4578
4579 if (error) {
4580 return error;
4581 }
4582
4583 hfsc_install__(netdev, class.max_rate);
4584 return 0;
4585}
4586
4587static int
4588hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4589{
4590 struct ofpbuf msg;
d57695d7 4591 struct queue_dump_state state;
a339aa81
EJ
4592 struct hfsc_class hc;
4593
4594 hc.max_rate = 0;
4595 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4596 hfsc_install__(netdev, hc.max_rate);
a339aa81 4597
d57695d7 4598 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4599 return ENODEV;
4600 }
4601
d57695d7 4602 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4603 unsigned int queue_id;
4604
4605 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4606 hfsc_update_queue__(netdev, queue_id, &hc);
4607 }
4608 }
4609
d57695d7 4610 finish_queue_dump(&state);
a339aa81
EJ
4611 return 0;
4612}
4613
4614static void
4615hfsc_tc_destroy(struct tc *tc)
4616{
4617 struct hfsc *hfsc;
4618 struct hfsc_class *hc, *next;
4619
4620 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4621
4622 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4623 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4624 free(hc);
4625 }
4626
4627 tc_destroy(tc);
4628 free(hfsc);
4629}
4630
4631static int
79f1cbe9 4632hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4633{
4634 const struct hfsc *hfsc;
4635 hfsc = hfsc_get__(netdev);
79f1cbe9 4636 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4637 return 0;
4638}
4639
4640static int
79f1cbe9 4641hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4642{
4643 int error;
4644 struct hfsc_class class;
4645
4646 hfsc_parse_qdisc_details__(netdev, details, &class);
4647 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4648 tc_make_handle(1, 0), &class);
4649
4650 if (!error) {
4651 hfsc_get__(netdev)->max_rate = class.max_rate;
4652 }
4653
4654 return error;
4655}
4656
4657static int
4658hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4659 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4660{
4661 const struct hfsc_class *hc;
4662
4663 hc = hfsc_class_cast__(queue);
79f1cbe9 4664 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4665 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4666 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4667 }
4668 return 0;
4669}
4670
4671static int
4672hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4673 const struct smap *details)
a339aa81
EJ
4674{
4675 int error;
4676 struct hfsc_class class;
4677
4678 error = hfsc_parse_class_details__(netdev, details, &class);
4679 if (error) {
4680 return error;
4681 }
4682
4683 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4684 tc_make_handle(1, 0xfffe), &class);
4685 if (error) {
4686 return error;
4687 }
4688
4689 hfsc_update_queue__(netdev, queue_id, &class);
4690 return 0;
4691}
4692
4693static int
4694hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4695{
4696 int error;
4697 struct hfsc *hfsc;
4698 struct hfsc_class *hc;
4699
4700 hc = hfsc_class_cast__(queue);
4701 hfsc = hfsc_get__(netdev);
4702
4703 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4704 if (!error) {
4705 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4706 free(hc);
4707 }
4708 return error;
4709}
4710
4711static int
4712hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4713 struct netdev_queue_stats *stats)
4714{
4715 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4716 tc_make_handle(1, 0xfffe), NULL, stats);
4717}
4718
4719static int
4720hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4721 const struct ofpbuf *nlmsg,
4722 netdev_dump_queue_stats_cb *cb, void *aux)
4723{
4724 struct netdev_queue_stats stats;
4725 unsigned int handle, major, minor;
4726 int error;
4727
4728 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4729 if (error) {
4730 return error;
4731 }
4732
4733 major = tc_get_major(handle);
4734 minor = tc_get_minor(handle);
4735 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4736 (*cb)(minor - 1, &stats, aux);
4737 }
4738 return 0;
4739}
4740
4741static const struct tc_ops tc_ops_hfsc = {
4742 "hfsc", /* linux_name */
4743 "linux-hfsc", /* ovs_name */
4744 HFSC_N_QUEUES, /* n_queues */
4745 hfsc_tc_install, /* tc_install */
4746 hfsc_tc_load, /* tc_load */
4747 hfsc_tc_destroy, /* tc_destroy */
4748 hfsc_qdisc_get, /* qdisc_get */
4749 hfsc_qdisc_set, /* qdisc_set */
4750 hfsc_class_get, /* class_get */
4751 hfsc_class_set, /* class_set */
4752 hfsc_class_delete, /* class_delete */
4753 hfsc_class_get_stats, /* class_get_stats */
4754 hfsc_class_dump_stats /* class_dump_stats */
4755};
4756\f
6cf888b8
BS
4757/* "linux-noop" traffic control class. */
4758
4759static void
4760noop_install__(struct netdev *netdev_)
4761{
4762 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4763 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4764
4765 netdev->tc = CONST_CAST(struct tc *, &tc);
4766}
4767
4768static int
4769noop_tc_install(struct netdev *netdev,
4770 const struct smap *details OVS_UNUSED)
4771{
4772 noop_install__(netdev);
4773 return 0;
4774}
4775
4776static int
4777noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4778{
4779 noop_install__(netdev);
4780 return 0;
4781}
4782
4783static const struct tc_ops tc_ops_noop = {
4784 NULL, /* linux_name */
4785 "linux-noop", /* ovs_name */
4786 0, /* n_queues */
4787 noop_tc_install,
4788 noop_tc_load,
4789 NULL, /* tc_destroy */
4790 NULL, /* qdisc_get */
4791 NULL, /* qdisc_set */
4792 NULL, /* class_get */
4793 NULL, /* class_set */
4794 NULL, /* class_delete */
4795 NULL, /* class_get_stats */
4796 NULL /* class_dump_stats */
4797};
4798\f
c1c9c9c4
BP
4799/* "linux-default" traffic control class.
4800 *
4801 * This class represents the default, unnamed Linux qdisc. It corresponds to
4802 * the "" (empty string) QoS type in the OVS database. */
4803
4804static void
b5d57fc8 4805default_install__(struct netdev *netdev_)
c1c9c9c4 4806{
b5d57fc8 4807 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4808 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4809
559eb230
BP
4810 /* Nothing but a tc class implementation is allowed to write to a tc. This
4811 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4812 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4813}
4814
4815static int
4816default_tc_install(struct netdev *netdev,
79f1cbe9 4817 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4818{
4819 default_install__(netdev);
4820 return 0;
4821}
4822
4823static int
4824default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4825{
4826 default_install__(netdev);
4827 return 0;
4828}
4829
4830static const struct tc_ops tc_ops_default = {
4831 NULL, /* linux_name */
4832 "", /* ovs_name */
4833 0, /* n_queues */
4834 default_tc_install,
4835 default_tc_load,
4836 NULL, /* tc_destroy */
4837 NULL, /* qdisc_get */
4838 NULL, /* qdisc_set */
4839 NULL, /* class_get */
4840 NULL, /* class_set */
4841 NULL, /* class_delete */
4842 NULL, /* class_get_stats */
4843 NULL /* class_dump_stats */
4844};
4845\f
4846/* "linux-other" traffic control class.
4847 *
4848 * */
4849
4850static int
b5d57fc8 4851other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4852{
b5d57fc8 4853 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4854 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4855
559eb230
BP
4856 /* Nothing but a tc class implementation is allowed to write to a tc. This
4857 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4858 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4859 return 0;
4860}
4861
4862static const struct tc_ops tc_ops_other = {
4863 NULL, /* linux_name */
4864 "linux-other", /* ovs_name */
4865 0, /* n_queues */
4866 NULL, /* tc_install */
4867 other_tc_load,
4868 NULL, /* tc_destroy */
4869 NULL, /* qdisc_get */
4870 NULL, /* qdisc_set */
4871 NULL, /* class_get */
4872 NULL, /* class_set */
4873 NULL, /* class_delete */
4874 NULL, /* class_get_stats */
4875 NULL /* class_dump_stats */
4876};
4877\f
4878/* Traffic control. */
4879
4880/* Number of kernel "tc" ticks per second. */
4881static double ticks_per_s;
4882
4883/* Number of kernel "jiffies" per second. This is used for the purpose of
4884 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4885 * one jiffy's worth of data.
4886 *
4887 * There are two possibilities here:
4888 *
4889 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4890 * approximate range of 100 to 1024. That means that we really need to
4891 * make sure that the qdisc can buffer that much data.
4892 *
4893 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4894 * has finely granular timers and there's no need to fudge additional room
4895 * for buffers. (There's no extra effort needed to implement that: the
4896 * large 'buffer_hz' is used as a divisor, so practically any number will
4897 * come out as 0 in the division. Small integer results in the case of
4898 * really high dividends won't have any real effect anyhow.)
4899 */
4900static unsigned int buffer_hz;
4901
7874bdff
RD
4902static struct tcmsg *
4903netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4904 unsigned int flags, struct ofpbuf *request)
4905{
4906 int ifindex;
4907 int error;
4908
4909 error = get_ifindex(netdev, &ifindex);
4910 if (error) {
4911 return NULL;
4912 }
4913
4914 return tc_make_request(ifindex, type, flags, request);
4915}
4916
f8500004
JP
4917/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4918 * of 'kbits_burst'.
4919 *
4920 * This function is equivalent to running:
4921 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4922 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4923 * mtu 65535 drop
4924 *
4925 * The configuration and stats may be seen with the following command:
c7952afb 4926 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4927 *
4928 * Returns 0 if successful, otherwise a positive errno value.
4929 */
4930static int
c7952afb
BP
4931tc_add_policer(struct netdev *netdev,
4932 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4933{
4934 struct tc_police tc_police;
4935 struct ofpbuf request;
4936 struct tcmsg *tcmsg;
4937 size_t basic_offset;
4938 size_t police_offset;
4939 int error;
4940 int mtu = 65535;
4941
4942 memset(&tc_police, 0, sizeof tc_police);
4943 tc_police.action = TC_POLICE_SHOT;
4944 tc_police.mtu = mtu;
1aca400c 4945 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4946
79abacc8
MAA
4947 /* The following appears wrong in one way: In networking a kilobit is
4948 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4949 *
4950 * However if you "fix" those problems then "tc filter show ..." shows
4951 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4952 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4953 * tc's point of view. Whatever. */
4954 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4955 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 4956
7874bdff
RD
4957 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4958 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
4959 if (!tcmsg) {
4960 return ENODEV;
4961 }
4962 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4963 tcmsg->tcm_info = tc_make_handle(49,
4964 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4965
4966 nl_msg_put_string(&request, TCA_KIND, "basic");
4967 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4968 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4969 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4970 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4971 nl_msg_end_nested(&request, police_offset);
4972 nl_msg_end_nested(&request, basic_offset);
4973
4974 error = tc_transact(&request, NULL);
4975 if (error) {
4976 return error;
4977 }
4978
4979 return 0;
4980}
4981
c1c9c9c4
BP
4982static void
4983read_psched(void)
4984{
4985 /* The values in psched are not individually very meaningful, but they are
4986 * important. The tables below show some values seen in the wild.
4987 *
4988 * Some notes:
4989 *
4990 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4991 * (Before that, there are hints that it was 1000000000.)
4992 *
4993 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4994 * above.
4995 *
4996 * /proc/net/psched
4997 * -----------------------------------
4998 * [1] 000c8000 000f4240 000f4240 00000064
4999 * [2] 000003e8 00000400 000f4240 3b9aca00
5000 * [3] 000003e8 00000400 000f4240 3b9aca00
5001 * [4] 000003e8 00000400 000f4240 00000064
5002 * [5] 000003e8 00000040 000f4240 3b9aca00
5003 * [6] 000003e8 00000040 000f4240 000000f9
5004 *
5005 * a b c d ticks_per_s buffer_hz
5006 * ------- --------- ---------- ------------- ----------- -------------
5007 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5008 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5009 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5010 * [4] 1,000 1,024 1,000,000 100 976,562 100
5011 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5012 * [6] 1,000 64 1,000,000 249 15,625,000 249
5013 *
5014 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5015 * [2] 2.6.26-1-686-bigmem from Debian lenny
5016 * [3] 2.6.26-2-sparc64 from Debian lenny
5017 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5018 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5019 * [6] 2.6.34 from kernel.org on KVM
5020 */
23882115 5021 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5022 static const char fn[] = "/proc/net/psched";
5023 unsigned int a, b, c, d;
5024 FILE *stream;
5025
23882115
BP
5026 if (!ovsthread_once_start(&once)) {
5027 return;
5028 }
5029
c1c9c9c4
BP
5030 ticks_per_s = 1.0;
5031 buffer_hz = 100;
5032
5033 stream = fopen(fn, "r");
5034 if (!stream) {
10a89ef0 5035 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5036 goto exit;
c1c9c9c4
BP
5037 }
5038
5039 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5040 VLOG_WARN("%s: read failed", fn);
5041 fclose(stream);
23882115 5042 goto exit;
c1c9c9c4
BP
5043 }
5044 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5045 fclose(stream);
5046
5047 if (!a || !c) {
5048 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5049 goto exit;
c1c9c9c4
BP
5050 }
5051
5052 ticks_per_s = (double) a * c / b;
5053 if (c == 1000000) {
5054 buffer_hz = d;
5055 } else {
5056 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5057 fn, a, b, c, d);
5058 }
5059 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5060
5061exit:
5062 ovsthread_once_done(&once);
c1c9c9c4
BP
5063}
5064
5065/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5066 * rate of 'rate' bytes per second. */
5067static unsigned int
5068tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5069{
23882115 5070 read_psched();
c1c9c9c4
BP
5071 return (rate * ticks) / ticks_per_s;
5072}
5073
5074/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5075 * rate of 'rate' bytes per second. */
5076static unsigned int
5077tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5078{
23882115 5079 read_psched();
015c93a4 5080 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5081}
5082
5083/* Returns the number of bytes that need to be reserved for qdisc buffering at
5084 * a transmission rate of 'rate' bytes per second. */
5085static unsigned int
5086tc_buffer_per_jiffy(unsigned int rate)
5087{
23882115 5088 read_psched();
c1c9c9c4
BP
5089 return rate / buffer_hz;
5090}
5091
5092/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5093 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5094 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5095 * stores NULL into it if it is absent.
5096 *
5097 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5098 * 'msg'.
5099 *
5100 * Returns 0 if successful, otherwise a positive errno value. */
5101static int
5102tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5103 struct nlattr **options)
5104{
5105 static const struct nl_policy tca_policy[] = {
5106 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5107 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5108 };
5109 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5110
5111 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5112 tca_policy, ta, ARRAY_SIZE(ta))) {
5113 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5114 goto error;
5115 }
5116
5117 if (kind) {
5118 *kind = nl_attr_get_string(ta[TCA_KIND]);
5119 }
5120
5121 if (options) {
5122 *options = ta[TCA_OPTIONS];
5123 }
5124
5125 return 0;
5126
5127error:
5128 if (kind) {
5129 *kind = NULL;
5130 }
5131 if (options) {
5132 *options = NULL;
5133 }
5134 return EPROTO;
5135}
5136
5137/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5138 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5139 * into '*options', and its queue statistics into '*stats'. Any of the output
5140 * arguments may be null.
5141 *
5142 * Returns 0 if successful, otherwise a positive errno value. */
5143static int
5144tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5145 struct nlattr **options, struct netdev_queue_stats *stats)
5146{
5147 static const struct nl_policy tca_policy[] = {
5148 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5149 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5150 };
5151 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5152
5153 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5154 tca_policy, ta, ARRAY_SIZE(ta))) {
5155 VLOG_WARN_RL(&rl, "failed to parse class message");
5156 goto error;
5157 }
5158
5159 if (handlep) {
5160 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5161 *handlep = tc->tcm_handle;
5162 }
5163
5164 if (options) {
5165 *options = ta[TCA_OPTIONS];
5166 }
5167
5168 if (stats) {
5169 const struct gnet_stats_queue *gsq;
5170 struct gnet_stats_basic gsb;
5171
5172 static const struct nl_policy stats_policy[] = {
5173 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5174 .min_len = sizeof gsb },
5175 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5176 .min_len = sizeof *gsq },
5177 };
5178 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5179
5180 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5181 sa, ARRAY_SIZE(sa))) {
5182 VLOG_WARN_RL(&rl, "failed to parse class stats");
5183 goto error;
5184 }
5185
5186 /* Alignment issues screw up the length of struct gnet_stats_basic on
5187 * some arch/bitsize combinations. Newer versions of Linux have a
5188 * struct gnet_stats_basic_packed, but we can't depend on that. The
5189 * easiest thing to do is just to make a copy. */
5190 memset(&gsb, 0, sizeof gsb);
5191 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5192 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5193 stats->tx_bytes = gsb.bytes;
5194 stats->tx_packets = gsb.packets;
5195
5196 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5197 stats->tx_errors = gsq->drops;
5198 }
5199
5200 return 0;
5201
5202error:
5203 if (options) {
5204 *options = NULL;
5205 }
5206 if (stats) {
5207 memset(stats, 0, sizeof *stats);
5208 }
5209 return EPROTO;
5210}
5211
5212/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5213 * on 'netdev'. */
5214static int
5215tc_query_class(const struct netdev *netdev,
5216 unsigned int handle, unsigned int parent,
5217 struct ofpbuf **replyp)
5218{
5219 struct ofpbuf request;
5220 struct tcmsg *tcmsg;
5221 int error;
5222
7874bdff
RD
5223 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5224 &request);
23a98ffe
BP
5225 if (!tcmsg) {
5226 return ENODEV;
5227 }
c1c9c9c4
BP
5228 tcmsg->tcm_handle = handle;
5229 tcmsg->tcm_parent = parent;
5230
5231 error = tc_transact(&request, replyp);
5232 if (error) {
5233 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5234 netdev_get_name(netdev),
5235 tc_get_major(handle), tc_get_minor(handle),
5236 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5237 ovs_strerror(error));
c1c9c9c4
BP
5238 }
5239 return error;
5240}
5241
5242/* Equivalent to "tc class del dev <name> handle <handle>". */
5243static int
5244tc_delete_class(const struct netdev *netdev, unsigned int handle)
5245{
5246 struct ofpbuf request;
5247 struct tcmsg *tcmsg;
5248 int error;
5249
7874bdff 5250 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5251 if (!tcmsg) {
5252 return ENODEV;
5253 }
c1c9c9c4
BP
5254 tcmsg->tcm_handle = handle;
5255 tcmsg->tcm_parent = 0;
5256
5257 error = tc_transact(&request, NULL);
5258 if (error) {
5259 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5260 netdev_get_name(netdev),
5261 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5262 ovs_strerror(error));
c1c9c9c4
BP
5263 }
5264 return error;
5265}
5266
5267/* Equivalent to "tc qdisc del dev <name> root". */
5268static int
b5d57fc8 5269tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5270{
b5d57fc8 5271 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5272 struct ofpbuf request;
5273 struct tcmsg *tcmsg;
5274 int error;
5275
7874bdff 5276 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5277 if (!tcmsg) {
5278 return ENODEV;
5279 }
c1c9c9c4
BP
5280 tcmsg->tcm_handle = tc_make_handle(1, 0);
5281 tcmsg->tcm_parent = TC_H_ROOT;
5282
5283 error = tc_transact(&request, NULL);
5284 if (error == EINVAL) {
5285 /* EINVAL probably means that the default qdisc was in use, in which
5286 * case we've accomplished our purpose. */
5287 error = 0;
5288 }
b5d57fc8
BP
5289 if (!error && netdev->tc) {
5290 if (netdev->tc->ops->tc_destroy) {
5291 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5292 }
b5d57fc8 5293 netdev->tc = NULL;
c1c9c9c4
BP
5294 }
5295 return error;
5296}
5297
ac3e3aaa
BP
5298static bool
5299getqdisc_is_safe(void)
5300{
5301 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5302 static bool safe = false;
5303
5304 if (ovsthread_once_start(&once)) {
5305 struct utsname utsname;
5306 int major, minor;
5307
5308 if (uname(&utsname) == -1) {
5309 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5310 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5311 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5312 } else if (major < 2 || (major == 2 && minor < 35)) {
5313 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5314 utsname.release);
5315 } else {
5316 safe = true;
5317 }
5318 ovsthread_once_done(&once);
5319 }
5320 return safe;
5321}
5322
c1c9c9c4
BP
5323/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5324 * kernel to determine what they are. Returns 0 if successful, otherwise a
5325 * positive errno value. */
5326static int
b5d57fc8 5327tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5328{
b5d57fc8 5329 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5330 struct ofpbuf request, *qdisc;
5331 const struct tc_ops *ops;
5332 struct tcmsg *tcmsg;
5333 int load_error;
5334 int error;
5335
b5d57fc8 5336 if (netdev->tc) {
c1c9c9c4
BP
5337 return 0;
5338 }
5339
5340 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5341 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5342 * 2.6.35 without that fix backported to it.
5343 *
5344 * To avoid the OOPS, we must not make a request that would attempt to dump
5345 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5346 * few others. There are a few ways that I can see to do this, but most of
5347 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5348 * technique chosen here is to assume that any non-default qdisc that we
5349 * create will have a class with handle 1:0. The built-in qdiscs only have
5350 * a class with handle 0:0.
5351 *
ac3e3aaa
BP
5352 * On Linux 2.6.35+ we use the straightforward method because it allows us
5353 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5354 * in such a case we get no response at all from the kernel (!) if a
5355 * builtin qdisc is in use (which is later caught by "!error &&
5356 * !qdisc->size"). */
7874bdff
RD
5357 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5358 &request);
23a98ffe
BP
5359 if (!tcmsg) {
5360 return ENODEV;
5361 }
ac3e3aaa
BP
5362 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5363 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5364
5365 /* Figure out what tc class to instantiate. */
5366 error = tc_transact(&request, &qdisc);
ac3e3aaa 5367 if (!error && qdisc->size) {
c1c9c9c4
BP
5368 const char *kind;
5369
5370 error = tc_parse_qdisc(qdisc, &kind, NULL);
5371 if (error) {
5372 ops = &tc_ops_other;
5373 } else {
5374 ops = tc_lookup_linux_name(kind);
5375 if (!ops) {
5376 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5377 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5378
5379 ops = &tc_ops_other;
5380 }
5381 }
ac3e3aaa
BP
5382 } else if ((!error && !qdisc->size) || error == ENOENT) {
5383 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5384 * set up by some other entity that doesn't have a handle 1:0. We will
5385 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5386 ops = &tc_ops_default;
5387 error = 0;
5388 } else {
5389 /* Who knows? Maybe the device got deleted. */
5390 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5391 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5392 ops = &tc_ops_other;
5393 }
5394
5395 /* Instantiate it. */
b5d57fc8
BP
5396 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5397 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5398 ofpbuf_delete(qdisc);
5399
5400 return error ? error : load_error;
5401}
5402
5403/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5404 approximate the time to transmit packets of various lengths. For an MTU of
5405 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5406 represents two possible packet lengths; for a MTU of 513 through 1024, four
5407 possible lengths; and so on.
5408
5409 Returns, for the specified 'mtu', the number of bits that packet lengths
5410 need to be shifted right to fit within such a 256-entry table. */
5411static int
5412tc_calc_cell_log(unsigned int mtu)
5413{
5414 int cell_log;
5415
5416 if (!mtu) {
5417 mtu = ETH_PAYLOAD_MAX;
5418 }
5419 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5420
5421 for (cell_log = 0; mtu >= 256; cell_log++) {
5422 mtu >>= 1;
5423 }
5424
5425 return cell_log;
5426}
5427
5428/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5429 * of 'mtu'. */
5430static void
5431tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5432{
5433 memset(rate, 0, sizeof *rate);
5434 rate->cell_log = tc_calc_cell_log(mtu);
5435 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5436 /* rate->cell_align = 0; */ /* distro headers. */
5437 rate->mpu = ETH_TOTAL_MIN;
5438 rate->rate = Bps;
5439}
5440
5441/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5442 * attribute of the specified "type".
5443 *
5444 * See tc_calc_cell_log() above for a description of "rtab"s. */
5445static void
5446tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5447{
5448 uint32_t *rtab;
5449 unsigned int i;
5450
5451 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5452 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5453 unsigned packet_size = (i + 1) << rate->cell_log;
5454 if (packet_size < rate->mpu) {
5455 packet_size = rate->mpu;
5456 }
5457 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5458 }
5459}
5460
5461/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5462 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5463 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5464 * 0 is fine.) */
c1c9c9c4
BP
5465static int
5466tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5467{
5468 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5469 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5470}
d3980822 5471\f
aaf2fb1a
BP
5472/* Linux-only functions declared in netdev-linux.h */
5473
5474/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5475 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5476int
5477netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5478 const char *flag_name, bool enable)
5479{
5480 const char *netdev_name = netdev_get_name(netdev);
5481 struct ethtool_value evalue;
5482 uint32_t new_flags;
5483 int error;
5484
ab985a77 5485 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5486 memset(&evalue, 0, sizeof evalue);
5487 error = netdev_linux_do_ethtool(netdev_name,
5488 (struct ethtool_cmd *)&evalue,
5489 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5490 if (error) {
5491 return error;
5492 }
5493
ab985a77 5494 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5495 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5496 if (new_flags == evalue.data) {
5497 return 0;
5498 }
5499 evalue.data = new_flags;
aaf2fb1a
BP
5500 error = netdev_linux_do_ethtool(netdev_name,
5501 (struct ethtool_cmd *)&evalue,
5502 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5503 if (error) {
5504 return error;
5505 }
5506
ab985a77 5507 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5508 memset(&evalue, 0, sizeof evalue);
5509 error = netdev_linux_do_ethtool(netdev_name,
5510 (struct ethtool_cmd *)&evalue,
5511 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5512 if (error) {
5513 return error;
5514 }
5515
5516 if (new_flags != evalue.data) {
5517 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5518 "device %s failed", enable ? "enable" : "disable",
5519 flag_name, netdev_name);
5520 return EOPNOTSUPP;
5521 }
5522
5523 return 0;
5524}
5525\f
5526/* Utility functions. */
5527
d3980822 5528/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5529static void
d3980822
BP
5530netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5531 const struct rtnl_link_stats *src)
5532{
f613a0d7
PS
5533 dst->rx_packets = src->rx_packets;
5534 dst->tx_packets = src->tx_packets;
5535 dst->rx_bytes = src->rx_bytes;
5536 dst->tx_bytes = src->tx_bytes;
5537 dst->rx_errors = src->rx_errors;
5538 dst->tx_errors = src->tx_errors;
5539 dst->rx_dropped = src->rx_dropped;
5540 dst->tx_dropped = src->tx_dropped;
5541 dst->multicast = src->multicast;
5542 dst->collisions = src->collisions;
5543 dst->rx_length_errors = src->rx_length_errors;
5544 dst->rx_over_errors = src->rx_over_errors;
5545 dst->rx_crc_errors = src->rx_crc_errors;
5546 dst->rx_frame_errors = src->rx_frame_errors;
5547 dst->rx_fifo_errors = src->rx_fifo_errors;
5548 dst->rx_missed_errors = src->rx_missed_errors;
5549 dst->tx_aborted_errors = src->tx_aborted_errors;
5550 dst->tx_carrier_errors = src->tx_carrier_errors;
5551 dst->tx_fifo_errors = src->tx_fifo_errors;
5552 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5553 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5554}
5555
337c9b99
BP
5556/* Copies 'src' into 'dst', performing format conversion in the process. */
5557static void
5558netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5559 const struct rtnl_link_stats64 *src)
5560{
5561 dst->rx_packets = src->rx_packets;
5562 dst->tx_packets = src->tx_packets;
5563 dst->rx_bytes = src->rx_bytes;
5564 dst->tx_bytes = src->tx_bytes;
5565 dst->rx_errors = src->rx_errors;
5566 dst->tx_errors = src->tx_errors;
5567 dst->rx_dropped = src->rx_dropped;
5568 dst->tx_dropped = src->tx_dropped;
5569 dst->multicast = src->multicast;
5570 dst->collisions = src->collisions;
5571 dst->rx_length_errors = src->rx_length_errors;
5572 dst->rx_over_errors = src->rx_over_errors;
5573 dst->rx_crc_errors = src->rx_crc_errors;
5574 dst->rx_frame_errors = src->rx_frame_errors;
5575 dst->rx_fifo_errors = src->rx_fifo_errors;
5576 dst->rx_missed_errors = src->rx_missed_errors;
5577 dst->tx_aborted_errors = src->tx_aborted_errors;
5578 dst->tx_carrier_errors = src->tx_carrier_errors;
5579 dst->tx_fifo_errors = src->tx_fifo_errors;
5580 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5581 dst->tx_window_errors = src->tx_window_errors;
5582}
5583
c1c9c9c4 5584static int
35eef899 5585get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5586{
c1c9c9c4
BP
5587 struct ofpbuf request;
5588 struct ofpbuf *reply;
c1c9c9c4
BP
5589 int error;
5590
d6e3feb5 5591 /* Filtering all counters by default */
5592 memset(stats, 0xFF, sizeof(struct netdev_stats));
5593
c1c9c9c4 5594 ofpbuf_init(&request, 0);
13a24df8
BP
5595 nl_msg_put_nlmsghdr(&request,
5596 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5597 RTM_GETLINK, NLM_F_REQUEST);
5598 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5599 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5600 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5601 ofpbuf_uninit(&request);
5602 if (error) {
5603 return error;
5604 }
5605
13a24df8 5606 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5607 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5608 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5609 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5610 error = 0;
5611 } else {
71f21279 5612 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5613 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5614 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5615 error = 0;
5616 } else {
5617 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5618 error = EPROTO;
5619 }
13a24df8
BP
5620 }
5621 } else {
5622 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5623 error = EPROTO;
c1c9c9c4 5624 }
8b61709d 5625
8b61709d 5626
576e26d7 5627 ofpbuf_delete(reply);
35eef899 5628 return error;
8b61709d 5629}
c1c9c9c4 5630
3a183124 5631static int
b5d57fc8 5632get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5633{
5634 struct ifreq ifr;
5635 int error;
5636
755be9ea 5637 *flags = 0;
259e0b1a 5638 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5639 if (!error) {
5640 *flags = ifr.ifr_flags;
5641 }
8b61709d
BP
5642 return error;
5643}
5644
5645static int
4b609110 5646set_flags(const char *name, unsigned int flags)
8b61709d
BP
5647{
5648 struct ifreq ifr;
5649
5650 ifr.ifr_flags = flags;
259e0b1a 5651 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5652}
5653
01b25786
PB
5654int
5655linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5656{
5657 struct ifreq ifr;
259e0b1a 5658 int error;
8b61709d 5659
71d7c22f 5660 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5661 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5662
5663 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5664 if (error) {
580e1152
RD
5665 /* ENODEV probably means that a vif disappeared asynchronously and
5666 * hasn't been removed from the database yet, so reduce the log level
5667 * to INFO for that case. */
5668 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5669 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5670 netdev_name, ovs_strerror(error));
259e0b1a 5671 return -error;
8b61709d
BP
5672 }
5673 return ifr.ifr_ifindex;
5674}
5675
5676static int
5677get_ifindex(const struct netdev *netdev_, int *ifindexp)
5678{
b5d57fc8 5679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5680
b5d57fc8 5681 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
5682 netdev_linux_update_via_netlink(netdev);
5683 }
5684
5685 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5686 /* Fall back to ioctl if netlink fails */
01b25786 5687 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5688
8b61709d 5689 if (ifindex < 0) {
b5d57fc8
BP
5690 netdev->get_ifindex_error = -ifindex;
5691 netdev->ifindex = 0;
c7b1b0a5 5692 } else {
b5d57fc8
BP
5693 netdev->get_ifindex_error = 0;
5694 netdev->ifindex = ifindex;
8b61709d 5695 }
b5d57fc8 5696 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5697 }
c7b1b0a5 5698
b5d57fc8
BP
5699 *ifindexp = netdev->ifindex;
5700 return netdev->get_ifindex_error;
8b61709d
BP
5701}
5702
5703static int
756819dd
FL
5704netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5705{
5706 struct ofpbuf request;
5707 struct ofpbuf *reply;
5708 struct rtnetlink_change chg;
5709 struct rtnetlink_change *change = &chg;
5710 int error;
5711
5712 ofpbuf_init(&request, 0);
5713 nl_msg_put_nlmsghdr(&request,
5714 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5715 RTM_GETLINK, NLM_F_REQUEST);
5716 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5717
5718 /* The correct identifiers for a Linux device are netnsid and ifindex,
5719 * but ifindex changes as the port is moved to another network namespace
5720 * and the interface name statically stored in ovsdb. */
5721 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5722 if (netdev_linux_netnsid_is_remote(netdev)) {
5723 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5724 }
5725 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5726 ofpbuf_uninit(&request);
5727 if (error) {
5728 ofpbuf_delete(reply);
5729 return error;
5730 }
5731
5732 if (rtnetlink_parse(reply, change)
5733 && change->nlmsg_type == RTM_NEWLINK) {
5734 bool changed = false;
5735 error = 0;
5736
5737 /* Update netdev from rtnl msg and increment its seq if needed. */
5738 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5739 netdev->carrier_resets++;
5740 changed = true;
5741 }
5742 if (change->ifi_flags != netdev->ifi_flags) {
5743 netdev->ifi_flags = change->ifi_flags;
5744 changed = true;
5745 }
5746 if (change->mtu && change->mtu != netdev->mtu) {
5747 netdev->mtu = change->mtu;
5748 netdev->cache_valid |= VALID_MTU;
5749 netdev->netdev_mtu_error = 0;
5750 changed = true;
5751 }
5752 if (!eth_addr_is_zero(change->mac)
5753 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5754 netdev->etheraddr = change->mac;
5755 netdev->cache_valid |= VALID_ETHERADDR;
5756 netdev->ether_addr_error = 0;
5757 changed = true;
5758 }
5759 if (change->if_index != netdev->ifindex) {
5760 netdev->ifindex = change->if_index;
5761 netdev->cache_valid |= VALID_IFINDEX;
5762 netdev->get_ifindex_error = 0;
5763 changed = true;
5764 }
5765 if (changed) {
5766 netdev_change_seq_changed(&netdev->up);
5767 }
5768 } else {
5769 error = EINVAL;
5770 }
5771
5772 ofpbuf_delete(reply);
5773 return error;
5774}
5775
5776static int
74ff3298 5777get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5778{
5779 struct ifreq ifr;
5780 int hwaddr_family;
259e0b1a 5781 int error;
8b61709d
BP
5782
5783 memset(&ifr, 0, sizeof ifr);
71d7c22f 5784 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5785 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5786 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5787 if (error) {
78857dfb
BP
5788 /* ENODEV probably means that a vif disappeared asynchronously and
5789 * hasn't been removed from the database yet, so reduce the log level
5790 * to INFO for that case. */
259e0b1a 5791 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5792 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5793 netdev_name, ovs_strerror(error));
5794 return error;
8b61709d
BP
5795 }
5796 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5797 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5798 hwaddr_family != ARPHRD_NONE) {
c9697f35 5799 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5800 netdev_name, hwaddr_family);
c9697f35 5801 return EINVAL;
8b61709d
BP
5802 }
5803 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5804 return 0;
5805}
5806
5807static int
74ff3298 5808set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5809{
5810 struct ifreq ifr;
259e0b1a 5811 int error;
8b61709d
BP
5812
5813 memset(&ifr, 0, sizeof ifr);
71d7c22f 5814 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5815 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5816 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5817 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5818 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5819 if (error) {
8b61709d 5820 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5821 netdev_name, ovs_strerror(error));
8b61709d 5822 }
259e0b1a 5823 return error;
8b61709d
BP
5824}
5825
5826static int
0b0544d7 5827netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5828 int cmd, const char *cmd_name)
5829{
5830 struct ifreq ifr;
259e0b1a 5831 int error;
8b61709d
BP
5832
5833 memset(&ifr, 0, sizeof ifr);
71d7c22f 5834 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5835 ifr.ifr_data = (caddr_t) ecmd;
5836
5837 ecmd->cmd = cmd;
259e0b1a
BP
5838 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5839 if (error) {
5840 if (error != EOPNOTSUPP) {
8b61709d 5841 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5842 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5843 } else {
5844 /* The device doesn't support this operation. That's pretty
5845 * common, so there's no point in logging anything. */
5846 }
8b61709d 5847 }
259e0b1a 5848 return error;
8b61709d 5849}
f1acd62b 5850
488d734d
BP
5851/* Returns an AF_PACKET raw socket or a negative errno value. */
5852static int
5853af_packet_sock(void)
5854{
23882115
BP
5855 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5856 static int sock;
488d734d 5857
23882115 5858 if (ovsthread_once_start(&once)) {
488d734d
BP
5859 sock = socket(AF_PACKET, SOCK_RAW, 0);
5860 if (sock >= 0) {
8450059e
BP
5861 int error = set_nonblocking(sock);
5862 if (error) {
5863 close(sock);
5864 sock = -error;
5865 }
488d734d
BP
5866 } else {
5867 sock = -errno;
10a89ef0
BP
5868 VLOG_ERR("failed to create packet socket: %s",
5869 ovs_strerror(errno));
488d734d 5870 }
23882115 5871 ovsthread_once_done(&once);
488d734d
BP
5872 }
5873
5874 return sock;
5875}