]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif.h
ipf: Avoid accessing to a freed rp.
[mirror_ovs.git] / lib / dpif.h
CommitLineData
064af421 1/*
1954e6bb 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
064af421 3 *
a14bc59f
BP
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
064af421 7 *
a14bc59f
BP
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
064af421
BP
15 */
16
ffcb9f6e
BP
17/*
18 * dpif, the DataPath InterFace.
19 *
20 * In Open vSwitch terminology, a "datapath" is a flow-based software switch.
21 * A datapath has no intelligence of its own. Rather, it relies entirely on
22 * its client to set up flows. The datapath layer is core to the Open vSwitch
23 * software switch: one could say, without much exaggeration, that everything
24 * in ovs-vswitchd above dpif exists only to make the correct decisions
25 * interacting with dpif.
26 *
27 * Typically, the client of a datapath is the software switch module in
28 * "ovs-vswitchd", but other clients can be written. The "ovs-dpctl" utility
29 * is also a (simple) client.
30 *
31 *
32 * Overview
33 * ========
34 *
35 * The terms written in quotes below are defined in later sections.
36 *
37 * When a datapath "port" receives a packet, it extracts the headers (the
da546e07
JR
38 * "flow"). If the datapath's "flow table" contains a "flow entry" matching
39 * the packet, then it executes the "actions" in the flow entry and increments
40 * the flow's statistics. If there is no matching flow entry, the datapath
41 * instead appends the packet to an "upcall" queue.
ffcb9f6e
BP
42 *
43 *
44 * Ports
45 * =====
46 *
47 * A datapath has a set of ports that are analogous to the ports on an Ethernet
48 * switch. At the datapath level, each port has the following information
49 * associated with it:
50 *
51 * - A name, a short string that must be unique within the host. This is
52 * typically a name that would be familiar to the system administrator,
53 * e.g. "eth0" or "vif1.1", but it is otherwise arbitrary.
54 *
55 * - A 32-bit port number that must be unique within the datapath but is
56 * otherwise arbitrary. The port number is the most important identifier
57 * for a port in the datapath interface.
58 *
59 * - A type, a short string that identifies the kind of port. On a Linux
60 * host, typical types are "system" (for a network device such as eth0),
61 * "internal" (for a simulated port used to connect to the TCP/IP stack),
62 * and "gre" (for a GRE tunnel).
63 *
1954e6bb
AW
64 * - A Netlink PID for each upcall reading thread (see "Upcall Queuing and
65 * Ordering" below).
ffcb9f6e
BP
66 *
67 * The dpif interface has functions for adding and deleting ports. When a
68 * datapath implements these (e.g. as the Linux and netdev datapaths do), then
69 * Open vSwitch's ovs-vswitchd daemon can directly control what ports are used
70 * for switching. Some datapaths might not implement them, or implement them
e6fc4e2c
JP
71 * with restrictions on the types of ports that can be added or removed,
72 * on systems where port membership can only be changed by some external
73 * entity.
ffcb9f6e
BP
74 *
75 * Each datapath must have a port, sometimes called the "local port", whose
76 * name is the same as the datapath itself, with port number 0. The local port
77 * cannot be deleted.
78 *
79 * Ports are available as "struct netdev"s. To obtain a "struct netdev *" for
80 * a port named 'name' with type 'port_type', in a datapath of type
81 * 'datapath_type', call netdev_open(name, dpif_port_open_type(datapath_type,
82 * port_type). The netdev can be used to get and set important data related to
83 * the port, such as:
84 *
85 * - MTU (netdev_get_mtu(), netdev_set_mtu()).
86 *
87 * - Ethernet address (netdev_get_etheraddr(), netdev_set_etheraddr()).
88 *
89 * - Statistics such as the number of packets and bytes transmitted and
90 * received (netdev_get_stats()).
91 *
92 * - Carrier status (netdev_get_carrier()).
93 *
94 * - Speed (netdev_get_features()).
95 *
96 * - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and
97 * related functions.)
98 *
99 * - Arbitrary port-specific configuration parameters (netdev_get_config(),
100 * netdev_set_config()). An example of such a parameter is the IP
101 * endpoint for a GRE tunnel.
102 *
103 *
104 * Flow Table
105 * ==========
106 *
ee75c546 107 * The flow table is a collection of "flow entries". Each flow entry contains:
ffcb9f6e
BP
108 *
109 * - A "flow", that is, a summary of the headers in an Ethernet packet. The
ee75c546
BP
110 * flow must be unique within the flow table. Flows are fine-grained
111 * entities that include L2, L3, and L4 headers. A single TCP connection
112 * consists of two flows, one in each direction.
ffcb9f6e
BP
113 *
114 * In Open vSwitch userspace, "struct flow" is the typical way to describe
115 * a flow, but the datapath interface uses a different data format to
7c9afefd
SF
116 * allow ABI forward- and backward-compatibility. Refer to OVS_KEY_ATTR_*
117 * and "struct ovs_key_*" in include/odp-netlink.h for details.
ffcb9f6e
BP
118 * lib/odp-util.h defines several functions for working with these flows.
119 *
ee75c546
BP
120 * - A "mask" that, for each bit in the flow, specifies whether the datapath
121 * should consider the corresponding flow bit when deciding whether a
122 * given packet matches the flow entry. The original datapath design did
123 * not support matching: every flow entry was exact match. With the
124 * addition of a mask, the interface supports datapaths with a spectrum of
125 * wildcard matching capabilities, from those that only support exact
126 * matches to those that support bitwise wildcarding on the entire flow
127 * key, as well as datapaths with capabilities somewhere in between.
128 *
129 * Datapaths do not provide a way to query their wildcarding capabilities,
130 * nor is it expected that the client should attempt to probe for the
131 * details of their support. Instead, a client installs flows with masks
132 * that wildcard as many bits as acceptable. The datapath then actually
133 * wildcards as many of those bits as it can and changes the wildcard bits
134 * that it does not support into exact match bits. A datapath that can
135 * wildcard any bit, for example, would install the supplied mask, an
136 * exact-match only datapath would install an exact-match mask regardless
137 * of what mask the client supplied, and a datapath in the middle of the
138 * spectrum would selectively change some wildcard bits into exact match
139 * bits.
140 *
141 * Regardless of the requested or installed mask, the datapath retains the
142 * original flow supplied by the client. (It does not, for example, "zero
143 * out" the wildcarded bits.) This allows the client to unambiguously
144 * identify the flow entry in later flow table operations.
145 *
146 * The flow table does not have priorities; that is, all flow entries have
147 * equal priority. Detecting overlapping flow entries is expensive in
148 * general, so the datapath is not required to do it. It is primarily the
149 * client's responsibility not to install flow entries whose flow and mask
150 * combinations overlap.
ffcb9f6e
BP
151 *
152 * - A list of "actions" that tell the datapath what to do with packets
153 * within a flow. Some examples of actions are OVS_ACTION_ATTR_OUTPUT,
154 * which transmits the packet out a port, and OVS_ACTION_ATTR_SET, which
155 * modifies packet headers. Refer to OVS_ACTION_ATTR_* and "struct
837eefc7
BP
156 * ovs_action_*" in include/odp-netlink.h for details. lib/odp-util.h
157 * defines several functions for working with datapath actions.
ffcb9f6e
BP
158 *
159 * The actions list may be empty. This indicates that nothing should be
160 * done to matching packets, that is, they should be dropped.
161 *
162 * (In case you are familiar with OpenFlow, datapath actions are analogous
163 * to OpenFlow actions.)
164 *
165 * - Statistics: the number of packets and bytes that the flow has
166 * processed, the last time that the flow processed a packet, and the
167 * union of all the TCP flags in packets processed by the flow. (The
168 * latter is 0 if the flow is not a TCP flow.)
169 *
170 * The datapath's client manages the flow table, primarily in reaction to
171 * "upcalls" (see below).
172 *
173 *
174 * Upcalls
175 * =======
176 *
177 * A datapath sometimes needs to notify its client that a packet was received.
178 * The datapath mechanism to do this is called an "upcall".
179 *
180 * Upcalls are used in two situations:
181 *
182 * - When a packet is received, but there is no matching flow entry in its
183 * flow table (a flow table "miss"), this causes an upcall of type
184 * DPIF_UC_MISS. These are called "miss" upcalls.
185 *
186 * - A datapath action of type OVS_ACTION_ATTR_USERSPACE causes an upcall of
187 * type DPIF_UC_ACTION. These are called "action" upcalls.
188 *
189 * An upcall contains an entire packet. There is no attempt to, e.g., copy
190 * only as much of the packet as normally needed to make a forwarding decision.
191 * Such an optimization is doable, but experimental prototypes showed it to be
192 * of little benefit because an upcall typically contains the first packet of a
193 * flow, which is usually short (e.g. a TCP SYN). Also, the entire packet can
194 * sometimes really be needed.
195 *
196 * After a client reads a given upcall, the datapath is finished with it, that
197 * is, the datapath doesn't maintain any lingering state past that point.
198 *
199 * The latency from the time that a packet arrives at a port to the time that
200 * it is received from dpif_recv() is critical in some benchmarks. For
201 * example, if this latency is 1 ms, then a netperf TCP_CRR test, which opens
202 * and closes TCP connections one at a time as quickly as it can, cannot
203 * possibly achieve more than 500 transactions per second, since every
204 * connection consists of two flows with 1-ms latency to set up each one.
205 *
206 * To receive upcalls, a client has to enable them with dpif_recv_set(). A
1954e6bb
AW
207 * datapath should generally support being opened multiple times (e.g. so that
208 * one may run "ovs-dpctl show" or "ovs-dpctl dump-flows" while "ovs-vswitchd"
209 * is also running) but need not support more than one of these clients
210 * enabling upcalls at once.
ffcb9f6e
BP
211 *
212 *
213 * Upcall Queuing and Ordering
214 * ---------------------------
215 *
216 * The datapath's client reads upcalls one at a time by calling dpif_recv().
217 * When more than one upcall is pending, the order in which the datapath
218 * presents upcalls to its client is important. The datapath's client does not
219 * directly control this order, so the datapath implementer must take care
220 * during design.
221 *
222 * The minimal behavior, suitable for initial testing of a datapath
223 * implementation, is that all upcalls are appended to a single queue, which is
224 * delivered to the client in order.
225 *
226 * The datapath should ensure that a high rate of upcalls from one particular
227 * port cannot cause upcalls from other sources to be dropped or unreasonably
228 * delayed. Otherwise, one port conducting a port scan or otherwise initiating
229 * high-rate traffic spanning many flows could suppress other traffic.
230 * Ideally, the datapath should present upcalls from each port in a "round
231 * robin" manner, to ensure fairness.
232 *
233 * The client has no control over "miss" upcalls and no insight into the
234 * datapath's implementation, so the datapath is entirely responsible for
235 * queuing and delivering them. On the other hand, the datapath has
236 * considerable freedom of implementation. One good approach is to maintain a
237 * separate queue for each port, to prevent any given port's upcalls from
238 * interfering with other ports' upcalls. If this is impractical, then another
239 * reasonable choice is to maintain some fixed number of queues and assign each
240 * port to one of them. Ports assigned to the same queue can then interfere
241 * with each other, but not with ports assigned to different queues. Other
242 * approaches are also possible.
243 *
244 * The client has some control over "action" upcalls: it can specify a 32-bit
245 * "Netlink PID" as part of the action. This terminology comes from the Linux
246 * datapath implementation, which uses a protocol called Netlink in which a PID
247 * designates a particular socket and the upcall data is delivered to the
248 * socket's receive queue. Generically, though, a Netlink PID identifies a
249 * queue for upcalls. The basic requirements on the datapath are:
250 *
251 * - The datapath must provide a Netlink PID associated with each port. The
252 * client can retrieve the PID with dpif_port_get_pid().
253 *
254 * - The datapath must provide a "special" Netlink PID not associated with
255 * any port. dpif_port_get_pid() also provides this PID. (ovs-vswitchd
256 * uses this PID to queue special packets that must not be lost even if a
257 * port is otherwise busy, such as packets used for tunnel monitoring.)
258 *
259 * The minimal behavior of dpif_port_get_pid() and the treatment of the Netlink
260 * PID in "action" upcalls is that dpif_port_get_pid() returns a constant value
261 * and all upcalls are appended to a single queue.
262 *
1954e6bb 263 * The preferred behavior is:
ffcb9f6e
BP
264 *
265 * - Each port has a PID that identifies the queue used for "miss" upcalls
266 * on that port. (Thus, if each port has its own queue for "miss"
267 * upcalls, then each port has a different Netlink PID.)
268 *
269 * - "miss" upcalls for a given port and "action" upcalls that specify that
270 * port's Netlink PID add their upcalls to the same queue. The upcalls
271 * are delivered to the datapath's client in the order that the packets
272 * were received, regardless of whether the upcalls are "miss" or "action"
273 * upcalls.
274 *
275 * - Upcalls that specify the "special" Netlink PID are queued separately.
276 *
277 *
278 * Packet Format
279 * =============
280 *
281 * The datapath interface works with packets in a particular form. This is the
282 * form taken by packets received via upcalls (i.e. by dpif_recv()). Packets
283 * supplied to the datapath for processing (i.e. to dpif_execute()) also take
284 * this form.
285 *
286 * A VLAN tag is represented by an 802.1Q header. If the layer below the
287 * datapath interface uses another representation, then the datapath interface
288 * must perform conversion.
289 *
290 * The datapath interface requires all packets to fit within the MTU. Some
291 * operating systems internally process packets larger than MTU, with features
292 * such as TSO and UFO. When such a packet passes through the datapath
293 * interface, it must be broken into multiple MTU or smaller sized packets for
294 * presentation as upcalls. (This does not happen often, because an upcall
295 * typically contains the first packet of a flow, which is usually short.)
296 *
297 * Some operating system TCP/IP stacks maintain packets in an unchecksummed or
298 * partially checksummed state until transmission. The datapath interface
299 * requires all host-generated packets to be fully checksummed (e.g. IP and TCP
300 * checksums must be correct). On such an OS, the datapath interface must fill
301 * in these checksums.
302 *
303 * Packets passed through the datapath interface must be at least 14 bytes
304 * long, that is, they must have a complete Ethernet header. They are not
305 * required to be padded to the minimum Ethernet length.
306 *
307 *
308 * Typical Usage
309 * =============
310 *
311 * Typically, the client of a datapath begins by configuring the datapath with
312 * a set of ports. Afterward, the client runs in a loop polling for upcalls to
313 * arrive.
314 *
315 * For each upcall received, the client examines the enclosed packet and
316 * figures out what should be done with it. For example, if the client
317 * implements a MAC-learning switch, then it searches the forwarding database
318 * for the packet's destination MAC and VLAN and determines the set of ports to
319 * which it should be sent. In any case, the client composes a set of datapath
320 * actions to properly dispatch the packet and then directs the datapath to
321 * execute those actions on the packet (e.g. with dpif_execute()).
322 *
323 * Most of the time, the actions that the client executed on the packet apply
324 * to every packet with the same flow. For example, the flow includes both
325 * destination MAC and VLAN ID (and much more), so this is true for the
326 * MAC-learning switch example above. In such a case, the client can also
327 * direct the datapath to treat any further packets in the flow in the same
328 * way, using dpif_flow_put() to add a new flow entry.
329 *
330 * Other tasks the client might need to perform, in addition to reacting to
331 * upcalls, include:
332 *
333 * - Periodically polling flow statistics, perhaps to supply to its own
334 * clients.
335 *
336 * - Deleting flow entries from the datapath that haven't been used
337 * recently, to save memory.
338 *
339 * - Updating flow entries whose actions should change. For example, if a
340 * MAC learning switch learns that a MAC has moved, then it must update
341 * the actions of flow entries that sent packets to the MAC at its old
342 * location.
343 *
344 * - Adding and removing ports to achieve a new configuration.
5703b15f
BP
345 *
346 *
347 * Thread-safety
348 * =============
349 *
350 * Most of the dpif functions are fully thread-safe: they may be called from
351 * any number of threads on the same or different dpif objects. The exceptions
352 * are:
353 *
354 * - dpif_port_poll() and dpif_port_poll_wait() are conditionally
355 * thread-safe: they may be called from different threads only on
356 * different dpif objects.
357 *
d2ad7ef1
JS
358 * - dpif_flow_dump_next() is conditionally thread-safe: It may be called
359 * from different threads with the same 'struct dpif_flow_dump', but all
360 * other parameters must be different for each thread.
361 *
362 * - dpif_flow_dump_done() is conditionally thread-safe: All threads that
363 * share the same 'struct dpif_flow_dump' must have finished using it.
364 * This function must then be called exactly once for a particular
365 * dpif_flow_dump to finish the corresponding flow dump operation.
366 *
367 * - Functions that operate on 'struct dpif_port_dump' are conditionally
368 * thread-safe with respect to those objects. That is, one may dump ports
369 * from any number of threads at once, but each thread must use its own
370 * struct dpif_port_dump.
ffcb9f6e 371 */
064af421
BP
372#ifndef DPIF_H
373#define DPIF_H 1
374
064af421
BP
375#include <stdbool.h>
376#include <stddef.h>
377#include <stdint.h>
01961bbd
DDP
378
379#include "dpdk.h"
cf62fa4c 380#include "dp-packet.h"
5dddf960 381#include "netdev.h"
9dbb9d5e 382#include "openflow/openflow.h"
0d71302e 383#include "openvswitch/ofp-meter.h"
1c1e46ed 384#include "ovs-numa.h"
758c456d 385#include "packets.h"
9dbb9d5e 386#include "util.h"
064af421 387
03292c46
JG
388#ifdef __cplusplus
389extern "C" {
390#endif
391
c228a364 392struct dpif;
623540e4
EJ
393struct dpif_class;
394struct dpif_flow;
c97fb132 395struct ds;
572b7068 396struct flow;
623540e4 397struct flow_wildcards;
cdee00fd 398struct nlattr;
d0c23a1a 399struct sset;
064af421 400
999401aa
JG
401int dp_register_provider(const struct dpif_class *);
402int dp_unregister_provider(const char *type);
8205fbc8 403void dp_disallow_provider(const char *type);
d0c23a1a 404void dp_enumerate_types(struct sset *types);
f79e673f 405const char *dpif_normalize_type(const char *);
999401aa 406
d0c23a1a 407int dp_enumerate_names(const char *type, struct sset *names);
1a6f1e2a 408void dp_parse_name(const char *datapath_name, char **name, char **type);
5792c5c6 409
1a6f1e2a
JG
410int dpif_open(const char *name, const char *type, struct dpif **);
411int dpif_create(const char *name, const char *type, struct dpif **);
412int dpif_create_and_open(const char *name, const char *type, struct dpif **);
064af421
BP
413void dpif_close(struct dpif *);
414
a36de779 415bool dpif_run(struct dpif *);
640e1b20
BP
416void dpif_wait(struct dpif *);
417
b29ba128 418const char *dpif_name(const struct dpif *);
1a6f1e2a 419const char *dpif_base_name(const struct dpif *);
c7a26215 420const char *dpif_type(const struct dpif *);
064af421 421
f87c1357
IM
422bool dpif_cleanup_required(const struct dpif *);
423
064af421
BP
424int dpif_delete(struct dpif *);
425
3b68500b 426/* Statistics for a dpif as a whole. */
a8d9304d 427struct dpif_dp_stats {
a8d9304d
BP
428 uint64_t n_hit; /* Number of flow table matches. */
429 uint64_t n_missed; /* Number of flow table misses. */
430 uint64_t n_lost; /* Number of misses not sent to userspace. */
431 uint64_t n_flows; /* Number of flows present. */
847108dc
AZ
432 uint64_t n_mask_hit; /* Number of mega flow masks visited for
433 flow table matches. */
1ce3fa06 434 uint32_t n_masks; /* Number of mega flow masks. */
a8d9304d
BP
435};
436int dpif_get_dp_stats(const struct dpif *, struct dpif_dp_stats *);
437
dcdcad68
PB
438int dpif_set_features(struct dpif *, uint32_t new_features);
439
c5b4b0ce
JL
440int dpif_get_n_offloaded_flows(struct dpif *dpif, uint64_t *n_flows);
441
6bc60024
BP
442\f
443/* Port operations. */
064af421 444
0aeaabc8
JP
445const char *dpif_port_open_type(const char *datapath_type,
446 const char *port_type);
4e022ec0 447int dpif_port_add(struct dpif *, struct netdev *, odp_port_t *port_nop);
97459c2f 448int dpif_port_del(struct dpif *, odp_port_t port_no, bool local_delete);
4c738a8d
BP
449
450/* A port within a datapath.
451 *
452 * 'name' and 'type' are suitable for passing to netdev_open(). */
453struct dpif_port {
454 char *name; /* Network device name, e.g. "eth0". */
455 char *type; /* Network device type, e.g. "system". */
4e022ec0 456 odp_port_t port_no; /* Port number within datapath. */
4c738a8d
BP
457};
458void dpif_port_clone(struct dpif_port *, const struct dpif_port *);
459void dpif_port_destroy(struct dpif_port *);
4afba28d 460bool dpif_port_exists(const struct dpif *dpif, const char *devname);
4e022ec0 461int dpif_port_query_by_number(const struct dpif *, odp_port_t port_no,
4c738a8d 462 struct dpif_port *);
064af421 463int dpif_port_query_by_name(const struct dpif *, const char *devname,
4c738a8d 464 struct dpif_port *);
4e022ec0 465int dpif_port_get_name(struct dpif *, odp_port_t port_no,
335562c0 466 char *name, size_t name_size);
769b5034 467uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no);
b0ec0f27
BP
468
469struct dpif_port_dump {
470 const struct dpif *dpif;
471 int error;
472 void *state;
473};
474void dpif_port_dump_start(struct dpif_port_dump *, const struct dpif *);
4c738a8d 475bool dpif_port_dump_next(struct dpif_port_dump *, struct dpif_port *);
b0ec0f27
BP
476int dpif_port_dump_done(struct dpif_port_dump *);
477
4c738a8d 478/* Iterates through each DPIF_PORT in DPIF, using DUMP as state.
b0ec0f27
BP
479 *
480 * Arguments all have pointer type.
481 *
482 * If you break out of the loop, then you need to free the dump structure by
483 * hand using dpif_port_dump_done(). */
4c738a8d 484#define DPIF_PORT_FOR_EACH(DPIF_PORT, DUMP, DPIF) \
b0ec0f27 485 for (dpif_port_dump_start(DUMP, DPIF); \
4c738a8d 486 (dpif_port_dump_next(DUMP, DPIF_PORT) \
b0ec0f27
BP
487 ? true \
488 : (dpif_port_dump_done(DUMP), false)); \
489 )
064af421 490
e9e28be3
BP
491int dpif_port_poll(const struct dpif *, char **devnamep);
492void dpif_port_poll_wait(const struct dpif *);
6bc60024
BP
493\f
494/* Flow table operations. */
e9e28be3 495
c97fb132
BP
496struct dpif_flow_stats {
497 uint64_t n_packets;
498 uint64_t n_bytes;
499 long long int used;
a66733a8 500 uint16_t tcp_flags;
c97fb132
BP
501};
502
16441315 503/* more statistics info for offloaded packets and bytes */
504struct dpif_flow_detailed_stats {
505 uint64_t n_packets;
506 uint64_t n_bytes;
507 /* n_offload_packets are a subset of n_packets */
508 uint64_t n_offload_packets;
509 /* n_offload_bytes are a subset of n_bytes */
510 uint64_t n_offload_bytes;
511 long long int used;
512 uint16_t tcp_flags;
513};
514
d63ca532 515struct dpif_flow_attrs {
342b8904
IM
516 bool offloaded; /* True if flow is offloaded to HW. */
517 const char *dp_layer; /* DP layer the flow is handled in. */
518 const char *dp_extra_info; /* Extra information provided by DP. */
d63ca532
GT
519};
520
a692410a
GT
521struct dpif_flow_dump_types {
522 bool ovs_flows;
523 bool netdev_flows;
524};
525
cf62fa4c 526void dpif_flow_stats_extract(const struct flow *, const struct dp_packet *packet,
a7752d4a 527 long long int used, struct dpif_flow_stats *);
c97fb132
BP
528void dpif_flow_stats_format(const struct dpif_flow_stats *, struct ds *);
529
ba25b8f4
BP
530enum dpif_flow_put_flags {
531 DPIF_FP_CREATE = 1 << 0, /* Allow creating a new flow. */
532 DPIF_FP_MODIFY = 1 << 1, /* Allow modifying an existing flow. */
43f9ac0a
JR
533 DPIF_FP_ZERO_STATS = 1 << 2, /* Zero the stats of an existing flow. */
534 DPIF_FP_PROBE = 1 << 3 /* Suppress error messages, if any. */
ba25b8f4
BP
535};
536
2c85851f 537bool dpif_probe_feature(struct dpif *, const char *name,
bb71c96e
AZ
538 const struct ofpbuf *key, const struct ofpbuf *actions,
539 const ovs_u128 *ufid);
064af421 540int dpif_flow_flush(struct dpif *);
ba25b8f4 541int dpif_flow_put(struct dpif *, enum dpif_flow_put_flags,
feebdea2 542 const struct nlattr *key, size_t key_len,
e6cc0bab 543 const struct nlattr *mask, size_t mask_len,
feebdea2 544 const struct nlattr *actions, size_t actions_len,
bd5131ba 545 const ovs_u128 *ufid, const unsigned pmd_id,
1c1e46ed 546 struct dpif_flow_stats *);
feebdea2
BP
547int dpif_flow_del(struct dpif *,
548 const struct nlattr *key, size_t key_len,
bd5131ba 549 const ovs_u128 *ufid, const unsigned pmd_id,
1c1e46ed 550 struct dpif_flow_stats *);
6fe09f8c 551int dpif_flow_get(struct dpif *,
feebdea2 552 const struct nlattr *key, size_t key_len,
bd5131ba 553 const ovs_u128 *ufid, const unsigned pmd_id,
6fe09f8c 554 struct ofpbuf *, struct dpif_flow *);
ac64794a
BP
555\f
556/* Flow dumping interface
557 * ======================
558 *
559 * This interface allows iteration through all of the flows currently installed
560 * in a datapath. It is somewhat complicated by two requirements:
561 *
562 * - Efficient support for dumping flows in parallel from multiple threads.
563 *
564 * - Allow callers to avoid making unnecessary copies of data returned by
565 * the interface across several flows in cases where the dpif
566 * implementation has to maintain a copy of that information anyhow.
567 * (That is, allow the client visibility into any underlying batching as
568 * part of its own batching.)
569 *
570 *
571 * Usage
572 * -----
573 *
574 * 1. Call dpif_flow_dump_create().
575 * 2. In each thread that participates in the dump (which may be just a single
576 * thread if parallelism isn't important):
577 * (a) Call dpif_flow_dump_thread_create().
578 * (b) Call dpif_flow_dump_next() repeatedly until it returns 0.
579 * (c) Call dpif_flow_dump_thread_destroy().
580 * 3. Call dpif_flow_dump_destroy().
581 *
582 * All error reporting is deferred to the call to dpif_flow_dump_destroy().
583 */
7e8b7199 584struct dpif_flow_dump *dpif_flow_dump_create(const struct dpif *, bool terse,
a692410a 585 struct dpif_flow_dump_types *);
ac64794a 586int dpif_flow_dump_destroy(struct dpif_flow_dump *);
704a1e09 587
ac64794a
BP
588struct dpif_flow_dump_thread *dpif_flow_dump_thread_create(
589 struct dpif_flow_dump *);
590void dpif_flow_dump_thread_destroy(struct dpif_flow_dump_thread *);
591
1c1e46ed
AW
592#define PMD_ID_NULL OVS_CORE_UNSPEC
593
ac64794a
BP
594/* A datapath flow as dumped by dpif_flow_dump_next(). */
595struct dpif_flow {
596 const struct nlattr *key; /* Flow key, as OVS_KEY_ATTR_* attrs. */
597 size_t key_len; /* 'key' length in bytes. */
598 const struct nlattr *mask; /* Flow mask, as OVS_KEY_ATTR_* attrs. */
599 size_t mask_len; /* 'mask' length in bytes. */
600 const struct nlattr *actions; /* Actions, as OVS_ACTION_ATTR_ */
601 size_t actions_len; /* 'actions' length in bytes. */
7af12bd7 602 ovs_u128 ufid; /* Unique flow identifier. */
70e5ed6f 603 bool ufid_present; /* True if 'ufid' was provided by datapath.*/
bd5131ba 604 unsigned pmd_id; /* Datapath poll mode driver id. */
ac64794a 605 struct dpif_flow_stats stats; /* Flow statistics. */
d63ca532 606 struct dpif_flow_attrs attrs; /* Flow attributes. */
704a1e09 607};
ac64794a
BP
608int dpif_flow_dump_next(struct dpif_flow_dump_thread *,
609 struct dpif_flow *flows, int max_flows);
6fe09f8c
JS
610
611#define DPIF_FLOW_BUFSIZE 2048
6bc60024 612\f
6bc60024
BP
613/* Operation batching interface.
614 *
615 * Some datapaths are faster at performing N operations together than the same
616 * N operations individually, hence an interface for batching.
617 */
618
619enum dpif_op_type {
620 DPIF_OP_FLOW_PUT = 1,
b99d3cee
BP
621 DPIF_OP_FLOW_DEL,
622 DPIF_OP_EXECUTE,
6fe09f8c 623 DPIF_OP_FLOW_GET,
6bc60024
BP
624};
625
57924fc9
SB
626/* offload_type argument types to (*operate) interface */
627enum dpif_offload_type {
628 DPIF_OFFLOAD_AUTO, /* Offload if possible, fallback to software. */
629 DPIF_OFFLOAD_NEVER, /* Never offload to hardware. */
630 DPIF_OFFLOAD_ALWAYS, /* Always offload to hardware. */
631};
632
1a0c894a
BP
633/* Add or modify a flow.
634 *
635 * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
636 * the 'key_len' bytes starting at 'key'. The associated actions are specified
637 * by the Netlink attributes with types OVS_ACTION_ATTR_* in the 'actions_len'
638 * bytes starting at 'actions'.
639 *
640 * - If the flow's key does not exist in the dpif, then the flow will be
641 * added if 'flags' includes DPIF_FP_CREATE. Otherwise the operation will
642 * fail with ENOENT.
643 *
644 * If the operation succeeds, then 'stats', if nonnull, will be zeroed.
645 *
646 * - If the flow's key does exist in the dpif, then the flow's actions will
647 * be updated if 'flags' includes DPIF_FP_MODIFY. Otherwise the operation
648 * will fail with EEXIST. If the flow's actions are updated, then its
649 * statistics will be zeroed if 'flags' includes DPIF_FP_ZERO_STATS, and
650 * left as-is otherwise.
651 *
652 * If the operation succeeds, then 'stats', if nonnull, will be set to the
653 * flow's statistics before the update.
1c1e46ed
AW
654 *
655 * - If the datapath implements multiple pmd thread with its own flow
656 * table, 'pmd_id' should be used to specify the particular polling
f5d317a1
DDP
657 * thread for the operation. PMD_ID_NULL means that the flow should
658 * be put on all the polling threads.
1a0c894a 659 */
6bc60024 660struct dpif_flow_put {
6bc60024
BP
661 /* Input. */
662 enum dpif_flow_put_flags flags; /* DPIF_FP_*. */
663 const struct nlattr *key; /* Flow to put. */
664 size_t key_len; /* Length of 'key' in bytes. */
e6cc0bab
AZ
665 const struct nlattr *mask; /* Mask to put. */
666 size_t mask_len; /* Length of 'mask' in bytes. */
6bc60024
BP
667 const struct nlattr *actions; /* Actions to perform on flow. */
668 size_t actions_len; /* Length of 'actions' in bytes. */
70e5ed6f 669 const ovs_u128 *ufid; /* Optional unique flow identifier. */
bd5131ba 670 unsigned pmd_id; /* Datapath poll mode driver id. */
6bc60024
BP
671
672 /* Output. */
673 struct dpif_flow_stats *stats; /* Optional flow statistics. */
6bc60024
BP
674};
675
1a0c894a
BP
676/* Delete a flow.
677 *
678 * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
70e5ed6f
JS
679 * the 'key_len' bytes starting at 'key', or the unique identifier 'ufid'. If
680 * the flow was created using 'ufid', then 'ufid' must be specified to delete
681 * the flow. If both are specified, 'key' will be ignored for flow deletion.
682 * Succeeds with status 0 if the flow is deleted, or fails with ENOENT if the
683 * dpif does not contain such a flow.
684 *
685 * Callers should always provide the 'key' to improve dpif logging in the event
686 * of errors or unexpected behaviour.
1a0c894a 687 *
1c1e46ed
AW
688 * If the datapath implements multiple polling thread with its own flow table,
689 * 'pmd_id' should be used to specify the particular polling thread for the
f5d317a1
DDP
690 * operation. PMD_ID_NULL means that the flow should be deleted from all the
691 * polling threads.
1c1e46ed 692 *
1a0c894a
BP
693 * If the operation succeeds, then 'stats', if nonnull, will be set to the
694 * flow's statistics before its deletion. */
b99d3cee
BP
695struct dpif_flow_del {
696 /* Input. */
697 const struct nlattr *key; /* Flow to delete. */
698 size_t key_len; /* Length of 'key' in bytes. */
64bb477f 699 const ovs_u128 *ufid; /* Unique identifier of flow to delete. */
8e1ffd75
JS
700 bool terse; /* OK to skip sending/receiving full flow
701 * info? */
bd5131ba 702 unsigned pmd_id; /* Datapath poll mode driver id. */
b99d3cee
BP
703
704 /* Output. */
705 struct dpif_flow_stats *stats; /* Optional flow statistics. */
706};
707
1a0c894a
BP
708/* Executes actions on a specified packet.
709 *
710 * Performs the 'actions_len' bytes of actions in 'actions' on the Ethernet
711 * frame in 'packet' and on the packet metadata in 'md'. May modify both
712 * 'packet' and 'md'.
713 *
714 * Some dpif providers do not implement every action. The Linux kernel
715 * datapath, in particular, does not implement ARP field modification. If
716 * 'needs_help' is true, the dpif layer executes in userspace all of the
717 * actions that it can, and for OVS_ACTION_ATTR_OUTPUT and
718 * OVS_ACTION_ATTR_USERSPACE actions it passes the packet through to the dpif
719 * implementation.
720 *
721 * This works even if 'actions_len' is too long for a Netlink attribute. */
6bc60024 722struct dpif_execute {
1a0c894a 723 /* Input. */
6bc60024
BP
724 const struct nlattr *actions; /* Actions to execute on packet. */
725 size_t actions_len; /* Length of 'actions' in bytes. */
1a0c894a 726 bool needs_help;
43f9ac0a 727 bool probe; /* Suppress error messages. */
27130224
AZ
728 unsigned int mtu; /* Maximum transmission unit to fragment.
729 0 if not a fragmented packet */
0442bfb1 730 uint64_t hash;
1cceb31b 731 const struct flow *flow; /* Flow extracted from 'packet'. */
1a0c894a
BP
732
733 /* Input, but possibly modified as a side effect of execution. */
cf62fa4c 734 struct dp_packet *packet; /* Packet to execute. */
6bc60024
BP
735};
736
6fe09f8c
JS
737/* Queries the dpif for a flow entry.
738 *
739 * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in
70e5ed6f
JS
740 * the 'key_len' bytes starting at 'key', or the unique identifier 'ufid'. If
741 * the flow was created using 'ufid', then 'ufid' must be specified to fetch
742 * the flow. If both are specified, 'key' will be ignored for the flow query.
743 * 'buffer' must point to an initialized buffer, with a recommended size of
744 * DPIF_FLOW_BUFSIZE bytes.
6fe09f8c 745 *
d7b55c5c
IM
746 * On success, 'flow' will be populated with the mask, actions, stats and attrs
747 * for the datapath flow corresponding to 'key'. The mask and actions may point
6fe09f8c
JS
748 * within '*buffer', or may point at RCU-protected data. Therefore, callers
749 * that wish to hold these over quiescent periods must make a copy of these
342b8904 750 * fields before quiescing.
6fe09f8c 751 *
70e5ed6f
JS
752 * Callers should always provide 'key' to improve dpif logging in the event of
753 * errors or unexpected behaviour.
754 *
1c1e46ed
AW
755 * If the datapath implements multiple polling thread with its own flow table,
756 * 'pmd_id' should be used to specify the particular polling thread for the
f5d317a1
DDP
757 * operation. PMD_ID_NULL means that the datapath will return the first
758 * matching flow from any poll thread.
1c1e46ed 759 *
6fe09f8c
JS
760 * Succeeds with status 0 if the flow is fetched, or fails with ENOENT if no
761 * such flow exists. Other failures are indicated with a positive errno value.
762 */
763struct dpif_flow_get {
764 /* Input. */
765 const struct nlattr *key; /* Flow to get. */
766 size_t key_len; /* Length of 'key' in bytes. */
64bb477f 767 const ovs_u128 *ufid; /* Unique identifier of flow to get. */
bd5131ba 768 unsigned pmd_id; /* Datapath poll mode driver id. */
6fe09f8c
JS
769 struct ofpbuf *buffer; /* Storage for output parameters. */
770
771 /* Output. */
772 struct dpif_flow *flow; /* Resulting flow from datapath. */
773};
774
758c456d
JR
775int dpif_execute(struct dpif *, struct dpif_execute *);
776
c2b565b5 777struct dpif_op {
6bc60024 778 enum dpif_op_type type;
c2b565b5
BP
779 int error;
780 union {
781 struct dpif_flow_put flow_put;
b99d3cee 782 struct dpif_flow_del flow_del;
c2b565b5 783 struct dpif_execute execute;
6fe09f8c 784 struct dpif_flow_get flow_get;
fa37affa 785 };
6bc60024
BP
786};
787
57924fc9
SB
788void dpif_operate(struct dpif *, struct dpif_op **ops, size_t n_ops,
789 enum dpif_offload_type);
e6530a8d 790\f
6bc60024 791/* Upcalls. */
064af421 792
82272ede
BP
793enum dpif_upcall_type {
794 DPIF_UC_MISS, /* Miss in flow table. */
df2c07f4 795 DPIF_UC_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
982b8810 796 DPIF_N_UC_TYPES
82272ede
BP
797};
798
01545c1a
BP
799const char *dpif_upcall_type_to_string(enum dpif_upcall_type);
800
856081f6
BP
801/* A packet passed up from the datapath to userspace.
802 *
da546e07
JR
803 * The 'packet', 'key' and 'userdata' may point into data in a buffer
804 * provided by the caller, so the buffer should be released only after the
805 * upcall processing has been finished.
806 *
807 * While being processed, the 'packet' may be reallocated, so the packet must
808 * be separately released with ofpbuf_uninit().
856081f6
BP
809 */
810struct dpif_upcall {
856081f6 811 /* All types. */
165f5e46 812 struct dp_packet packet; /* Packet data,'dp_packet' should be the first
7dc5969e
JP
813 member to avoid a hole. This is because
814 'rte_mbuf' in dp_packet is aligned atleast
815 on a 64-byte boundary */
82272ede 816 enum dpif_upcall_type type;
856081f6
BP
817 struct nlattr *key; /* Flow key. */
818 size_t key_len; /* Length of 'key' in bytes. */
7af12bd7 819 ovs_u128 ufid; /* Unique flow identifier for 'key'. */
27130224 820 struct nlattr *mru; /* Maximum receive unit. */
0442bfb1 821 struct nlattr *hash; /* Packet hash. */
aaca4fe0 822 struct nlattr *cutlen; /* Number of bytes shrink from the end. */
856081f6 823
82272ede 824 /* DPIF_UC_ACTION only. */
e995e3df 825 struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
8b7ea2d4 826 struct nlattr *out_tun_key; /* Output tunnel key. */
7321bda3 827 struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
856081f6 828};
9dbb9d5e 829
e4e74c3a
AW
830/* A callback to notify higher layer of dpif about to be purged, so that
831 * higher layer could try reacting to this (e.g. grabbing all flow stats
832 * before they are gone). This function is currently implemented only by
833 * dpif-netdev.
834 *
835 * The caller needs to provide the 'aux' pointer passed down by higher
836 * layer from the dpif_register_notify_cb() function and the 'pmd_id' of
837 * the polling thread.
838 */
839 typedef void dp_purge_callback(void *aux, unsigned pmd_id);
840
841void dpif_register_dp_purge_cb(struct dpif *, dp_purge_callback *, void *aux);
842
623540e4
EJ
843/* A callback to process an upcall, currently implemented only by dpif-netdev.
844 *
7af12bd7 845 * The caller provides the 'packet' and 'flow' to process, the corresponding
7a5e0ee7 846 * 'ufid' as generated by odp_flow_key_hash(), the polling thread id 'pmd_id',
1c1e46ed
AW
847 * the 'type' of the upcall, and if 'type' is DPIF_UC_ACTION then the
848 * 'userdata' attached to the action.
623540e4
EJ
849 *
850 * The callback must fill in 'actions' with the datapath actions to apply to
851 * 'packet'. 'wc' and 'put_actions' will either be both null or both nonnull.
852 * If they are nonnull, then the caller will install a flow entry to process
853 * all future packets that match 'flow' and 'wc'; the callback must store a
854 * wildcard mask suitable for that purpose into 'wc'. If the actions to store
855 * into the flow entry are the same as 'actions', then the callback may leave
856 * 'put_actions' empty; otherwise it must store the desired actions into
857 * 'put_actions'.
858 *
859 * Returns 0 if successful, ENOSPC if the flow limit has been reached and no
860 * flow should be installed, or some otherwise a positive errno value. */
cf62fa4c 861typedef int upcall_callback(const struct dp_packet *packet,
623540e4 862 const struct flow *flow,
7af12bd7 863 ovs_u128 *ufid,
bd5131ba 864 unsigned pmd_id,
623540e4
EJ
865 enum dpif_upcall_type type,
866 const struct nlattr *userdata,
867 struct ofpbuf *actions,
868 struct flow_wildcards *wc,
869 struct ofpbuf *put_actions,
870 void *aux);
871
872void dpif_register_upcall_cb(struct dpif *, upcall_callback *, void *aux);
6b31e073 873
a12b3ead 874int dpif_recv_set(struct dpif *, bool enable);
1954e6bb 875int dpif_handlers_set(struct dpif *, uint32_t n_handlers);
d4f6865c 876int dpif_set_config(struct dpif *, const struct smap *cfg);
91364d18 877int dpif_port_set_config(struct dpif *, odp_port_t, const struct smap *cfg);
1954e6bb
AW
878int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *,
879 struct ofpbuf *);
1ba530f4 880void dpif_recv_purge(struct dpif *);
1954e6bb 881void dpif_recv_wait(struct dpif *, uint32_t handler_id);
6b31e073
RW
882void dpif_enable_upcall(struct dpif *);
883void dpif_disable_upcall(struct dpif *);
884
885void dpif_print_packet(struct dpif *, struct dpif_upcall *);
6bc60024 886\f
5dddf960
JR
887/* Meters. */
888void dpif_meter_get_features(const struct dpif *,
889 struct ofputil_meter_features *);
8101f03f 890int dpif_meter_set(struct dpif *, ofproto_meter_id meter_id,
5dddf960
JR
891 struct ofputil_meter_config *);
892int dpif_meter_get(const struct dpif *, ofproto_meter_id meter_id,
893 struct ofputil_meter_stats *, uint16_t n_bands);
894int dpif_meter_del(struct dpif *, ofproto_meter_id meter_id,
895 struct ofputil_meter_stats *, uint16_t n_bands);
9df65060
VDA
896
897/* Bonding. */
898
899/* Bit-mask for hashing a flow down to a bucket. */
900#define BOND_MASK 0xff
901#define BOND_BUCKETS (BOND_MASK + 1)
902
91fc374a 903int dpif_bond_add(struct dpif *, uint32_t bond_id, odp_port_t *member_map);
9df65060
VDA
904int dpif_bond_del(struct dpif *, uint32_t bond_id);
905int dpif_bond_stats_get(struct dpif *, uint32_t bond_id, uint64_t *n_bytes);
906bool dpif_supports_lb_output_action(const struct dpif *);
907
5dddf960 908\f
6bc60024 909/* Miscellaneous. */
064af421 910
53a4218d
BP
911void dpif_get_netflow_ids(const struct dpif *,
912 uint8_t *engine_type, uint8_t *engine_id);
064af421 913
aae51f53
BP
914int dpif_queue_to_priority(const struct dpif *, uint32_t queue_id,
915 uint32_t *priority);
916
f5d317a1
DDP
917int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no,
918 unsigned int **pmds, size_t *n);
919
b5cbbcf6 920char *dpif_get_dp_version(const struct dpif *);
a36de779 921bool dpif_supports_tnl_push_pop(const struct dpif *);
a13a0209 922bool dpif_supports_explicit_drop_action(const struct dpif *);
eff1e5b0
RD
923
924/* Log functions. */
925struct vlog_module;
926
927void log_flow_message(const struct dpif *dpif, int error,
928 const struct vlog_module *module,
929 const char *operation,
930 const struct nlattr *key, size_t key_len,
931 const struct nlattr *mask, size_t mask_len,
932 const ovs_u128 *ufid,
933 const struct dpif_flow_stats *stats,
934 const struct nlattr *actions, size_t actions_len);
935void log_flow_put_message(const struct dpif *,
936 const struct vlog_module *,
937 const struct dpif_flow_put *,
938 int error);
939void log_flow_del_message(const struct dpif *,
940 const struct vlog_module *,
941 const struct dpif_flow_del *,
942 int error);
943void log_execute_message(const struct dpif *,
944 const struct vlog_module *,
945 const struct dpif_execute *,
946 bool subexecute, int error);
947void log_flow_get_message(const struct dpif *,
948 const struct vlog_module *,
949 const struct dpif_flow_get *,
950 int error);
03292c46
JG
951#ifdef __cplusplus
952}
953#endif
954
064af421 955#endif /* dpif.h */