]>
Commit | Line | Data |
---|---|---|
064af421 | 1 | /* |
1954e6bb | 2 | * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. |
064af421 | 3 | * |
a14bc59f BP |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
064af421 | 7 | * |
a14bc59f BP |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
064af421 BP |
15 | */ |
16 | ||
ffcb9f6e BP |
17 | /* |
18 | * dpif, the DataPath InterFace. | |
19 | * | |
20 | * In Open vSwitch terminology, a "datapath" is a flow-based software switch. | |
21 | * A datapath has no intelligence of its own. Rather, it relies entirely on | |
22 | * its client to set up flows. The datapath layer is core to the Open vSwitch | |
23 | * software switch: one could say, without much exaggeration, that everything | |
24 | * in ovs-vswitchd above dpif exists only to make the correct decisions | |
25 | * interacting with dpif. | |
26 | * | |
27 | * Typically, the client of a datapath is the software switch module in | |
28 | * "ovs-vswitchd", but other clients can be written. The "ovs-dpctl" utility | |
29 | * is also a (simple) client. | |
30 | * | |
31 | * | |
32 | * Overview | |
33 | * ======== | |
34 | * | |
35 | * The terms written in quotes below are defined in later sections. | |
36 | * | |
37 | * When a datapath "port" receives a packet, it extracts the headers (the | |
da546e07 JR |
38 | * "flow"). If the datapath's "flow table" contains a "flow entry" matching |
39 | * the packet, then it executes the "actions" in the flow entry and increments | |
40 | * the flow's statistics. If there is no matching flow entry, the datapath | |
41 | * instead appends the packet to an "upcall" queue. | |
ffcb9f6e BP |
42 | * |
43 | * | |
44 | * Ports | |
45 | * ===== | |
46 | * | |
47 | * A datapath has a set of ports that are analogous to the ports on an Ethernet | |
48 | * switch. At the datapath level, each port has the following information | |
49 | * associated with it: | |
50 | * | |
51 | * - A name, a short string that must be unique within the host. This is | |
52 | * typically a name that would be familiar to the system administrator, | |
53 | * e.g. "eth0" or "vif1.1", but it is otherwise arbitrary. | |
54 | * | |
55 | * - A 32-bit port number that must be unique within the datapath but is | |
56 | * otherwise arbitrary. The port number is the most important identifier | |
57 | * for a port in the datapath interface. | |
58 | * | |
59 | * - A type, a short string that identifies the kind of port. On a Linux | |
60 | * host, typical types are "system" (for a network device such as eth0), | |
61 | * "internal" (for a simulated port used to connect to the TCP/IP stack), | |
62 | * and "gre" (for a GRE tunnel). | |
63 | * | |
1954e6bb AW |
64 | * - A Netlink PID for each upcall reading thread (see "Upcall Queuing and |
65 | * Ordering" below). | |
ffcb9f6e BP |
66 | * |
67 | * The dpif interface has functions for adding and deleting ports. When a | |
68 | * datapath implements these (e.g. as the Linux and netdev datapaths do), then | |
69 | * Open vSwitch's ovs-vswitchd daemon can directly control what ports are used | |
70 | * for switching. Some datapaths might not implement them, or implement them | |
71 | * with restrictions on the types of ports that can be added or removed | |
72 | * (e.g. on ESX), on systems where port membership can only be changed by some | |
73 | * external entity. | |
74 | * | |
75 | * Each datapath must have a port, sometimes called the "local port", whose | |
76 | * name is the same as the datapath itself, with port number 0. The local port | |
77 | * cannot be deleted. | |
78 | * | |
79 | * Ports are available as "struct netdev"s. To obtain a "struct netdev *" for | |
80 | * a port named 'name' with type 'port_type', in a datapath of type | |
81 | * 'datapath_type', call netdev_open(name, dpif_port_open_type(datapath_type, | |
82 | * port_type). The netdev can be used to get and set important data related to | |
83 | * the port, such as: | |
84 | * | |
85 | * - MTU (netdev_get_mtu(), netdev_set_mtu()). | |
86 | * | |
87 | * - Ethernet address (netdev_get_etheraddr(), netdev_set_etheraddr()). | |
88 | * | |
89 | * - Statistics such as the number of packets and bytes transmitted and | |
90 | * received (netdev_get_stats()). | |
91 | * | |
92 | * - Carrier status (netdev_get_carrier()). | |
93 | * | |
94 | * - Speed (netdev_get_features()). | |
95 | * | |
96 | * - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and | |
97 | * related functions.) | |
98 | * | |
99 | * - Arbitrary port-specific configuration parameters (netdev_get_config(), | |
100 | * netdev_set_config()). An example of such a parameter is the IP | |
101 | * endpoint for a GRE tunnel. | |
102 | * | |
103 | * | |
104 | * Flow Table | |
105 | * ========== | |
106 | * | |
ee75c546 | 107 | * The flow table is a collection of "flow entries". Each flow entry contains: |
ffcb9f6e BP |
108 | * |
109 | * - A "flow", that is, a summary of the headers in an Ethernet packet. The | |
ee75c546 BP |
110 | * flow must be unique within the flow table. Flows are fine-grained |
111 | * entities that include L2, L3, and L4 headers. A single TCP connection | |
112 | * consists of two flows, one in each direction. | |
ffcb9f6e BP |
113 | * |
114 | * In Open vSwitch userspace, "struct flow" is the typical way to describe | |
115 | * a flow, but the datapath interface uses a different data format to | |
116 | * allow ABI forward- and backward-compatibility. datapath/README | |
117 | * describes the rationale and design. Refer to OVS_KEY_ATTR_* and | |
837eefc7 | 118 | * "struct ovs_key_*" in include/odp-netlink.h for details. |
ffcb9f6e BP |
119 | * lib/odp-util.h defines several functions for working with these flows. |
120 | * | |
ee75c546 BP |
121 | * - A "mask" that, for each bit in the flow, specifies whether the datapath |
122 | * should consider the corresponding flow bit when deciding whether a | |
123 | * given packet matches the flow entry. The original datapath design did | |
124 | * not support matching: every flow entry was exact match. With the | |
125 | * addition of a mask, the interface supports datapaths with a spectrum of | |
126 | * wildcard matching capabilities, from those that only support exact | |
127 | * matches to those that support bitwise wildcarding on the entire flow | |
128 | * key, as well as datapaths with capabilities somewhere in between. | |
129 | * | |
130 | * Datapaths do not provide a way to query their wildcarding capabilities, | |
131 | * nor is it expected that the client should attempt to probe for the | |
132 | * details of their support. Instead, a client installs flows with masks | |
133 | * that wildcard as many bits as acceptable. The datapath then actually | |
134 | * wildcards as many of those bits as it can and changes the wildcard bits | |
135 | * that it does not support into exact match bits. A datapath that can | |
136 | * wildcard any bit, for example, would install the supplied mask, an | |
137 | * exact-match only datapath would install an exact-match mask regardless | |
138 | * of what mask the client supplied, and a datapath in the middle of the | |
139 | * spectrum would selectively change some wildcard bits into exact match | |
140 | * bits. | |
141 | * | |
142 | * Regardless of the requested or installed mask, the datapath retains the | |
143 | * original flow supplied by the client. (It does not, for example, "zero | |
144 | * out" the wildcarded bits.) This allows the client to unambiguously | |
145 | * identify the flow entry in later flow table operations. | |
146 | * | |
147 | * The flow table does not have priorities; that is, all flow entries have | |
148 | * equal priority. Detecting overlapping flow entries is expensive in | |
149 | * general, so the datapath is not required to do it. It is primarily the | |
150 | * client's responsibility not to install flow entries whose flow and mask | |
151 | * combinations overlap. | |
ffcb9f6e BP |
152 | * |
153 | * - A list of "actions" that tell the datapath what to do with packets | |
154 | * within a flow. Some examples of actions are OVS_ACTION_ATTR_OUTPUT, | |
155 | * which transmits the packet out a port, and OVS_ACTION_ATTR_SET, which | |
156 | * modifies packet headers. Refer to OVS_ACTION_ATTR_* and "struct | |
837eefc7 BP |
157 | * ovs_action_*" in include/odp-netlink.h for details. lib/odp-util.h |
158 | * defines several functions for working with datapath actions. | |
ffcb9f6e BP |
159 | * |
160 | * The actions list may be empty. This indicates that nothing should be | |
161 | * done to matching packets, that is, they should be dropped. | |
162 | * | |
163 | * (In case you are familiar with OpenFlow, datapath actions are analogous | |
164 | * to OpenFlow actions.) | |
165 | * | |
166 | * - Statistics: the number of packets and bytes that the flow has | |
167 | * processed, the last time that the flow processed a packet, and the | |
168 | * union of all the TCP flags in packets processed by the flow. (The | |
169 | * latter is 0 if the flow is not a TCP flow.) | |
170 | * | |
171 | * The datapath's client manages the flow table, primarily in reaction to | |
172 | * "upcalls" (see below). | |
173 | * | |
174 | * | |
175 | * Upcalls | |
176 | * ======= | |
177 | * | |
178 | * A datapath sometimes needs to notify its client that a packet was received. | |
179 | * The datapath mechanism to do this is called an "upcall". | |
180 | * | |
181 | * Upcalls are used in two situations: | |
182 | * | |
183 | * - When a packet is received, but there is no matching flow entry in its | |
184 | * flow table (a flow table "miss"), this causes an upcall of type | |
185 | * DPIF_UC_MISS. These are called "miss" upcalls. | |
186 | * | |
187 | * - A datapath action of type OVS_ACTION_ATTR_USERSPACE causes an upcall of | |
188 | * type DPIF_UC_ACTION. These are called "action" upcalls. | |
189 | * | |
190 | * An upcall contains an entire packet. There is no attempt to, e.g., copy | |
191 | * only as much of the packet as normally needed to make a forwarding decision. | |
192 | * Such an optimization is doable, but experimental prototypes showed it to be | |
193 | * of little benefit because an upcall typically contains the first packet of a | |
194 | * flow, which is usually short (e.g. a TCP SYN). Also, the entire packet can | |
195 | * sometimes really be needed. | |
196 | * | |
197 | * After a client reads a given upcall, the datapath is finished with it, that | |
198 | * is, the datapath doesn't maintain any lingering state past that point. | |
199 | * | |
200 | * The latency from the time that a packet arrives at a port to the time that | |
201 | * it is received from dpif_recv() is critical in some benchmarks. For | |
202 | * example, if this latency is 1 ms, then a netperf TCP_CRR test, which opens | |
203 | * and closes TCP connections one at a time as quickly as it can, cannot | |
204 | * possibly achieve more than 500 transactions per second, since every | |
205 | * connection consists of two flows with 1-ms latency to set up each one. | |
206 | * | |
207 | * To receive upcalls, a client has to enable them with dpif_recv_set(). A | |
1954e6bb AW |
208 | * datapath should generally support being opened multiple times (e.g. so that |
209 | * one may run "ovs-dpctl show" or "ovs-dpctl dump-flows" while "ovs-vswitchd" | |
210 | * is also running) but need not support more than one of these clients | |
211 | * enabling upcalls at once. | |
ffcb9f6e BP |
212 | * |
213 | * | |
214 | * Upcall Queuing and Ordering | |
215 | * --------------------------- | |
216 | * | |
217 | * The datapath's client reads upcalls one at a time by calling dpif_recv(). | |
218 | * When more than one upcall is pending, the order in which the datapath | |
219 | * presents upcalls to its client is important. The datapath's client does not | |
220 | * directly control this order, so the datapath implementer must take care | |
221 | * during design. | |
222 | * | |
223 | * The minimal behavior, suitable for initial testing of a datapath | |
224 | * implementation, is that all upcalls are appended to a single queue, which is | |
225 | * delivered to the client in order. | |
226 | * | |
227 | * The datapath should ensure that a high rate of upcalls from one particular | |
228 | * port cannot cause upcalls from other sources to be dropped or unreasonably | |
229 | * delayed. Otherwise, one port conducting a port scan or otherwise initiating | |
230 | * high-rate traffic spanning many flows could suppress other traffic. | |
231 | * Ideally, the datapath should present upcalls from each port in a "round | |
232 | * robin" manner, to ensure fairness. | |
233 | * | |
234 | * The client has no control over "miss" upcalls and no insight into the | |
235 | * datapath's implementation, so the datapath is entirely responsible for | |
236 | * queuing and delivering them. On the other hand, the datapath has | |
237 | * considerable freedom of implementation. One good approach is to maintain a | |
238 | * separate queue for each port, to prevent any given port's upcalls from | |
239 | * interfering with other ports' upcalls. If this is impractical, then another | |
240 | * reasonable choice is to maintain some fixed number of queues and assign each | |
241 | * port to one of them. Ports assigned to the same queue can then interfere | |
242 | * with each other, but not with ports assigned to different queues. Other | |
243 | * approaches are also possible. | |
244 | * | |
245 | * The client has some control over "action" upcalls: it can specify a 32-bit | |
246 | * "Netlink PID" as part of the action. This terminology comes from the Linux | |
247 | * datapath implementation, which uses a protocol called Netlink in which a PID | |
248 | * designates a particular socket and the upcall data is delivered to the | |
249 | * socket's receive queue. Generically, though, a Netlink PID identifies a | |
250 | * queue for upcalls. The basic requirements on the datapath are: | |
251 | * | |
252 | * - The datapath must provide a Netlink PID associated with each port. The | |
253 | * client can retrieve the PID with dpif_port_get_pid(). | |
254 | * | |
255 | * - The datapath must provide a "special" Netlink PID not associated with | |
256 | * any port. dpif_port_get_pid() also provides this PID. (ovs-vswitchd | |
257 | * uses this PID to queue special packets that must not be lost even if a | |
258 | * port is otherwise busy, such as packets used for tunnel monitoring.) | |
259 | * | |
260 | * The minimal behavior of dpif_port_get_pid() and the treatment of the Netlink | |
261 | * PID in "action" upcalls is that dpif_port_get_pid() returns a constant value | |
262 | * and all upcalls are appended to a single queue. | |
263 | * | |
1954e6bb | 264 | * The preferred behavior is: |
ffcb9f6e BP |
265 | * |
266 | * - Each port has a PID that identifies the queue used for "miss" upcalls | |
267 | * on that port. (Thus, if each port has its own queue for "miss" | |
268 | * upcalls, then each port has a different Netlink PID.) | |
269 | * | |
270 | * - "miss" upcalls for a given port and "action" upcalls that specify that | |
271 | * port's Netlink PID add their upcalls to the same queue. The upcalls | |
272 | * are delivered to the datapath's client in the order that the packets | |
273 | * were received, regardless of whether the upcalls are "miss" or "action" | |
274 | * upcalls. | |
275 | * | |
276 | * - Upcalls that specify the "special" Netlink PID are queued separately. | |
277 | * | |
1954e6bb AW |
278 | * Multiple threads may want to read upcalls simultaneously from a single |
279 | * datapath. To support multiple threads well, one extends the above preferred | |
280 | * behavior: | |
281 | * | |
282 | * - Each port has multiple PIDs. The datapath distributes "miss" upcalls | |
283 | * across the PIDs, ensuring that a given flow is mapped in a stable way | |
284 | * to a single PID. | |
285 | * | |
286 | * - For "action" upcalls, the thread can specify its own Netlink PID or | |
287 | * other threads' Netlink PID of the same port for offloading purpose | |
288 | * (e.g. in a "round robin" manner). | |
289 | * | |
ffcb9f6e BP |
290 | * |
291 | * Packet Format | |
292 | * ============= | |
293 | * | |
294 | * The datapath interface works with packets in a particular form. This is the | |
295 | * form taken by packets received via upcalls (i.e. by dpif_recv()). Packets | |
296 | * supplied to the datapath for processing (i.e. to dpif_execute()) also take | |
297 | * this form. | |
298 | * | |
299 | * A VLAN tag is represented by an 802.1Q header. If the layer below the | |
300 | * datapath interface uses another representation, then the datapath interface | |
301 | * must perform conversion. | |
302 | * | |
303 | * The datapath interface requires all packets to fit within the MTU. Some | |
304 | * operating systems internally process packets larger than MTU, with features | |
305 | * such as TSO and UFO. When such a packet passes through the datapath | |
306 | * interface, it must be broken into multiple MTU or smaller sized packets for | |
307 | * presentation as upcalls. (This does not happen often, because an upcall | |
308 | * typically contains the first packet of a flow, which is usually short.) | |
309 | * | |
310 | * Some operating system TCP/IP stacks maintain packets in an unchecksummed or | |
311 | * partially checksummed state until transmission. The datapath interface | |
312 | * requires all host-generated packets to be fully checksummed (e.g. IP and TCP | |
313 | * checksums must be correct). On such an OS, the datapath interface must fill | |
314 | * in these checksums. | |
315 | * | |
316 | * Packets passed through the datapath interface must be at least 14 bytes | |
317 | * long, that is, they must have a complete Ethernet header. They are not | |
318 | * required to be padded to the minimum Ethernet length. | |
319 | * | |
320 | * | |
321 | * Typical Usage | |
322 | * ============= | |
323 | * | |
324 | * Typically, the client of a datapath begins by configuring the datapath with | |
325 | * a set of ports. Afterward, the client runs in a loop polling for upcalls to | |
326 | * arrive. | |
327 | * | |
328 | * For each upcall received, the client examines the enclosed packet and | |
329 | * figures out what should be done with it. For example, if the client | |
330 | * implements a MAC-learning switch, then it searches the forwarding database | |
331 | * for the packet's destination MAC and VLAN and determines the set of ports to | |
332 | * which it should be sent. In any case, the client composes a set of datapath | |
333 | * actions to properly dispatch the packet and then directs the datapath to | |
334 | * execute those actions on the packet (e.g. with dpif_execute()). | |
335 | * | |
336 | * Most of the time, the actions that the client executed on the packet apply | |
337 | * to every packet with the same flow. For example, the flow includes both | |
338 | * destination MAC and VLAN ID (and much more), so this is true for the | |
339 | * MAC-learning switch example above. In such a case, the client can also | |
340 | * direct the datapath to treat any further packets in the flow in the same | |
341 | * way, using dpif_flow_put() to add a new flow entry. | |
342 | * | |
343 | * Other tasks the client might need to perform, in addition to reacting to | |
344 | * upcalls, include: | |
345 | * | |
346 | * - Periodically polling flow statistics, perhaps to supply to its own | |
347 | * clients. | |
348 | * | |
349 | * - Deleting flow entries from the datapath that haven't been used | |
350 | * recently, to save memory. | |
351 | * | |
352 | * - Updating flow entries whose actions should change. For example, if a | |
353 | * MAC learning switch learns that a MAC has moved, then it must update | |
354 | * the actions of flow entries that sent packets to the MAC at its old | |
355 | * location. | |
356 | * | |
357 | * - Adding and removing ports to achieve a new configuration. | |
5703b15f BP |
358 | * |
359 | * | |
360 | * Thread-safety | |
361 | * ============= | |
362 | * | |
363 | * Most of the dpif functions are fully thread-safe: they may be called from | |
364 | * any number of threads on the same or different dpif objects. The exceptions | |
365 | * are: | |
366 | * | |
367 | * - dpif_port_poll() and dpif_port_poll_wait() are conditionally | |
368 | * thread-safe: they may be called from different threads only on | |
369 | * different dpif objects. | |
370 | * | |
d2ad7ef1 JS |
371 | * - dpif_flow_dump_next() is conditionally thread-safe: It may be called |
372 | * from different threads with the same 'struct dpif_flow_dump', but all | |
373 | * other parameters must be different for each thread. | |
374 | * | |
375 | * - dpif_flow_dump_done() is conditionally thread-safe: All threads that | |
376 | * share the same 'struct dpif_flow_dump' must have finished using it. | |
377 | * This function must then be called exactly once for a particular | |
378 | * dpif_flow_dump to finish the corresponding flow dump operation. | |
379 | * | |
380 | * - Functions that operate on 'struct dpif_port_dump' are conditionally | |
381 | * thread-safe with respect to those objects. That is, one may dump ports | |
382 | * from any number of threads at once, but each thread must use its own | |
383 | * struct dpif_port_dump. | |
ffcb9f6e | 384 | */ |
064af421 BP |
385 | #ifndef DPIF_H |
386 | #define DPIF_H 1 | |
387 | ||
064af421 BP |
388 | #include <stdbool.h> |
389 | #include <stddef.h> | |
390 | #include <stdint.h> | |
758c456d | 391 | #include "netdev.h" |
da546e07 | 392 | #include "ofpbuf.h" |
9dbb9d5e | 393 | #include "openflow/openflow.h" |
758c456d | 394 | #include "packets.h" |
9dbb9d5e | 395 | #include "util.h" |
064af421 | 396 | |
03292c46 JG |
397 | #ifdef __cplusplus |
398 | extern "C" { | |
399 | #endif | |
400 | ||
c228a364 | 401 | struct dpif; |
623540e4 EJ |
402 | struct dpif_class; |
403 | struct dpif_flow; | |
c97fb132 | 404 | struct ds; |
572b7068 | 405 | struct flow; |
623540e4 | 406 | struct flow_wildcards; |
cdee00fd | 407 | struct nlattr; |
d0c23a1a | 408 | struct sset; |
064af421 | 409 | |
999401aa JG |
410 | int dp_register_provider(const struct dpif_class *); |
411 | int dp_unregister_provider(const char *type); | |
579a77e0 | 412 | void dp_blacklist_provider(const char *type); |
d0c23a1a | 413 | void dp_enumerate_types(struct sset *types); |
f79e673f | 414 | const char *dpif_normalize_type(const char *); |
999401aa | 415 | |
d0c23a1a | 416 | int dp_enumerate_names(const char *type, struct sset *names); |
1a6f1e2a | 417 | void dp_parse_name(const char *datapath_name, char **name, char **type); |
5792c5c6 | 418 | |
1a6f1e2a JG |
419 | int dpif_open(const char *name, const char *type, struct dpif **); |
420 | int dpif_create(const char *name, const char *type, struct dpif **); | |
421 | int dpif_create_and_open(const char *name, const char *type, struct dpif **); | |
064af421 BP |
422 | void dpif_close(struct dpif *); |
423 | ||
640e1b20 BP |
424 | void dpif_run(struct dpif *); |
425 | void dpif_wait(struct dpif *); | |
426 | ||
b29ba128 | 427 | const char *dpif_name(const struct dpif *); |
1a6f1e2a | 428 | const char *dpif_base_name(const struct dpif *); |
c7a26215 | 429 | const char *dpif_type(const struct dpif *); |
064af421 BP |
430 | |
431 | int dpif_delete(struct dpif *); | |
432 | ||
3b68500b | 433 | /* Statistics for a dpif as a whole. */ |
a8d9304d | 434 | struct dpif_dp_stats { |
a8d9304d BP |
435 | uint64_t n_hit; /* Number of flow table matches. */ |
436 | uint64_t n_missed; /* Number of flow table misses. */ | |
437 | uint64_t n_lost; /* Number of misses not sent to userspace. */ | |
438 | uint64_t n_flows; /* Number of flows present. */ | |
847108dc AZ |
439 | uint64_t n_mask_hit; /* Number of mega flow masks visited for |
440 | flow table matches. */ | |
1ce3fa06 | 441 | uint32_t n_masks; /* Number of mega flow masks. */ |
a8d9304d BP |
442 | }; |
443 | int dpif_get_dp_stats(const struct dpif *, struct dpif_dp_stats *); | |
444 | ||
6bc60024 BP |
445 | \f |
446 | /* Port operations. */ | |
064af421 | 447 | |
0aeaabc8 JP |
448 | const char *dpif_port_open_type(const char *datapath_type, |
449 | const char *port_type); | |
4e022ec0 AW |
450 | int dpif_port_add(struct dpif *, struct netdev *, odp_port_t *port_nop); |
451 | int dpif_port_del(struct dpif *, odp_port_t port_no); | |
4c738a8d BP |
452 | |
453 | /* A port within a datapath. | |
454 | * | |
455 | * 'name' and 'type' are suitable for passing to netdev_open(). */ | |
456 | struct dpif_port { | |
457 | char *name; /* Network device name, e.g. "eth0". */ | |
458 | char *type; /* Network device type, e.g. "system". */ | |
4e022ec0 | 459 | odp_port_t port_no; /* Port number within datapath. */ |
4c738a8d BP |
460 | }; |
461 | void dpif_port_clone(struct dpif_port *, const struct dpif_port *); | |
462 | void dpif_port_destroy(struct dpif_port *); | |
4afba28d | 463 | bool dpif_port_exists(const struct dpif *dpif, const char *devname); |
4e022ec0 | 464 | int dpif_port_query_by_number(const struct dpif *, odp_port_t port_no, |
4c738a8d | 465 | struct dpif_port *); |
064af421 | 466 | int dpif_port_query_by_name(const struct dpif *, const char *devname, |
4c738a8d | 467 | struct dpif_port *); |
4e022ec0 | 468 | int dpif_port_get_name(struct dpif *, odp_port_t port_no, |
335562c0 | 469 | char *name, size_t name_size); |
1954e6bb AW |
470 | uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no, |
471 | uint32_t hash); | |
b0ec0f27 BP |
472 | |
473 | struct dpif_port_dump { | |
474 | const struct dpif *dpif; | |
475 | int error; | |
476 | void *state; | |
477 | }; | |
478 | void dpif_port_dump_start(struct dpif_port_dump *, const struct dpif *); | |
4c738a8d | 479 | bool dpif_port_dump_next(struct dpif_port_dump *, struct dpif_port *); |
b0ec0f27 BP |
480 | int dpif_port_dump_done(struct dpif_port_dump *); |
481 | ||
4c738a8d | 482 | /* Iterates through each DPIF_PORT in DPIF, using DUMP as state. |
b0ec0f27 BP |
483 | * |
484 | * Arguments all have pointer type. | |
485 | * | |
486 | * If you break out of the loop, then you need to free the dump structure by | |
487 | * hand using dpif_port_dump_done(). */ | |
4c738a8d | 488 | #define DPIF_PORT_FOR_EACH(DPIF_PORT, DUMP, DPIF) \ |
b0ec0f27 | 489 | for (dpif_port_dump_start(DUMP, DPIF); \ |
4c738a8d | 490 | (dpif_port_dump_next(DUMP, DPIF_PORT) \ |
b0ec0f27 BP |
491 | ? true \ |
492 | : (dpif_port_dump_done(DUMP), false)); \ | |
493 | ) | |
064af421 | 494 | |
e9e28be3 BP |
495 | int dpif_port_poll(const struct dpif *, char **devnamep); |
496 | void dpif_port_poll_wait(const struct dpif *); | |
6bc60024 BP |
497 | \f |
498 | /* Flow table operations. */ | |
e9e28be3 | 499 | |
c97fb132 BP |
500 | struct dpif_flow_stats { |
501 | uint64_t n_packets; | |
502 | uint64_t n_bytes; | |
503 | long long int used; | |
a66733a8 | 504 | uint16_t tcp_flags; |
c97fb132 BP |
505 | }; |
506 | ||
a39edbd4 | 507 | void dpif_flow_stats_extract(const struct flow *, const struct ofpbuf *packet, |
a7752d4a | 508 | long long int used, struct dpif_flow_stats *); |
c97fb132 BP |
509 | void dpif_flow_stats_format(const struct dpif_flow_stats *, struct ds *); |
510 | ||
ba25b8f4 BP |
511 | enum dpif_flow_put_flags { |
512 | DPIF_FP_CREATE = 1 << 0, /* Allow creating a new flow. */ | |
513 | DPIF_FP_MODIFY = 1 << 1, /* Allow modifying an existing flow. */ | |
514 | DPIF_FP_ZERO_STATS = 1 << 2 /* Zero the stats of an existing flow. */ | |
515 | }; | |
516 | ||
064af421 | 517 | int dpif_flow_flush(struct dpif *); |
ba25b8f4 | 518 | int dpif_flow_put(struct dpif *, enum dpif_flow_put_flags, |
feebdea2 | 519 | const struct nlattr *key, size_t key_len, |
e6cc0bab | 520 | const struct nlattr *mask, size_t mask_len, |
feebdea2 | 521 | const struct nlattr *actions, size_t actions_len, |
c97fb132 | 522 | struct dpif_flow_stats *); |
feebdea2 BP |
523 | int dpif_flow_del(struct dpif *, |
524 | const struct nlattr *key, size_t key_len, | |
c97fb132 | 525 | struct dpif_flow_stats *); |
6fe09f8c | 526 | int dpif_flow_get(struct dpif *, |
feebdea2 | 527 | const struct nlattr *key, size_t key_len, |
6fe09f8c | 528 | struct ofpbuf *, struct dpif_flow *); |
ac64794a BP |
529 | \f |
530 | /* Flow dumping interface | |
531 | * ====================== | |
532 | * | |
533 | * This interface allows iteration through all of the flows currently installed | |
534 | * in a datapath. It is somewhat complicated by two requirements: | |
535 | * | |
536 | * - Efficient support for dumping flows in parallel from multiple threads. | |
537 | * | |
538 | * - Allow callers to avoid making unnecessary copies of data returned by | |
539 | * the interface across several flows in cases where the dpif | |
540 | * implementation has to maintain a copy of that information anyhow. | |
541 | * (That is, allow the client visibility into any underlying batching as | |
542 | * part of its own batching.) | |
543 | * | |
544 | * | |
545 | * Usage | |
546 | * ----- | |
547 | * | |
548 | * 1. Call dpif_flow_dump_create(). | |
549 | * 2. In each thread that participates in the dump (which may be just a single | |
550 | * thread if parallelism isn't important): | |
551 | * (a) Call dpif_flow_dump_thread_create(). | |
552 | * (b) Call dpif_flow_dump_next() repeatedly until it returns 0. | |
553 | * (c) Call dpif_flow_dump_thread_destroy(). | |
554 | * 3. Call dpif_flow_dump_destroy(). | |
555 | * | |
556 | * All error reporting is deferred to the call to dpif_flow_dump_destroy(). | |
557 | */ | |
558 | struct dpif_flow_dump *dpif_flow_dump_create(const struct dpif *); | |
559 | int dpif_flow_dump_destroy(struct dpif_flow_dump *); | |
704a1e09 | 560 | |
ac64794a BP |
561 | struct dpif_flow_dump_thread *dpif_flow_dump_thread_create( |
562 | struct dpif_flow_dump *); | |
563 | void dpif_flow_dump_thread_destroy(struct dpif_flow_dump_thread *); | |
564 | ||
565 | /* A datapath flow as dumped by dpif_flow_dump_next(). */ | |
566 | struct dpif_flow { | |
567 | const struct nlattr *key; /* Flow key, as OVS_KEY_ATTR_* attrs. */ | |
568 | size_t key_len; /* 'key' length in bytes. */ | |
569 | const struct nlattr *mask; /* Flow mask, as OVS_KEY_ATTR_* attrs. */ | |
570 | size_t mask_len; /* 'mask' length in bytes. */ | |
571 | const struct nlattr *actions; /* Actions, as OVS_ACTION_ATTR_ */ | |
572 | size_t actions_len; /* 'actions' length in bytes. */ | |
573 | struct dpif_flow_stats stats; /* Flow statistics. */ | |
704a1e09 | 574 | }; |
ac64794a BP |
575 | int dpif_flow_dump_next(struct dpif_flow_dump_thread *, |
576 | struct dpif_flow *flows, int max_flows); | |
6fe09f8c JS |
577 | |
578 | #define DPIF_FLOW_BUFSIZE 2048 | |
6bc60024 | 579 | \f |
6bc60024 BP |
580 | /* Operation batching interface. |
581 | * | |
582 | * Some datapaths are faster at performing N operations together than the same | |
583 | * N operations individually, hence an interface for batching. | |
584 | */ | |
585 | ||
586 | enum dpif_op_type { | |
587 | DPIF_OP_FLOW_PUT = 1, | |
b99d3cee BP |
588 | DPIF_OP_FLOW_DEL, |
589 | DPIF_OP_EXECUTE, | |
6fe09f8c | 590 | DPIF_OP_FLOW_GET, |
6bc60024 BP |
591 | }; |
592 | ||
1a0c894a BP |
593 | /* Add or modify a flow. |
594 | * | |
595 | * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in | |
596 | * the 'key_len' bytes starting at 'key'. The associated actions are specified | |
597 | * by the Netlink attributes with types OVS_ACTION_ATTR_* in the 'actions_len' | |
598 | * bytes starting at 'actions'. | |
599 | * | |
600 | * - If the flow's key does not exist in the dpif, then the flow will be | |
601 | * added if 'flags' includes DPIF_FP_CREATE. Otherwise the operation will | |
602 | * fail with ENOENT. | |
603 | * | |
604 | * If the operation succeeds, then 'stats', if nonnull, will be zeroed. | |
605 | * | |
606 | * - If the flow's key does exist in the dpif, then the flow's actions will | |
607 | * be updated if 'flags' includes DPIF_FP_MODIFY. Otherwise the operation | |
608 | * will fail with EEXIST. If the flow's actions are updated, then its | |
609 | * statistics will be zeroed if 'flags' includes DPIF_FP_ZERO_STATS, and | |
610 | * left as-is otherwise. | |
611 | * | |
612 | * If the operation succeeds, then 'stats', if nonnull, will be set to the | |
613 | * flow's statistics before the update. | |
614 | */ | |
6bc60024 | 615 | struct dpif_flow_put { |
6bc60024 BP |
616 | /* Input. */ |
617 | enum dpif_flow_put_flags flags; /* DPIF_FP_*. */ | |
618 | const struct nlattr *key; /* Flow to put. */ | |
619 | size_t key_len; /* Length of 'key' in bytes. */ | |
e6cc0bab AZ |
620 | const struct nlattr *mask; /* Mask to put. */ |
621 | size_t mask_len; /* Length of 'mask' in bytes. */ | |
6bc60024 BP |
622 | const struct nlattr *actions; /* Actions to perform on flow. */ |
623 | size_t actions_len; /* Length of 'actions' in bytes. */ | |
624 | ||
625 | /* Output. */ | |
626 | struct dpif_flow_stats *stats; /* Optional flow statistics. */ | |
6bc60024 BP |
627 | }; |
628 | ||
1a0c894a BP |
629 | /* Delete a flow. |
630 | * | |
631 | * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in | |
632 | * the 'key_len' bytes starting at 'key'. Succeeds with status 0 if the flow | |
633 | * is deleted, or fails with ENOENT if the dpif does not contain such a flow. | |
634 | * | |
635 | * If the operation succeeds, then 'stats', if nonnull, will be set to the | |
636 | * flow's statistics before its deletion. */ | |
b99d3cee BP |
637 | struct dpif_flow_del { |
638 | /* Input. */ | |
639 | const struct nlattr *key; /* Flow to delete. */ | |
640 | size_t key_len; /* Length of 'key' in bytes. */ | |
641 | ||
642 | /* Output. */ | |
643 | struct dpif_flow_stats *stats; /* Optional flow statistics. */ | |
644 | }; | |
645 | ||
1a0c894a BP |
646 | /* Executes actions on a specified packet. |
647 | * | |
648 | * Performs the 'actions_len' bytes of actions in 'actions' on the Ethernet | |
649 | * frame in 'packet' and on the packet metadata in 'md'. May modify both | |
650 | * 'packet' and 'md'. | |
651 | * | |
652 | * Some dpif providers do not implement every action. The Linux kernel | |
653 | * datapath, in particular, does not implement ARP field modification. If | |
654 | * 'needs_help' is true, the dpif layer executes in userspace all of the | |
655 | * actions that it can, and for OVS_ACTION_ATTR_OUTPUT and | |
656 | * OVS_ACTION_ATTR_USERSPACE actions it passes the packet through to the dpif | |
657 | * implementation. | |
658 | * | |
659 | * This works even if 'actions_len' is too long for a Netlink attribute. */ | |
6bc60024 | 660 | struct dpif_execute { |
1a0c894a | 661 | /* Input. */ |
6bc60024 BP |
662 | const struct nlattr *actions; /* Actions to execute on packet. */ |
663 | size_t actions_len; /* Length of 'actions' in bytes. */ | |
1a0c894a BP |
664 | bool needs_help; |
665 | ||
666 | /* Input, but possibly modified as a side effect of execution. */ | |
da546e07 | 667 | struct ofpbuf *packet; /* Packet to execute. */ |
758c456d | 668 | struct pkt_metadata md; /* Packet metadata. */ |
6bc60024 BP |
669 | }; |
670 | ||
6fe09f8c JS |
671 | /* Queries the dpif for a flow entry. |
672 | * | |
673 | * The flow is specified by the Netlink attributes with types OVS_KEY_ATTR_* in | |
674 | * the 'key_len' bytes starting at 'key'. 'buffer' must point to an initialized | |
675 | * buffer, with a recommended size of DPIF_FLOW_BUFSIZE bytes. | |
676 | * | |
677 | * On success, 'flow' will be populated with the mask, actions and stats for | |
678 | * the datapath flow corresponding to 'key'. The mask and actions may point | |
679 | * within '*buffer', or may point at RCU-protected data. Therefore, callers | |
680 | * that wish to hold these over quiescent periods must make a copy of these | |
681 | * fields before quiescing. | |
682 | * | |
683 | * Succeeds with status 0 if the flow is fetched, or fails with ENOENT if no | |
684 | * such flow exists. Other failures are indicated with a positive errno value. | |
685 | */ | |
686 | struct dpif_flow_get { | |
687 | /* Input. */ | |
688 | const struct nlattr *key; /* Flow to get. */ | |
689 | size_t key_len; /* Length of 'key' in bytes. */ | |
690 | struct ofpbuf *buffer; /* Storage for output parameters. */ | |
691 | ||
692 | /* Output. */ | |
693 | struct dpif_flow *flow; /* Resulting flow from datapath. */ | |
694 | }; | |
695 | ||
758c456d JR |
696 | int dpif_execute(struct dpif *, struct dpif_execute *); |
697 | ||
c2b565b5 | 698 | struct dpif_op { |
6bc60024 | 699 | enum dpif_op_type type; |
c2b565b5 BP |
700 | int error; |
701 | union { | |
702 | struct dpif_flow_put flow_put; | |
b99d3cee | 703 | struct dpif_flow_del flow_del; |
c2b565b5 | 704 | struct dpif_execute execute; |
6fe09f8c | 705 | struct dpif_flow_get flow_get; |
c2b565b5 | 706 | } u; |
6bc60024 BP |
707 | }; |
708 | ||
c2b565b5 | 709 | void dpif_operate(struct dpif *, struct dpif_op **ops, size_t n_ops); |
6bc60024 BP |
710 | \f |
711 | /* Upcalls. */ | |
064af421 | 712 | |
82272ede BP |
713 | enum dpif_upcall_type { |
714 | DPIF_UC_MISS, /* Miss in flow table. */ | |
df2c07f4 | 715 | DPIF_UC_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */ |
982b8810 | 716 | DPIF_N_UC_TYPES |
82272ede BP |
717 | }; |
718 | ||
01545c1a BP |
719 | const char *dpif_upcall_type_to_string(enum dpif_upcall_type); |
720 | ||
856081f6 BP |
721 | /* A packet passed up from the datapath to userspace. |
722 | * | |
da546e07 JR |
723 | * The 'packet', 'key' and 'userdata' may point into data in a buffer |
724 | * provided by the caller, so the buffer should be released only after the | |
725 | * upcall processing has been finished. | |
726 | * | |
727 | * While being processed, the 'packet' may be reallocated, so the packet must | |
728 | * be separately released with ofpbuf_uninit(). | |
856081f6 BP |
729 | */ |
730 | struct dpif_upcall { | |
856081f6 | 731 | /* All types. */ |
82272ede | 732 | enum dpif_upcall_type type; |
da546e07 | 733 | struct ofpbuf packet; /* Packet data. */ |
856081f6 BP |
734 | struct nlattr *key; /* Flow key. */ |
735 | size_t key_len; /* Length of 'key' in bytes. */ | |
736 | ||
82272ede | 737 | /* DPIF_UC_ACTION only. */ |
e995e3df | 738 | struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */ |
856081f6 | 739 | }; |
9dbb9d5e | 740 | |
623540e4 EJ |
741 | /* A callback to process an upcall, currently implemented only by dpif-netdev. |
742 | * | |
743 | * The caller provides the 'packet' and 'flow' to process, the 'type' of the | |
744 | * upcall, and if 'type' is DPIF_UC_ACTION then the 'userdata' attached to the | |
745 | * action. | |
746 | * | |
747 | * The callback must fill in 'actions' with the datapath actions to apply to | |
748 | * 'packet'. 'wc' and 'put_actions' will either be both null or both nonnull. | |
749 | * If they are nonnull, then the caller will install a flow entry to process | |
750 | * all future packets that match 'flow' and 'wc'; the callback must store a | |
751 | * wildcard mask suitable for that purpose into 'wc'. If the actions to store | |
752 | * into the flow entry are the same as 'actions', then the callback may leave | |
753 | * 'put_actions' empty; otherwise it must store the desired actions into | |
754 | * 'put_actions'. | |
755 | * | |
756 | * Returns 0 if successful, ENOSPC if the flow limit has been reached and no | |
757 | * flow should be installed, or some otherwise a positive errno value. */ | |
758 | typedef int upcall_callback(const struct ofpbuf *packet, | |
759 | const struct flow *flow, | |
760 | enum dpif_upcall_type type, | |
761 | const struct nlattr *userdata, | |
762 | struct ofpbuf *actions, | |
763 | struct flow_wildcards *wc, | |
764 | struct ofpbuf *put_actions, | |
765 | void *aux); | |
766 | ||
767 | void dpif_register_upcall_cb(struct dpif *, upcall_callback *, void *aux); | |
6b31e073 | 768 | |
a12b3ead | 769 | int dpif_recv_set(struct dpif *, bool enable); |
1954e6bb AW |
770 | int dpif_handlers_set(struct dpif *, uint32_t n_handlers); |
771 | int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *, | |
772 | struct ofpbuf *); | |
1ba530f4 | 773 | void dpif_recv_purge(struct dpif *); |
1954e6bb | 774 | void dpif_recv_wait(struct dpif *, uint32_t handler_id); |
6b31e073 RW |
775 | void dpif_enable_upcall(struct dpif *); |
776 | void dpif_disable_upcall(struct dpif *); | |
777 | ||
778 | void dpif_print_packet(struct dpif *, struct dpif_upcall *); | |
6bc60024 BP |
779 | \f |
780 | /* Miscellaneous. */ | |
064af421 | 781 | |
53a4218d BP |
782 | void dpif_get_netflow_ids(const struct dpif *, |
783 | uint8_t *engine_type, uint8_t *engine_id); | |
064af421 | 784 | |
aae51f53 BP |
785 | int dpif_queue_to_priority(const struct dpif *, uint32_t queue_id, |
786 | uint32_t *priority); | |
787 | ||
03292c46 JG |
788 | #ifdef __cplusplus |
789 | } | |
790 | #endif | |
791 | ||
064af421 | 792 | #endif /* dpif.h */ |