2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * dpif, the DataPath InterFace.
20 * In Open vSwitch terminology, a "datapath" is a flow-based software switch.
21 * A datapath has no intelligence of its own. Rather, it relies entirely on
22 * its client to set up flows. The datapath layer is core to the Open vSwitch
23 * software switch: one could say, without much exaggeration, that everything
24 * in ovs-vswitchd above dpif exists only to make the correct decisions
25 * interacting with dpif.
27 * Typically, the client of a datapath is the software switch module in
28 * "ovs-vswitchd", but other clients can be written. The "ovs-dpctl" utility
29 * is also a (simple) client.
35 * The terms written in quotes below are defined in later sections.
37 * When a datapath "port" receives a packet, it extracts the headers (the
38 * "flow"). If the datapath's "flow table" contains a "flow entry" matching
39 * the packet, then it executes the "actions" in the flow entry and increments
40 * the flow's statistics. If there is no matching flow entry, the datapath
41 * instead appends the packet to an "upcall" queue.
47 * A datapath has a set of ports that are analogous to the ports on an Ethernet
48 * switch. At the datapath level, each port has the following information
51 * - A name, a short string that must be unique within the host. This is
52 * typically a name that would be familiar to the system administrator,
53 * e.g. "eth0" or "vif1.1", but it is otherwise arbitrary.
55 * - A 32-bit port number that must be unique within the datapath but is
56 * otherwise arbitrary. The port number is the most important identifier
57 * for a port in the datapath interface.
59 * - A type, a short string that identifies the kind of port. On a Linux
60 * host, typical types are "system" (for a network device such as eth0),
61 * "internal" (for a simulated port used to connect to the TCP/IP stack),
62 * and "gre" (for a GRE tunnel).
64 * - A Netlink PID (see "Upcall Queuing and Ordering" below).
66 * The dpif interface has functions for adding and deleting ports. When a
67 * datapath implements these (e.g. as the Linux and netdev datapaths do), then
68 * Open vSwitch's ovs-vswitchd daemon can directly control what ports are used
69 * for switching. Some datapaths might not implement them, or implement them
70 * with restrictions on the types of ports that can be added or removed
71 * (e.g. on ESX), on systems where port membership can only be changed by some
74 * Each datapath must have a port, sometimes called the "local port", whose
75 * name is the same as the datapath itself, with port number 0. The local port
78 * Ports are available as "struct netdev"s. To obtain a "struct netdev *" for
79 * a port named 'name' with type 'port_type', in a datapath of type
80 * 'datapath_type', call netdev_open(name, dpif_port_open_type(datapath_type,
81 * port_type). The netdev can be used to get and set important data related to
84 * - MTU (netdev_get_mtu(), netdev_set_mtu()).
86 * - Ethernet address (netdev_get_etheraddr(), netdev_set_etheraddr()).
88 * - Statistics such as the number of packets and bytes transmitted and
89 * received (netdev_get_stats()).
91 * - Carrier status (netdev_get_carrier()).
93 * - Speed (netdev_get_features()).
95 * - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and
98 * - Arbitrary port-specific configuration parameters (netdev_get_config(),
99 * netdev_set_config()). An example of such a parameter is the IP
100 * endpoint for a GRE tunnel.
106 * The flow table is a collection of "flow entries". Each flow entry contains:
108 * - A "flow", that is, a summary of the headers in an Ethernet packet. The
109 * flow must be unique within the flow table. Flows are fine-grained
110 * entities that include L2, L3, and L4 headers. A single TCP connection
111 * consists of two flows, one in each direction.
113 * In Open vSwitch userspace, "struct flow" is the typical way to describe
114 * a flow, but the datapath interface uses a different data format to
115 * allow ABI forward- and backward-compatibility. datapath/README
116 * describes the rationale and design. Refer to OVS_KEY_ATTR_* and
117 * "struct ovs_key_*" in include/linux/openvswitch.h for details.
118 * lib/odp-util.h defines several functions for working with these flows.
120 * - A "mask" that, for each bit in the flow, specifies whether the datapath
121 * should consider the corresponding flow bit when deciding whether a
122 * given packet matches the flow entry. The original datapath design did
123 * not support matching: every flow entry was exact match. With the
124 * addition of a mask, the interface supports datapaths with a spectrum of
125 * wildcard matching capabilities, from those that only support exact
126 * matches to those that support bitwise wildcarding on the entire flow
127 * key, as well as datapaths with capabilities somewhere in between.
129 * Datapaths do not provide a way to query their wildcarding capabilities,
130 * nor is it expected that the client should attempt to probe for the
131 * details of their support. Instead, a client installs flows with masks
132 * that wildcard as many bits as acceptable. The datapath then actually
133 * wildcards as many of those bits as it can and changes the wildcard bits
134 * that it does not support into exact match bits. A datapath that can
135 * wildcard any bit, for example, would install the supplied mask, an
136 * exact-match only datapath would install an exact-match mask regardless
137 * of what mask the client supplied, and a datapath in the middle of the
138 * spectrum would selectively change some wildcard bits into exact match
141 * Regardless of the requested or installed mask, the datapath retains the
142 * original flow supplied by the client. (It does not, for example, "zero
143 * out" the wildcarded bits.) This allows the client to unambiguously
144 * identify the flow entry in later flow table operations.
146 * The flow table does not have priorities; that is, all flow entries have
147 * equal priority. Detecting overlapping flow entries is expensive in
148 * general, so the datapath is not required to do it. It is primarily the
149 * client's responsibility not to install flow entries whose flow and mask
150 * combinations overlap.
152 * - A list of "actions" that tell the datapath what to do with packets
153 * within a flow. Some examples of actions are OVS_ACTION_ATTR_OUTPUT,
154 * which transmits the packet out a port, and OVS_ACTION_ATTR_SET, which
155 * modifies packet headers. Refer to OVS_ACTION_ATTR_* and "struct
156 * ovs_action_*" in include/linux/openvswitch.h for details.
157 * lib/odp-util.h defines several functions for working with datapath
160 * The actions list may be empty. This indicates that nothing should be
161 * done to matching packets, that is, they should be dropped.
163 * (In case you are familiar with OpenFlow, datapath actions are analogous
164 * to OpenFlow actions.)
166 * - Statistics: the number of packets and bytes that the flow has
167 * processed, the last time that the flow processed a packet, and the
168 * union of all the TCP flags in packets processed by the flow. (The
169 * latter is 0 if the flow is not a TCP flow.)
171 * The datapath's client manages the flow table, primarily in reaction to
172 * "upcalls" (see below).
178 * A datapath sometimes needs to notify its client that a packet was received.
179 * The datapath mechanism to do this is called an "upcall".
181 * Upcalls are used in two situations:
183 * - When a packet is received, but there is no matching flow entry in its
184 * flow table (a flow table "miss"), this causes an upcall of type
185 * DPIF_UC_MISS. These are called "miss" upcalls.
187 * - A datapath action of type OVS_ACTION_ATTR_USERSPACE causes an upcall of
188 * type DPIF_UC_ACTION. These are called "action" upcalls.
190 * An upcall contains an entire packet. There is no attempt to, e.g., copy
191 * only as much of the packet as normally needed to make a forwarding decision.
192 * Such an optimization is doable, but experimental prototypes showed it to be
193 * of little benefit because an upcall typically contains the first packet of a
194 * flow, which is usually short (e.g. a TCP SYN). Also, the entire packet can
195 * sometimes really be needed.
197 * After a client reads a given upcall, the datapath is finished with it, that
198 * is, the datapath doesn't maintain any lingering state past that point.
200 * The latency from the time that a packet arrives at a port to the time that
201 * it is received from dpif_recv() is critical in some benchmarks. For
202 * example, if this latency is 1 ms, then a netperf TCP_CRR test, which opens
203 * and closes TCP connections one at a time as quickly as it can, cannot
204 * possibly achieve more than 500 transactions per second, since every
205 * connection consists of two flows with 1-ms latency to set up each one.
207 * To receive upcalls, a client has to enable them with dpif_recv_set(). A
208 * datapath should generally support multiple clients at once (e.g. so that one
209 * may run "ovs-dpctl show" or "ovs-dpctl dump-flows" while "ovs-vswitchd" is
210 * also running) but need not support multiple clients enabling upcalls at
214 * Upcall Queuing and Ordering
215 * ---------------------------
217 * The datapath's client reads upcalls one at a time by calling dpif_recv().
218 * When more than one upcall is pending, the order in which the datapath
219 * presents upcalls to its client is important. The datapath's client does not
220 * directly control this order, so the datapath implementer must take care
223 * The minimal behavior, suitable for initial testing of a datapath
224 * implementation, is that all upcalls are appended to a single queue, which is
225 * delivered to the client in order.
227 * The datapath should ensure that a high rate of upcalls from one particular
228 * port cannot cause upcalls from other sources to be dropped or unreasonably
229 * delayed. Otherwise, one port conducting a port scan or otherwise initiating
230 * high-rate traffic spanning many flows could suppress other traffic.
231 * Ideally, the datapath should present upcalls from each port in a "round
232 * robin" manner, to ensure fairness.
234 * The client has no control over "miss" upcalls and no insight into the
235 * datapath's implementation, so the datapath is entirely responsible for
236 * queuing and delivering them. On the other hand, the datapath has
237 * considerable freedom of implementation. One good approach is to maintain a
238 * separate queue for each port, to prevent any given port's upcalls from
239 * interfering with other ports' upcalls. If this is impractical, then another
240 * reasonable choice is to maintain some fixed number of queues and assign each
241 * port to one of them. Ports assigned to the same queue can then interfere
242 * with each other, but not with ports assigned to different queues. Other
243 * approaches are also possible.
245 * The client has some control over "action" upcalls: it can specify a 32-bit
246 * "Netlink PID" as part of the action. This terminology comes from the Linux
247 * datapath implementation, which uses a protocol called Netlink in which a PID
248 * designates a particular socket and the upcall data is delivered to the
249 * socket's receive queue. Generically, though, a Netlink PID identifies a
250 * queue for upcalls. The basic requirements on the datapath are:
252 * - The datapath must provide a Netlink PID associated with each port. The
253 * client can retrieve the PID with dpif_port_get_pid().
255 * - The datapath must provide a "special" Netlink PID not associated with
256 * any port. dpif_port_get_pid() also provides this PID. (ovs-vswitchd
257 * uses this PID to queue special packets that must not be lost even if a
258 * port is otherwise busy, such as packets used for tunnel monitoring.)
260 * The minimal behavior of dpif_port_get_pid() and the treatment of the Netlink
261 * PID in "action" upcalls is that dpif_port_get_pid() returns a constant value
262 * and all upcalls are appended to a single queue.
264 * The ideal behavior is:
266 * - Each port has a PID that identifies the queue used for "miss" upcalls
267 * on that port. (Thus, if each port has its own queue for "miss"
268 * upcalls, then each port has a different Netlink PID.)
270 * - "miss" upcalls for a given port and "action" upcalls that specify that
271 * port's Netlink PID add their upcalls to the same queue. The upcalls
272 * are delivered to the datapath's client in the order that the packets
273 * were received, regardless of whether the upcalls are "miss" or "action"
276 * - Upcalls that specify the "special" Netlink PID are queued separately.
282 * The datapath interface works with packets in a particular form. This is the
283 * form taken by packets received via upcalls (i.e. by dpif_recv()). Packets
284 * supplied to the datapath for processing (i.e. to dpif_execute()) also take
287 * A VLAN tag is represented by an 802.1Q header. If the layer below the
288 * datapath interface uses another representation, then the datapath interface
289 * must perform conversion.
291 * The datapath interface requires all packets to fit within the MTU. Some
292 * operating systems internally process packets larger than MTU, with features
293 * such as TSO and UFO. When such a packet passes through the datapath
294 * interface, it must be broken into multiple MTU or smaller sized packets for
295 * presentation as upcalls. (This does not happen often, because an upcall
296 * typically contains the first packet of a flow, which is usually short.)
298 * Some operating system TCP/IP stacks maintain packets in an unchecksummed or
299 * partially checksummed state until transmission. The datapath interface
300 * requires all host-generated packets to be fully checksummed (e.g. IP and TCP
301 * checksums must be correct). On such an OS, the datapath interface must fill
302 * in these checksums.
304 * Packets passed through the datapath interface must be at least 14 bytes
305 * long, that is, they must have a complete Ethernet header. They are not
306 * required to be padded to the minimum Ethernet length.
312 * Typically, the client of a datapath begins by configuring the datapath with
313 * a set of ports. Afterward, the client runs in a loop polling for upcalls to
316 * For each upcall received, the client examines the enclosed packet and
317 * figures out what should be done with it. For example, if the client
318 * implements a MAC-learning switch, then it searches the forwarding database
319 * for the packet's destination MAC and VLAN and determines the set of ports to
320 * which it should be sent. In any case, the client composes a set of datapath
321 * actions to properly dispatch the packet and then directs the datapath to
322 * execute those actions on the packet (e.g. with dpif_execute()).
324 * Most of the time, the actions that the client executed on the packet apply
325 * to every packet with the same flow. For example, the flow includes both
326 * destination MAC and VLAN ID (and much more), so this is true for the
327 * MAC-learning switch example above. In such a case, the client can also
328 * direct the datapath to treat any further packets in the flow in the same
329 * way, using dpif_flow_put() to add a new flow entry.
331 * Other tasks the client might need to perform, in addition to reacting to
334 * - Periodically polling flow statistics, perhaps to supply to its own
337 * - Deleting flow entries from the datapath that haven't been used
338 * recently, to save memory.
340 * - Updating flow entries whose actions should change. For example, if a
341 * MAC learning switch learns that a MAC has moved, then it must update
342 * the actions of flow entries that sent packets to the MAC at its old
345 * - Adding and removing ports to achieve a new configuration.
351 * Most of the dpif functions are fully thread-safe: they may be called from
352 * any number of threads on the same or different dpif objects. The exceptions
355 * - dpif_port_poll() and dpif_port_poll_wait() are conditionally
356 * thread-safe: they may be called from different threads only on
357 * different dpif objects.
359 * - dpif_flow_dump_next() is conditionally thread-safe: It may be called
360 * from different threads with the same 'struct dpif_flow_dump', but all
361 * other parameters must be different for each thread.
363 * - dpif_flow_dump_done() is conditionally thread-safe: All threads that
364 * share the same 'struct dpif_flow_dump' must have finished using it.
365 * This function must then be called exactly once for a particular
366 * dpif_flow_dump to finish the corresponding flow dump operation.
368 * - Functions that operate on 'struct dpif_port_dump' are conditionally
369 * thread-safe with respect to those objects. That is, one may dump ports
370 * from any number of threads at once, but each thread must use its own
371 * struct dpif_port_dump.
381 #include "openflow/openflow.h"
396 int dp_register_provider(const struct dpif_class
*);
397 int dp_unregister_provider(const char *type
);
398 void dp_blacklist_provider(const char *type
);
399 void dp_enumerate_types(struct sset
*types
);
400 const char *dpif_normalize_type(const char *);
402 int dp_enumerate_names(const char *type
, struct sset
*names
);
403 void dp_parse_name(const char *datapath_name
, char **name
, char **type
);
405 int dpif_open(const char *name
, const char *type
, struct dpif
**);
406 int dpif_create(const char *name
, const char *type
, struct dpif
**);
407 int dpif_create_and_open(const char *name
, const char *type
, struct dpif
**);
408 void dpif_close(struct dpif
*);
410 void dpif_run(struct dpif
*);
411 void dpif_wait(struct dpif
*);
413 const char *dpif_name(const struct dpif
*);
414 const char *dpif_base_name(const struct dpif
*);
415 const char *dpif_type(const struct dpif
*);
417 int dpif_delete(struct dpif
*);
419 /* Statistics for a dpif as a whole. */
420 struct dpif_dp_stats
{
421 uint64_t n_hit
; /* Number of flow table matches. */
422 uint64_t n_missed
; /* Number of flow table misses. */
423 uint64_t n_lost
; /* Number of misses not sent to userspace. */
424 uint64_t n_flows
; /* Number of flows present. */
425 uint64_t n_mask_hit
; /* Number of mega flow masks visited for
426 flow table matches. */
427 uint32_t n_masks
; /* Number of mega flow masks. */
429 int dpif_get_dp_stats(const struct dpif
*, struct dpif_dp_stats
*);
432 /* Port operations. */
434 const char *dpif_port_open_type(const char *datapath_type
,
435 const char *port_type
);
436 int dpif_port_add(struct dpif
*, struct netdev
*, odp_port_t
*port_nop
);
437 int dpif_port_del(struct dpif
*, odp_port_t port_no
);
439 /* A port within a datapath.
441 * 'name' and 'type' are suitable for passing to netdev_open(). */
443 char *name
; /* Network device name, e.g. "eth0". */
444 char *type
; /* Network device type, e.g. "system". */
445 odp_port_t port_no
; /* Port number within datapath. */
447 void dpif_port_clone(struct dpif_port
*, const struct dpif_port
*);
448 void dpif_port_destroy(struct dpif_port
*);
449 bool dpif_port_exists(const struct dpif
*dpif
, const char *devname
);
450 int dpif_port_query_by_number(const struct dpif
*, odp_port_t port_no
,
452 int dpif_port_query_by_name(const struct dpif
*, const char *devname
,
454 int dpif_port_get_name(struct dpif
*, odp_port_t port_no
,
455 char *name
, size_t name_size
);
456 uint32_t dpif_port_get_pid(const struct dpif
*, odp_port_t port_no
);
458 struct dpif_port_dump
{
459 const struct dpif
*dpif
;
463 void dpif_port_dump_start(struct dpif_port_dump
*, const struct dpif
*);
464 bool dpif_port_dump_next(struct dpif_port_dump
*, struct dpif_port
*);
465 int dpif_port_dump_done(struct dpif_port_dump
*);
467 /* Iterates through each DPIF_PORT in DPIF, using DUMP as state.
469 * Arguments all have pointer type.
471 * If you break out of the loop, then you need to free the dump structure by
472 * hand using dpif_port_dump_done(). */
473 #define DPIF_PORT_FOR_EACH(DPIF_PORT, DUMP, DPIF) \
474 for (dpif_port_dump_start(DUMP, DPIF); \
475 (dpif_port_dump_next(DUMP, DPIF_PORT) \
477 : (dpif_port_dump_done(DUMP), false)); \
480 int dpif_port_poll(const struct dpif
*, char **devnamep
);
481 void dpif_port_poll_wait(const struct dpif
*);
483 /* Flow table operations. */
485 struct dpif_flow_stats
{
492 void dpif_flow_stats_extract(const struct flow
*, const struct ofpbuf
*packet
,
493 long long int used
, struct dpif_flow_stats
*);
494 void dpif_flow_stats_format(const struct dpif_flow_stats
*, struct ds
*);
496 enum dpif_flow_put_flags
{
497 DPIF_FP_CREATE
= 1 << 0, /* Allow creating a new flow. */
498 DPIF_FP_MODIFY
= 1 << 1, /* Allow modifying an existing flow. */
499 DPIF_FP_ZERO_STATS
= 1 << 2 /* Zero the stats of an existing flow. */
502 int dpif_flow_flush(struct dpif
*);
503 int dpif_flow_put(struct dpif
*, enum dpif_flow_put_flags
,
504 const struct nlattr
*key
, size_t key_len
,
505 const struct nlattr
*mask
, size_t mask_len
,
506 const struct nlattr
*actions
, size_t actions_len
,
507 struct dpif_flow_stats
*);
508 int dpif_flow_del(struct dpif
*,
509 const struct nlattr
*key
, size_t key_len
,
510 struct dpif_flow_stats
*);
511 int dpif_flow_get(const struct dpif
*,
512 const struct nlattr
*key
, size_t key_len
,
513 struct ofpbuf
**actionsp
, struct dpif_flow_stats
*);
515 struct dpif_flow_dump
{
516 const struct dpif
*dpif
;
519 void dpif_flow_dump_state_init(const struct dpif
*, void **statep
);
520 int dpif_flow_dump_start(struct dpif_flow_dump
*, const struct dpif
*);
521 bool dpif_flow_dump_next(struct dpif_flow_dump
*, void *state
,
522 const struct nlattr
**key
, size_t *key_len
,
523 const struct nlattr
**mask
, size_t *mask_len
,
524 const struct nlattr
**actions
, size_t *actions_len
,
525 const struct dpif_flow_stats
**);
526 bool dpif_flow_dump_next_may_destroy_keys(struct dpif_flow_dump
*dump
,
528 int dpif_flow_dump_done(struct dpif_flow_dump
*);
529 void dpif_flow_dump_state_uninit(const struct dpif
*, void *state
);
531 /* Operation batching interface.
533 * Some datapaths are faster at performing N operations together than the same
534 * N operations individually, hence an interface for batching.
538 DPIF_OP_FLOW_PUT
= 1,
543 struct dpif_flow_put
{
545 enum dpif_flow_put_flags flags
; /* DPIF_FP_*. */
546 const struct nlattr
*key
; /* Flow to put. */
547 size_t key_len
; /* Length of 'key' in bytes. */
548 const struct nlattr
*mask
; /* Mask to put. */
549 size_t mask_len
; /* Length of 'mask' in bytes. */
550 const struct nlattr
*actions
; /* Actions to perform on flow. */
551 size_t actions_len
; /* Length of 'actions' in bytes. */
554 struct dpif_flow_stats
*stats
; /* Optional flow statistics. */
557 struct dpif_flow_del
{
559 const struct nlattr
*key
; /* Flow to delete. */
560 size_t key_len
; /* Length of 'key' in bytes. */
563 struct dpif_flow_stats
*stats
; /* Optional flow statistics. */
566 struct dpif_execute
{
567 /* Raw support for execute passed along to the provider. */
568 const struct nlattr
*actions
; /* Actions to execute on packet. */
569 size_t actions_len
; /* Length of 'actions' in bytes. */
570 struct ofpbuf
*packet
; /* Packet to execute. */
571 struct pkt_metadata md
; /* Packet metadata. */
573 /* Some dpif providers do not implement every action. The Linux kernel
574 * datapath, in particular, does not implement ARP field modification.
576 * If this member is set to true, the dpif layer executes in userspace all
577 * of the actions that it can, and for OVS_ACTION_ATTR_OUTPUT and
578 * OVS_ACTION_ATTR_USERSPACE actions it passes the packet through to the
579 * dpif implementation. */
583 int dpif_execute(struct dpif
*, struct dpif_execute
*);
586 enum dpif_op_type type
;
589 struct dpif_flow_put flow_put
;
590 struct dpif_flow_del flow_del
;
591 struct dpif_execute execute
;
595 void dpif_operate(struct dpif
*, struct dpif_op
**ops
, size_t n_ops
);
599 enum dpif_upcall_type
{
600 DPIF_UC_MISS
, /* Miss in flow table. */
601 DPIF_UC_ACTION
, /* OVS_ACTION_ATTR_USERSPACE action. */
605 const char *dpif_upcall_type_to_string(enum dpif_upcall_type
);
607 /* A packet passed up from the datapath to userspace.
609 * The 'packet', 'key' and 'userdata' may point into data in a buffer
610 * provided by the caller, so the buffer should be released only after the
611 * upcall processing has been finished.
613 * While being processed, the 'packet' may be reallocated, so the packet must
614 * be separately released with ofpbuf_uninit().
618 enum dpif_upcall_type type
;
619 struct ofpbuf packet
; /* Packet data. */
620 struct nlattr
*key
; /* Flow key. */
621 size_t key_len
; /* Length of 'key' in bytes. */
623 /* DPIF_UC_ACTION only. */
624 struct nlattr
*userdata
; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
627 int dpif_recv_set(struct dpif
*, bool enable
);
628 int dpif_recv(struct dpif
*, struct dpif_upcall
*, struct ofpbuf
*);
629 void dpif_recv_purge(struct dpif
*);
630 void dpif_recv_wait(struct dpif
*);
634 void dpif_get_netflow_ids(const struct dpif
*,
635 uint8_t *engine_type
, uint8_t *engine_id
);
637 int dpif_queue_to_priority(const struct dpif
*, uint32_t queue_id
,