]> git.proxmox.com Git - ovs.git/blame - lib/dpif-netdev.c
netdev-dpdk: Remove the tx queue spinlock.
[ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
ff073a71 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
72865317 24#include <netinet/in.h>
9d82ec47 25#include <sys/socket.h>
7f3adc00 26#include <net/if.h>
cdee00fd 27#include <stdint.h>
72865317
BP
28#include <stdlib.h>
29#include <string.h>
30#include <sys/ioctl.h>
31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
2c0ea78f 34#include "classifier.h"
59e6d833 35#include "cmap.h"
72865317 36#include "csum.h"
614c4892 37#include "dpif.h"
72865317 38#include "dpif-provider.h"
614c4892 39#include "dummy.h"
36956a7d 40#include "dynamic-string.h"
afae68b1 41#include "fat-rwlock.h"
72865317 42#include "flow.h"
9f361d6b 43#include "cmap.h"
6c3eee82 44#include "latch.h"
72865317 45#include "list.h"
8c301900 46#include "meta-flow.h"
72865317 47#include "netdev.h"
8617afff 48#include "netdev-dpdk.h"
de281153 49#include "netdev-vport.h"
cdee00fd 50#include "netlink.h"
f094af7b 51#include "odp-execute.h"
72865317
BP
52#include "odp-util.h"
53#include "ofp-print.h"
54#include "ofpbuf.h"
5a034064 55#include "ovs-numa.h"
61e7deb1 56#include "ovs-rcu.h"
91088554 57#include "packet-dpif.h"
72865317
BP
58#include "packets.h"
59#include "poll-loop.h"
26c6b6cd 60#include "random.h"
d33ed218 61#include "seq.h"
462278db 62#include "shash.h"
0cbfe35d 63#include "sset.h"
72865317 64#include "timeval.h"
74cc3969 65#include "unixctl.h"
72865317 66#include "util.h"
72865317 67#include "vlog.h"
5136ce49 68
d98e6007 69VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 70
2c0ea78f
GS
71/* By default, choose a priority in the middle. */
72#define NETDEV_RULE_PRIORITY 0x8000
73
8bb113da 74#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
75/* Use per thread recirc_depth to prevent recirculation loop. */
76#define MAX_RECIRC_DEPTH 5
77DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 78
72865317 79/* Configuration parameters. */
72865317
BP
80enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
81
8a4e3a85
BP
82/* Protects against changes to 'dp_netdevs'. */
83static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
84
85/* Contains all 'struct dp_netdev's. */
86static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
87 = SHASH_INITIALIZER(&dp_netdevs);
88
623540e4 89static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 90
9bbf1c3d
DDP
91/* Stores a miniflow */
92
93/* There are fields in the flow structure that we never use. Therefore we can
94 * save a few words of memory */
95#define NETDEV_KEY_BUF_SIZE_U32 (FLOW_U32S \
96 - MINI_N_INLINE \
97 - FLOW_U32_SIZE(regs) \
98 - FLOW_U32_SIZE(metadata) \
99 )
100struct netdev_flow_key {
101 struct miniflow flow;
102 uint32_t buf[NETDEV_KEY_BUF_SIZE_U32];
103};
104
105/* Exact match cache for frequently used flows
106 *
107 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
108 * search its entries for a miniflow that matches exactly the miniflow of the
109 * packet. It stores the 'cls_rule'(rule) that matches the miniflow.
110 *
111 * A cache entry holds a reference to its 'dp_netdev_flow'.
112 *
113 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
114 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
115 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
116 * value is the index of a cache entry where the miniflow could be.
117 *
118 *
119 * Thread-safety
120 * =============
121 *
122 * Each pmd_thread has its own private exact match cache.
123 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
124 */
125
126#define EM_FLOW_HASH_SHIFT 10
127#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
128#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
129#define EM_FLOW_HASH_SEGS 2
130
131struct emc_entry {
132 uint32_t hash;
133 struct netdev_flow_key mf;
134 struct dp_netdev_flow *flow;
135};
136
137struct emc_cache {
138 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
139};
140
141/* Iterate in the exact match cache through every entry that might contain a
142 * miniflow with hash 'HASH'. */
143#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
144 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
145 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
146 i__ < EM_FLOW_HASH_SEGS; \
147 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
148
8a4e3a85
BP
149/* Datapath based on the network device interface from netdev.h.
150 *
151 *
152 * Thread-safety
153 * =============
154 *
155 * Some members, marked 'const', are immutable. Accessing other members
156 * requires synchronization, as noted in more detail below.
157 *
158 * Acquisition order is, from outermost to innermost:
159 *
160 * dp_netdev_mutex (global)
59e6d833 161 * port_mutex
9bbf1c3d 162 * emc_mutex
8a4e3a85 163 * flow_mutex
8a4e3a85 164 */
72865317 165struct dp_netdev {
8a4e3a85
BP
166 const struct dpif_class *const class;
167 const char *const name;
6b31e073 168 struct dpif *dpif;
6a8267c5
BP
169 struct ovs_refcount ref_cnt;
170 atomic_flag destroyed;
72865317 171
8a4e3a85
BP
172 /* Flows.
173 *
afae68b1
JR
174 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
175 * changes to 'cls' must be made while still holding the 'flow_mutex'.
8a4e3a85
BP
176 */
177 struct ovs_mutex flow_mutex;
afae68b1 178 struct classifier cls;
9f361d6b 179 struct cmap flow_table OVS_GUARDED; /* Flow table. */
8a4e3a85 180
8a4e3a85
BP
181 /* Statistics.
182 *
51852a57
BP
183 * ovsthread_stats is internally synchronized. */
184 struct ovsthread_stats stats; /* Contains 'struct dp_netdev_stats *'. */
72865317 185
8a4e3a85
BP
186 /* Ports.
187 *
59e6d833
BP
188 * Protected by RCU. Take the mutex to add or remove ports. */
189 struct ovs_mutex port_mutex;
190 struct cmap ports;
d33ed218 191 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 192
6b31e073
RW
193 /* Protects access to ofproto-dpif-upcall interface during revalidator
194 * thread synchronization. */
195 struct fat_rwlock upcall_rwlock;
623540e4
EJ
196 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
197 void *upcall_aux;
6b31e073 198
6c3eee82
BP
199 /* Forwarding threads. */
200 struct latch exit_latch;
e4cfed38
PS
201 struct pmd_thread *pmd_threads;
202 size_t n_pmd_threads;
203 int pmd_count;
9bbf1c3d
DDP
204
205 /* Exact match cache for non-pmd devices.
206 * Pmd devices use instead each thread's flow_cache for this purpose.
207 * Protected by emc_mutex */
208 struct emc_cache flow_cache OVS_GUARDED;
209 struct ovs_mutex emc_mutex;
72865317
BP
210};
211
8a4e3a85 212static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
59e6d833 213 odp_port_t);
ff073a71 214
51852a57
BP
215enum dp_stat_type {
216 DP_STAT_HIT, /* Packets that matched in the flow table. */
217 DP_STAT_MISS, /* Packets that did not match. */
218 DP_STAT_LOST, /* Packets not passed up to the client. */
219 DP_N_STATS
220};
221
222/* Contained by struct dp_netdev's 'stats' member. */
223struct dp_netdev_stats {
224 struct ovs_mutex mutex; /* Protects 'n'. */
225
226 /* Indexed by DP_STAT_*, protected by 'mutex'. */
227 unsigned long long int n[DP_N_STATS] OVS_GUARDED;
228};
229
230
72865317
BP
231/* A port in a netdev-based datapath. */
232struct dp_netdev_port {
59e6d833 233 struct cmap_node node; /* Node in dp_netdev's 'ports'. */
ff073a71 234 odp_port_t port_no;
72865317 235 struct netdev *netdev;
4b609110 236 struct netdev_saved_flags *sf;
55c955bd 237 struct netdev_rxq **rxq;
b284085e 238 struct ovs_refcount ref_cnt;
0cbfe35d 239 char *type; /* Port type as requested by user. */
72865317
BP
240};
241
8a4e3a85
BP
242/* A flow in dp_netdev's 'flow_table'.
243 *
244 *
245 * Thread-safety
246 * =============
247 *
248 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
249 * its dp_netdev's classifier. The text below calls this classifier 'cls'.
250 *
251 * Motivation
252 * ----------
253 *
254 * The thread safety rules described here for "struct dp_netdev_flow" are
255 * motivated by two goals:
256 *
257 * - Prevent threads that read members of "struct dp_netdev_flow" from
258 * reading bad data due to changes by some thread concurrently modifying
259 * those members.
260 *
261 * - Prevent two threads making changes to members of a given "struct
262 * dp_netdev_flow" from interfering with each other.
263 *
264 *
265 * Rules
266 * -----
267 *
ed79f89a
DDP
268 * A flow 'flow' may be accessed without a risk of being freed during an RCU
269 * grace period. Code that needs to hold onto a flow for a while
270 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
271 *
272 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
273 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
274 * from modification.
8a4e3a85
BP
275 *
276 * Some members, marked 'const', are immutable. Accessing other members
277 * requires synchronization, as noted in more detail below.
278 */
72865317 279struct dp_netdev_flow {
9bbf1c3d 280 bool dead;
2c0ea78f 281 /* Packet classification. */
8a4e3a85 282 const struct cls_rule cr; /* In owning dp_netdev's 'cls'. */
2c0ea78f 283
8a4e3a85 284 /* Hash table index by unmasked flow. */
9f361d6b 285 const struct cmap_node node; /* In owning dp_netdev's 'flow_table'. */
8a4e3a85 286 const struct flow flow; /* The flow that created this entry. */
72865317 287
ed79f89a
DDP
288 /* Number of references.
289 * The classifier owns one reference.
290 * Any thread trying to keep a rule from being freed should hold its own
291 * reference. */
292 struct ovs_refcount ref_cnt;
293
8a4e3a85
BP
294 /* Statistics.
295 *
296 * Reading or writing these members requires 'mutex'. */
679ba04c 297 struct ovsthread_stats stats; /* Contains "struct dp_netdev_flow_stats". */
8a4e3a85 298
45c626a3 299 /* Actions. */
61e7deb1 300 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
72865317
BP
301};
302
ed79f89a 303static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 304static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
8a4e3a85 305
679ba04c
BP
306/* Contained by struct dp_netdev_flow's 'stats' member. */
307struct dp_netdev_flow_stats {
308 struct ovs_mutex mutex; /* Guards all the other members. */
309
310 long long int used OVS_GUARDED; /* Last used time, in monotonic msecs. */
311 long long int packet_count OVS_GUARDED; /* Number of packets matched. */
312 long long int byte_count OVS_GUARDED; /* Number of bytes matched. */
313 uint16_t tcp_flags OVS_GUARDED; /* Bitwise-OR of seen tcp_flags values. */
314};
315
a84cb64a
BP
316/* A set of datapath actions within a "struct dp_netdev_flow".
317 *
318 *
319 * Thread-safety
320 * =============
321 *
45c626a3 322 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 323struct dp_netdev_actions {
a84cb64a
BP
324 /* These members are immutable: they do not change during the struct's
325 * lifetime. */
326 struct nlattr *actions; /* Sequence of OVS_ACTION_ATTR_* attributes. */
327 unsigned int size; /* Size of 'actions', in bytes. */
328};
329
330struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
331 size_t);
61e7deb1
BP
332struct dp_netdev_actions *dp_netdev_flow_get_actions(
333 const struct dp_netdev_flow *);
334static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 335
e4cfed38
PS
336/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
337 * the performance overhead of interrupt processing. Therefore netdev can
338 * not implement rx-wait for these devices. dpif-netdev needs to poll
339 * these device to check for recv buffer. pmd-thread does polling for
340 * devices assigned to itself thread.
341 *
342 * DPDK used PMD for accessing NIC.
343 *
344 * A thread that receives packets from PMD ports, looks them up in the flow
345 * table, and executes the actions it finds.
346 **/
347struct pmd_thread {
6c3eee82 348 struct dp_netdev *dp;
9bbf1c3d 349 struct emc_cache flow_cache;
6c3eee82 350 pthread_t thread;
e4cfed38
PS
351 int id;
352 atomic_uint change_seq;
6c3eee82
BP
353};
354
84067a4c
JR
355#define PMD_INITIAL_SEQ 1
356
72865317
BP
357/* Interface to netdev-based datapath. */
358struct dpif_netdev {
359 struct dpif dpif;
360 struct dp_netdev *dp;
d33ed218 361 uint64_t last_port_seq;
72865317
BP
362};
363
8a4e3a85 364static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
59e6d833 365 struct dp_netdev_port **portp);
8a4e3a85 366static int get_port_by_name(struct dp_netdev *dp, const char *devname,
59e6d833 367 struct dp_netdev_port **portp);
8a4e3a85
BP
368static void dp_netdev_free(struct dp_netdev *)
369 OVS_REQUIRES(dp_netdev_mutex);
72865317 370static void dp_netdev_flow_flush(struct dp_netdev *);
8a4e3a85
BP
371static int do_add_port(struct dp_netdev *dp, const char *devname,
372 const char *type, odp_port_t port_no)
59e6d833 373 OVS_REQUIRES(dp->port_mutex);
c40b890f 374static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 375 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
376static int dpif_netdev_open(const struct dpif_class *, const char *name,
377 bool create, struct dpif **);
8a4e3a85 378static void dp_netdev_execute_actions(struct dp_netdev *dp,
8cbf4f47
DDP
379 struct dpif_packet **, int c,
380 bool may_steal, struct pkt_metadata *,
9bbf1c3d 381 struct emc_cache *flow_cache,
4edb9ae9 382 const struct nlattr *actions,
e4cfed38 383 size_t actions_len);
3c33f0ff
JR
384static void dp_netdev_input(struct dp_netdev *, struct emc_cache *,
385 struct dpif_packet **, int cnt,
386 struct pkt_metadata *);
e4cfed38
PS
387
388static void dp_netdev_set_pmd_threads(struct dp_netdev *, int n);
6b31e073 389static void dp_netdev_disable_upcall(struct dp_netdev *);
72865317 390
9bbf1c3d
DDP
391static void emc_clear_entry(struct emc_entry *ce);
392
393static void
394emc_cache_init(struct emc_cache *flow_cache)
395{
396 int i;
397
398 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
399 flow_cache->entries[i].flow = NULL;
400 flow_cache->entries[i].hash = 0;
401 miniflow_initialize(&flow_cache->entries[i].mf.flow,
402 flow_cache->entries[i].mf.buf);
403 }
404}
405
406static void
407emc_cache_uninit(struct emc_cache *flow_cache)
408{
409 int i;
410
411 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
412 emc_clear_entry(&flow_cache->entries[i]);
413 }
414}
415
72865317
BP
416static struct dpif_netdev *
417dpif_netdev_cast(const struct dpif *dpif)
418{
cb22974d 419 ovs_assert(dpif->dpif_class->open == dpif_netdev_open);
72865317
BP
420 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
421}
422
423static struct dp_netdev *
424get_dp_netdev(const struct dpif *dpif)
425{
426 return dpif_netdev_cast(dpif)->dp;
427}
428
2197d7ab 429static int
2240af25
DDP
430dpif_netdev_enumerate(struct sset *all_dps,
431 const struct dpif_class *dpif_class)
2197d7ab
GL
432{
433 struct shash_node *node;
434
97be1538 435 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 436 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
437 struct dp_netdev *dp = node->data;
438 if (dpif_class != dp->class) {
439 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
440 * If the class doesn't match, skip this dpif. */
441 continue;
442 }
2197d7ab
GL
443 sset_add(all_dps, node->name);
444 }
97be1538 445 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 446
2197d7ab
GL
447 return 0;
448}
449
add90f6f
EJ
450static bool
451dpif_netdev_class_is_dummy(const struct dpif_class *class)
452{
453 return class != &dpif_netdev_class;
454}
455
0aeaabc8
JP
456static const char *
457dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
458{
459 return strcmp(type, "internal") ? type
add90f6f 460 : dpif_netdev_class_is_dummy(class) ? "dummy"
0aeaabc8
JP
461 : "tap";
462}
463
72865317
BP
464static struct dpif *
465create_dpif_netdev(struct dp_netdev *dp)
466{
462278db 467 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 468 struct dpif_netdev *dpif;
72865317 469
6a8267c5 470 ovs_refcount_ref(&dp->ref_cnt);
72865317 471
72865317 472 dpif = xmalloc(sizeof *dpif);
614c4892 473 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 474 dpif->dp = dp;
d33ed218 475 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
476
477 return &dpif->dpif;
478}
479
4e022ec0
AW
480/* Choose an unused, non-zero port number and return it on success.
481 * Return ODPP_NONE on failure. */
482static odp_port_t
e44768b7 483choose_port(struct dp_netdev *dp, const char *name)
59e6d833 484 OVS_REQUIRES(dp->port_mutex)
e44768b7 485{
4e022ec0 486 uint32_t port_no;
e44768b7
JP
487
488 if (dp->class != &dpif_netdev_class) {
489 const char *p;
490 int start_no = 0;
491
492 /* If the port name begins with "br", start the number search at
493 * 100 to make writing tests easier. */
494 if (!strncmp(name, "br", 2)) {
495 start_no = 100;
496 }
497
498 /* If the port name contains a number, try to assign that port number.
499 * This can make writing unit tests easier because port numbers are
500 * predictable. */
501 for (p = name; *p != '\0'; p++) {
502 if (isdigit((unsigned char) *p)) {
503 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
504 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
505 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 506 return u32_to_odp(port_no);
e44768b7
JP
507 }
508 break;
509 }
510 }
511 }
512
ff073a71
BP
513 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
514 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 515 return u32_to_odp(port_no);
e44768b7
JP
516 }
517 }
518
4e022ec0 519 return ODPP_NONE;
e44768b7
JP
520}
521
72865317 522static int
614c4892
BP
523create_dp_netdev(const char *name, const struct dpif_class *class,
524 struct dp_netdev **dpp)
8a4e3a85 525 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
526{
527 struct dp_netdev *dp;
528 int error;
72865317 529
462278db 530 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
531 shash_add(&dp_netdevs, name, dp);
532
533 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
534 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 535 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 536 atomic_flag_clear(&dp->destroyed);
8a4e3a85
BP
537
538 ovs_mutex_init(&dp->flow_mutex);
539 classifier_init(&dp->cls, NULL);
9f361d6b 540 cmap_init(&dp->flow_table);
8a4e3a85 541
51852a57 542 ovsthread_stats_init(&dp->stats);
ed27e010 543
59e6d833
BP
544 ovs_mutex_init(&dp->port_mutex);
545 cmap_init(&dp->ports);
d33ed218 546 dp->port_seq = seq_create();
6c3eee82 547 latch_init(&dp->exit_latch);
6b31e073
RW
548 fat_rwlock_init(&dp->upcall_rwlock);
549
550 /* Disable upcalls by default. */
551 dp_netdev_disable_upcall(dp);
623540e4 552 dp->upcall_aux = NULL;
6b31e073 553 dp->upcall_cb = NULL;
e44768b7 554
59e6d833 555 ovs_mutex_lock(&dp->port_mutex);
4e022ec0 556 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
59e6d833 557 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
558 if (error) {
559 dp_netdev_free(dp);
462278db 560 return error;
72865317
BP
561 }
562
3c33f0ff 563 ovs_mutex_init_recursive(&dp->emc_mutex);
9bbf1c3d
DDP
564 emc_cache_init(&dp->flow_cache);
565
462278db 566 *dpp = dp;
72865317
BP
567 return 0;
568}
569
570static int
614c4892 571dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 572 bool create, struct dpif **dpifp)
72865317 573{
462278db 574 struct dp_netdev *dp;
5279f8fd 575 int error;
462278db 576
97be1538 577 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
578 dp = shash_find_data(&dp_netdevs, name);
579 if (!dp) {
5279f8fd 580 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 581 } else {
5279f8fd
BP
582 error = (dp->class != class ? EINVAL
583 : create ? EEXIST
584 : 0);
585 }
586 if (!error) {
587 *dpifp = create_dpif_netdev(dp);
6b31e073 588 dp->dpif = *dpifp;
72865317 589 }
97be1538 590 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 591
5279f8fd 592 return error;
72865317
BP
593}
594
8a4e3a85
BP
595/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
596 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
597static void
598dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 599 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 600{
59e6d833 601 struct dp_netdev_port *port;
51852a57
BP
602 struct dp_netdev_stats *bucket;
603 int i;
4ad28026 604
8a4e3a85
BP
605 shash_find_and_delete(&dp_netdevs, dp->name);
606
e4cfed38
PS
607 dp_netdev_set_pmd_threads(dp, 0);
608 free(dp->pmd_threads);
6c3eee82 609
1ba530f4 610 dp_netdev_flow_flush(dp);
59e6d833 611 ovs_mutex_lock(&dp->port_mutex);
a532e683 612 CMAP_FOR_EACH (port, node, &dp->ports) {
c40b890f 613 do_del_port(dp, port);
1ba530f4 614 }
59e6d833 615 ovs_mutex_unlock(&dp->port_mutex);
51852a57
BP
616
617 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &dp->stats) {
618 ovs_mutex_destroy(&bucket->mutex);
619 free_cacheline(bucket);
620 }
621 ovsthread_stats_destroy(&dp->stats);
f5126b57 622
2c0ea78f 623 classifier_destroy(&dp->cls);
9f361d6b 624 cmap_destroy(&dp->flow_table);
8a4e3a85 625 ovs_mutex_destroy(&dp->flow_mutex);
d33ed218 626 seq_destroy(dp->port_seq);
59e6d833 627 cmap_destroy(&dp->ports);
6b31e073 628 fat_rwlock_destroy(&dp->upcall_rwlock);
6c3eee82 629 latch_destroy(&dp->exit_latch);
9bbf1c3d
DDP
630
631 emc_cache_uninit(&dp->flow_cache);
632 ovs_mutex_destroy(&dp->emc_mutex);
633
8a4e3a85 634 free(CONST_CAST(char *, dp->name));
72865317
BP
635 free(dp);
636}
637
8a4e3a85
BP
638static void
639dp_netdev_unref(struct dp_netdev *dp)
640{
641 if (dp) {
642 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
643 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
644 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 645 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
646 dp_netdev_free(dp);
647 }
648 ovs_mutex_unlock(&dp_netdev_mutex);
649 }
650}
651
72865317
BP
652static void
653dpif_netdev_close(struct dpif *dpif)
654{
655 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 656
8a4e3a85 657 dp_netdev_unref(dp);
72865317
BP
658 free(dpif);
659}
660
661static int
7dab847a 662dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
663{
664 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 665
6a8267c5 666 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 667 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
668 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
669 OVS_NOT_REACHED();
670 }
671 }
5279f8fd 672
72865317
BP
673 return 0;
674}
675
676static int
a8d9304d 677dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
678{
679 struct dp_netdev *dp = get_dp_netdev(dpif);
51852a57
BP
680 struct dp_netdev_stats *bucket;
681 size_t i;
5279f8fd 682
9f361d6b 683 stats->n_flows = cmap_count(&dp->flow_table);
8a4e3a85 684
51852a57
BP
685 stats->n_hit = stats->n_missed = stats->n_lost = 0;
686 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &dp->stats) {
687 ovs_mutex_lock(&bucket->mutex);
688 stats->n_hit += bucket->n[DP_STAT_HIT];
689 stats->n_missed += bucket->n[DP_STAT_MISS];
690 stats->n_lost += bucket->n[DP_STAT_LOST];
691 ovs_mutex_unlock(&bucket->mutex);
692 }
1ce3fa06 693 stats->n_masks = UINT32_MAX;
847108dc 694 stats->n_mask_hit = UINT64_MAX;
5279f8fd 695
72865317
BP
696 return 0;
697}
698
e4cfed38
PS
699static void
700dp_netdev_reload_pmd_threads(struct dp_netdev *dp)
701{
702 int i;
703
704 for (i = 0; i < dp->n_pmd_threads; i++) {
705 struct pmd_thread *f = &dp->pmd_threads[i];
84067a4c 706 int old_seq;
e4cfed38 707
91a96379 708 atomic_add_relaxed(&f->change_seq, 1, &old_seq);
84067a4c 709 }
e4cfed38
PS
710}
711
59e6d833
BP
712static uint32_t
713hash_port_no(odp_port_t port_no)
714{
715 return hash_int(odp_to_u32(port_no), 0);
716}
717
72865317 718static int
c3827f61 719do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
4e022ec0 720 odp_port_t port_no)
59e6d833 721 OVS_REQUIRES(dp->port_mutex)
72865317 722{
4b609110 723 struct netdev_saved_flags *sf;
72865317
BP
724 struct dp_netdev_port *port;
725 struct netdev *netdev;
2499a8ce 726 enum netdev_flags flags;
0cbfe35d 727 const char *open_type;
72865317 728 int error;
55c955bd 729 int i;
72865317
BP
730
731 /* XXX reject devices already in some dp_netdev. */
732
733 /* Open and validate network device. */
0aeaabc8 734 open_type = dpif_netdev_port_open_type(dp->class, type);
0cbfe35d 735 error = netdev_open(devname, open_type, &netdev);
72865317
BP
736 if (error) {
737 return error;
738 }
72865317
BP
739 /* XXX reject non-Ethernet devices */
740
2499a8ce
AC
741 netdev_get_flags(netdev, &flags);
742 if (flags & NETDEV_LOOPBACK) {
743 VLOG_ERR("%s: cannot add a loopback device", devname);
744 netdev_close(netdev);
745 return EINVAL;
746 }
747
5a034064
AW
748 if (netdev_is_pmd(netdev)) {
749 int n_cores = ovs_numa_get_n_cores();
750
751 if (n_cores == OVS_CORE_UNSPEC) {
752 VLOG_ERR("%s, cannot get cpu core info", devname);
753 return ENOENT;
754 }
755 /* There can only be ovs_numa_get_n_cores() pmd threads,
756 * so creates a tx_q for each. */
757 error = netdev_set_multiq(netdev, n_cores, NR_QUEUE);
758 if (error) {
759 VLOG_ERR("%s, cannot set multiq", devname);
760 return errno;
761 }
762 }
e4cfed38
PS
763 port = xzalloc(sizeof *port);
764 port->port_no = port_no;
765 port->netdev = netdev;
55c955bd 766 port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev));
e4cfed38 767 port->type = xstrdup(type);
55c955bd
PS
768 for (i = 0; i < netdev_n_rxq(netdev); i++) {
769 error = netdev_rxq_open(netdev, &port->rxq[i], i);
770 if (error
771 && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) {
772 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
773 devname, ovs_strerror(errno));
774 netdev_close(netdev);
16bea12c
TG
775 free(port->type);
776 free(port->rxq);
777 free(port);
55c955bd
PS
778 return error;
779 }
7b6b0ef4
BP
780 }
781
4b609110 782 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 783 if (error) {
55c955bd
PS
784 for (i = 0; i < netdev_n_rxq(netdev); i++) {
785 netdev_rxq_close(port->rxq[i]);
786 }
72865317 787 netdev_close(netdev);
16bea12c 788 free(port->type);
f7791740 789 free(port->rxq);
e4cfed38 790 free(port);
72865317
BP
791 return error;
792 }
4b609110 793 port->sf = sf;
e4cfed38
PS
794
795 if (netdev_is_pmd(netdev)) {
796 dp->pmd_count++;
db73f716 797 dp_netdev_set_pmd_threads(dp, NR_PMD_THREADS);
e4cfed38
PS
798 dp_netdev_reload_pmd_threads(dp);
799 }
800 ovs_refcount_init(&port->ref_cnt);
72865317 801
59e6d833 802 cmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
d33ed218 803 seq_change(dp->port_seq);
72865317
BP
804
805 return 0;
806}
807
247527db
BP
808static int
809dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 810 odp_port_t *port_nop)
247527db
BP
811{
812 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
813 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
814 const char *dpif_port;
4e022ec0 815 odp_port_t port_no;
5279f8fd 816 int error;
247527db 817
59e6d833 818 ovs_mutex_lock(&dp->port_mutex);
3aa30359 819 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 820 if (*port_nop != ODPP_NONE) {
ff073a71
BP
821 port_no = *port_nop;
822 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 823 } else {
3aa30359 824 port_no = choose_port(dp, dpif_port);
5279f8fd 825 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 826 }
5279f8fd 827 if (!error) {
247527db 828 *port_nop = port_no;
5279f8fd 829 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 830 }
59e6d833 831 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
832
833 return error;
72865317
BP
834}
835
836static int
4e022ec0 837dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
838{
839 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
840 int error;
841
59e6d833 842 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
843 if (port_no == ODPP_LOCAL) {
844 error = EINVAL;
845 } else {
846 struct dp_netdev_port *port;
847
848 error = get_port_by_number(dp, port_no, &port);
849 if (!error) {
850 do_del_port(dp, port);
851 }
852 }
59e6d833 853 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
854
855 return error;
72865317
BP
856}
857
858static bool
4e022ec0 859is_valid_port_number(odp_port_t port_no)
72865317 860{
ff073a71
BP
861 return port_no != ODPP_NONE;
862}
863
864static struct dp_netdev_port *
865dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
866{
867 struct dp_netdev_port *port;
868
59e6d833 869 CMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
ff073a71
BP
870 if (port->port_no == port_no) {
871 return port;
872 }
873 }
874 return NULL;
72865317
BP
875}
876
877static int
878get_port_by_number(struct dp_netdev *dp,
4e022ec0 879 odp_port_t port_no, struct dp_netdev_port **portp)
72865317
BP
880{
881 if (!is_valid_port_number(port_no)) {
882 *portp = NULL;
883 return EINVAL;
884 } else {
ff073a71 885 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
886 return *portp ? 0 : ENOENT;
887 }
888}
889
b284085e
PS
890static void
891port_ref(struct dp_netdev_port *port)
892{
893 if (port) {
894 ovs_refcount_ref(&port->ref_cnt);
895 }
896}
897
a1fdee13
AW
898static bool
899port_try_ref(struct dp_netdev_port *port)
900{
901 if (port) {
902 return ovs_refcount_try_ref_rcu(&port->ref_cnt);
903 }
904
905 return false;
906}
907
b284085e 908static void
59e6d833 909port_destroy__(struct dp_netdev_port *port)
b284085e 910{
98de6beb 911 int n_rxq = netdev_n_rxq(port->netdev);
59e6d833 912 int i;
55c955bd 913
59e6d833
BP
914 netdev_close(port->netdev);
915 netdev_restore_flags(port->sf);
55c955bd 916
59e6d833
BP
917 for (i = 0; i < n_rxq; i++) {
918 netdev_rxq_close(port->rxq[i]);
919 }
920 free(port->rxq);
921 free(port->type);
922 free(port);
923}
924
925static void
926port_unref(struct dp_netdev_port *port)
927{
24f83812 928 if (port && ovs_refcount_unref_relaxed(&port->ref_cnt) == 1) {
59e6d833 929 ovsrcu_postpone(port_destroy__, port);
b284085e
PS
930 }
931}
932
72865317
BP
933static int
934get_port_by_name(struct dp_netdev *dp,
935 const char *devname, struct dp_netdev_port **portp)
59e6d833 936 OVS_REQUIRES(dp->port_mutex)
72865317
BP
937{
938 struct dp_netdev_port *port;
939
a532e683 940 CMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 941 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
942 *portp = port;
943 return 0;
944 }
945 }
946 return ENOENT;
947}
948
c40b890f
BP
949static void
950do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 951 OVS_REQUIRES(dp->port_mutex)
72865317 952{
c40b890f 953 cmap_remove(&dp->ports, &port->node, hash_odp_port(port->port_no));
d33ed218 954 seq_change(dp->port_seq);
e4cfed38
PS
955 if (netdev_is_pmd(port->netdev)) {
956 dp_netdev_reload_pmd_threads(dp);
957 }
72865317 958
b284085e 959 port_unref(port);
72865317
BP
960}
961
962static void
4c738a8d
BP
963answer_port_query(const struct dp_netdev_port *port,
964 struct dpif_port *dpif_port)
72865317 965{
3efb6063 966 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 967 dpif_port->type = xstrdup(port->type);
4c738a8d 968 dpif_port->port_no = port->port_no;
72865317
BP
969}
970
971static int
4e022ec0 972dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 973 struct dpif_port *dpif_port)
72865317
BP
974{
975 struct dp_netdev *dp = get_dp_netdev(dpif);
976 struct dp_netdev_port *port;
977 int error;
978
979 error = get_port_by_number(dp, port_no, &port);
4afba28d 980 if (!error && dpif_port) {
4c738a8d 981 answer_port_query(port, dpif_port);
72865317 982 }
5279f8fd 983
72865317
BP
984 return error;
985}
986
987static int
988dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 989 struct dpif_port *dpif_port)
72865317
BP
990{
991 struct dp_netdev *dp = get_dp_netdev(dpif);
992 struct dp_netdev_port *port;
993 int error;
994
59e6d833 995 ovs_mutex_lock(&dp->port_mutex);
72865317 996 error = get_port_by_name(dp, devname, &port);
4afba28d 997 if (!error && dpif_port) {
4c738a8d 998 answer_port_query(port, dpif_port);
72865317 999 }
59e6d833 1000 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1001
72865317
BP
1002 return error;
1003}
1004
61e7deb1
BP
1005static void
1006dp_netdev_flow_free(struct dp_netdev_flow *flow)
1007{
1008 struct dp_netdev_flow_stats *bucket;
1009 size_t i;
1010
1011 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &flow->stats) {
1012 ovs_mutex_destroy(&bucket->mutex);
1013 free_cacheline(bucket);
1014 }
1015 ovsthread_stats_destroy(&flow->stats);
1016
1017 cls_rule_destroy(CONST_CAST(struct cls_rule *, &flow->cr));
1018 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1019 free(flow);
1020}
1021
ed79f89a
DDP
1022static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1023{
1024 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1025 ovsrcu_postpone(dp_netdev_flow_free, flow);
1026 }
1027}
1028
72865317 1029static void
8a4e3a85 1030dp_netdev_remove_flow(struct dp_netdev *dp, struct dp_netdev_flow *flow)
8a4e3a85 1031 OVS_REQUIRES(dp->flow_mutex)
72865317 1032{
8a4e3a85 1033 struct cls_rule *cr = CONST_CAST(struct cls_rule *, &flow->cr);
9f361d6b 1034 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2c0ea78f 1035
8a4e3a85 1036 classifier_remove(&dp->cls, cr);
9f361d6b 1037 cmap_remove(&dp->flow_table, node, flow_hash(&flow->flow, 0));
9bbf1c3d 1038 flow->dead = true;
ed79f89a
DDP
1039
1040 dp_netdev_flow_unref(flow);
72865317
BP
1041}
1042
1043static void
1044dp_netdev_flow_flush(struct dp_netdev *dp)
1045{
78c8df12 1046 struct dp_netdev_flow *netdev_flow;
72865317 1047
8a4e3a85 1048 ovs_mutex_lock(&dp->flow_mutex);
6bc3bb82 1049 CMAP_FOR_EACH (netdev_flow, node, &dp->flow_table) {
8a4e3a85 1050 dp_netdev_remove_flow(dp, netdev_flow);
72865317 1051 }
8a4e3a85 1052 ovs_mutex_unlock(&dp->flow_mutex);
72865317
BP
1053}
1054
1055static int
1056dpif_netdev_flow_flush(struct dpif *dpif)
1057{
1058 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1059
72865317
BP
1060 dp_netdev_flow_flush(dp);
1061 return 0;
1062}
1063
b0ec0f27 1064struct dp_netdev_port_state {
59e6d833 1065 struct cmap_position position;
4c738a8d 1066 char *name;
b0ec0f27
BP
1067};
1068
1069static int
1070dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1071{
1072 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1073 return 0;
1074}
1075
72865317 1076static int
b0ec0f27 1077dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1078 struct dpif_port *dpif_port)
72865317 1079{
b0ec0f27 1080 struct dp_netdev_port_state *state = state_;
72865317 1081 struct dp_netdev *dp = get_dp_netdev(dpif);
59e6d833 1082 struct cmap_node *node;
ff073a71 1083 int retval;
72865317 1084
59e6d833 1085 node = cmap_next_position(&dp->ports, &state->position);
ff073a71
BP
1086 if (node) {
1087 struct dp_netdev_port *port;
5279f8fd 1088
ff073a71
BP
1089 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1090
1091 free(state->name);
1092 state->name = xstrdup(netdev_get_name(port->netdev));
1093 dpif_port->name = state->name;
1094 dpif_port->type = port->type;
1095 dpif_port->port_no = port->port_no;
1096
1097 retval = 0;
1098 } else {
1099 retval = EOF;
72865317 1100 }
5279f8fd 1101
ff073a71 1102 return retval;
b0ec0f27
BP
1103}
1104
1105static int
4c738a8d 1106dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1107{
4c738a8d
BP
1108 struct dp_netdev_port_state *state = state_;
1109 free(state->name);
b0ec0f27
BP
1110 free(state);
1111 return 0;
72865317
BP
1112}
1113
1114static int
67a4917b 1115dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1116{
1117 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1118 uint64_t new_port_seq;
5279f8fd
BP
1119 int error;
1120
d33ed218
BP
1121 new_port_seq = seq_read(dpif->dp->port_seq);
1122 if (dpif->last_port_seq != new_port_seq) {
1123 dpif->last_port_seq = new_port_seq;
5279f8fd 1124 error = ENOBUFS;
72865317 1125 } else {
5279f8fd 1126 error = EAGAIN;
72865317 1127 }
5279f8fd
BP
1128
1129 return error;
72865317
BP
1130}
1131
1132static void
1133dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1134{
1135 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1136
d33ed218 1137 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1138}
1139
1140static struct dp_netdev_flow *
1141dp_netdev_flow_cast(const struct cls_rule *cr)
1142{
1143 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1144}
1145
9bbf1c3d
DDP
1146static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1147{
1148 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1149}
1150
1151static inline bool
1152emc_entry_alive(struct emc_entry *ce)
1153{
1154 return ce->flow && !ce->flow->dead;
1155}
1156
1157static void
1158emc_clear_entry(struct emc_entry *ce)
1159{
1160 if (ce->flow) {
1161 dp_netdev_flow_unref(ce->flow);
1162 ce->flow = NULL;
1163 }
1164}
1165
1166static inline void
1167emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
1168 const struct miniflow *mf, uint32_t hash)
1169{
1170 if (ce->flow != flow) {
1171 if (ce->flow) {
1172 dp_netdev_flow_unref(ce->flow);
1173 }
1174
1175 if (dp_netdev_flow_ref(flow)) {
1176 ce->flow = flow;
1177 } else {
1178 ce->flow = NULL;
1179 }
1180 }
1181 if (mf) {
1182 miniflow_clone_inline(&ce->mf.flow, mf, count_1bits(mf->map));
1183 ce->hash = hash;
1184 }
1185}
1186
1187static inline void
1188emc_insert(struct emc_cache *cache, const struct miniflow *mf, uint32_t hash,
1189 struct dp_netdev_flow *flow)
1190{
1191 struct emc_entry *to_be_replaced = NULL;
1192 struct emc_entry *current_entry;
1193
1194 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, hash) {
1195 if (current_entry->hash == hash
1196 && miniflow_equal(&current_entry->mf.flow, mf)) {
1197
1198 /* We found the entry with the 'mf' miniflow */
1199 emc_change_entry(current_entry, flow, NULL, 0);
1200 return;
1201 }
1202
1203 /* Replacement policy: put the flow in an empty (not alive) entry, or
1204 * in the first entry where it can be */
1205 if (!to_be_replaced
1206 || (emc_entry_alive(to_be_replaced)
1207 && !emc_entry_alive(current_entry))
1208 || current_entry->hash < to_be_replaced->hash) {
1209 to_be_replaced = current_entry;
1210 }
1211 }
1212 /* We didn't find the miniflow in the cache.
1213 * The 'to_be_replaced' entry is where the new flow will be stored */
1214
1215 emc_change_entry(to_be_replaced, flow, mf, hash);
1216}
1217
1218static inline struct dp_netdev_flow *
1219emc_lookup(struct emc_cache *cache, const struct miniflow *mf, uint32_t hash)
1220{
1221 struct emc_entry *current_entry;
1222
1223 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, hash) {
1224 if (current_entry->hash == hash && emc_entry_alive(current_entry)
1225 && miniflow_equal(&current_entry->mf.flow, mf)) {
1226
1227 /* We found the entry with the 'mf' miniflow */
1228 return current_entry->flow;
1229 }
1230 }
1231
1232 return NULL;
1233}
1234
72865317 1235static struct dp_netdev_flow *
4f150744 1236dp_netdev_lookup_flow(const struct dp_netdev *dp, const struct miniflow *key)
2c0ea78f 1237{
8a4e3a85 1238 struct dp_netdev_flow *netdev_flow;
4f150744 1239 struct cls_rule *rule;
2c0ea78f 1240
b7648634 1241 classifier_lookup_miniflow_batch(&dp->cls, &key, &rule, 1);
4f150744 1242 netdev_flow = dp_netdev_flow_cast(rule);
2c0ea78f 1243
8a4e3a85 1244 return netdev_flow;
2c0ea78f
GS
1245}
1246
1247static struct dp_netdev_flow *
1248dp_netdev_find_flow(const struct dp_netdev *dp, const struct flow *flow)
72865317 1249{
1763b4b8 1250 struct dp_netdev_flow *netdev_flow;
72865317 1251
9f361d6b 1252 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, flow_hash(flow, 0),
1763b4b8 1253 &dp->flow_table) {
2c0ea78f 1254 if (flow_equal(&netdev_flow->flow, flow)) {
61e7deb1 1255 return netdev_flow;
72865317
BP
1256 }
1257 }
8a4e3a85 1258
72865317
BP
1259 return NULL;
1260}
1261
1262static void
6fe09f8c 1263get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow,
1763b4b8 1264 struct dpif_flow_stats *stats)
feebdea2 1265{
679ba04c
BP
1266 struct dp_netdev_flow_stats *bucket;
1267 size_t i;
1268
1269 memset(stats, 0, sizeof *stats);
1270 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &netdev_flow->stats) {
1271 ovs_mutex_lock(&bucket->mutex);
1272 stats->n_packets += bucket->packet_count;
1273 stats->n_bytes += bucket->byte_count;
1274 stats->used = MAX(stats->used, bucket->used);
1275 stats->tcp_flags |= bucket->tcp_flags;
1276 ovs_mutex_unlock(&bucket->mutex);
1277 }
72865317
BP
1278}
1279
6fe09f8c
JS
1280static void
1281dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
1282 struct ofpbuf *buffer, struct dpif_flow *flow)
1283{
1284 struct flow_wildcards wc;
1285 struct dp_netdev_actions *actions;
1286
1287 minimask_expand(&netdev_flow->cr.match.mask, &wc);
1288 odp_flow_key_from_mask(buffer, &wc.masks, &netdev_flow->flow,
1289 odp_to_u32(wc.masks.in_port.odp_port),
1290 SIZE_MAX, true);
1291 flow->mask = ofpbuf_data(buffer);
1292 flow->mask_len = ofpbuf_size(buffer);
1293
1294 actions = dp_netdev_flow_get_actions(netdev_flow);
1295 flow->actions = actions->actions;
1296 flow->actions_len = actions->size;
1297
1298 get_dpif_flow_stats(netdev_flow, &flow->stats);
1299}
1300
36956a7d 1301static int
8c301900
JR
1302dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1303 const struct nlattr *mask_key,
1304 uint32_t mask_key_len, const struct flow *flow,
1305 struct flow *mask)
1306{
1307 if (mask_key_len) {
80e44883
BP
1308 enum odp_key_fitness fitness;
1309
1310 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, mask, flow);
1311 if (fitness) {
8c301900
JR
1312 /* This should not happen: it indicates that
1313 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
1314 * disagree on the acceptable form of a mask. Log the problem
1315 * as an error, with enough details to enable debugging. */
1316 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1317
1318 if (!VLOG_DROP_ERR(&rl)) {
1319 struct ds s;
1320
1321 ds_init(&s);
1322 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
1323 true);
80e44883
BP
1324 VLOG_ERR("internal error parsing flow mask %s (%s)",
1325 ds_cstr(&s), odp_key_fitness_to_string(fitness));
8c301900
JR
1326 ds_destroy(&s);
1327 }
1328
1329 return EINVAL;
1330 }
8c301900
JR
1331 } else {
1332 enum mf_field_id id;
1333 /* No mask key, unwildcard everything except fields whose
1334 * prerequisities are not met. */
1335 memset(mask, 0x0, sizeof *mask);
1336
1337 for (id = 0; id < MFF_N_IDS; ++id) {
1338 /* Skip registers and metadata. */
1339 if (!(id >= MFF_REG0 && id < MFF_REG0 + FLOW_N_REGS)
1340 && id != MFF_METADATA) {
1341 const struct mf_field *mf = mf_from_id(id);
1342 if (mf_are_prereqs_ok(mf, flow)) {
1343 mf_mask_field(mf, mask);
1344 }
1345 }
1346 }
1347 }
1348
f3f750e5
BP
1349 /* Force unwildcard the in_port.
1350 *
1351 * We need to do this even in the case where we unwildcard "everything"
1352 * above because "everything" only includes the 16-bit OpenFlow port number
1353 * mask->in_port.ofp_port, which only covers half of the 32-bit datapath
1354 * port number mask->in_port.odp_port. */
1355 mask->in_port.odp_port = u32_to_odp(UINT32_MAX);
1356
8c301900
JR
1357 return 0;
1358}
1359
1360static int
1361dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1362 struct flow *flow)
36956a7d 1363{
586ddea5
BP
1364 odp_port_t in_port;
1365
8c301900 1366 if (odp_flow_key_to_flow(key, key_len, flow)) {
36956a7d 1367 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
1368 * and odp_flow_key_to_flow() disagree on the acceptable form of a
1369 * flow. Log the problem as an error, with enough details to enable
1370 * debugging. */
36956a7d
BP
1371 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1372
1373 if (!VLOG_DROP_ERR(&rl)) {
1374 struct ds s;
1375
1376 ds_init(&s);
8c301900 1377 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
1378 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
1379 ds_destroy(&s);
1380 }
1381
1382 return EINVAL;
1383 }
1384
586ddea5
BP
1385 in_port = flow->in_port.odp_port;
1386 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
1387 return EINVAL;
1388 }
1389
36956a7d
BP
1390 return 0;
1391}
1392
72865317 1393static int
6fe09f8c 1394dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
1395{
1396 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1397 struct dp_netdev_flow *netdev_flow;
bc4a05c6
BP
1398 struct flow key;
1399 int error;
36956a7d 1400
6fe09f8c 1401 error = dpif_netdev_flow_from_nlattrs(get->key, get->key_len, &key);
bc4a05c6
BP
1402 if (error) {
1403 return error;
1404 }
14608a15 1405
2c0ea78f 1406 netdev_flow = dp_netdev_find_flow(dp, &key);
8a4e3a85 1407
1763b4b8 1408 if (netdev_flow) {
6fe09f8c 1409 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->flow);
61e7deb1 1410 } else {
5279f8fd 1411 error = ENOENT;
72865317 1412 }
bc4a05c6 1413
5279f8fd 1414 return error;
72865317
BP
1415}
1416
72865317 1417static int
ae2ceebd
EJ
1418dp_netdev_flow_add(struct dp_netdev *dp, struct match *match,
1419 const struct nlattr *actions, size_t actions_len)
8a4e3a85 1420 OVS_REQUIRES(dp->flow_mutex)
72865317 1421{
1763b4b8 1422 struct dp_netdev_flow *netdev_flow;
72865317 1423
1763b4b8 1424 netdev_flow = xzalloc(sizeof *netdev_flow);
ae2ceebd 1425 *CONST_CAST(struct flow *, &netdev_flow->flow) = match->flow;
8a4e3a85 1426
ed79f89a
DDP
1427 ovs_refcount_init(&netdev_flow->ref_cnt);
1428
679ba04c
BP
1429 ovsthread_stats_init(&netdev_flow->stats);
1430
61e7deb1
BP
1431 ovsrcu_set(&netdev_flow->actions,
1432 dp_netdev_actions_create(actions, actions_len));
2c0ea78f 1433
8a4e3a85 1434 cls_rule_init(CONST_CAST(struct cls_rule *, &netdev_flow->cr),
ae2ceebd 1435 match, NETDEV_RULE_PRIORITY);
9f361d6b
JR
1436 cmap_insert(&dp->flow_table,
1437 CONST_CAST(struct cmap_node *, &netdev_flow->node),
ae2ceebd 1438 flow_hash(&match->flow, 0));
8a4e3a85
BP
1439 classifier_insert(&dp->cls,
1440 CONST_CAST(struct cls_rule *, &netdev_flow->cr));
72865317 1441
623540e4
EJ
1442 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
1443 struct ds ds = DS_EMPTY_INITIALIZER;
1444
1445 ds_put_cstr(&ds, "flow_add: ");
1446 match_format(match, &ds, OFP_DEFAULT_PRIORITY);
1447 ds_put_cstr(&ds, ", actions:");
1448 format_odp_actions(&ds, actions, actions_len);
1449
1450 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
1451
1452 ds_destroy(&ds);
1453 }
1454
72865317
BP
1455 return 0;
1456}
1457
1458static void
1763b4b8 1459clear_stats(struct dp_netdev_flow *netdev_flow)
72865317 1460{
679ba04c
BP
1461 struct dp_netdev_flow_stats *bucket;
1462 size_t i;
1463
1464 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &netdev_flow->stats) {
1465 ovs_mutex_lock(&bucket->mutex);
1466 bucket->used = 0;
1467 bucket->packet_count = 0;
1468 bucket->byte_count = 0;
1469 bucket->tcp_flags = 0;
1470 ovs_mutex_unlock(&bucket->mutex);
1471 }
72865317
BP
1472}
1473
1474static int
89625d1e 1475dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
1476{
1477 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1478 struct dp_netdev_flow *netdev_flow;
4f150744 1479 struct miniflow miniflow;
ae2ceebd 1480 struct match match;
36956a7d
BP
1481 int error;
1482
ae2ceebd 1483 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
1484 if (error) {
1485 return error;
1486 }
1487 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
1488 put->mask, put->mask_len,
ae2ceebd 1489 &match.flow, &match.wc.masks);
36956a7d
BP
1490 if (error) {
1491 return error;
1492 }
ae2ceebd 1493 miniflow_init(&miniflow, &match.flow);
72865317 1494
8a4e3a85 1495 ovs_mutex_lock(&dp->flow_mutex);
4f150744 1496 netdev_flow = dp_netdev_lookup_flow(dp, &miniflow);
1763b4b8 1497 if (!netdev_flow) {
89625d1e 1498 if (put->flags & DPIF_FP_CREATE) {
9f361d6b 1499 if (cmap_count(&dp->flow_table) < MAX_FLOWS) {
89625d1e
BP
1500 if (put->stats) {
1501 memset(put->stats, 0, sizeof *put->stats);
feebdea2 1502 }
ae2ceebd 1503 error = dp_netdev_flow_add(dp, &match, put->actions,
5279f8fd 1504 put->actions_len);
72865317 1505 } else {
5279f8fd 1506 error = EFBIG;
72865317
BP
1507 }
1508 } else {
5279f8fd 1509 error = ENOENT;
72865317
BP
1510 }
1511 } else {
2c0ea78f 1512 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 1513 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
1514 struct dp_netdev_actions *new_actions;
1515 struct dp_netdev_actions *old_actions;
1516
1517 new_actions = dp_netdev_actions_create(put->actions,
1518 put->actions_len);
1519
61e7deb1
BP
1520 old_actions = dp_netdev_flow_get_actions(netdev_flow);
1521 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 1522
a84cb64a
BP
1523 if (put->stats) {
1524 get_dpif_flow_stats(netdev_flow, put->stats);
1525 }
1526 if (put->flags & DPIF_FP_ZERO_STATS) {
1527 clear_stats(netdev_flow);
72865317 1528 }
8a4e3a85 1529
61e7deb1 1530 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 1531 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 1532 error = EEXIST;
2c0ea78f
GS
1533 } else {
1534 /* Overlapping flow. */
1535 error = EINVAL;
72865317
BP
1536 }
1537 }
8a4e3a85 1538 ovs_mutex_unlock(&dp->flow_mutex);
5715de14 1539 miniflow_destroy(&miniflow);
5279f8fd
BP
1540
1541 return error;
72865317
BP
1542}
1543
72865317 1544static int
b99d3cee 1545dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
1546{
1547 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1548 struct dp_netdev_flow *netdev_flow;
14608a15 1549 struct flow key;
36956a7d
BP
1550 int error;
1551
b99d3cee 1552 error = dpif_netdev_flow_from_nlattrs(del->key, del->key_len, &key);
36956a7d
BP
1553 if (error) {
1554 return error;
1555 }
72865317 1556
8a4e3a85 1557 ovs_mutex_lock(&dp->flow_mutex);
2c0ea78f 1558 netdev_flow = dp_netdev_find_flow(dp, &key);
1763b4b8 1559 if (netdev_flow) {
b99d3cee 1560 if (del->stats) {
1763b4b8 1561 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 1562 }
8a4e3a85 1563 dp_netdev_remove_flow(dp, netdev_flow);
72865317 1564 } else {
5279f8fd 1565 error = ENOENT;
72865317 1566 }
8a4e3a85 1567 ovs_mutex_unlock(&dp->flow_mutex);
5279f8fd
BP
1568
1569 return error;
72865317
BP
1570}
1571
ac64794a
BP
1572struct dpif_netdev_flow_dump {
1573 struct dpif_flow_dump up;
9f361d6b 1574 struct cmap_position pos;
d2ad7ef1
JS
1575 int status;
1576 struct ovs_mutex mutex;
e723fd32
JS
1577};
1578
ac64794a
BP
1579static struct dpif_netdev_flow_dump *
1580dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 1581{
ac64794a 1582 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
1583}
1584
ac64794a
BP
1585static struct dpif_flow_dump *
1586dpif_netdev_flow_dump_create(const struct dpif *dpif_)
e723fd32 1587{
ac64794a 1588 struct dpif_netdev_flow_dump *dump;
e723fd32 1589
ac64794a
BP
1590 dump = xmalloc(sizeof *dump);
1591 dpif_flow_dump_init(&dump->up, dpif_);
9f361d6b 1592 memset(&dump->pos, 0, sizeof dump->pos);
ac64794a
BP
1593 dump->status = 0;
1594 ovs_mutex_init(&dump->mutex);
1595
1596 return &dump->up;
e723fd32
JS
1597}
1598
1599static int
ac64794a 1600dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 1601{
ac64794a 1602 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 1603
ac64794a
BP
1604 ovs_mutex_destroy(&dump->mutex);
1605 free(dump);
704a1e09
BP
1606 return 0;
1607}
1608
ac64794a
BP
1609struct dpif_netdev_flow_dump_thread {
1610 struct dpif_flow_dump_thread up;
1611 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
1612 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
1613 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
1614};
1615
1616static struct dpif_netdev_flow_dump_thread *
1617dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
1618{
1619 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
1620}
1621
1622static struct dpif_flow_dump_thread *
1623dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
1624{
1625 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
1626 struct dpif_netdev_flow_dump_thread *thread;
1627
1628 thread = xmalloc(sizeof *thread);
1629 dpif_flow_dump_thread_init(&thread->up, &dump->up);
1630 thread->dump = dump;
1631 return &thread->up;
1632}
1633
1634static void
1635dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
1636{
1637 struct dpif_netdev_flow_dump_thread *thread
1638 = dpif_netdev_flow_dump_thread_cast(thread_);
1639
1640 free(thread);
1641}
1642
704a1e09 1643static int
ac64794a 1644dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 1645 struct dpif_flow *flows, int max_flows)
ac64794a
BP
1646{
1647 struct dpif_netdev_flow_dump_thread *thread
1648 = dpif_netdev_flow_dump_thread_cast(thread_);
1649 struct dpif_netdev_flow_dump *dump = thread->dump;
1650 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
8bb113da 1651 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
ac64794a 1652 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
8bb113da
RW
1653 int n_flows = 0;
1654 int i;
14608a15 1655
ac64794a 1656 ovs_mutex_lock(&dump->mutex);
8bb113da 1657 if (!dump->status) {
8bb113da
RW
1658 for (n_flows = 0; n_flows < MIN(max_flows, FLOW_DUMP_MAX_BATCH);
1659 n_flows++) {
9f361d6b 1660 struct cmap_node *node;
8bb113da 1661
9f361d6b 1662 node = cmap_next_position(&dp->flow_table, &dump->pos);
8bb113da
RW
1663 if (!node) {
1664 dump->status = EOF;
1665 break;
1666 }
1667 netdev_flows[n_flows] = CONTAINER_OF(node, struct dp_netdev_flow,
1668 node);
d2ad7ef1 1669 }
8a4e3a85 1670 }
ac64794a 1671 ovs_mutex_unlock(&dump->mutex);
ac64794a 1672
8bb113da
RW
1673 for (i = 0; i < n_flows; i++) {
1674 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
1675 struct odputil_keybuf *keybuf = &thread->keybuf[i];
1676 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
1677 struct dpif_flow *f = &flows[i];
1678 struct dp_netdev_actions *dp_actions;
1679 struct flow_wildcards wc;
1680 struct ofpbuf buf;
1681
1682 minimask_expand(&netdev_flow->cr.match.mask, &wc);
1683
1684 /* Key. */
1685 ofpbuf_use_stack(&buf, keybuf, sizeof *keybuf);
1686 odp_flow_key_from_flow(&buf, &netdev_flow->flow, &wc.masks,
1687 netdev_flow->flow.in_port.odp_port, true);
1688 f->key = ofpbuf_data(&buf);
1689 f->key_len = ofpbuf_size(&buf);
1690
1691 /* Mask. */
1692 ofpbuf_use_stack(&buf, maskbuf, sizeof *maskbuf);
1693 odp_flow_key_from_mask(&buf, &wc.masks, &netdev_flow->flow,
1694 odp_to_u32(wc.masks.in_port.odp_port),
1695 SIZE_MAX, true);
1696 f->mask = ofpbuf_data(&buf);
1697 f->mask_len = ofpbuf_size(&buf);
1698
1699 /* Actions. */
1700 dp_actions = dp_netdev_flow_get_actions(netdev_flow);
1701 f->actions = dp_actions->actions;
1702 f->actions_len = dp_actions->size;
1703
1704 /* Stats. */
1705 get_dpif_flow_stats(netdev_flow, &f->stats);
1706 }
feebdea2 1707
8bb113da 1708 return n_flows;
72865317
BP
1709}
1710
1711static int
758c456d 1712dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
72865317
BP
1713{
1714 struct dp_netdev *dp = get_dp_netdev(dpif);
8cbf4f47 1715 struct dpif_packet packet, *pp;
758c456d 1716 struct pkt_metadata *md = &execute->md;
72865317 1717
1f317cb5
PS
1718 if (ofpbuf_size(execute->packet) < ETH_HEADER_LEN ||
1719 ofpbuf_size(execute->packet) > UINT16_MAX) {
72865317
BP
1720 return EINVAL;
1721 }
1722
91088554 1723 packet.ofpbuf = *execute->packet;
8cbf4f47 1724 pp = &packet;
91088554 1725
9bbf1c3d 1726 ovs_mutex_lock(&dp->emc_mutex);
8cbf4f47 1727 dp_netdev_execute_actions(dp, &pp, 1, false, md,
9bbf1c3d
DDP
1728 &dp->flow_cache, execute->actions,
1729 execute->actions_len);
1730 ovs_mutex_unlock(&dp->emc_mutex);
8a4e3a85 1731
91088554
DDP
1732 /* Even though may_steal is set to false, some actions could modify or
1733 * reallocate the ofpbuf memory. We need to pass those changes to the
1734 * caller */
1735 *execute->packet = packet.ofpbuf;
1736
758c456d 1737 return 0;
72865317
BP
1738}
1739
1a0c894a
BP
1740static void
1741dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
1742{
1743 size_t i;
1744
1745 for (i = 0; i < n_ops; i++) {
1746 struct dpif_op *op = ops[i];
1747
1748 switch (op->type) {
1749 case DPIF_OP_FLOW_PUT:
1750 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
1751 break;
1752
1753 case DPIF_OP_FLOW_DEL:
1754 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
1755 break;
1756
1757 case DPIF_OP_EXECUTE:
1758 op->error = dpif_netdev_execute(dpif, &op->u.execute);
1759 break;
6fe09f8c
JS
1760
1761 case DPIF_OP_FLOW_GET:
1762 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
1763 break;
1a0c894a
BP
1764 }
1765 }
1766}
1767
5bf93d67
EJ
1768static int
1769dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
1770 uint32_t queue_id, uint32_t *priority)
1771{
1772 *priority = queue_id;
1773 return 0;
1774}
1775
72865317 1776\f
a84cb64a
BP
1777/* Creates and returns a new 'struct dp_netdev_actions', with a reference count
1778 * of 1, whose actions are a copy of from the 'ofpacts_len' bytes of
1779 * 'ofpacts'. */
1780struct dp_netdev_actions *
1781dp_netdev_actions_create(const struct nlattr *actions, size_t size)
1782{
1783 struct dp_netdev_actions *netdev_actions;
1784
1785 netdev_actions = xmalloc(sizeof *netdev_actions);
a84cb64a
BP
1786 netdev_actions->actions = xmemdup(actions, size);
1787 netdev_actions->size = size;
1788
1789 return netdev_actions;
1790}
1791
a84cb64a 1792struct dp_netdev_actions *
61e7deb1 1793dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 1794{
61e7deb1 1795 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
1796}
1797
61e7deb1
BP
1798static void
1799dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 1800{
61e7deb1
BP
1801 free(actions->actions);
1802 free(actions);
a84cb64a
BP
1803}
1804\f
e4cfed38 1805
5794e276 1806static void
f7791740 1807dp_netdev_process_rxq_port(struct dp_netdev *dp,
9bbf1c3d
DDP
1808 struct emc_cache *flow_cache,
1809 struct dp_netdev_port *port,
1810 struct netdev_rxq *rxq)
e4cfed38 1811{
8cbf4f47
DDP
1812 struct dpif_packet *packets[NETDEV_MAX_RX_BATCH];
1813 int error, cnt;
e4cfed38 1814
8cbf4f47 1815 error = netdev_rxq_recv(rxq, packets, &cnt);
e4cfed38 1816 if (!error) {
3c33f0ff
JR
1817 struct pkt_metadata md = PKT_METADATA_INITIALIZER(port->port_no);
1818
1819 *recirc_depth_get() = 0;
1820 dp_netdev_input(dp, flow_cache, packets, cnt, &md);
e4cfed38 1821 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 1822 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
1823
1824 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 1825 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
1826 }
1827}
1828
1829static void
1830dpif_netdev_run(struct dpif *dpif)
1831{
1832 struct dp_netdev_port *port;
1833 struct dp_netdev *dp = get_dp_netdev(dpif);
1834
9bbf1c3d 1835 ovs_mutex_lock(&dp->emc_mutex);
a532e683 1836 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
1837 if (!netdev_is_pmd(port->netdev)) {
1838 int i;
1839
1840 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
9bbf1c3d
DDP
1841 dp_netdev_process_rxq_port(dp, &dp->flow_cache, port,
1842 port->rxq[i]);
55c955bd 1843 }
e4cfed38
PS
1844 }
1845 }
9bbf1c3d 1846 ovs_mutex_unlock(&dp->emc_mutex);
e4cfed38
PS
1847}
1848
1849static void
1850dpif_netdev_wait(struct dpif *dpif)
1851{
1852 struct dp_netdev_port *port;
1853 struct dp_netdev *dp = get_dp_netdev(dpif);
1854
59e6d833 1855 ovs_mutex_lock(&dp_netdev_mutex);
a532e683 1856 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
1857 if (!netdev_is_pmd(port->netdev)) {
1858 int i;
1859
1860 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
1861 netdev_rxq_wait(port->rxq[i]);
1862 }
e4cfed38
PS
1863 }
1864 }
59e6d833 1865 ovs_mutex_unlock(&dp_netdev_mutex);
e4cfed38
PS
1866}
1867
f7791740 1868struct rxq_poll {
e4cfed38 1869 struct dp_netdev_port *port;
55c955bd 1870 struct netdev_rxq *rx;
e4cfed38
PS
1871};
1872
1873static int
1874pmd_load_queues(struct pmd_thread *f,
f7791740 1875 struct rxq_poll **ppoll_list, int poll_cnt)
e4cfed38
PS
1876{
1877 struct dp_netdev *dp = f->dp;
f7791740 1878 struct rxq_poll *poll_list = *ppoll_list;
e4cfed38
PS
1879 struct dp_netdev_port *port;
1880 int id = f->id;
1881 int index;
1882 int i;
1883
1884 /* Simple scheduler for netdev rx polling. */
e4cfed38
PS
1885 for (i = 0; i < poll_cnt; i++) {
1886 port_unref(poll_list[i].port);
1887 }
1888
1889 poll_cnt = 0;
1890 index = 0;
1891
a532e683 1892 CMAP_FOR_EACH (port, node, &f->dp->ports) {
a1fdee13
AW
1893 /* Calls port_try_ref() to prevent the main thread
1894 * from deleting the port. */
1895 if (port_try_ref(port)) {
1896 if (netdev_is_pmd(port->netdev)) {
1897 int i;
1898
1899 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
1900 if ((index % dp->n_pmd_threads) == id) {
1901 poll_list = xrealloc(poll_list,
1902 sizeof *poll_list * (poll_cnt + 1));
1903
1904 port_ref(port);
1905 poll_list[poll_cnt].port = port;
1906 poll_list[poll_cnt].rx = port->rxq[i];
1907 poll_cnt++;
1908 }
1909 index++;
55c955bd 1910 }
e4cfed38 1911 }
a1fdee13
AW
1912 /* Unrefs the port_try_ref(). */
1913 port_unref(port);
e4cfed38
PS
1914 }
1915 }
1916
e4cfed38
PS
1917 *ppoll_list = poll_list;
1918 return poll_cnt;
1919}
1920
6c3eee82 1921static void *
e4cfed38 1922pmd_thread_main(void *f_)
6c3eee82 1923{
e4cfed38 1924 struct pmd_thread *f = f_;
6c3eee82 1925 struct dp_netdev *dp = f->dp;
e4cfed38 1926 unsigned int lc = 0;
f7791740 1927 struct rxq_poll *poll_list;
84067a4c 1928 unsigned int port_seq = PMD_INITIAL_SEQ;
e4cfed38
PS
1929 int poll_cnt;
1930 int i;
6c3eee82 1931
e4cfed38
PS
1932 poll_cnt = 0;
1933 poll_list = NULL;
1934
8617afff 1935 pmd_thread_setaffinity_cpu(f->id);
e4cfed38 1936reload:
9bbf1c3d 1937 emc_cache_init(&f->flow_cache);
e4cfed38 1938 poll_cnt = pmd_load_queues(f, &poll_list, poll_cnt);
6c3eee82 1939
e4cfed38 1940 for (;;) {
6c3eee82
BP
1941 int i;
1942
e4cfed38 1943 for (i = 0; i < poll_cnt; i++) {
9bbf1c3d
DDP
1944 dp_netdev_process_rxq_port(dp, &f->flow_cache, poll_list[i].port,
1945 poll_list[i].rx);
e4cfed38
PS
1946 }
1947
1948 if (lc++ > 1024) {
84067a4c 1949 unsigned int seq;
6c3eee82 1950
e4cfed38 1951 lc = 0;
84067a4c
JR
1952
1953 ovsrcu_quiesce();
1954
91a96379 1955 atomic_read_relaxed(&f->change_seq, &seq);
84067a4c
JR
1956 if (seq != port_seq) {
1957 port_seq = seq;
6c3eee82
BP
1958 break;
1959 }
1960 }
e4cfed38 1961 }
6c3eee82 1962
9bbf1c3d
DDP
1963 emc_cache_uninit(&f->flow_cache);
1964
e4cfed38
PS
1965 if (!latch_is_set(&f->dp->exit_latch)){
1966 goto reload;
1967 }
6c3eee82 1968
e4cfed38
PS
1969 for (i = 0; i < poll_cnt; i++) {
1970 port_unref(poll_list[i].port);
6c3eee82 1971 }
6c3eee82 1972
e4cfed38 1973 free(poll_list);
6c3eee82
BP
1974 return NULL;
1975}
1976
6b31e073
RW
1977static void
1978dp_netdev_disable_upcall(struct dp_netdev *dp)
1979 OVS_ACQUIRES(dp->upcall_rwlock)
1980{
1981 fat_rwlock_wrlock(&dp->upcall_rwlock);
1982}
1983
1984static void
1985dpif_netdev_disable_upcall(struct dpif *dpif)
1986 OVS_NO_THREAD_SAFETY_ANALYSIS
1987{
1988 struct dp_netdev *dp = get_dp_netdev(dpif);
1989 dp_netdev_disable_upcall(dp);
1990}
1991
1992static void
1993dp_netdev_enable_upcall(struct dp_netdev *dp)
1994 OVS_RELEASES(dp->upcall_rwlock)
1995{
1996 fat_rwlock_unlock(&dp->upcall_rwlock);
1997}
1998
1999static void
2000dpif_netdev_enable_upcall(struct dpif *dpif)
2001 OVS_NO_THREAD_SAFETY_ANALYSIS
2002{
2003 struct dp_netdev *dp = get_dp_netdev(dpif);
2004 dp_netdev_enable_upcall(dp);
2005}
2006
6c3eee82 2007static void
e4cfed38 2008dp_netdev_set_pmd_threads(struct dp_netdev *dp, int n)
6c3eee82
BP
2009{
2010 int i;
2011
e4cfed38 2012 if (n == dp->n_pmd_threads) {
6c3eee82
BP
2013 return;
2014 }
2015
2016 /* Stop existing threads. */
2017 latch_set(&dp->exit_latch);
e4cfed38
PS
2018 dp_netdev_reload_pmd_threads(dp);
2019 for (i = 0; i < dp->n_pmd_threads; i++) {
2020 struct pmd_thread *f = &dp->pmd_threads[i];
6c3eee82
BP
2021
2022 xpthread_join(f->thread, NULL);
2023 }
2024 latch_poll(&dp->exit_latch);
e4cfed38 2025 free(dp->pmd_threads);
6c3eee82
BP
2026
2027 /* Start new threads. */
e4cfed38
PS
2028 dp->pmd_threads = xmalloc(n * sizeof *dp->pmd_threads);
2029 dp->n_pmd_threads = n;
2030
6c3eee82 2031 for (i = 0; i < n; i++) {
e4cfed38 2032 struct pmd_thread *f = &dp->pmd_threads[i];
6c3eee82
BP
2033
2034 f->dp = dp;
e4cfed38 2035 f->id = i;
84067a4c 2036 atomic_init(&f->change_seq, PMD_INITIAL_SEQ);
e4cfed38
PS
2037
2038 /* Each thread will distribute all devices rx-queues among
2039 * themselves. */
8ba0a522 2040 f->thread = ovs_thread_create("pmd", pmd_thread_main, f);
6c3eee82
BP
2041 }
2042}
e4cfed38 2043
6c3eee82 2044\f
679ba04c
BP
2045static void *
2046dp_netdev_flow_stats_new_cb(void)
2047{
2048 struct dp_netdev_flow_stats *bucket = xzalloc_cacheline(sizeof *bucket);
2049 ovs_mutex_init(&bucket->mutex);
2050 return bucket;
2051}
2052
72865317 2053static void
1763b4b8 2054dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow,
8cbf4f47
DDP
2055 int cnt, int size,
2056 uint16_t tcp_flags)
72865317 2057{
679ba04c
BP
2058 long long int now = time_msec();
2059 struct dp_netdev_flow_stats *bucket;
2060
2061 bucket = ovsthread_stats_bucket_get(&netdev_flow->stats,
2062 dp_netdev_flow_stats_new_cb);
2063
2064 ovs_mutex_lock(&bucket->mutex);
2065 bucket->used = MAX(now, bucket->used);
8cbf4f47
DDP
2066 bucket->packet_count += cnt;
2067 bucket->byte_count += size;
679ba04c
BP
2068 bucket->tcp_flags |= tcp_flags;
2069 ovs_mutex_unlock(&bucket->mutex);
72865317
BP
2070}
2071
51852a57
BP
2072static void *
2073dp_netdev_stats_new_cb(void)
2074{
2075 struct dp_netdev_stats *bucket = xzalloc_cacheline(sizeof *bucket);
2076 ovs_mutex_init(&bucket->mutex);
2077 return bucket;
2078}
2079
2080static void
8cbf4f47 2081dp_netdev_count_packet(struct dp_netdev *dp, enum dp_stat_type type, int cnt)
51852a57
BP
2082{
2083 struct dp_netdev_stats *bucket;
2084
2085 bucket = ovsthread_stats_bucket_get(&dp->stats, dp_netdev_stats_new_cb);
2086 ovs_mutex_lock(&bucket->mutex);
8cbf4f47 2087 bucket->n[type] += cnt;
51852a57
BP
2088 ovs_mutex_unlock(&bucket->mutex);
2089}
2090
623540e4
EJ
2091static int
2092dp_netdev_upcall(struct dp_netdev *dp, struct dpif_packet *packet_,
2093 struct flow *flow, struct flow_wildcards *wc,
2094 enum dpif_upcall_type type, const struct nlattr *userdata,
2095 struct ofpbuf *actions, struct ofpbuf *put_actions)
2096{
2097 struct ofpbuf *packet = &packet_->ofpbuf;
2098
2099 if (type == DPIF_UC_MISS) {
2100 dp_netdev_count_packet(dp, DP_STAT_MISS, 1);
2101 }
2102
2103 if (OVS_UNLIKELY(!dp->upcall_cb)) {
2104 return ENODEV;
2105 }
2106
2107 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
2108 struct ds ds = DS_EMPTY_INITIALIZER;
2109 struct ofpbuf key;
2110 char *packet_str;
2111
2112 ofpbuf_init(&key, 0);
2113 odp_flow_key_from_flow(&key, flow, &wc->masks, flow->in_port.odp_port,
2114 true);
2115
2116 packet_str = ofp_packet_to_string(ofpbuf_data(packet),
2117 ofpbuf_size(packet));
2118
2119 odp_flow_key_format(ofpbuf_data(&key), ofpbuf_size(&key), &ds);
2120
2121 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
2122 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
2123
2124 ofpbuf_uninit(&key);
2125 free(packet_str);
2126 ds_destroy(&ds);
2127 }
2128
2129 return dp->upcall_cb(packet, flow, type, userdata, actions, wc,
2130 put_actions, dp->upcall_aux);
2131}
2132
9bbf1c3d
DDP
2133static inline uint32_t
2134dpif_netdev_packet_get_dp_hash(struct dpif_packet *packet,
2135 const struct miniflow *mf)
2136{
2137 uint32_t hash;
2138
2139 hash = dpif_packet_get_dp_hash(packet);
2140 if (OVS_UNLIKELY(!hash)) {
2141 hash = miniflow_hash_5tuple(mf, 0);
2142 dpif_packet_set_dp_hash(packet, hash);
2143 }
2144 return hash;
2145}
2146
567bbb2e 2147struct packet_batch {
8cbf4f47
DDP
2148 unsigned int packet_count;
2149 unsigned int byte_count;
2150 uint16_t tcp_flags;
2151
2152 struct dp_netdev_flow *flow;
2153
2154 struct dpif_packet *packets[NETDEV_MAX_RX_BATCH];
2155 struct pkt_metadata md;
2156};
2157
2158static inline void
9bbf1c3d
DDP
2159packet_batch_update(struct packet_batch *batch, struct dpif_packet *packet,
2160 const struct miniflow *mf)
8cbf4f47
DDP
2161{
2162 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
2163 batch->packets[batch->packet_count++] = packet;
2164 batch->byte_count += ofpbuf_size(&packet->ofpbuf);
2165}
2166
2167static inline void
567bbb2e 2168packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow,
84d6d5eb 2169 struct pkt_metadata *md)
8cbf4f47
DDP
2170{
2171 batch->flow = flow;
2172 batch->md = *md;
8cbf4f47
DDP
2173
2174 batch->packet_count = 0;
2175 batch->byte_count = 0;
2176 batch->tcp_flags = 0;
8cbf4f47
DDP
2177}
2178
2179static inline void
9bbf1c3d
DDP
2180packet_batch_execute(struct packet_batch *batch, struct dp_netdev *dp,
2181 struct emc_cache *flow_cache)
8cbf4f47
DDP
2182{
2183 struct dp_netdev_actions *actions;
2184 struct dp_netdev_flow *flow = batch->flow;
2185
2186 dp_netdev_flow_used(batch->flow, batch->packet_count, batch->byte_count,
2187 batch->tcp_flags);
2188
2189 actions = dp_netdev_flow_get_actions(flow);
2190
9bbf1c3d
DDP
2191 dp_netdev_execute_actions(dp, batch->packets, batch->packet_count, true,
2192 &batch->md, flow_cache,
8cbf4f47
DDP
2193 actions->actions, actions->size);
2194
2195 dp_netdev_count_packet(dp, DP_STAT_HIT, batch->packet_count);
2196}
2197
9bbf1c3d
DDP
2198static inline bool
2199dp_netdev_queue_batches(struct dpif_packet *pkt, struct pkt_metadata *md,
2200 struct dp_netdev_flow *flow, const struct miniflow *mf,
2201 struct packet_batch *batches, size_t *n_batches,
2202 size_t max_batches)
2203{
2204 struct packet_batch *batch = NULL;
2205 int j;
2206
2207 if (OVS_UNLIKELY(!flow)) {
2208 return false;
2209 }
2210 /* XXX: This O(n^2) algortihm makes sense if we're operating under the
2211 * assumption that the number of distinct flows (and therefore the
2212 * number of distinct batches) is quite small. If this turns out not
2213 * to be the case, it may make sense to pre sort based on the
2214 * netdev_flow pointer. That done we can get the appropriate batching
2215 * in O(n * log(n)) instead. */
2216 for (j = *n_batches - 1; j >= 0; j--) {
2217 if (batches[j].flow == flow) {
2218 batch = &batches[j];
2219 packet_batch_update(batch, pkt, mf);
2220 return true;
2221 }
2222 }
2223 if (OVS_UNLIKELY(*n_batches >= max_batches)) {
2224 return false;
2225 }
2226
2227 batch = &batches[(*n_batches)++];
2228 packet_batch_init(batch, flow, md);
2229 packet_batch_update(batch, pkt, mf);
2230 return true;
2231}
2232
2233static inline void
2234dpif_packet_swap(struct dpif_packet **a, struct dpif_packet **b)
2235{
2236 struct dpif_packet *tmp = *a;
2237 *a = *b;
2238 *b = tmp;
2239}
2240
2241/* Try to process all ('cnt') the 'packets' using only the exact match cache
2242 * 'flow_cache'. If a flow is not found for a packet 'packets[i]', or if there
2243 * is no matching batch for a packet's flow, the miniflow is copied into 'keys'
2244 * and the packet pointer is moved at the beginning of the 'packets' array.
2245 *
2246 * The function returns the number of packets that needs to be processed in the
2247 * 'packets' array (they have been moved to the beginning of the vector).
2248 */
2249static inline size_t
2250emc_processing(struct dp_netdev *dp, struct emc_cache *flow_cache,
2251 struct dpif_packet **packets, size_t cnt,
2252 struct pkt_metadata *md, struct netdev_flow_key *keys)
72865317 2253{
9bbf1c3d
DDP
2254 struct netdev_flow_key key;
2255 struct packet_batch batches[4];
84d6d5eb 2256 size_t n_batches, i;
9bbf1c3d 2257 size_t notfound_cnt = 0;
8cbf4f47 2258
9bbf1c3d
DDP
2259 n_batches = 0;
2260 miniflow_initialize(&key.flow, key.buf);
84d6d5eb 2261 for (i = 0; i < cnt; i++) {
9bbf1c3d
DDP
2262 struct dp_netdev_flow *flow;
2263 uint32_t hash;
2264
84d6d5eb
EJ
2265 if (OVS_UNLIKELY(ofpbuf_size(&packets[i]->ofpbuf) < ETH_HEADER_LEN)) {
2266 dpif_packet_delete(packets[i]);
84d6d5eb
EJ
2267 continue;
2268 }
8cbf4f47 2269
9bbf1c3d
DDP
2270 miniflow_extract(&packets[i]->ofpbuf, md, &key.flow);
2271
2272 hash = dpif_netdev_packet_get_dp_hash(packets[i], &key.flow);
2273
2274 flow = emc_lookup(flow_cache, &key.flow, hash);
2275 if (OVS_UNLIKELY(!dp_netdev_queue_batches(packets[i], md,
2276 flow, &key.flow,
2277 batches, &n_batches,
2278 ARRAY_SIZE(batches)))) {
2279 if (i != notfound_cnt) {
2280 dpif_packet_swap(&packets[i], &packets[notfound_cnt]);
2281 }
2282
2283 keys[notfound_cnt++] = key;
2284 }
2285 }
2286
2287 for (i = 0; i < n_batches; i++) {
2288 packet_batch_execute(&batches[i], dp, flow_cache);
84d6d5eb 2289 }
4f150744 2290
9bbf1c3d
DDP
2291 return notfound_cnt;
2292}
2293
2294static inline void
2295fast_path_processing(struct dp_netdev *dp, struct emc_cache *flow_cache,
2296 struct dpif_packet **packets, size_t cnt,
2297 struct pkt_metadata *md, struct netdev_flow_key *keys)
2298{
1a0d5831 2299#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
2300 const size_t PKT_ARRAY_SIZE = cnt;
2301#else
1a0d5831 2302 /* Sparse or MSVC doesn't like variable length array. */
9bbf1c3d
DDP
2303 enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH };
2304#endif
2305 struct packet_batch batches[PKT_ARRAY_SIZE];
2306 const struct miniflow *mfs[PKT_ARRAY_SIZE]; /* NULL at bad packets. */
2307 struct cls_rule *rules[PKT_ARRAY_SIZE];
2308 size_t n_batches, i;
2309 bool any_miss;
2310
2311 for (i = 0; i < cnt; i++) {
2312 mfs[i] = &keys[i].flow;
2313 }
623540e4
EJ
2314 any_miss = !classifier_lookup_miniflow_batch(&dp->cls, mfs, rules, cnt);
2315 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
2316 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
2317 struct ofpbuf actions, put_actions;
2318 struct match match;
2319
2320 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
2321 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
2322
2323 for (i = 0; i < cnt; i++) {
2324 const struct dp_netdev_flow *netdev_flow;
2325 struct ofpbuf *add_actions;
2326 int error;
2327
2328 if (OVS_LIKELY(rules[i] || !mfs[i])) {
2329 continue;
2330 }
2331
2332 /* It's possible that an earlier slow path execution installed
2333 * the rule this flow needs. In this case, it's a lot cheaper
2334 * to catch it here than execute a miss. */
2335 netdev_flow = dp_netdev_lookup_flow(dp, mfs[i]);
2336 if (netdev_flow) {
2337 rules[i] = CONST_CAST(struct cls_rule *, &netdev_flow->cr);
2338 continue;
2339 }
2340
2341 miniflow_expand(mfs[i], &match.flow);
2342
2343 ofpbuf_clear(&actions);
2344 ofpbuf_clear(&put_actions);
2345
2346 error = dp_netdev_upcall(dp, packets[i], &match.flow, &match.wc,
2347 DPIF_UC_MISS, NULL, &actions,
2348 &put_actions);
2349 if (OVS_UNLIKELY(error && error != ENOSPC)) {
2350 continue;
2351 }
2352
2353 /* We can't allow the packet batching in the next loop to execute
2354 * the actions. Otherwise, if there are any slow path actions,
2355 * we'll send the packet up twice. */
2356 dp_netdev_execute_actions(dp, &packets[i], 1, false, md,
9bbf1c3d 2357 flow_cache, ofpbuf_data(&actions),
623540e4
EJ
2358 ofpbuf_size(&actions));
2359
2360 add_actions = ofpbuf_size(&put_actions)
2361 ? &put_actions
2362 : &actions;
2363
2364 ovs_mutex_lock(&dp->flow_mutex);
2365 /* XXX: There's a brief race where this flow could have already
2366 * been installed since we last did the flow lookup. This could be
2367 * solved by moving the mutex lock outside the loop, but that's an
2368 * awful long time to be locking everyone out of making flow
2369 * installs. If we move to a per-core classifier, it would be
2370 * reasonable. */
2371 if (OVS_LIKELY(error != ENOSPC)
2372 && !dp_netdev_lookup_flow(dp, mfs[i])) {
2373 dp_netdev_flow_add(dp, &match, ofpbuf_data(add_actions),
2374 ofpbuf_size(add_actions));
2375 }
2376 ovs_mutex_unlock(&dp->flow_mutex);
2377 }
2378
2379 ofpbuf_uninit(&actions);
2380 ofpbuf_uninit(&put_actions);
2381 fat_rwlock_unlock(&dp->upcall_rwlock);
2382 }
84d6d5eb
EJ
2383
2384 n_batches = 0;
8cbf4f47 2385 for (i = 0; i < cnt; i++) {
9bbf1c3d 2386 struct dpif_packet *packet = packets[i];
84d6d5eb 2387 struct dp_netdev_flow *flow;
8cbf4f47 2388
623540e4 2389 if (OVS_UNLIKELY(!rules[i] || !mfs[i])) {
84d6d5eb
EJ
2390 continue;
2391 }
2392
84d6d5eb 2393 flow = dp_netdev_flow_cast(rules[i]);
9bbf1c3d
DDP
2394 emc_insert(flow_cache, mfs[i], dpif_packet_get_dp_hash(packet), flow);
2395 dp_netdev_queue_batches(packet, md, flow, mfs[i], batches, &n_batches,
2396 ARRAY_SIZE(batches));
8cbf4f47
DDP
2397 }
2398
84d6d5eb 2399 for (i = 0; i < n_batches; i++) {
9bbf1c3d 2400 packet_batch_execute(&batches[i], dp, flow_cache);
72865317
BP
2401 }
2402}
2403
adcf00ba 2404static void
9bbf1c3d
DDP
2405dp_netdev_input(struct dp_netdev *dp, struct emc_cache *flow_cache,
2406 struct dpif_packet **packets, int cnt, struct pkt_metadata *md)
2407{
1a0d5831 2408#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
2409 const size_t PKT_ARRAY_SIZE = cnt;
2410#else
1a0d5831 2411 /* Sparse or MSVC doesn't like variable length array. */
9bbf1c3d
DDP
2412 enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH };
2413#endif
2414 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
2415 size_t newcnt;
2416
2417 newcnt = emc_processing(dp, flow_cache, packets, cnt, md, keys);
2418 if (OVS_UNLIKELY(newcnt)) {
2419 fast_path_processing(dp, flow_cache, packets, newcnt, md, keys);
2420 }
2421}
2422
9080a111
JR
2423struct dp_netdev_execute_aux {
2424 struct dp_netdev *dp;
9bbf1c3d 2425 struct emc_cache *flow_cache;
9080a111
JR
2426};
2427
6b31e073 2428static void
623540e4
EJ
2429dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
2430 void *aux)
6b31e073
RW
2431{
2432 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 2433 dp->upcall_aux = aux;
6b31e073
RW
2434 dp->upcall_cb = cb;
2435}
2436
9080a111 2437static void
8cbf4f47 2438dp_execute_cb(void *aux_, struct dpif_packet **packets, int cnt,
572f732a 2439 struct pkt_metadata *md,
09f9da0b 2440 const struct nlattr *a, bool may_steal)
8a4e3a85 2441 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
2442{
2443 struct dp_netdev_execute_aux *aux = aux_;
623540e4
EJ
2444 uint32_t *depth = recirc_depth_get();
2445 struct dp_netdev *dp = aux->dp;
09f9da0b 2446 int type = nl_attr_type(a);
8a4e3a85 2447 struct dp_netdev_port *p;
8cbf4f47 2448 int i;
9080a111 2449
09f9da0b
JR
2450 switch ((enum ovs_action_attr)type) {
2451 case OVS_ACTION_ATTR_OUTPUT:
623540e4 2452 p = dp_netdev_lookup_port(dp, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 2453 if (OVS_LIKELY(p)) {
f00fa8cb 2454 netdev_send(p->netdev, NETDEV_QID_NONE, packets, cnt, may_steal);
26a5075b
DDP
2455 } else if (may_steal) {
2456 for (i = 0; i < cnt; i++) {
2457 dpif_packet_delete(packets[i]);
2458 }
8a4e3a85 2459 }
09f9da0b
JR
2460 break;
2461
623540e4
EJ
2462 case OVS_ACTION_ATTR_USERSPACE:
2463 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
2464 const struct nlattr *userdata;
2465 struct ofpbuf actions;
2466 struct flow flow;
4fc65926 2467
623540e4
EJ
2468 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
2469 ofpbuf_init(&actions, 0);
8cbf4f47 2470
623540e4
EJ
2471 for (i = 0; i < cnt; i++) {
2472 int error;
2473
2474 ofpbuf_clear(&actions);
2475
2476 flow_extract(&packets[i]->ofpbuf, md, &flow);
2477 error = dp_netdev_upcall(dp, packets[i], &flow, NULL,
2478 DPIF_UC_ACTION, userdata, &actions,
2479 NULL);
2480 if (!error || error == ENOSPC) {
2481 dp_netdev_execute_actions(dp, &packets[i], 1, false, md,
9bbf1c3d 2482 aux->flow_cache,
623540e4
EJ
2483 ofpbuf_data(&actions),
2484 ofpbuf_size(&actions));
2485 }
8cbf4f47 2486
623540e4
EJ
2487 if (may_steal) {
2488 dpif_packet_delete(packets[i]);
2489 }
db73f716 2490 }
623540e4
EJ
2491 ofpbuf_uninit(&actions);
2492 fat_rwlock_unlock(&dp->upcall_rwlock);
8cbf4f47 2493 }
6b31e073 2494
09f9da0b 2495 break;
572f732a 2496
347bf289
AZ
2497 case OVS_ACTION_ATTR_HASH: {
2498 const struct ovs_action_hash *hash_act;
2499 uint32_t hash;
2500
2501 hash_act = nl_attr_get(a);
8cbf4f47 2502
8cbf4f47
DDP
2503 for (i = 0; i < cnt; i++) {
2504
8cbf4f47
DDP
2505 if (hash_act->hash_alg == OVS_HASH_ALG_L4) {
2506 /* Hash need not be symmetric, nor does it need to include
2507 * L2 fields. */
9bbf1c3d
DDP
2508 hash = hash_2words(dpif_packet_get_dp_hash(packets[i]),
2509 hash_act->hash_basis);
8cbf4f47
DDP
2510 } else {
2511 VLOG_WARN("Unknown hash algorithm specified "
2512 "for the hash action.");
2513 hash = 2;
2514 }
2515
347bf289
AZ
2516 if (!hash) {
2517 hash = 1; /* 0 is not valid */
2518 }
2519
8cbf4f47
DDP
2520 if (i == 0) {
2521 md->dp_hash = hash;
2522 }
9bbf1c3d 2523 dpif_packet_set_dp_hash(packets[i], hash);
347bf289 2524 }
347bf289
AZ
2525 break;
2526 }
2527
adcf00ba
AZ
2528 case OVS_ACTION_ATTR_RECIRC:
2529 if (*depth < MAX_RECIRC_DEPTH) {
572f732a 2530
adcf00ba 2531 (*depth)++;
8cbf4f47
DDP
2532 for (i = 0; i < cnt; i++) {
2533 struct dpif_packet *recirc_pkt;
2534 struct pkt_metadata recirc_md = *md;
2535
2536 recirc_pkt = (may_steal) ? packets[i]
2537 : dpif_packet_clone(packets[i]);
2538
2539 recirc_md.recirc_id = nl_attr_get_u32(a);
2540
2541 /* Hash is private to each packet */
61a2647e 2542 recirc_md.dp_hash = dpif_packet_get_dp_hash(packets[i]);
8cbf4f47 2543
9bbf1c3d
DDP
2544 dp_netdev_input(dp, aux->flow_cache, &recirc_pkt, 1,
2545 &recirc_md);
8cbf4f47 2546 }
adcf00ba
AZ
2547 (*depth)--;
2548
adcf00ba
AZ
2549 break;
2550 } else {
2551 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
26a5075b
DDP
2552 if (may_steal) {
2553 for (i = 0; i < cnt; i++) {
2554 dpif_packet_delete(packets[i]);
2555 }
2556 }
adcf00ba 2557 }
572f732a 2558 break;
572f732a 2559
09f9da0b
JR
2560 case OVS_ACTION_ATTR_PUSH_VLAN:
2561 case OVS_ACTION_ATTR_POP_VLAN:
2562 case OVS_ACTION_ATTR_PUSH_MPLS:
2563 case OVS_ACTION_ATTR_POP_MPLS:
2564 case OVS_ACTION_ATTR_SET:
6d670e7f 2565 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b
JR
2566 case OVS_ACTION_ATTR_SAMPLE:
2567 case OVS_ACTION_ATTR_UNSPEC:
2568 case __OVS_ACTION_ATTR_MAX:
2569 OVS_NOT_REACHED();
da546e07 2570 }
98403001
BP
2571}
2572
4edb9ae9 2573static void
8cbf4f47
DDP
2574dp_netdev_execute_actions(struct dp_netdev *dp,
2575 struct dpif_packet **packets, int cnt,
2576 bool may_steal, struct pkt_metadata *md,
9bbf1c3d 2577 struct emc_cache *flow_cache,
9080a111 2578 const struct nlattr *actions, size_t actions_len)
72865317 2579{
9bbf1c3d 2580 struct dp_netdev_execute_aux aux = {dp, flow_cache};
9080a111 2581
8cbf4f47
DDP
2582 odp_execute_actions(&aux, packets, cnt, may_steal, md, actions,
2583 actions_len, dp_execute_cb);
72865317
BP
2584}
2585
2586const struct dpif_class dpif_netdev_class = {
72865317 2587 "netdev",
2197d7ab 2588 dpif_netdev_enumerate,
0aeaabc8 2589 dpif_netdev_port_open_type,
72865317
BP
2590 dpif_netdev_open,
2591 dpif_netdev_close,
7dab847a 2592 dpif_netdev_destroy,
e4cfed38
PS
2593 dpif_netdev_run,
2594 dpif_netdev_wait,
72865317 2595 dpif_netdev_get_stats,
72865317
BP
2596 dpif_netdev_port_add,
2597 dpif_netdev_port_del,
2598 dpif_netdev_port_query_by_number,
2599 dpif_netdev_port_query_by_name,
98403001 2600 NULL, /* port_get_pid */
b0ec0f27
BP
2601 dpif_netdev_port_dump_start,
2602 dpif_netdev_port_dump_next,
2603 dpif_netdev_port_dump_done,
72865317
BP
2604 dpif_netdev_port_poll,
2605 dpif_netdev_port_poll_wait,
72865317 2606 dpif_netdev_flow_flush,
ac64794a
BP
2607 dpif_netdev_flow_dump_create,
2608 dpif_netdev_flow_dump_destroy,
2609 dpif_netdev_flow_dump_thread_create,
2610 dpif_netdev_flow_dump_thread_destroy,
704a1e09 2611 dpif_netdev_flow_dump_next,
1a0c894a 2612 dpif_netdev_operate,
6b31e073
RW
2613 NULL, /* recv_set */
2614 NULL, /* handlers_set */
5bf93d67 2615 dpif_netdev_queue_to_priority,
6b31e073
RW
2616 NULL, /* recv */
2617 NULL, /* recv_wait */
2618 NULL, /* recv_purge */
2619 dpif_netdev_register_upcall_cb,
2620 dpif_netdev_enable_upcall,
2621 dpif_netdev_disable_upcall,
72865317 2622};
614c4892 2623
74cc3969
BP
2624static void
2625dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
2626 const char *argv[], void *aux OVS_UNUSED)
2627{
59e6d833
BP
2628 struct dp_netdev_port *old_port;
2629 struct dp_netdev_port *new_port;
74cc3969 2630 struct dp_netdev *dp;
ff073a71 2631 odp_port_t port_no;
74cc3969 2632
8a4e3a85 2633 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
2634 dp = shash_find_data(&dp_netdevs, argv[1]);
2635 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 2636 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
2637 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
2638 return;
2639 }
8a4e3a85
BP
2640 ovs_refcount_ref(&dp->ref_cnt);
2641 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 2642
59e6d833
BP
2643 ovs_mutex_lock(&dp->port_mutex);
2644 if (get_port_by_name(dp, argv[2], &old_port)) {
74cc3969 2645 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 2646 goto exit;
74cc3969
BP
2647 }
2648
ff073a71
BP
2649 port_no = u32_to_odp(atoi(argv[3]));
2650 if (!port_no || port_no == ODPP_NONE) {
74cc3969 2651 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 2652 goto exit;
74cc3969 2653 }
ff073a71 2654 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 2655 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 2656 goto exit;
74cc3969 2657 }
59e6d833
BP
2658
2659 /* Remove old port. */
2660 cmap_remove(&dp->ports, &old_port->node, hash_port_no(old_port->port_no));
2661 ovsrcu_postpone(free, old_port);
2662
2663 /* Insert new port (cmap semantics mean we cannot re-insert 'old_port'). */
2664 new_port = xmemdup(old_port, sizeof *old_port);
2665 new_port->port_no = port_no;
2666 cmap_insert(&dp->ports, &new_port->node, hash_port_no(port_no));
2667
d33ed218 2668 seq_change(dp->port_seq);
74cc3969 2669 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
2670
2671exit:
59e6d833 2672 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 2673 dp_netdev_unref(dp);
74cc3969
BP
2674}
2675
c40b890f
BP
2676static void
2677dpif_dummy_delete_port(struct unixctl_conn *conn, int argc OVS_UNUSED,
2678 const char *argv[], void *aux OVS_UNUSED)
2679{
2680 struct dp_netdev_port *port;
2681 struct dp_netdev *dp;
2682
2683 ovs_mutex_lock(&dp_netdev_mutex);
2684 dp = shash_find_data(&dp_netdevs, argv[1]);
2685 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
2686 ovs_mutex_unlock(&dp_netdev_mutex);
2687 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
2688 return;
2689 }
2690 ovs_refcount_ref(&dp->ref_cnt);
2691 ovs_mutex_unlock(&dp_netdev_mutex);
2692
2693 ovs_mutex_lock(&dp->port_mutex);
2694 if (get_port_by_name(dp, argv[2], &port)) {
2695 unixctl_command_reply_error(conn, "unknown port");
2696 } else if (port->port_no == ODPP_LOCAL) {
2697 unixctl_command_reply_error(conn, "can't delete local port");
2698 } else {
2699 do_del_port(dp, port);
2700 unixctl_command_reply(conn, NULL);
2701 }
2702 ovs_mutex_unlock(&dp->port_mutex);
2703
2704 dp_netdev_unref(dp);
2705}
2706
0cbfe35d
BP
2707static void
2708dpif_dummy_register__(const char *type)
2709{
2710 struct dpif_class *class;
2711
2712 class = xmalloc(sizeof *class);
2713 *class = dpif_netdev_class;
2714 class->type = xstrdup(type);
2715 dp_register_provider(class);
2716}
2717
614c4892 2718void
0cbfe35d 2719dpif_dummy_register(bool override)
614c4892 2720{
0cbfe35d
BP
2721 if (override) {
2722 struct sset types;
2723 const char *type;
2724
2725 sset_init(&types);
2726 dp_enumerate_types(&types);
2727 SSET_FOR_EACH (type, &types) {
2728 if (!dp_unregister_provider(type)) {
2729 dpif_dummy_register__(type);
2730 }
2731 }
2732 sset_destroy(&types);
614c4892 2733 }
0cbfe35d
BP
2734
2735 dpif_dummy_register__("dummy");
74cc3969
BP
2736
2737 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 2738 "dp port new-number",
74cc3969 2739 3, 3, dpif_dummy_change_port_number, NULL);
74467d5c 2740 unixctl_command_register("dpif-dummy/delete-port", "dp port",
c40b890f 2741 2, 2, dpif_dummy_delete_port, NULL);
614c4892 2742}