]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netdev.c
dpif-netdev: Move rxq management into functions.
[mirror_ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
d262ac2c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
72865317 24#include <netinet/in.h>
9d82ec47 25#include <sys/socket.h>
7f3adc00 26#include <net/if.h>
cdee00fd 27#include <stdint.h>
72865317
BP
28#include <stdlib.h>
29#include <string.h>
30#include <sys/ioctl.h>
31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
9f861c91 34#include "bitmap.h"
59e6d833 35#include "cmap.h"
72865317 36#include "csum.h"
e14deea0 37#include "dp-packet.h"
614c4892 38#include "dpif.h"
72865317 39#include "dpif-provider.h"
614c4892 40#include "dummy.h"
36956a7d 41#include "dynamic-string.h"
afae68b1 42#include "fat-rwlock.h"
72865317 43#include "flow.h"
9f361d6b 44#include "cmap.h"
fbe0962b 45#include "coverage.h"
762d146a 46#include "hmapx.h"
6c3eee82 47#include "latch.h"
72865317 48#include "list.h"
0de8783a 49#include "match.h"
72865317 50#include "netdev.h"
8617afff 51#include "netdev-dpdk.h"
de281153 52#include "netdev-vport.h"
cdee00fd 53#include "netlink.h"
f094af7b 54#include "odp-execute.h"
72865317
BP
55#include "odp-util.h"
56#include "ofp-print.h"
57#include "ofpbuf.h"
5a034064 58#include "ovs-numa.h"
61e7deb1 59#include "ovs-rcu.h"
72865317
BP
60#include "packets.h"
61#include "poll-loop.h"
0de8783a 62#include "pvector.h"
26c6b6cd 63#include "random.h"
d33ed218 64#include "seq.h"
462278db 65#include "shash.h"
0cbfe35d 66#include "sset.h"
72865317 67#include "timeval.h"
53902038 68#include "tnl-neigh-cache.h"
7f9b8504 69#include "tnl-ports.h"
74cc3969 70#include "unixctl.h"
72865317 71#include "util.h"
e6211adc 72#include "openvswitch/vlog.h"
5136ce49 73
d98e6007 74VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 75
8bb113da 76#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
77/* Use per thread recirc_depth to prevent recirculation loop. */
78#define MAX_RECIRC_DEPTH 5
79DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 80
72865317 81/* Configuration parameters. */
72865317
BP
82enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
83
8a4e3a85
BP
84/* Protects against changes to 'dp_netdevs'. */
85static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
86
87/* Contains all 'struct dp_netdev's. */
88static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
89 = SHASH_INITIALIZER(&dp_netdevs);
90
623540e4 91static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 92
2494ccd7
JS
93static struct odp_support dp_netdev_support = {
94 .max_mpls_depth = SIZE_MAX,
95 .recirc = true,
96};
97
79df317f 98/* Stores a miniflow with inline values */
9bbf1c3d 99
9bbf1c3d 100struct netdev_flow_key {
caeb4906
JR
101 uint32_t hash; /* Hash function differs for different users. */
102 uint32_t len; /* Length of the following miniflow (incl. map). */
0de8783a 103 struct miniflow mf;
8fd47924 104 uint64_t buf[FLOW_MAX_PACKET_U64S];
9bbf1c3d
DDP
105};
106
107/* Exact match cache for frequently used flows
108 *
109 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
110 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 111 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
112 *
113 * A cache entry holds a reference to its 'dp_netdev_flow'.
114 *
115 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
116 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
117 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
118 * value is the index of a cache entry where the miniflow could be.
119 *
120 *
121 * Thread-safety
122 * =============
123 *
124 * Each pmd_thread has its own private exact match cache.
125 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
126 */
127
fc82e877 128#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
129#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
130#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
131#define EM_FLOW_HASH_SEGS 2
132
133struct emc_entry {
9bbf1c3d 134 struct dp_netdev_flow *flow;
0de8783a 135 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
136};
137
138struct emc_cache {
139 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 140 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
141};
142
143/* Iterate in the exact match cache through every entry that might contain a
144 * miniflow with hash 'HASH'. */
145#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
146 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
147 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
148 i__ < EM_FLOW_HASH_SEGS; \
149 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
150\f
151/* Simple non-wildcarding single-priority classifier. */
152
153struct dpcls {
154 struct cmap subtables_map;
155 struct pvector subtables;
156};
9bbf1c3d 157
0de8783a
JR
158/* A rule to be inserted to the classifier. */
159struct dpcls_rule {
160 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
161 struct netdev_flow_key *mask; /* Subtable's mask. */
162 struct netdev_flow_key flow; /* Matching key. */
163 /* 'flow' must be the last field, additional space is allocated here. */
164};
165
166static void dpcls_init(struct dpcls *);
167static void dpcls_destroy(struct dpcls *);
168static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
169 const struct netdev_flow_key *mask);
170static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
171static bool dpcls_lookup(const struct dpcls *cls,
172 const struct netdev_flow_key keys[],
173 struct dpcls_rule **rules, size_t cnt);
174\f
8a4e3a85
BP
175/* Datapath based on the network device interface from netdev.h.
176 *
177 *
178 * Thread-safety
179 * =============
180 *
181 * Some members, marked 'const', are immutable. Accessing other members
182 * requires synchronization, as noted in more detail below.
183 *
184 * Acquisition order is, from outermost to innermost:
185 *
186 * dp_netdev_mutex (global)
59e6d833 187 * port_mutex
8a4e3a85 188 */
72865317 189struct dp_netdev {
8a4e3a85
BP
190 const struct dpif_class *const class;
191 const char *const name;
6b31e073 192 struct dpif *dpif;
6a8267c5
BP
193 struct ovs_refcount ref_cnt;
194 atomic_flag destroyed;
72865317 195
8a4e3a85
BP
196 /* Ports.
197 *
59e6d833
BP
198 * Protected by RCU. Take the mutex to add or remove ports. */
199 struct ovs_mutex port_mutex;
200 struct cmap ports;
d33ed218 201 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 202
6b31e073
RW
203 /* Protects access to ofproto-dpif-upcall interface during revalidator
204 * thread synchronization. */
205 struct fat_rwlock upcall_rwlock;
623540e4
EJ
206 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
207 void *upcall_aux;
6b31e073 208
e4e74c3a
AW
209 /* Callback function for notifying the purging of dp flows (during
210 * reseting pmd deletion). */
211 dp_purge_callback *dp_purge_cb;
212 void *dp_purge_aux;
213
65f13b50
AW
214 /* Stores all 'struct dp_netdev_pmd_thread's. */
215 struct cmap poll_threads;
216
217 /* Protects the access of the 'struct dp_netdev_pmd_thread'
218 * instance for non-pmd thread. */
219 struct ovs_mutex non_pmd_mutex;
220
221 /* Each pmd thread will store its pointer to
222 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
223 ovsthread_key_t per_pmd_key;
f2eee189 224
a14b8947 225 /* Cpu mask for pin of pmd threads. */
f2eee189 226 char *pmd_cmask;
a36de779 227 uint64_t last_tnl_conf_seq;
72865317
BP
228};
229
8a4e3a85 230static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
59e6d833 231 odp_port_t);
ff073a71 232
51852a57 233enum dp_stat_type {
abcf3ef4
DDP
234 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
235 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
51852a57
BP
236 DP_STAT_MISS, /* Packets that did not match. */
237 DP_STAT_LOST, /* Packets not passed up to the client. */
238 DP_N_STATS
239};
240
55e3ca97
DDP
241enum pmd_cycles_counter_type {
242 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
243 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
244 PMD_N_CYCLES
245};
246
72865317
BP
247/* A port in a netdev-based datapath. */
248struct dp_netdev_port {
35303d71 249 odp_port_t port_no;
72865317 250 struct netdev *netdev;
efa2bcbb 251 struct cmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 252 struct netdev_saved_flags *sf;
55c955bd 253 struct netdev_rxq **rxq;
b284085e 254 struct ovs_refcount ref_cnt;
0cbfe35d 255 char *type; /* Port type as requested by user. */
a14b8947
IM
256 int latest_requested_n_rxq; /* Latest requested from netdev number
257 of rx queues. */
72865317
BP
258};
259
1c1e46ed
AW
260/* Contained by struct dp_netdev_flow's 'stats' member. */
261struct dp_netdev_flow_stats {
eb94da30
DDP
262 atomic_llong used; /* Last used time, in monotonic msecs. */
263 atomic_ullong packet_count; /* Number of packets matched. */
264 atomic_ullong byte_count; /* Number of bytes matched. */
265 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
266};
267
268/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
269 *
270 *
271 * Thread-safety
272 * =============
273 *
274 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 275 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
276 *
277 * Motivation
278 * ----------
279 *
280 * The thread safety rules described here for "struct dp_netdev_flow" are
281 * motivated by two goals:
282 *
283 * - Prevent threads that read members of "struct dp_netdev_flow" from
284 * reading bad data due to changes by some thread concurrently modifying
285 * those members.
286 *
287 * - Prevent two threads making changes to members of a given "struct
288 * dp_netdev_flow" from interfering with each other.
289 *
290 *
291 * Rules
292 * -----
293 *
ed79f89a
DDP
294 * A flow 'flow' may be accessed without a risk of being freed during an RCU
295 * grace period. Code that needs to hold onto a flow for a while
296 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
297 *
298 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
299 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
300 * from modification.
8a4e3a85
BP
301 *
302 * Some members, marked 'const', are immutable. Accessing other members
303 * requires synchronization, as noted in more detail below.
304 */
72865317 305struct dp_netdev_flow {
11e5cf1f 306 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 307 /* Hash table index by unmasked flow. */
1c1e46ed
AW
308 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
309 /* 'flow_table'. */
70e5ed6f 310 const ovs_u128 ufid; /* Unique flow identifier. */
bd5131ba 311 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 312 /* flow. */
72865317 313
ed79f89a
DDP
314 /* Number of references.
315 * The classifier owns one reference.
316 * Any thread trying to keep a rule from being freed should hold its own
317 * reference. */
318 struct ovs_refcount ref_cnt;
319
11e5cf1f
DDP
320 bool dead;
321
1c1e46ed
AW
322 /* Statistics. */
323 struct dp_netdev_flow_stats stats;
8a4e3a85 324
45c626a3 325 /* Actions. */
61e7deb1 326 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 327
11e5cf1f
DDP
328 /* While processing a group of input packets, the datapath uses the next
329 * member to store a pointer to the output batch for the flow. It is
330 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
331 * packet_batch_init() and packet_batch_execute()). */
332 struct packet_batch *batch;
333
0de8783a
JR
334 /* Packet classification. */
335 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
336 /* 'cr' must be the last member. */
72865317
BP
337};
338
ed79f89a 339static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 340static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f
JS
341static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
342 struct flow *);
8a4e3a85 343
a84cb64a
BP
344/* A set of datapath actions within a "struct dp_netdev_flow".
345 *
346 *
347 * Thread-safety
348 * =============
349 *
45c626a3 350 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 351struct dp_netdev_actions {
a84cb64a
BP
352 /* These members are immutable: they do not change during the struct's
353 * lifetime. */
a84cb64a 354 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 355 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
356};
357
358struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
359 size_t);
61e7deb1
BP
360struct dp_netdev_actions *dp_netdev_flow_get_actions(
361 const struct dp_netdev_flow *);
362static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 363
1c1e46ed
AW
364/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
365struct dp_netdev_pmd_stats {
366 /* Indexed by DP_STAT_*. */
eb94da30 367 atomic_ullong n[DP_N_STATS];
1c1e46ed
AW
368};
369
55e3ca97
DDP
370/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
371struct dp_netdev_pmd_cycles {
372 /* Indexed by PMD_CYCLES_*. */
373 atomic_ullong n[PMD_N_CYCLES];
374};
375
ae7ad0a1
IM
376/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
377struct rxq_poll {
378 struct dp_netdev_port *port;
379 struct netdev_rxq *rx;
380 struct ovs_list node;
381};
382
e4cfed38
PS
383/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
384 * the performance overhead of interrupt processing. Therefore netdev can
385 * not implement rx-wait for these devices. dpif-netdev needs to poll
386 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 387 * devices assigned to itself.
e4cfed38
PS
388 *
389 * DPDK used PMD for accessing NIC.
390 *
65f13b50
AW
391 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
392 * I/O of all non-pmd threads. There will be no actual thread created
393 * for the instance.
1c1e46ed
AW
394 *
395 * Each struct has its own flow table and classifier. Packets received
396 * from managed ports are looked up in the corresponding pmd thread's
397 * flow table, and are executed with the found actions.
398 * */
65f13b50 399struct dp_netdev_pmd_thread {
6c3eee82 400 struct dp_netdev *dp;
1c1e46ed 401 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
65f13b50 402 struct cmap_node node; /* In 'dp->poll_threads'. */
accf8626
AW
403
404 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
405 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
406
65f13b50
AW
407 /* Per thread exact-match cache. Note, the instance for cpu core
408 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
409 * need to be protected (e.g. by 'dp_netdev_mutex'). All other
410 * instances will only be accessed by its own pmd thread. */
9bbf1c3d 411 struct emc_cache flow_cache;
1c1e46ed
AW
412
413 /* Classifier and Flow-Table.
414 *
415 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
416 * changes to 'cls' must be made while still holding the 'flow_mutex'.
417 */
418 struct ovs_mutex flow_mutex;
419 struct dpcls cls;
420 struct cmap flow_table OVS_GUARDED; /* Flow table. */
421
422 /* Statistics. */
423 struct dp_netdev_pmd_stats stats;
424
55e3ca97
DDP
425 /* Cycles counters */
426 struct dp_netdev_pmd_cycles cycles;
427
428 /* Used to count cicles. See 'cycles_counter_end()' */
429 unsigned long long last_cycles;
430
65f13b50
AW
431 struct latch exit_latch; /* For terminating the pmd thread. */
432 atomic_uint change_seq; /* For reloading pmd ports. */
6c3eee82 433 pthread_t thread;
65f13b50
AW
434 int index; /* Idx of this pmd thread among pmd*/
435 /* threads on same numa node. */
bd5131ba 436 unsigned core_id; /* CPU core id of this pmd thread. */
65f13b50 437 int numa_id; /* numa node id of this pmd thread. */
347ba9bb 438 atomic_int tx_qid; /* Queue id used by this pmd thread to
3bcc10c0 439 * send packets on all netdevs */
6553d06b 440
ae7ad0a1
IM
441 struct ovs_mutex poll_mutex; /* Mutex for poll_list. */
442 /* List of rx queues to poll. */
443 struct ovs_list poll_list OVS_GUARDED;
444 int poll_cnt; /* Number of elemints in poll_list. */
445
6553d06b
DDP
446 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
447 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
448 * values and subtracts them from 'stats' and 'cycles' before
449 * reporting to the user */
450 unsigned long long stats_zero[DP_N_STATS];
451 uint64_t cycles_zero[PMD_N_CYCLES];
6c3eee82
BP
452};
453
84067a4c
JR
454#define PMD_INITIAL_SEQ 1
455
72865317
BP
456/* Interface to netdev-based datapath. */
457struct dpif_netdev {
458 struct dpif dpif;
459 struct dp_netdev *dp;
d33ed218 460 uint64_t last_port_seq;
72865317
BP
461};
462
8a4e3a85 463static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
59e6d833 464 struct dp_netdev_port **portp);
8a4e3a85 465static int get_port_by_name(struct dp_netdev *dp, const char *devname,
59e6d833 466 struct dp_netdev_port **portp);
8a4e3a85
BP
467static void dp_netdev_free(struct dp_netdev *)
468 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
469static int do_add_port(struct dp_netdev *dp, const char *devname,
470 const char *type, odp_port_t port_no)
59e6d833 471 OVS_REQUIRES(dp->port_mutex);
c40b890f 472static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 473 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
474static int dpif_netdev_open(const struct dpif_class *, const char *name,
475 bool create, struct dpif **);
65f13b50 476static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
e14deea0 477 struct dp_packet **, int c,
41ccaa24 478 bool may_steal,
4edb9ae9 479 const struct nlattr *actions,
e4cfed38 480 size_t actions_len);
65f13b50 481static void dp_netdev_input(struct dp_netdev_pmd_thread *,
a90ed026
DDP
482 struct dp_packet **, int cnt, odp_port_t port_no);
483static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
484 struct dp_packet **, int cnt);
41ccaa24 485
6b31e073 486static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 487static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50
AW
488static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
489 struct dp_netdev *dp, int index,
bd5131ba 490 unsigned core_id, int numa_id);
1c1e46ed 491static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
f2eee189 492static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
b19befae 493static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 494 unsigned core_id);
1c1e46ed
AW
495static struct dp_netdev_pmd_thread *
496dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
65f13b50
AW
497static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
498static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
499static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
cc245ce8
IM
500static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
501static void dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
502 struct dp_netdev_pmd_thread *pmd);
503static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
504 struct dp_netdev_port *port);
505static void
506dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port);
ae7ad0a1
IM
507static void
508dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
509 struct dp_netdev_port *port, struct netdev_rxq *rx);
510static struct dp_netdev_pmd_thread *
511dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
f2eee189 512static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
1c1e46ed
AW
513static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
514static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
515static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
72865317 516
67ad54cb 517static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d
DDP
518static void emc_clear_entry(struct emc_entry *ce);
519
520static void
521emc_cache_init(struct emc_cache *flow_cache)
522{
523 int i;
524
67ad54cb 525 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
526 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
527 flow_cache->entries[i].flow = NULL;
0de8783a 528 flow_cache->entries[i].key.hash = 0;
09b0fa9c 529 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 530 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
531 }
532}
533
534static void
535emc_cache_uninit(struct emc_cache *flow_cache)
536{
537 int i;
538
539 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
540 emc_clear_entry(&flow_cache->entries[i]);
541 }
542}
543
67ad54cb
AW
544/* Check and clear dead flow references slowly (one entry at each
545 * invocation). */
546static void
547emc_cache_slow_sweep(struct emc_cache *flow_cache)
548{
549 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
550
551 if (!emc_entry_alive(entry)) {
552 emc_clear_entry(entry);
553 }
554 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
555}
556
c4ea7529
BP
557/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
558bool
559dpif_is_netdev(const struct dpif *dpif)
560{
561 return dpif->dpif_class->open == dpif_netdev_open;
562}
563
72865317
BP
564static struct dpif_netdev *
565dpif_netdev_cast(const struct dpif *dpif)
566{
c4ea7529 567 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
568 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
569}
570
571static struct dp_netdev *
572get_dp_netdev(const struct dpif *dpif)
573{
574 return dpif_netdev_cast(dpif)->dp;
575}
6553d06b
DDP
576\f
577enum pmd_info_type {
ce179f11
IM
578 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
579 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
580 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
6553d06b
DDP
581};
582
583static void
584pmd_info_show_stats(struct ds *reply,
585 struct dp_netdev_pmd_thread *pmd,
586 unsigned long long stats[DP_N_STATS],
587 uint64_t cycles[PMD_N_CYCLES])
588{
589 unsigned long long total_packets = 0;
590 uint64_t total_cycles = 0;
591 int i;
592
593 /* These loops subtracts reference values ('*_zero') from the counters.
594 * Since loads and stores are relaxed, it might be possible for a '*_zero'
595 * value to be more recent than the current value we're reading from the
596 * counter. This is not a big problem, since these numbers are not
597 * supposed to be too accurate, but we should at least make sure that
598 * the result is not negative. */
599 for (i = 0; i < DP_N_STATS; i++) {
600 if (stats[i] > pmd->stats_zero[i]) {
601 stats[i] -= pmd->stats_zero[i];
602 } else {
603 stats[i] = 0;
604 }
605
606 if (i != DP_STAT_LOST) {
607 /* Lost packets are already included in DP_STAT_MISS */
608 total_packets += stats[i];
609 }
610 }
611
612 for (i = 0; i < PMD_N_CYCLES; i++) {
613 if (cycles[i] > pmd->cycles_zero[i]) {
614 cycles[i] -= pmd->cycles_zero[i];
615 } else {
616 cycles[i] = 0;
617 }
618
619 total_cycles += cycles[i];
620 }
621
622 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
623 ? "main thread" : "pmd thread");
624
625 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
626 ds_put_format(reply, " numa_id %d", pmd->numa_id);
627 }
d5c199ea 628 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 629 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
630 }
631 ds_put_cstr(reply, ":\n");
632
633 ds_put_format(reply,
634 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
635 "\tmiss:%llu\n\tlost:%llu\n",
636 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
637 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
638
639 if (total_cycles == 0) {
640 return;
641 }
642
643 ds_put_format(reply,
644 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
645 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
646 cycles[PMD_CYCLES_POLLING],
647 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
648 cycles[PMD_CYCLES_PROCESSING],
649 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
650
651 if (total_packets == 0) {
652 return;
653 }
654
655 ds_put_format(reply,
656 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
657 total_cycles / (double)total_packets,
658 total_cycles, total_packets);
659
660 ds_put_format(reply,
661 "\tavg processing cycles per packet: "
662 "%.02f (%"PRIu64"/%llu)\n",
663 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
664 cycles[PMD_CYCLES_PROCESSING], total_packets);
665}
666
667static void
668pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
669 struct dp_netdev_pmd_thread *pmd,
670 unsigned long long stats[DP_N_STATS],
671 uint64_t cycles[PMD_N_CYCLES])
672{
673 int i;
674
675 /* We cannot write 'stats' and 'cycles' (because they're written by other
676 * threads) and we shouldn't change 'stats' (because they're used to count
677 * datapath stats, which must not be cleared here). Instead, we save the
678 * current values and subtract them from the values to be displayed in the
679 * future */
680 for (i = 0; i < DP_N_STATS; i++) {
681 pmd->stats_zero[i] = stats[i];
682 }
683 for (i = 0; i < PMD_N_CYCLES; i++) {
684 pmd->cycles_zero[i] = cycles[i];
685 }
686}
687
ce179f11
IM
688static void
689pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
690{
691 if (pmd->core_id != NON_PMD_CORE_ID) {
692 struct rxq_poll *poll;
693 const char *prev_name = NULL;
694
695 ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
696 pmd->numa_id, pmd->core_id);
697
698 ovs_mutex_lock(&pmd->poll_mutex);
699 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
700 const char *name = netdev_get_name(poll->port->netdev);
701
702 if (!prev_name || strcmp(name, prev_name)) {
703 if (prev_name) {
704 ds_put_cstr(reply, "\n");
705 }
706 ds_put_format(reply, "\tport: %s\tqueue-id:",
707 netdev_get_name(poll->port->netdev));
708 }
709 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
710 prev_name = name;
711 }
712 ovs_mutex_unlock(&pmd->poll_mutex);
713 ds_put_cstr(reply, "\n");
714 }
715}
716
6553d06b
DDP
717static void
718dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
719 void *aux)
720{
721 struct ds reply = DS_EMPTY_INITIALIZER;
722 struct dp_netdev_pmd_thread *pmd;
723 struct dp_netdev *dp = NULL;
724 enum pmd_info_type type = *(enum pmd_info_type *) aux;
725
726 ovs_mutex_lock(&dp_netdev_mutex);
727
728 if (argc == 2) {
729 dp = shash_find_data(&dp_netdevs, argv[1]);
730 } else if (shash_count(&dp_netdevs) == 1) {
731 /* There's only one datapath */
732 dp = shash_first(&dp_netdevs)->data;
733 }
734
735 if (!dp) {
736 ovs_mutex_unlock(&dp_netdev_mutex);
737 unixctl_command_reply_error(conn,
738 "please specify an existing datapath");
739 return;
740 }
741
742 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
ce179f11
IM
743 if (type == PMD_INFO_SHOW_RXQ) {
744 pmd_info_show_rxq(&reply, pmd);
745 } else {
746 unsigned long long stats[DP_N_STATS];
747 uint64_t cycles[PMD_N_CYCLES];
748 int i;
6553d06b 749
ce179f11
IM
750 /* Read current stats and cycle counters */
751 for (i = 0; i < ARRAY_SIZE(stats); i++) {
752 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
753 }
754 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
755 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
756 }
6553d06b 757
ce179f11
IM
758 if (type == PMD_INFO_CLEAR_STATS) {
759 pmd_info_clear_stats(&reply, pmd, stats, cycles);
760 } else if (type == PMD_INFO_SHOW_STATS) {
761 pmd_info_show_stats(&reply, pmd, stats, cycles);
762 }
6553d06b
DDP
763 }
764 }
765
766 ovs_mutex_unlock(&dp_netdev_mutex);
767
768 unixctl_command_reply(conn, ds_cstr(&reply));
769 ds_destroy(&reply);
770}
771\f
772static int
773dpif_netdev_init(void)
774{
775 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
776 clear_aux = PMD_INFO_CLEAR_STATS,
777 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b
DDP
778
779 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
780 0, 1, dpif_netdev_pmd_info,
781 (void *)&show_aux);
782 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
783 0, 1, dpif_netdev_pmd_info,
784 (void *)&clear_aux);
ce179f11
IM
785 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
786 0, 1, dpif_netdev_pmd_info,
787 (void *)&poll_aux);
6553d06b
DDP
788 return 0;
789}
72865317 790
2197d7ab 791static int
2240af25
DDP
792dpif_netdev_enumerate(struct sset *all_dps,
793 const struct dpif_class *dpif_class)
2197d7ab
GL
794{
795 struct shash_node *node;
796
97be1538 797 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 798 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
799 struct dp_netdev *dp = node->data;
800 if (dpif_class != dp->class) {
801 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
802 * If the class doesn't match, skip this dpif. */
803 continue;
804 }
2197d7ab
GL
805 sset_add(all_dps, node->name);
806 }
97be1538 807 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 808
2197d7ab
GL
809 return 0;
810}
811
add90f6f
EJ
812static bool
813dpif_netdev_class_is_dummy(const struct dpif_class *class)
814{
815 return class != &dpif_netdev_class;
816}
817
0aeaabc8
JP
818static const char *
819dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
820{
821 return strcmp(type, "internal") ? type
add90f6f 822 : dpif_netdev_class_is_dummy(class) ? "dummy"
0aeaabc8
JP
823 : "tap";
824}
825
72865317
BP
826static struct dpif *
827create_dpif_netdev(struct dp_netdev *dp)
828{
462278db 829 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 830 struct dpif_netdev *dpif;
72865317 831
6a8267c5 832 ovs_refcount_ref(&dp->ref_cnt);
72865317 833
72865317 834 dpif = xmalloc(sizeof *dpif);
614c4892 835 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 836 dpif->dp = dp;
d33ed218 837 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
838
839 return &dpif->dpif;
840}
841
4e022ec0
AW
842/* Choose an unused, non-zero port number and return it on success.
843 * Return ODPP_NONE on failure. */
844static odp_port_t
e44768b7 845choose_port(struct dp_netdev *dp, const char *name)
59e6d833 846 OVS_REQUIRES(dp->port_mutex)
e44768b7 847{
4e022ec0 848 uint32_t port_no;
e44768b7
JP
849
850 if (dp->class != &dpif_netdev_class) {
851 const char *p;
852 int start_no = 0;
853
854 /* If the port name begins with "br", start the number search at
855 * 100 to make writing tests easier. */
856 if (!strncmp(name, "br", 2)) {
857 start_no = 100;
858 }
859
860 /* If the port name contains a number, try to assign that port number.
861 * This can make writing unit tests easier because port numbers are
862 * predictable. */
863 for (p = name; *p != '\0'; p++) {
864 if (isdigit((unsigned char) *p)) {
865 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
866 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
867 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 868 return u32_to_odp(port_no);
e44768b7
JP
869 }
870 break;
871 }
872 }
873 }
874
ff073a71
BP
875 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
876 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 877 return u32_to_odp(port_no);
e44768b7
JP
878 }
879 }
880
4e022ec0 881 return ODPP_NONE;
e44768b7
JP
882}
883
72865317 884static int
614c4892
BP
885create_dp_netdev(const char *name, const struct dpif_class *class,
886 struct dp_netdev **dpp)
8a4e3a85 887 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
888{
889 struct dp_netdev *dp;
890 int error;
72865317 891
462278db 892 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
893 shash_add(&dp_netdevs, name, dp);
894
895 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
896 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 897 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 898 atomic_flag_clear(&dp->destroyed);
8a4e3a85 899
59e6d833
BP
900 ovs_mutex_init(&dp->port_mutex);
901 cmap_init(&dp->ports);
d33ed218 902 dp->port_seq = seq_create();
6b31e073
RW
903 fat_rwlock_init(&dp->upcall_rwlock);
904
905 /* Disable upcalls by default. */
906 dp_netdev_disable_upcall(dp);
623540e4 907 dp->upcall_aux = NULL;
6b31e073 908 dp->upcall_cb = NULL;
e44768b7 909
65f13b50
AW
910 cmap_init(&dp->poll_threads);
911 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
912 ovsthread_key_create(&dp->per_pmd_key, NULL);
913
f2eee189 914 dp_netdev_set_nonpmd(dp);
65f13b50 915
59e6d833 916 ovs_mutex_lock(&dp->port_mutex);
4e022ec0 917 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
59e6d833 918 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
919 if (error) {
920 dp_netdev_free(dp);
462278db 921 return error;
72865317
BP
922 }
923
a36de779 924 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 925 *dpp = dp;
72865317
BP
926 return 0;
927}
928
929static int
614c4892 930dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 931 bool create, struct dpif **dpifp)
72865317 932{
462278db 933 struct dp_netdev *dp;
5279f8fd 934 int error;
462278db 935
97be1538 936 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
937 dp = shash_find_data(&dp_netdevs, name);
938 if (!dp) {
5279f8fd 939 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 940 } else {
5279f8fd
BP
941 error = (dp->class != class ? EINVAL
942 : create ? EEXIST
943 : 0);
944 }
945 if (!error) {
946 *dpifp = create_dpif_netdev(dp);
6b31e073 947 dp->dpif = *dpifp;
72865317 948 }
97be1538 949 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 950
5279f8fd 951 return error;
72865317
BP
952}
953
88ace79b
DDP
954static void
955dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
956 OVS_NO_THREAD_SAFETY_ANALYSIS
957{
958 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
959 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
960
961 /* Before freeing a lock we should release it */
962 fat_rwlock_unlock(&dp->upcall_rwlock);
963 fat_rwlock_destroy(&dp->upcall_rwlock);
964}
965
8a4e3a85
BP
966/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
967 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
968static void
969dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 970 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 971{
59e6d833 972 struct dp_netdev_port *port;
4ad28026 973
8a4e3a85
BP
974 shash_find_and_delete(&dp_netdevs, dp->name);
975
65f13b50
AW
976 dp_netdev_destroy_all_pmds(dp);
977 ovs_mutex_destroy(&dp->non_pmd_mutex);
978 ovsthread_key_delete(dp->per_pmd_key);
6c3eee82 979
59e6d833 980 ovs_mutex_lock(&dp->port_mutex);
a532e683 981 CMAP_FOR_EACH (port, node, &dp->ports) {
d916785c 982 /* PMD threads are destroyed here. do_del_port() cannot quiesce */
c40b890f 983 do_del_port(dp, port);
1ba530f4 984 }
59e6d833 985 ovs_mutex_unlock(&dp->port_mutex);
d916785c 986 cmap_destroy(&dp->poll_threads);
51852a57 987
d33ed218 988 seq_destroy(dp->port_seq);
59e6d833 989 cmap_destroy(&dp->ports);
88ace79b
DDP
990
991 /* Upcalls must be disabled at this point */
992 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 993
f2eee189 994 free(dp->pmd_cmask);
8a4e3a85 995 free(CONST_CAST(char *, dp->name));
72865317
BP
996 free(dp);
997}
998
8a4e3a85
BP
999static void
1000dp_netdev_unref(struct dp_netdev *dp)
1001{
1002 if (dp) {
1003 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1004 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1005 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1006 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1007 dp_netdev_free(dp);
1008 }
1009 ovs_mutex_unlock(&dp_netdev_mutex);
1010 }
1011}
1012
72865317
BP
1013static void
1014dpif_netdev_close(struct dpif *dpif)
1015{
1016 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1017
8a4e3a85 1018 dp_netdev_unref(dp);
72865317
BP
1019 free(dpif);
1020}
1021
1022static int
7dab847a 1023dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1024{
1025 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1026
6a8267c5 1027 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1028 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1029 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1030 OVS_NOT_REACHED();
1031 }
1032 }
5279f8fd 1033
72865317
BP
1034 return 0;
1035}
1036
eb94da30
DDP
1037/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1038 * load/store semantics. While the increment is not atomic, the load and
1039 * store operations are, making it impossible to read inconsistent values.
1040 *
1041 * This is used to update thread local stats counters. */
1042static void
1043non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1044{
1045 unsigned long long tmp;
1046
1047 atomic_read_relaxed(var, &tmp);
1048 tmp += n;
1049 atomic_store_relaxed(var, tmp);
1050}
1051
72865317 1052static int
a8d9304d 1053dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1054{
1055 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1056 struct dp_netdev_pmd_thread *pmd;
8a4e3a85 1057
1c1e46ed
AW
1058 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1059 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
eb94da30 1060 unsigned long long n;
1c1e46ed 1061 stats->n_flows += cmap_count(&pmd->flow_table);
eb94da30 1062
abcf3ef4
DDP
1063 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1064 stats->n_hit += n;
1065 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
eb94da30
DDP
1066 stats->n_hit += n;
1067 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1068 stats->n_missed += n;
1069 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1070 stats->n_lost += n;
51852a57 1071 }
1ce3fa06 1072 stats->n_masks = UINT32_MAX;
847108dc 1073 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1074
72865317
BP
1075 return 0;
1076}
1077
e4cfed38 1078static void
65f13b50 1079dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1080{
65f13b50
AW
1081 int old_seq;
1082
accf8626
AW
1083 if (pmd->core_id == NON_PMD_CORE_ID) {
1084 return;
1085 }
1086
1087 ovs_mutex_lock(&pmd->cond_mutex);
65f13b50 1088 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
accf8626
AW
1089 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1090 ovs_mutex_unlock(&pmd->cond_mutex);
65f13b50 1091}
e4cfed38 1092
59e6d833
BP
1093static uint32_t
1094hash_port_no(odp_port_t port_no)
1095{
1096 return hash_int(odp_to_u32(port_no), 0);
1097}
1098
72865317 1099static int
c3827f61 1100do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
4e022ec0 1101 odp_port_t port_no)
59e6d833 1102 OVS_REQUIRES(dp->port_mutex)
72865317 1103{
4b609110 1104 struct netdev_saved_flags *sf;
72865317
BP
1105 struct dp_netdev_port *port;
1106 struct netdev *netdev;
2499a8ce 1107 enum netdev_flags flags;
0cbfe35d 1108 const char *open_type;
72865317 1109 int error;
55c955bd 1110 int i;
72865317 1111
17050610
BP
1112 /* Reject devices already in 'dp'. */
1113 if (!get_port_by_name(dp, devname, &port)) {
1114 return EEXIST;
1115 }
72865317
BP
1116
1117 /* Open and validate network device. */
0aeaabc8 1118 open_type = dpif_netdev_port_open_type(dp->class, type);
0cbfe35d 1119 error = netdev_open(devname, open_type, &netdev);
72865317
BP
1120 if (error) {
1121 return error;
1122 }
72865317
BP
1123 /* XXX reject non-Ethernet devices */
1124
2499a8ce
AC
1125 netdev_get_flags(netdev, &flags);
1126 if (flags & NETDEV_LOOPBACK) {
1127 VLOG_ERR("%s: cannot add a loopback device", devname);
1128 netdev_close(netdev);
1129 return EINVAL;
1130 }
1131
5a034064
AW
1132 if (netdev_is_pmd(netdev)) {
1133 int n_cores = ovs_numa_get_n_cores();
1134
1135 if (n_cores == OVS_CORE_UNSPEC) {
1136 VLOG_ERR("%s, cannot get cpu core info", devname);
1137 return ENOENT;
1138 }
1139 /* There can only be ovs_numa_get_n_cores() pmd threads,
3bcc10c0
DDP
1140 * so creates a txq for each, and one extra for the non
1141 * pmd threads. */
a14b8947
IM
1142 error = netdev_set_multiq(netdev, n_cores + 1,
1143 netdev_requested_n_rxq(netdev));
7251515e 1144 if (error && (error != EOPNOTSUPP)) {
5a034064
AW
1145 VLOG_ERR("%s, cannot set multiq", devname);
1146 return errno;
1147 }
1148 }
e4cfed38 1149 port = xzalloc(sizeof *port);
35303d71 1150 port->port_no = port_no;
e4cfed38 1151 port->netdev = netdev;
55c955bd 1152 port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev));
e4cfed38 1153 port->type = xstrdup(type);
a14b8947 1154 port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
55c955bd
PS
1155 for (i = 0; i < netdev_n_rxq(netdev); i++) {
1156 error = netdev_rxq_open(netdev, &port->rxq[i], i);
1157 if (error
1158 && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) {
1159 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1160 devname, ovs_strerror(errno));
1161 netdev_close(netdev);
16bea12c
TG
1162 free(port->type);
1163 free(port->rxq);
1164 free(port);
55c955bd
PS
1165 return error;
1166 }
7b6b0ef4
BP
1167 }
1168
4b609110 1169 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 1170 if (error) {
55c955bd
PS
1171 for (i = 0; i < netdev_n_rxq(netdev); i++) {
1172 netdev_rxq_close(port->rxq[i]);
1173 }
72865317 1174 netdev_close(netdev);
16bea12c 1175 free(port->type);
f7791740 1176 free(port->rxq);
e4cfed38 1177 free(port);
72865317
BP
1178 return error;
1179 }
4b609110 1180 port->sf = sf;
e4cfed38 1181
f7d63652
AW
1182 ovs_refcount_init(&port->ref_cnt);
1183 cmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1184
e4cfed38 1185 if (netdev_is_pmd(netdev)) {
cc245ce8 1186 dp_netdev_add_port_to_pmds(dp, port);
e4cfed38 1187 }
d33ed218 1188 seq_change(dp->port_seq);
72865317
BP
1189
1190 return 0;
1191}
1192
247527db
BP
1193static int
1194dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1195 odp_port_t *port_nop)
247527db
BP
1196{
1197 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1198 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1199 const char *dpif_port;
4e022ec0 1200 odp_port_t port_no;
5279f8fd 1201 int error;
247527db 1202
59e6d833 1203 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1204 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1205 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1206 port_no = *port_nop;
1207 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1208 } else {
3aa30359 1209 port_no = choose_port(dp, dpif_port);
5279f8fd 1210 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1211 }
5279f8fd 1212 if (!error) {
247527db 1213 *port_nop = port_no;
5279f8fd 1214 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1215 }
59e6d833 1216 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1217
1218 return error;
72865317
BP
1219}
1220
1221static int
4e022ec0 1222dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1223{
1224 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1225 int error;
1226
59e6d833 1227 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1228 if (port_no == ODPP_LOCAL) {
1229 error = EINVAL;
1230 } else {
1231 struct dp_netdev_port *port;
1232
1233 error = get_port_by_number(dp, port_no, &port);
1234 if (!error) {
1235 do_del_port(dp, port);
1236 }
1237 }
59e6d833 1238 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1239
1240 return error;
72865317
BP
1241}
1242
1243static bool
4e022ec0 1244is_valid_port_number(odp_port_t port_no)
72865317 1245{
ff073a71
BP
1246 return port_no != ODPP_NONE;
1247}
1248
1249static struct dp_netdev_port *
1250dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1251{
1252 struct dp_netdev_port *port;
1253
59e6d833 1254 CMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 1255 if (port->port_no == port_no) {
ff073a71
BP
1256 return port;
1257 }
1258 }
1259 return NULL;
72865317
BP
1260}
1261
1262static int
1263get_port_by_number(struct dp_netdev *dp,
4e022ec0 1264 odp_port_t port_no, struct dp_netdev_port **portp)
72865317
BP
1265{
1266 if (!is_valid_port_number(port_no)) {
1267 *portp = NULL;
1268 return EINVAL;
1269 } else {
ff073a71 1270 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
1271 return *portp ? 0 : ENOENT;
1272 }
1273}
1274
b284085e
PS
1275static void
1276port_ref(struct dp_netdev_port *port)
1277{
1278 if (port) {
1279 ovs_refcount_ref(&port->ref_cnt);
1280 }
1281}
1282
59e6d833
BP
1283static void
1284port_unref(struct dp_netdev_port *port)
1285{
24f83812 1286 if (port && ovs_refcount_unref_relaxed(&port->ref_cnt) == 1) {
accf8626
AW
1287 int n_rxq = netdev_n_rxq(port->netdev);
1288 int i;
1289
1290 netdev_close(port->netdev);
1291 netdev_restore_flags(port->sf);
1292
1293 for (i = 0; i < n_rxq; i++) {
1294 netdev_rxq_close(port->rxq[i]);
1295 }
1296 free(port->rxq);
1297 free(port->type);
1298 free(port);
b284085e
PS
1299 }
1300}
1301
72865317
BP
1302static int
1303get_port_by_name(struct dp_netdev *dp,
1304 const char *devname, struct dp_netdev_port **portp)
59e6d833 1305 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1306{
1307 struct dp_netdev_port *port;
1308
a532e683 1309 CMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 1310 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
1311 *portp = port;
1312 return 0;
1313 }
1314 }
1315 return ENOENT;
1316}
1317
347ba9bb
IM
1318static int
1319get_n_pmd_threads(struct dp_netdev *dp)
1320{
1321 /* There is one non pmd thread in dp->poll_threads */
1322 return cmap_count(&dp->poll_threads) - 1;
1323}
1324
65f13b50
AW
1325static int
1326get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1327{
1328 struct dp_netdev_pmd_thread *pmd;
1329 int n_pmds = 0;
1330
1331 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1332 if (pmd->numa_id == numa_id) {
1333 n_pmds++;
1334 }
1335 }
1336
1337 return n_pmds;
1338}
1339
1340/* Returns 'true' if there is a port with pmd netdev and the netdev
1341 * is on numa node 'numa_id'. */
1342static bool
1343has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
1344{
1345 struct dp_netdev_port *port;
1346
1347 CMAP_FOR_EACH (port, node, &dp->ports) {
1348 if (netdev_is_pmd(port->netdev)
1349 && netdev_get_numa_id(port->netdev) == numa_id) {
1350 return true;
1351 }
1352 }
1353
1354 return false;
1355}
1356
1357
c40b890f
BP
1358static void
1359do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 1360 OVS_REQUIRES(dp->port_mutex)
72865317 1361{
35303d71 1362 cmap_remove(&dp->ports, &port->node, hash_odp_port(port->port_no));
d33ed218 1363 seq_change(dp->port_seq);
e4cfed38 1364 if (netdev_is_pmd(port->netdev)) {
65f13b50
AW
1365 int numa_id = netdev_get_numa_id(port->netdev);
1366
ae7ad0a1
IM
1367 /* PMD threads can not be on invalid numa node. */
1368 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
65f13b50 1369 /* If there is no netdev on the numa node, deletes the pmd threads
ae7ad0a1 1370 * for that numa. Else, deletes the queues from polling lists. */
65f13b50
AW
1371 if (!has_pmd_port_for_numa(dp, numa_id)) {
1372 dp_netdev_del_pmds_on_numa(dp, numa_id);
ae7ad0a1 1373 } else {
cc245ce8 1374 dp_netdev_del_port_from_all_pmds(dp, port);
65f13b50 1375 }
e4cfed38 1376 }
72865317 1377
b284085e 1378 port_unref(port);
72865317
BP
1379}
1380
1381static void
4c738a8d
BP
1382answer_port_query(const struct dp_netdev_port *port,
1383 struct dpif_port *dpif_port)
72865317 1384{
3efb6063 1385 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 1386 dpif_port->type = xstrdup(port->type);
35303d71 1387 dpif_port->port_no = port->port_no;
72865317
BP
1388}
1389
1390static int
4e022ec0 1391dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 1392 struct dpif_port *dpif_port)
72865317
BP
1393{
1394 struct dp_netdev *dp = get_dp_netdev(dpif);
1395 struct dp_netdev_port *port;
1396 int error;
1397
1398 error = get_port_by_number(dp, port_no, &port);
4afba28d 1399 if (!error && dpif_port) {
4c738a8d 1400 answer_port_query(port, dpif_port);
72865317 1401 }
5279f8fd 1402
72865317
BP
1403 return error;
1404}
1405
1406static int
1407dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 1408 struct dpif_port *dpif_port)
72865317
BP
1409{
1410 struct dp_netdev *dp = get_dp_netdev(dpif);
1411 struct dp_netdev_port *port;
1412 int error;
1413
59e6d833 1414 ovs_mutex_lock(&dp->port_mutex);
72865317 1415 error = get_port_by_name(dp, devname, &port);
4afba28d 1416 if (!error && dpif_port) {
4c738a8d 1417 answer_port_query(port, dpif_port);
72865317 1418 }
59e6d833 1419 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1420
72865317
BP
1421 return error;
1422}
1423
61e7deb1
BP
1424static void
1425dp_netdev_flow_free(struct dp_netdev_flow *flow)
1426{
61e7deb1 1427 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1428 free(flow);
1429}
1430
ed79f89a
DDP
1431static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1432{
1433 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1434 ovsrcu_postpone(dp_netdev_flow_free, flow);
1435 }
1436}
1437
70e5ed6f
JS
1438static uint32_t
1439dp_netdev_flow_hash(const ovs_u128 *ufid)
1440{
1441 return ufid->u32[0];
1442}
1443
72865317 1444static void
1c1e46ed
AW
1445dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1446 struct dp_netdev_flow *flow)
1447 OVS_REQUIRES(pmd->flow_mutex)
72865317 1448{
9f361d6b 1449 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2c0ea78f 1450
1c1e46ed 1451 dpcls_remove(&pmd->cls, &flow->cr);
361d808d
JR
1452 flow->cr.mask = NULL; /* Accessing rule's mask after this is not safe. */
1453
1c1e46ed 1454 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
9bbf1c3d 1455 flow->dead = true;
ed79f89a
DDP
1456
1457 dp_netdev_flow_unref(flow);
72865317
BP
1458}
1459
1460static void
1c1e46ed 1461dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 1462{
78c8df12 1463 struct dp_netdev_flow *netdev_flow;
72865317 1464
1c1e46ed
AW
1465 ovs_mutex_lock(&pmd->flow_mutex);
1466 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1467 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 1468 }
1c1e46ed 1469 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
1470}
1471
1472static int
1473dpif_netdev_flow_flush(struct dpif *dpif)
1474{
1475 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
1476 struct dp_netdev_pmd_thread *pmd;
1477
1478 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1479 dp_netdev_pmd_flow_flush(pmd);
1480 }
5279f8fd 1481
72865317
BP
1482 return 0;
1483}
1484
b0ec0f27 1485struct dp_netdev_port_state {
59e6d833 1486 struct cmap_position position;
4c738a8d 1487 char *name;
b0ec0f27
BP
1488};
1489
1490static int
1491dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1492{
1493 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1494 return 0;
1495}
1496
72865317 1497static int
b0ec0f27 1498dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1499 struct dpif_port *dpif_port)
72865317 1500{
b0ec0f27 1501 struct dp_netdev_port_state *state = state_;
72865317 1502 struct dp_netdev *dp = get_dp_netdev(dpif);
59e6d833 1503 struct cmap_node *node;
ff073a71 1504 int retval;
72865317 1505
59e6d833 1506 node = cmap_next_position(&dp->ports, &state->position);
ff073a71
BP
1507 if (node) {
1508 struct dp_netdev_port *port;
5279f8fd 1509
ff073a71
BP
1510 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1511
1512 free(state->name);
1513 state->name = xstrdup(netdev_get_name(port->netdev));
1514 dpif_port->name = state->name;
1515 dpif_port->type = port->type;
35303d71 1516 dpif_port->port_no = port->port_no;
ff073a71
BP
1517
1518 retval = 0;
1519 } else {
1520 retval = EOF;
72865317 1521 }
5279f8fd 1522
ff073a71 1523 return retval;
b0ec0f27
BP
1524}
1525
1526static int
4c738a8d 1527dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1528{
4c738a8d
BP
1529 struct dp_netdev_port_state *state = state_;
1530 free(state->name);
b0ec0f27
BP
1531 free(state);
1532 return 0;
72865317
BP
1533}
1534
1535static int
67a4917b 1536dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1537{
1538 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1539 uint64_t new_port_seq;
5279f8fd
BP
1540 int error;
1541
d33ed218
BP
1542 new_port_seq = seq_read(dpif->dp->port_seq);
1543 if (dpif->last_port_seq != new_port_seq) {
1544 dpif->last_port_seq = new_port_seq;
5279f8fd 1545 error = ENOBUFS;
72865317 1546 } else {
5279f8fd 1547 error = EAGAIN;
72865317 1548 }
5279f8fd
BP
1549
1550 return error;
72865317
BP
1551}
1552
1553static void
1554dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1555{
1556 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1557
d33ed218 1558 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1559}
1560
1561static struct dp_netdev_flow *
0de8783a 1562dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
1563{
1564 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1565}
1566
9bbf1c3d
DDP
1567static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1568{
1569 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1570}
1571
79df317f
DDP
1572/* netdev_flow_key utilities.
1573 *
1574 * netdev_flow_key is basically a miniflow. We use these functions
1575 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1576 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1577 *
1578 * - Since we are dealing exclusively with miniflows created by
1579 * miniflow_extract(), if the map is different the miniflow is different.
1580 * Therefore we can be faster by comparing the map and the miniflow in a
1581 * single memcmp().
5fcff47b 1582 * - These functions can be inlined by the compiler. */
79df317f 1583
361d808d 1584/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 1585 * 'netdev_flow_key.mf' */
361d808d
JR
1586static inline size_t
1587netdev_flow_key_size(size_t flow_u64s)
79df317f 1588{
361d808d 1589 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
1590}
1591
79df317f
DDP
1592static inline bool
1593netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
1594 const struct netdev_flow_key *b)
1595{
caeb4906
JR
1596 /* 'b->len' may be not set yet. */
1597 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
1598}
1599
1600/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1601 * The maps are compared bitwise, so both 'key->mf' 'mf' must have been
1602 * generated by miniflow_extract. */
1603static inline bool
1604netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1605 const struct miniflow *mf)
79df317f 1606{
caeb4906 1607 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
1608}
1609
1610static inline void
1611netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
1612 const struct netdev_flow_key *src)
1613{
caeb4906
JR
1614 memcpy(dst, src,
1615 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
1616}
1617
1618/* Slow. */
1619static void
1620netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1621 const struct flow *src)
1622{
cf62fa4c 1623 struct dp_packet packet;
0de8783a 1624 uint64_t buf_stub[512 / 8];
0de8783a 1625
cf62fa4c
PS
1626 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1627 pkt_metadata_from_flow(&packet.md, src);
0de8783a 1628 flow_compose(&packet, src);
cf62fa4c
PS
1629 miniflow_extract(&packet, &dst->mf);
1630 dp_packet_uninit(&packet);
0de8783a 1631
361d808d 1632 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
0de8783a
JR
1633 dst->hash = 0; /* Not computed yet. */
1634}
1635
1636/* Initialize a netdev_flow_key 'mask' from 'match'. */
1637static inline void
1638netdev_flow_mask_init(struct netdev_flow_key *mask,
1639 const struct match *match)
1640{
09b0fa9c 1641 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 1642 struct flowmap fmap;
0de8783a 1643 uint32_t hash = 0;
5fcff47b 1644 size_t idx;
0de8783a
JR
1645
1646 /* Only check masks that make sense for the flow. */
5fcff47b
JR
1647 flow_wc_map(&match->flow, &fmap);
1648 flowmap_init(&mask->mf.map);
0de8783a 1649
5fcff47b
JR
1650 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1651 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 1652
5fcff47b
JR
1653 if (mask_u64) {
1654 flowmap_set(&mask->mf.map, idx, 1);
1655 *dst++ = mask_u64;
1656 hash = hash_add64(hash, mask_u64);
0de8783a 1657 }
0de8783a
JR
1658 }
1659
5fcff47b 1660 map_t map;
0de8783a 1661
5fcff47b
JR
1662 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1663 hash = hash_add64(hash, map);
1664 }
0de8783a 1665
5fcff47b 1666 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 1667
d70e8c28 1668 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
1669 mask->len = netdev_flow_key_size(n);
1670}
1671
361d808d 1672/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
1673static inline void
1674netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1675 const struct flow *flow,
1676 const struct netdev_flow_key *mask)
79df317f 1677{
09b0fa9c
JR
1678 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1679 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 1680 uint32_t hash = 0;
d70e8c28 1681 uint64_t value;
0de8783a
JR
1682
1683 dst->len = mask->len;
361d808d 1684 dst->mf = mask->mf; /* Copy maps. */
0de8783a 1685
5fcff47b 1686 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
1687 *dst_u64 = value & *mask_u64++;
1688 hash = hash_add64(hash, *dst_u64++);
0de8783a 1689 }
09b0fa9c
JR
1690 dst->hash = hash_finish(hash,
1691 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
1692}
1693
5fcff47b
JR
1694/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1695#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1696 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
0de8783a
JR
1697
1698/* Returns a hash value for the bits of 'key' where there are 1-bits in
1699 * 'mask'. */
1700static inline uint32_t
1701netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1702 const struct netdev_flow_key *mask)
1703{
09b0fa9c 1704 const uint64_t *p = miniflow_get_values(&mask->mf);
0de8783a 1705 uint32_t hash = 0;
5fcff47b 1706 uint64_t value;
0de8783a 1707
5fcff47b
JR
1708 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1709 hash = hash_add64(hash, value & *p++);
0de8783a
JR
1710 }
1711
09b0fa9c 1712 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
79df317f
DDP
1713}
1714
9bbf1c3d
DDP
1715static inline bool
1716emc_entry_alive(struct emc_entry *ce)
1717{
1718 return ce->flow && !ce->flow->dead;
1719}
1720
1721static void
1722emc_clear_entry(struct emc_entry *ce)
1723{
1724 if (ce->flow) {
1725 dp_netdev_flow_unref(ce->flow);
1726 ce->flow = NULL;
1727 }
1728}
1729
1730static inline void
1731emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 1732 const struct netdev_flow_key *key)
9bbf1c3d
DDP
1733{
1734 if (ce->flow != flow) {
1735 if (ce->flow) {
1736 dp_netdev_flow_unref(ce->flow);
1737 }
1738
1739 if (dp_netdev_flow_ref(flow)) {
1740 ce->flow = flow;
1741 } else {
1742 ce->flow = NULL;
1743 }
1744 }
0de8783a
JR
1745 if (key) {
1746 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
1747 }
1748}
1749
1750static inline void
0de8783a 1751emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
1752 struct dp_netdev_flow *flow)
1753{
1754 struct emc_entry *to_be_replaced = NULL;
1755 struct emc_entry *current_entry;
1756
0de8783a
JR
1757 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1758 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 1759 /* We found the entry with the 'mf' miniflow */
0de8783a 1760 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
1761 return;
1762 }
1763
1764 /* Replacement policy: put the flow in an empty (not alive) entry, or
1765 * in the first entry where it can be */
1766 if (!to_be_replaced
1767 || (emc_entry_alive(to_be_replaced)
1768 && !emc_entry_alive(current_entry))
0de8783a 1769 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
1770 to_be_replaced = current_entry;
1771 }
1772 }
1773 /* We didn't find the miniflow in the cache.
1774 * The 'to_be_replaced' entry is where the new flow will be stored */
1775
0de8783a 1776 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
1777}
1778
1779static inline struct dp_netdev_flow *
0de8783a 1780emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
1781{
1782 struct emc_entry *current_entry;
1783
0de8783a
JR
1784 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1785 if (current_entry->key.hash == key->hash
1786 && emc_entry_alive(current_entry)
1787 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 1788
0de8783a 1789 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
1790 return current_entry->flow;
1791 }
1792 }
1793
1794 return NULL;
1795}
1796
72865317 1797static struct dp_netdev_flow *
1c1e46ed
AW
1798dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
1799 const struct netdev_flow_key *key)
2c0ea78f 1800{
8a4e3a85 1801 struct dp_netdev_flow *netdev_flow;
0de8783a 1802 struct dpcls_rule *rule;
2c0ea78f 1803
1c1e46ed 1804 dpcls_lookup(&pmd->cls, key, &rule, 1);
4f150744 1805 netdev_flow = dp_netdev_flow_cast(rule);
2c0ea78f 1806
8a4e3a85 1807 return netdev_flow;
2c0ea78f
GS
1808}
1809
1810static struct dp_netdev_flow *
1c1e46ed
AW
1811dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
1812 const ovs_u128 *ufidp, const struct nlattr *key,
1813 size_t key_len)
72865317 1814{
1763b4b8 1815 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
1816 struct flow flow;
1817 ovs_u128 ufid;
1818
1819 /* If a UFID is not provided, determine one based on the key. */
1820 if (!ufidp && key && key_len
1821 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1c1e46ed 1822 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
70e5ed6f
JS
1823 ufidp = &ufid;
1824 }
72865317 1825
70e5ed6f
JS
1826 if (ufidp) {
1827 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 1828 &pmd->flow_table) {
bdd7ecf5 1829 if (ovs_u128_equals(&netdev_flow->ufid, ufidp)) {
70e5ed6f
JS
1830 return netdev_flow;
1831 }
72865317
BP
1832 }
1833 }
8a4e3a85 1834
72865317
BP
1835 return NULL;
1836}
1837
1838static void
eb94da30 1839get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1763b4b8 1840 struct dpif_flow_stats *stats)
feebdea2 1841{
eb94da30
DDP
1842 struct dp_netdev_flow *netdev_flow;
1843 unsigned long long n;
1844 long long used;
1845 uint16_t flags;
1846
1847 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
1848
1849 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
1850 stats->n_packets = n;
1851 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
1852 stats->n_bytes = n;
1853 atomic_read_relaxed(&netdev_flow->stats.used, &used);
1854 stats->used = used;
1855 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
1856 stats->tcp_flags = flags;
72865317
BP
1857}
1858
7af12bd7
JS
1859/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
1860 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
1861 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
1862 * protect them. */
6fe09f8c 1863static void
70e5ed6f 1864dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
7af12bd7 1865 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 1866 struct dpif_flow *flow, bool terse)
6fe09f8c 1867{
64bb477f
JS
1868 if (terse) {
1869 memset(flow, 0, sizeof *flow);
1870 } else {
1871 struct flow_wildcards wc;
1872 struct dp_netdev_actions *actions;
1873 size_t offset;
5262eea1
JG
1874 struct odp_flow_key_parms odp_parms = {
1875 .flow = &netdev_flow->flow,
1876 .mask = &wc.masks,
2494ccd7 1877 .support = dp_netdev_support,
5262eea1 1878 };
64bb477f
JS
1879
1880 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
1881
1882 /* Key */
6fd6ed71 1883 offset = key_buf->size;
64bb477f 1884 flow->key = ofpbuf_tail(key_buf);
5262eea1
JG
1885 odp_parms.odp_in_port = netdev_flow->flow.in_port.odp_port;
1886 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 1887 flow->key_len = key_buf->size - offset;
64bb477f
JS
1888
1889 /* Mask */
6fd6ed71 1890 offset = mask_buf->size;
64bb477f 1891 flow->mask = ofpbuf_tail(mask_buf);
5262eea1 1892 odp_parms.odp_in_port = wc.masks.in_port.odp_port;
ec1f6f32 1893 odp_parms.key_buf = key_buf;
5262eea1 1894 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 1895 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
1896
1897 /* Actions */
1898 actions = dp_netdev_flow_get_actions(netdev_flow);
1899 flow->actions = actions->actions;
1900 flow->actions_len = actions->size;
1901 }
6fe09f8c 1902
70e5ed6f
JS
1903 flow->ufid = netdev_flow->ufid;
1904 flow->ufid_present = true;
1c1e46ed 1905 flow->pmd_id = netdev_flow->pmd_id;
6fe09f8c
JS
1906 get_dpif_flow_stats(netdev_flow, &flow->stats);
1907}
1908
36956a7d 1909static int
8c301900
JR
1910dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1911 const struct nlattr *mask_key,
1912 uint32_t mask_key_len, const struct flow *flow,
9f861c91 1913 struct flow_wildcards *wc)
8c301900 1914{
ca8d3442
DDP
1915 enum odp_key_fitness fitness;
1916
1917 fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key,
1918 key_len, wc, flow);
1919 if (fitness) {
1920 /* This should not happen: it indicates that
1921 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
1922 * disagree on the acceptable form of a mask. Log the problem
1923 * as an error, with enough details to enable debugging. */
1924 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1925
1926 if (!VLOG_DROP_ERR(&rl)) {
1927 struct ds s;
8c301900 1928
ca8d3442
DDP
1929 ds_init(&s);
1930 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
1931 true);
1932 VLOG_ERR("internal error parsing flow mask %s (%s)",
1933 ds_cstr(&s), odp_key_fitness_to_string(fitness));
1934 ds_destroy(&s);
8c301900 1935 }
ca8d3442
DDP
1936
1937 return EINVAL;
8c301900
JR
1938 }
1939
1940 return 0;
1941}
1942
1943static int
1944dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1945 struct flow *flow)
36956a7d 1946{
586ddea5
BP
1947 odp_port_t in_port;
1948
6728d578 1949 if (odp_flow_key_to_flow_udpif(key, key_len, flow)) {
36956a7d 1950 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
1951 * and odp_flow_key_to_flow() disagree on the acceptable form of a
1952 * flow. Log the problem as an error, with enough details to enable
1953 * debugging. */
36956a7d
BP
1954 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1955
1956 if (!VLOG_DROP_ERR(&rl)) {
1957 struct ds s;
1958
1959 ds_init(&s);
8c301900 1960 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
1961 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
1962 ds_destroy(&s);
1963 }
1964
1965 return EINVAL;
1966 }
1967
586ddea5
BP
1968 in_port = flow->in_port.odp_port;
1969 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
1970 return EINVAL;
1971 }
1972
07659514 1973 /* Userspace datapath doesn't support conntrack. */
9daf2348
JS
1974 if (flow->ct_state || flow->ct_zone || flow->ct_mark
1975 || !ovs_u128_is_zero(&flow->ct_label)) {
07659514
JS
1976 return EINVAL;
1977 }
1978
36956a7d
BP
1979 return 0;
1980}
1981
72865317 1982static int
6fe09f8c 1983dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
1984{
1985 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1986 struct dp_netdev_flow *netdev_flow;
1c1e46ed 1987 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
1988 unsigned pmd_id = get->pmd_id == PMD_ID_NULL
1989 ? NON_PMD_CORE_ID : get->pmd_id;
70e5ed6f 1990 int error = 0;
8a4e3a85 1991
1c1e46ed
AW
1992 pmd = dp_netdev_get_pmd(dp, pmd_id);
1993 if (!pmd) {
1994 return EINVAL;
1995 }
1996
1997 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
1998 get->key_len);
1763b4b8 1999 if (netdev_flow) {
70e5ed6f 2000 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
64bb477f 2001 get->flow, false);
70e5ed6f 2002 } else {
5279f8fd 2003 error = ENOENT;
72865317 2004 }
1c1e46ed
AW
2005 dp_netdev_pmd_unref(pmd);
2006
bc4a05c6 2007
5279f8fd 2008 return error;
72865317
BP
2009}
2010
0de8783a 2011static struct dp_netdev_flow *
1c1e46ed
AW
2012dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2013 struct match *match, const ovs_u128 *ufid,
ae2ceebd 2014 const struct nlattr *actions, size_t actions_len)
1c1e46ed 2015 OVS_REQUIRES(pmd->flow_mutex)
72865317 2016{
0de8783a
JR
2017 struct dp_netdev_flow *flow;
2018 struct netdev_flow_key mask;
ed79f89a 2019
0de8783a
JR
2020 netdev_flow_mask_init(&mask, match);
2021 /* Make sure wc does not have metadata. */
5fcff47b
JR
2022 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2023 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 2024
0de8783a 2025 /* Do not allocate extra space. */
caeb4906 2026 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 2027 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 2028 flow->dead = false;
11e5cf1f 2029 flow->batch = NULL;
bd5131ba 2030 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 2031 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 2032 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 2033 ovs_refcount_init(&flow->ref_cnt);
0de8783a 2034 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 2035
0de8783a 2036 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
1c1e46ed 2037 dpcls_insert(&pmd->cls, &flow->cr, &mask);
72865317 2038
4c75aaab
EJ
2039 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2040 dp_netdev_flow_hash(&flow->ufid));
2041
623540e4 2042 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
0de8783a 2043 struct match match;
623540e4
EJ
2044 struct ds ds = DS_EMPTY_INITIALIZER;
2045
4d8f90b1 2046 match.tun_md.valid = false;
0de8783a
JR
2047 match.flow = flow->flow;
2048 miniflow_expand(&flow->cr.mask->mf, &match.wc.masks);
2049
623540e4 2050 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
2051 odp_format_ufid(ufid, &ds);
2052 ds_put_cstr(&ds, " ");
0de8783a 2053 match_format(&match, &ds, OFP_DEFAULT_PRIORITY);
623540e4
EJ
2054 ds_put_cstr(&ds, ", actions:");
2055 format_odp_actions(&ds, actions, actions_len);
2056
2057 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2058
2059 ds_destroy(&ds);
2060 }
2061
0de8783a 2062 return flow;
72865317
BP
2063}
2064
72865317 2065static int
89625d1e 2066dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
2067{
2068 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2069 struct dp_netdev_flow *netdev_flow;
0de8783a 2070 struct netdev_flow_key key;
1c1e46ed 2071 struct dp_netdev_pmd_thread *pmd;
ae2ceebd 2072 struct match match;
70e5ed6f 2073 ovs_u128 ufid;
bd5131ba
DDP
2074 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2075 ? NON_PMD_CORE_ID : put->pmd_id;
36956a7d
BP
2076 int error;
2077
ae2ceebd 2078 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
2079 if (error) {
2080 return error;
2081 }
2082 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2083 put->mask, put->mask_len,
9f861c91 2084 &match.flow, &match.wc);
36956a7d
BP
2085 if (error) {
2086 return error;
2087 }
0de8783a 2088
1c1e46ed
AW
2089 pmd = dp_netdev_get_pmd(dp, pmd_id);
2090 if (!pmd) {
2091 return EINVAL;
2092 }
2093
0de8783a
JR
2094 /* Must produce a netdev_flow_key for lookup.
2095 * This interface is no longer performance critical, since it is not used
2096 * for upcall processing any more. */
2097 netdev_flow_key_from_flow(&key, &match.flow);
72865317 2098
70e5ed6f
JS
2099 if (put->ufid) {
2100 ufid = *put->ufid;
2101 } else {
2102 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2103 }
2104
1c1e46ed
AW
2105 ovs_mutex_lock(&pmd->flow_mutex);
2106 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key);
1763b4b8 2107 if (!netdev_flow) {
89625d1e 2108 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 2109 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
89625d1e
BP
2110 if (put->stats) {
2111 memset(put->stats, 0, sizeof *put->stats);
feebdea2 2112 }
1c1e46ed 2113 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
70e5ed6f 2114 put->actions_len);
0de8783a 2115 error = 0;
72865317 2116 } else {
5279f8fd 2117 error = EFBIG;
72865317
BP
2118 }
2119 } else {
5279f8fd 2120 error = ENOENT;
72865317
BP
2121 }
2122 } else {
2c0ea78f 2123 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 2124 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
2125 struct dp_netdev_actions *new_actions;
2126 struct dp_netdev_actions *old_actions;
2127
2128 new_actions = dp_netdev_actions_create(put->actions,
2129 put->actions_len);
2130
61e7deb1
BP
2131 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2132 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 2133
a84cb64a
BP
2134 if (put->stats) {
2135 get_dpif_flow_stats(netdev_flow, put->stats);
2136 }
2137 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
2138 /* XXX: The userspace datapath uses thread local statistics
2139 * (for flows), which should be updated only by the owning
2140 * thread. Since we cannot write on stats memory here,
2141 * we choose not to support this flag. Please note:
2142 * - This feature is currently used only by dpctl commands with
2143 * option --clear.
2144 * - Should the need arise, this operation can be implemented
2145 * by keeping a base value (to be update here) for each
2146 * counter, and subtracting it before outputting the stats */
2147 error = EOPNOTSUPP;
72865317 2148 }
8a4e3a85 2149
61e7deb1 2150 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 2151 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 2152 error = EEXIST;
2c0ea78f
GS
2153 } else {
2154 /* Overlapping flow. */
2155 error = EINVAL;
72865317
BP
2156 }
2157 }
1c1e46ed
AW
2158 ovs_mutex_unlock(&pmd->flow_mutex);
2159 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2160
2161 return error;
72865317
BP
2162}
2163
72865317 2164static int
b99d3cee 2165dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
2166{
2167 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2168 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2169 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
2170 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2171 ? NON_PMD_CORE_ID : del->pmd_id;
70e5ed6f 2172 int error = 0;
72865317 2173
1c1e46ed
AW
2174 pmd = dp_netdev_get_pmd(dp, pmd_id);
2175 if (!pmd) {
2176 return EINVAL;
2177 }
2178
2179 ovs_mutex_lock(&pmd->flow_mutex);
2180 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2181 del->key_len);
1763b4b8 2182 if (netdev_flow) {
b99d3cee 2183 if (del->stats) {
1763b4b8 2184 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 2185 }
1c1e46ed 2186 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2187 } else {
5279f8fd 2188 error = ENOENT;
72865317 2189 }
1c1e46ed
AW
2190 ovs_mutex_unlock(&pmd->flow_mutex);
2191 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2192
2193 return error;
72865317
BP
2194}
2195
ac64794a
BP
2196struct dpif_netdev_flow_dump {
2197 struct dpif_flow_dump up;
1c1e46ed
AW
2198 struct cmap_position poll_thread_pos;
2199 struct cmap_position flow_pos;
2200 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
2201 int status;
2202 struct ovs_mutex mutex;
e723fd32
JS
2203};
2204
ac64794a
BP
2205static struct dpif_netdev_flow_dump *
2206dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 2207{
ac64794a 2208 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
2209}
2210
ac64794a 2211static struct dpif_flow_dump *
64bb477f 2212dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
e723fd32 2213{
ac64794a 2214 struct dpif_netdev_flow_dump *dump;
e723fd32 2215
1c1e46ed 2216 dump = xzalloc(sizeof *dump);
ac64794a 2217 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 2218 dump->up.terse = terse;
ac64794a
BP
2219 ovs_mutex_init(&dump->mutex);
2220
2221 return &dump->up;
e723fd32
JS
2222}
2223
2224static int
ac64794a 2225dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 2226{
ac64794a 2227 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 2228
ac64794a
BP
2229 ovs_mutex_destroy(&dump->mutex);
2230 free(dump);
704a1e09
BP
2231 return 0;
2232}
2233
ac64794a
BP
2234struct dpif_netdev_flow_dump_thread {
2235 struct dpif_flow_dump_thread up;
2236 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
2237 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2238 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
2239};
2240
2241static struct dpif_netdev_flow_dump_thread *
2242dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2243{
2244 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2245}
2246
2247static struct dpif_flow_dump_thread *
2248dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2249{
2250 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2251 struct dpif_netdev_flow_dump_thread *thread;
2252
2253 thread = xmalloc(sizeof *thread);
2254 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2255 thread->dump = dump;
2256 return &thread->up;
2257}
2258
2259static void
2260dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2261{
2262 struct dpif_netdev_flow_dump_thread *thread
2263 = dpif_netdev_flow_dump_thread_cast(thread_);
2264
2265 free(thread);
2266}
2267
704a1e09 2268static int
ac64794a 2269dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 2270 struct dpif_flow *flows, int max_flows)
ac64794a
BP
2271{
2272 struct dpif_netdev_flow_dump_thread *thread
2273 = dpif_netdev_flow_dump_thread_cast(thread_);
2274 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 2275 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
8bb113da
RW
2276 int n_flows = 0;
2277 int i;
14608a15 2278
ac64794a 2279 ovs_mutex_lock(&dump->mutex);
8bb113da 2280 if (!dump->status) {
1c1e46ed
AW
2281 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2282 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2283 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2284 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2285
2286 /* First call to dump_next(), extracts the first pmd thread.
2287 * If there is no pmd thread, returns immediately. */
2288 if (!pmd) {
2289 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2290 if (!pmd) {
2291 ovs_mutex_unlock(&dump->mutex);
2292 return n_flows;
8bb113da 2293
8bb113da 2294 }
d2ad7ef1 2295 }
1c1e46ed
AW
2296
2297 do {
2298 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2299 struct cmap_node *node;
2300
2301 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2302 if (!node) {
2303 break;
2304 }
2305 netdev_flows[n_flows] = CONTAINER_OF(node,
2306 struct dp_netdev_flow,
2307 node);
2308 }
2309 /* When finishing dumping the current pmd thread, moves to
2310 * the next. */
2311 if (n_flows < flow_limit) {
2312 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2313 dp_netdev_pmd_unref(pmd);
2314 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2315 if (!pmd) {
2316 dump->status = EOF;
2317 break;
2318 }
2319 }
2320 /* Keeps the reference to next caller. */
2321 dump->cur_pmd = pmd;
2322
2323 /* If the current dump is empty, do not exit the loop, since the
2324 * remaining pmds could have flows to be dumped. Just dumps again
2325 * on the new 'pmd'. */
2326 } while (!n_flows);
8a4e3a85 2327 }
ac64794a 2328 ovs_mutex_unlock(&dump->mutex);
ac64794a 2329
8bb113da
RW
2330 for (i = 0; i < n_flows; i++) {
2331 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2332 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2333 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2334 struct dpif_flow *f = &flows[i];
7af12bd7 2335 struct ofpbuf key, mask;
8bb113da 2336
7af12bd7
JS
2337 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2338 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
64bb477f
JS
2339 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2340 dump->up.terse);
8bb113da 2341 }
feebdea2 2342
8bb113da 2343 return n_flows;
72865317
BP
2344}
2345
2346static int
758c456d 2347dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 2348 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
2349{
2350 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 2351 struct dp_netdev_pmd_thread *pmd;
cf62fa4c 2352 struct dp_packet *pp;
72865317 2353
cf62fa4c
PS
2354 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2355 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
2356 return EINVAL;
2357 }
2358
65f13b50
AW
2359 /* Tries finding the 'pmd'. If NULL is returned, that means
2360 * the current thread is a non-pmd thread and should use
b19befae 2361 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
2362 pmd = ovsthread_getspecific(dp->per_pmd_key);
2363 if (!pmd) {
b19befae 2364 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
65f13b50
AW
2365 }
2366
2367 /* If the current thread is non-pmd thread, acquires
2368 * the 'non_pmd_mutex'. */
2369 if (pmd->core_id == NON_PMD_CORE_ID) {
2370 ovs_mutex_lock(&dp->non_pmd_mutex);
433330a8 2371 ovs_mutex_lock(&dp->port_mutex);
65f13b50 2372 }
1c1e46ed 2373
cf62fa4c 2374 pp = execute->packet;
41ccaa24 2375 dp_netdev_execute_actions(pmd, &pp, 1, false, execute->actions,
9bbf1c3d 2376 execute->actions_len);
65f13b50 2377 if (pmd->core_id == NON_PMD_CORE_ID) {
1c1e46ed 2378 dp_netdev_pmd_unref(pmd);
433330a8 2379 ovs_mutex_unlock(&dp->port_mutex);
65f13b50
AW
2380 ovs_mutex_unlock(&dp->non_pmd_mutex);
2381 }
8a4e3a85 2382
758c456d 2383 return 0;
72865317
BP
2384}
2385
1a0c894a
BP
2386static void
2387dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2388{
2389 size_t i;
2390
2391 for (i = 0; i < n_ops; i++) {
2392 struct dpif_op *op = ops[i];
2393
2394 switch (op->type) {
2395 case DPIF_OP_FLOW_PUT:
2396 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2397 break;
2398
2399 case DPIF_OP_FLOW_DEL:
2400 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2401 break;
2402
2403 case DPIF_OP_EXECUTE:
2404 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2405 break;
6fe09f8c
JS
2406
2407 case DPIF_OP_FLOW_GET:
2408 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2409 break;
1a0c894a
BP
2410 }
2411 }
2412}
2413
f2eee189
AW
2414/* Returns true if the configuration for rx queues or cpu mask
2415 * is changed. */
2416static bool
a14b8947 2417pmd_config_changed(const struct dp_netdev *dp, const char *cmask)
f2eee189 2418{
a14b8947
IM
2419 struct dp_netdev_port *port;
2420
2421 CMAP_FOR_EACH (port, node, &dp->ports) {
2422 struct netdev *netdev = port->netdev;
2423 int requested_n_rxq = netdev_requested_n_rxq(netdev);
2424 if (netdev_is_pmd(netdev)
2425 && port->latest_requested_n_rxq != requested_n_rxq) {
2426 return true;
f2eee189
AW
2427 }
2428 }
a14b8947
IM
2429
2430 if (dp->pmd_cmask != NULL && cmask != NULL) {
2431 return strcmp(dp->pmd_cmask, cmask);
2432 } else {
2433 return (dp->pmd_cmask != NULL || cmask != NULL);
2434 }
f2eee189
AW
2435}
2436
2437/* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
2438static int
a14b8947 2439dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
f2eee189
AW
2440{
2441 struct dp_netdev *dp = get_dp_netdev(dpif);
2442
a14b8947 2443 if (pmd_config_changed(dp, cmask)) {
f2eee189
AW
2444 struct dp_netdev_port *port;
2445
2446 dp_netdev_destroy_all_pmds(dp);
2447
2448 CMAP_FOR_EACH (port, node, &dp->ports) {
a14b8947
IM
2449 struct netdev *netdev = port->netdev;
2450 int requested_n_rxq = netdev_requested_n_rxq(netdev);
2451 if (netdev_is_pmd(port->netdev)
2452 && port->latest_requested_n_rxq != requested_n_rxq) {
f2eee189
AW
2453 int i, err;
2454
2455 /* Closes the existing 'rxq's. */
2456 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2457 netdev_rxq_close(port->rxq[i]);
2458 port->rxq[i] = NULL;
2459 }
2460
2461 /* Sets the new rx queue config. */
3bcc10c0
DDP
2462 err = netdev_set_multiq(port->netdev,
2463 ovs_numa_get_n_cores() + 1,
a14b8947 2464 requested_n_rxq);
7251515e 2465 if (err && (err != EOPNOTSUPP)) {
f2eee189
AW
2466 VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
2467 " %u", netdev_get_name(port->netdev),
a14b8947 2468 requested_n_rxq);
f2eee189
AW
2469 return err;
2470 }
a14b8947 2471 port->latest_requested_n_rxq = requested_n_rxq;
f2eee189
AW
2472 /* If the set_multiq() above succeeds, reopens the 'rxq's. */
2473 port->rxq = xrealloc(port->rxq, sizeof *port->rxq
2474 * netdev_n_rxq(port->netdev));
2475 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2476 netdev_rxq_open(port->netdev, &port->rxq[i], i);
2477 }
2478 }
2479 }
f2eee189
AW
2480 /* Reconfigures the cpu mask. */
2481 ovs_numa_set_cpu_mask(cmask);
2482 free(dp->pmd_cmask);
2483 dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
2484
2485 /* Restores the non-pmd. */
2486 dp_netdev_set_nonpmd(dp);
2487 /* Restores all pmd threads. */
2488 dp_netdev_reset_pmd_threads(dp);
2489 }
2490
2491 return 0;
2492}
2493
5bf93d67
EJ
2494static int
2495dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2496 uint32_t queue_id, uint32_t *priority)
2497{
2498 *priority = queue_id;
2499 return 0;
2500}
2501
72865317 2502\f
9ff55ae2
DDP
2503/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2504 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
a84cb64a
BP
2505struct dp_netdev_actions *
2506dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2507{
2508 struct dp_netdev_actions *netdev_actions;
2509
9ff55ae2
DDP
2510 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2511 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
2512 netdev_actions->size = size;
2513
2514 return netdev_actions;
2515}
2516
a84cb64a 2517struct dp_netdev_actions *
61e7deb1 2518dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 2519{
61e7deb1 2520 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
2521}
2522
61e7deb1
BP
2523static void
2524dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 2525{
61e7deb1 2526 free(actions);
a84cb64a
BP
2527}
2528\f
55e3ca97
DDP
2529static inline unsigned long long
2530cycles_counter(void)
2531{
2532#ifdef DPDK_NETDEV
2533 return rte_get_tsc_cycles();
2534#else
2535 return 0;
2536#endif
2537}
2538
2539/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2540extern struct ovs_mutex cycles_counter_fake_mutex;
2541
2542/* Start counting cycles. Must be followed by 'cycles_count_end()' */
2543static inline void
2544cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2545 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2546 OVS_NO_THREAD_SAFETY_ANALYSIS
2547{
2548 pmd->last_cycles = cycles_counter();
2549}
2550
2551/* Stop counting cycles and add them to the counter 'type' */
2552static inline void
2553cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2554 enum pmd_cycles_counter_type type)
2555 OVS_RELEASES(&cycles_counter_fake_mutex)
2556 OVS_NO_THREAD_SAFETY_ANALYSIS
2557{
2558 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2559
2560 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2561}
e4cfed38 2562
5794e276 2563static void
65f13b50 2564dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2565 struct dp_netdev_port *port,
2566 struct netdev_rxq *rxq)
e4cfed38 2567{
cd159f1a 2568 struct dp_packet *packets[NETDEV_MAX_BURST];
8cbf4f47 2569 int error, cnt;
e4cfed38 2570
55e3ca97 2571 cycles_count_start(pmd);
8cbf4f47 2572 error = netdev_rxq_recv(rxq, packets, &cnt);
55e3ca97 2573 cycles_count_end(pmd, PMD_CYCLES_POLLING);
e4cfed38 2574 if (!error) {
3c33f0ff 2575 *recirc_depth_get() = 0;
41ccaa24 2576
55e3ca97 2577 cycles_count_start(pmd);
a90ed026 2578 dp_netdev_input(pmd, packets, cnt, port->port_no);
55e3ca97 2579 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
e4cfed38 2580 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 2581 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
2582
2583 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 2584 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
2585 }
2586}
2587
a36de779
PS
2588/* Return true if needs to revalidate datapath flows. */
2589static bool
e4cfed38
PS
2590dpif_netdev_run(struct dpif *dpif)
2591{
2592 struct dp_netdev_port *port;
2593 struct dp_netdev *dp = get_dp_netdev(dpif);
b19befae
AW
2594 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_pmd(dp,
2595 NON_PMD_CORE_ID);
a36de779 2596 uint64_t new_tnl_seq;
e4cfed38 2597
65f13b50 2598 ovs_mutex_lock(&dp->non_pmd_mutex);
a532e683 2599 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2600 if (!netdev_is_pmd(port->netdev)) {
2601 int i;
2602
2603 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
65f13b50 2604 dp_netdev_process_rxq_port(non_pmd, port, port->rxq[i]);
55c955bd 2605 }
e4cfed38
PS
2606 }
2607 }
65f13b50 2608 ovs_mutex_unlock(&dp->non_pmd_mutex);
1c1e46ed
AW
2609 dp_netdev_pmd_unref(non_pmd);
2610
53902038 2611 tnl_neigh_cache_run();
7f9b8504 2612 tnl_port_map_run();
a36de779
PS
2613 new_tnl_seq = seq_read(tnl_conf_seq);
2614
2615 if (dp->last_tnl_conf_seq != new_tnl_seq) {
2616 dp->last_tnl_conf_seq = new_tnl_seq;
2617 return true;
2618 }
2619 return false;
e4cfed38
PS
2620}
2621
2622static void
2623dpif_netdev_wait(struct dpif *dpif)
2624{
2625 struct dp_netdev_port *port;
2626 struct dp_netdev *dp = get_dp_netdev(dpif);
2627
59e6d833 2628 ovs_mutex_lock(&dp_netdev_mutex);
a532e683 2629 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2630 if (!netdev_is_pmd(port->netdev)) {
2631 int i;
2632
2633 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2634 netdev_rxq_wait(port->rxq[i]);
2635 }
e4cfed38
PS
2636 }
2637 }
59e6d833 2638 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 2639 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
2640}
2641
e4cfed38 2642static int
65f13b50 2643pmd_load_queues(struct dp_netdev_pmd_thread *pmd,
f7791740 2644 struct rxq_poll **ppoll_list, int poll_cnt)
ae7ad0a1 2645 OVS_REQUIRES(pmd->poll_mutex)
e4cfed38 2646{
f7791740 2647 struct rxq_poll *poll_list = *ppoll_list;
ae7ad0a1
IM
2648 struct rxq_poll *poll;
2649 int i;
e4cfed38 2650
e4cfed38 2651 for (i = 0; i < poll_cnt; i++) {
65f13b50 2652 port_unref(poll_list[i].port);
e4cfed38
PS
2653 }
2654
ae7ad0a1 2655 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
a1fdee13 2656
ae7ad0a1
IM
2657 i = 0;
2658 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
2659 port_ref(poll->port);
2660 poll_list[i++] = *poll;
e4cfed38
PS
2661 }
2662
e4cfed38 2663 *ppoll_list = poll_list;
ae7ad0a1 2664 return pmd->poll_cnt;
e4cfed38
PS
2665}
2666
6c3eee82 2667static void *
e4cfed38 2668pmd_thread_main(void *f_)
6c3eee82 2669{
65f13b50 2670 struct dp_netdev_pmd_thread *pmd = f_;
e4cfed38 2671 unsigned int lc = 0;
f7791740 2672 struct rxq_poll *poll_list;
84067a4c 2673 unsigned int port_seq = PMD_INITIAL_SEQ;
e4cfed38
PS
2674 int poll_cnt;
2675 int i;
6c3eee82 2676
e4cfed38
PS
2677 poll_cnt = 0;
2678 poll_list = NULL;
2679
65f13b50
AW
2680 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
2681 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
2682 pmd_thread_setaffinity_cpu(pmd->core_id);
e4cfed38 2683reload:
65f13b50 2684 emc_cache_init(&pmd->flow_cache);
ae7ad0a1
IM
2685
2686 ovs_mutex_lock(&pmd->poll_mutex);
65f13b50 2687 poll_cnt = pmd_load_queues(pmd, &poll_list, poll_cnt);
ae7ad0a1 2688 ovs_mutex_unlock(&pmd->poll_mutex);
6c3eee82 2689
7dd671f0
MK
2690 /* List port/core affinity */
2691 for (i = 0; i < poll_cnt; i++) {
ce179f11
IM
2692 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
2693 pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
2694 netdev_rxq_get_queue_id(poll_list[i].rx));
7dd671f0
MK
2695 }
2696
accf8626
AW
2697 /* Signal here to make sure the pmd finishes
2698 * reloading the updated configuration. */
2699 dp_netdev_pmd_reload_done(pmd);
2700
e4cfed38 2701 for (;;) {
e4cfed38 2702 for (i = 0; i < poll_cnt; i++) {
65f13b50 2703 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
e4cfed38
PS
2704 }
2705
2706 if (lc++ > 1024) {
84067a4c 2707 unsigned int seq;
6c3eee82 2708
e4cfed38 2709 lc = 0;
84067a4c 2710
67ad54cb 2711 emc_cache_slow_sweep(&pmd->flow_cache);
fbe0962b 2712 coverage_try_clear();
84067a4c
JR
2713 ovsrcu_quiesce();
2714
65f13b50 2715 atomic_read_relaxed(&pmd->change_seq, &seq);
84067a4c
JR
2716 if (seq != port_seq) {
2717 port_seq = seq;
6c3eee82
BP
2718 break;
2719 }
2720 }
e4cfed38 2721 }
6c3eee82 2722
65f13b50 2723 emc_cache_uninit(&pmd->flow_cache);
9bbf1c3d 2724
65f13b50 2725 if (!latch_is_set(&pmd->exit_latch)){
e4cfed38
PS
2726 goto reload;
2727 }
6c3eee82 2728
e4cfed38 2729 for (i = 0; i < poll_cnt; i++) {
ae7ad0a1 2730 port_unref(poll_list[i].port);
6c3eee82 2731 }
6c3eee82 2732
accf8626
AW
2733 dp_netdev_pmd_reload_done(pmd);
2734
e4cfed38 2735 free(poll_list);
6c3eee82
BP
2736 return NULL;
2737}
2738
6b31e073
RW
2739static void
2740dp_netdev_disable_upcall(struct dp_netdev *dp)
2741 OVS_ACQUIRES(dp->upcall_rwlock)
2742{
2743 fat_rwlock_wrlock(&dp->upcall_rwlock);
2744}
2745
2746static void
2747dpif_netdev_disable_upcall(struct dpif *dpif)
2748 OVS_NO_THREAD_SAFETY_ANALYSIS
2749{
2750 struct dp_netdev *dp = get_dp_netdev(dpif);
2751 dp_netdev_disable_upcall(dp);
2752}
2753
2754static void
2755dp_netdev_enable_upcall(struct dp_netdev *dp)
2756 OVS_RELEASES(dp->upcall_rwlock)
2757{
2758 fat_rwlock_unlock(&dp->upcall_rwlock);
2759}
2760
2761static void
2762dpif_netdev_enable_upcall(struct dpif *dpif)
2763 OVS_NO_THREAD_SAFETY_ANALYSIS
2764{
2765 struct dp_netdev *dp = get_dp_netdev(dpif);
2766 dp_netdev_enable_upcall(dp);
2767}
2768
ae7ad0a1 2769static void
accf8626
AW
2770dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
2771{
2772 ovs_mutex_lock(&pmd->cond_mutex);
2773 xpthread_cond_signal(&pmd->cond);
2774 ovs_mutex_unlock(&pmd->cond_mutex);
2775}
2776
1c1e46ed
AW
2777/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
2778 * the pointer if succeeds, otherwise, NULL.
2779 *
2780 * Caller must unrefs the returned reference. */
65f13b50 2781static struct dp_netdev_pmd_thread *
bd5131ba 2782dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
2783{
2784 struct dp_netdev_pmd_thread *pmd;
55847abe 2785 const struct cmap_node *pnode;
65f13b50 2786
b19befae 2787 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
2788 if (!pnode) {
2789 return NULL;
2790 }
65f13b50
AW
2791 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
2792
1c1e46ed 2793 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
2794}
2795
f2eee189
AW
2796/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
2797static void
2798dp_netdev_set_nonpmd(struct dp_netdev *dp)
2799{
2800 struct dp_netdev_pmd_thread *non_pmd;
2801
2802 non_pmd = xzalloc(sizeof *non_pmd);
2803 dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
2804 OVS_NUMA_UNSPEC);
2805}
2806
1c1e46ed
AW
2807/* Caller must have valid pointer to 'pmd'. */
2808static bool
2809dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
2810{
2811 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
2812}
2813
2814static void
2815dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
2816{
2817 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
2818 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
2819 }
2820}
2821
2822/* Given cmap position 'pos', tries to ref the next node. If try_ref()
2823 * fails, keeps checking for next node until reaching the end of cmap.
2824 *
2825 * Caller must unrefs the returned reference. */
2826static struct dp_netdev_pmd_thread *
2827dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
2828{
2829 struct dp_netdev_pmd_thread *next;
2830
2831 do {
2832 struct cmap_node *node;
2833
2834 node = cmap_next_position(&dp->poll_threads, pos);
2835 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
2836 : NULL;
2837 } while (next && !dp_netdev_pmd_try_ref(next));
2838
2839 return next;
2840}
2841
65f13b50 2842/* Configures the 'pmd' based on the input argument. */
6c3eee82 2843static void
65f13b50 2844dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
bd5131ba 2845 int index, unsigned core_id, int numa_id)
65f13b50
AW
2846{
2847 pmd->dp = dp;
2848 pmd->index = index;
2849 pmd->core_id = core_id;
2850 pmd->numa_id = numa_id;
ae7ad0a1 2851 pmd->poll_cnt = 0;
1c1e46ed 2852
347ba9bb
IM
2853 atomic_init(&pmd->tx_qid,
2854 (core_id == NON_PMD_CORE_ID)
2855 ? ovs_numa_get_n_cores()
2856 : get_n_pmd_threads(dp));
2857
1c1e46ed 2858 ovs_refcount_init(&pmd->ref_cnt);
65f13b50
AW
2859 latch_init(&pmd->exit_latch);
2860 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
accf8626
AW
2861 xpthread_cond_init(&pmd->cond, NULL);
2862 ovs_mutex_init(&pmd->cond_mutex);
1c1e46ed 2863 ovs_mutex_init(&pmd->flow_mutex);
ae7ad0a1 2864 ovs_mutex_init(&pmd->poll_mutex);
1c1e46ed
AW
2865 dpcls_init(&pmd->cls);
2866 cmap_init(&pmd->flow_table);
ae7ad0a1 2867 list_init(&pmd->poll_list);
65f13b50
AW
2868 /* init the 'flow_cache' since there is no
2869 * actual thread created for NON_PMD_CORE_ID. */
2870 if (core_id == NON_PMD_CORE_ID) {
2871 emc_cache_init(&pmd->flow_cache);
2872 }
2873 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
2874 hash_int(core_id, 0));
2875}
2876
1c1e46ed
AW
2877static void
2878dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
2879{
2880 dp_netdev_pmd_flow_flush(pmd);
2881 dpcls_destroy(&pmd->cls);
2882 cmap_destroy(&pmd->flow_table);
2883 ovs_mutex_destroy(&pmd->flow_mutex);
2884 latch_destroy(&pmd->exit_latch);
2885 xpthread_cond_destroy(&pmd->cond);
2886 ovs_mutex_destroy(&pmd->cond_mutex);
ae7ad0a1 2887 ovs_mutex_destroy(&pmd->poll_mutex);
1c1e46ed
AW
2888 free(pmd);
2889}
2890
2891/* Stops the pmd thread, removes it from the 'dp->poll_threads',
2892 * and unrefs the struct. */
65f13b50 2893static void
e4e74c3a 2894dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 2895{
65f13b50 2896 /* Uninit the 'flow_cache' since there is
1c1e46ed 2897 * no actual thread uninit it for NON_PMD_CORE_ID. */
65f13b50
AW
2898 if (pmd->core_id == NON_PMD_CORE_ID) {
2899 emc_cache_uninit(&pmd->flow_cache);
2900 } else {
2901 latch_set(&pmd->exit_latch);
2902 dp_netdev_reload_pmd__(pmd);
2903 ovs_numa_unpin_core(pmd->core_id);
2904 xpthread_join(pmd->thread, NULL);
2905 }
ae7ad0a1
IM
2906
2907 /* Unref all ports and free poll_list. */
cc245ce8 2908 dp_netdev_pmd_clear_poll_list(pmd);
ae7ad0a1 2909
e4e74c3a
AW
2910 /* Purges the 'pmd''s flows after stopping the thread, but before
2911 * destroying the flows, so that the flow stats can be collected. */
2912 if (dp->dp_purge_cb) {
2913 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
2914 }
65f13b50 2915 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 2916 dp_netdev_pmd_unref(pmd);
65f13b50 2917}
6c3eee82 2918
65f13b50
AW
2919/* Destroys all pmd threads. */
2920static void
2921dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
2922{
2923 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
2924 struct dp_netdev_pmd_thread **pmd_list;
2925 size_t k = 0, n_pmds;
2926
2927 n_pmds = cmap_count(&dp->poll_threads);
2928 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
2929
2930 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
2931 /* We cannot call dp_netdev_del_pmd(), since it alters
2932 * 'dp->poll_threads' (while we're iterating it) and it
2933 * might quiesce. */
2934 ovs_assert(k < n_pmds);
2935 pmd_list[k++] = pmd;
6c3eee82 2936 }
d916785c
DDP
2937
2938 for (size_t i = 0; i < k; i++) {
2939 dp_netdev_del_pmd(dp, pmd_list[i]);
2940 }
2941 free(pmd_list);
65f13b50 2942}
6c3eee82 2943
347ba9bb
IM
2944/* Deletes all pmd threads on numa node 'numa_id' and
2945 * fixes tx_qids of other threads to keep them sequential. */
65f13b50
AW
2946static void
2947dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
2948{
2949 struct dp_netdev_pmd_thread *pmd;
347ba9bb
IM
2950 int n_pmds_on_numa, n_pmds;
2951 int *free_idx, k = 0;
d916785c 2952 struct dp_netdev_pmd_thread **pmd_list;
347ba9bb
IM
2953
2954 n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
d916785c
DDP
2955 free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
2956 pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
6c3eee82 2957
65f13b50 2958 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
2959 /* We cannot call dp_netdev_del_pmd(), since it alters
2960 * 'dp->poll_threads' (while we're iterating it) and it
2961 * might quiesce. */
65f13b50 2962 if (pmd->numa_id == numa_id) {
347ba9bb 2963 atomic_read_relaxed(&pmd->tx_qid, &free_idx[k]);
d916785c
DDP
2964 pmd_list[k] = pmd;
2965 ovs_assert(k < n_pmds_on_numa);
347ba9bb 2966 k++;
65f13b50 2967 }
6c3eee82 2968 }
347ba9bb 2969
d916785c
DDP
2970 for (int i = 0; i < k; i++) {
2971 dp_netdev_del_pmd(dp, pmd_list[i]);
2972 }
2973
347ba9bb
IM
2974 n_pmds = get_n_pmd_threads(dp);
2975 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2976 int old_tx_qid;
2977
2978 atomic_read_relaxed(&pmd->tx_qid, &old_tx_qid);
2979
2980 if (old_tx_qid >= n_pmds) {
2981 int new_tx_qid = free_idx[--k];
2982
2983 atomic_store_relaxed(&pmd->tx_qid, new_tx_qid);
2984 }
2985 }
2986
d916785c 2987 free(pmd_list);
347ba9bb 2988 free(free_idx);
65f13b50 2989}
6c3eee82 2990
cc245ce8
IM
2991/* Deletes all rx queues from pmd->poll_list. */
2992static void
2993dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd)
2994{
2995 struct rxq_poll *poll;
2996
2997 ovs_mutex_lock(&pmd->poll_mutex);
2998 LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
2999 port_unref(poll->port);
3000 free(poll);
3001 }
3002 pmd->poll_cnt = 0;
3003 ovs_mutex_unlock(&pmd->poll_mutex);
3004}
3005
3006/* Deletes all rx queues of 'port' from poll_list of pmd thread and
3007 * reloads it if poll_list was changed. */
3008static void
3009dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
3010 struct dp_netdev_pmd_thread *pmd)
3011{
3012 struct rxq_poll *poll, *next;
3013 bool found = false;
3014
3015 ovs_mutex_lock(&pmd->poll_mutex);
3016 LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
3017 if (poll->port == port) {
3018 found = true;
3019 port_unref(poll->port);
3020 list_remove(&poll->node);
3021 pmd->poll_cnt--;
3022 free(poll);
3023 }
3024 }
3025 ovs_mutex_unlock(&pmd->poll_mutex);
3026 if (found) {
3027 dp_netdev_reload_pmd__(pmd);
3028 }
3029}
3030
3031/* Deletes all rx queues of 'port' from all pmd threads of dp and
3032 * reloads them if needed. */
3033static void
3034dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
3035 struct dp_netdev_port *port)
3036{
3037 int numa_id = netdev_get_numa_id(port->netdev);
3038 struct dp_netdev_pmd_thread *pmd;
3039
3040 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3041 if (pmd->numa_id == numa_id) {
3042 dp_netdev_del_port_from_pmd(port, pmd);
3043 }
3044 }
3045}
3046
ae7ad0a1
IM
3047/* Returns PMD thread from this numa node with fewer rx queues to poll.
3048 * Returns NULL if there is no PMD threads on this numa node.
3049 * Can be called safely only by main thread. */
3050static struct dp_netdev_pmd_thread *
3051dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
3052{
3053 int min_cnt = -1;
3054 struct dp_netdev_pmd_thread *pmd, *res = NULL;
3055
3056 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3057 if (pmd->numa_id == numa_id
3058 && (min_cnt > pmd->poll_cnt || res == NULL)) {
3059 min_cnt = pmd->poll_cnt;
3060 res = pmd;
3061 }
3062 }
3063
3064 return res;
3065}
3066
3067/* Adds rx queue to poll_list of PMD thread. */
3068static void
3069dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3070 struct dp_netdev_port *port, struct netdev_rxq *rx)
3071 OVS_REQUIRES(pmd->poll_mutex)
3072{
3073 struct rxq_poll *poll = xmalloc(sizeof *poll);
3074
3075 port_ref(port);
3076 poll->port = port;
3077 poll->rx = rx;
3078
3079 list_push_back(&pmd->poll_list, &poll->node);
3080 pmd->poll_cnt++;
3081}
3082
cc245ce8
IM
3083/* Distributes all rx queues of 'port' between all PMD threads and reloads
3084 * them if needed. */
3085static void
3086dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
3087{
3088 int numa_id = netdev_get_numa_id(port->netdev);
3089 struct dp_netdev_pmd_thread *pmd;
3090 struct hmapx to_reload;
3091 struct hmapx_node *node;
3092 int i;
3093
3094 hmapx_init(&to_reload);
3095 /* Cannot create pmd threads for invalid numa node. */
3096 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
3097
3098 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
3099 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
3100 if (!pmd) {
3101 /* There is no pmd threads on this numa node. */
3102 dp_netdev_set_pmds_on_numa(dp, numa_id);
3103 /* Assigning of rx queues done. */
3104 break;
3105 }
3106
3107 ovs_mutex_lock(&pmd->poll_mutex);
3108 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]);
3109 ovs_mutex_unlock(&pmd->poll_mutex);
3110
3111 hmapx_add(&to_reload, pmd);
3112 }
3113
3114 HMAPX_FOR_EACH (node, &to_reload) {
3115 pmd = (struct dp_netdev_pmd_thread *) node->data;
3116 dp_netdev_reload_pmd__(pmd);
3117 }
3118
3119 hmapx_destroy(&to_reload);
3120}
3121
65f13b50
AW
3122/* Checks the numa node id of 'netdev' and starts pmd threads for
3123 * the numa node. */
3124static void
3125dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3126{
3127 int n_pmds;
e4cfed38 3128
65f13b50
AW
3129 if (!ovs_numa_numa_id_is_valid(numa_id)) {
3130 VLOG_ERR("Cannot create pmd threads due to numa id (%d)"
3131 "invalid", numa_id);
3132 return ;
3133 }
3134
3135 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
3136
3137 /* If there are already pmd threads created for the numa node
3138 * in which 'netdev' is on, do nothing. Else, creates the
3139 * pmd threads for the numa node. */
3140 if (!n_pmds) {
ae7ad0a1 3141 int can_have, n_unpinned, i, index = 0;
2aca813c 3142 struct dp_netdev_pmd_thread **pmds;
ae7ad0a1 3143 struct dp_netdev_port *port;
65f13b50
AW
3144
3145 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
3146 if (!n_unpinned) {
3147 VLOG_ERR("Cannot create pmd threads due to out of unpinned "
3148 "cores on numa node");
3149 return;
3150 }
6c3eee82 3151
f2eee189
AW
3152 /* If cpu mask is specified, uses all unpinned cores, otherwise
3153 * tries creating NR_PMD_THREADS pmd threads. */
3154 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
2aca813c 3155 pmds = xzalloc(can_have * sizeof *pmds);
65f13b50 3156 for (i = 0; i < can_have; i++) {
bd5131ba 3157 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
2aca813c
IM
3158 pmds[i] = xzalloc(sizeof **pmds);
3159 dp_netdev_configure_pmd(pmds[i], dp, i, core_id, numa_id);
3160 }
ae7ad0a1
IM
3161
3162 /* Distributes rx queues of this numa node between new pmd threads. */
3163 CMAP_FOR_EACH (port, node, &dp->ports) {
3164 if (netdev_is_pmd(port->netdev)
3165 && netdev_get_numa_id(port->netdev) == numa_id) {
3166 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
3167 /* Make thread-safety analyser happy. */
3168 ovs_mutex_lock(&pmds[index]->poll_mutex);
3169 dp_netdev_add_rxq_to_pmd(pmds[index], port, port->rxq[i]);
3170 ovs_mutex_unlock(&pmds[index]->poll_mutex);
3171 index = (index + 1) % can_have;
3172 }
3173 }
3174 }
3175
3176 /* Actual start of pmd threads. */
2aca813c 3177 for (i = 0; i < can_have; i++) {
2aca813c 3178 pmds[i]->thread = ovs_thread_create("pmd", pmd_thread_main, pmds[i]);
65f13b50 3179 }
2aca813c 3180 free(pmds);
65f13b50 3181 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
6c3eee82
BP
3182 }
3183}
e4cfed38 3184
6c3eee82 3185\f
f2eee189
AW
3186/* Called after pmd threads config change. Restarts pmd threads with
3187 * new configuration. */
3188static void
3189dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
3190{
3191 struct dp_netdev_port *port;
3192
3193 CMAP_FOR_EACH (port, node, &dp->ports) {
3194 if (netdev_is_pmd(port->netdev)) {
3195 int numa_id = netdev_get_numa_id(port->netdev);
3196
3197 dp_netdev_set_pmds_on_numa(dp, numa_id);
3198 }
3199 }
3200}
3201
b5cbbcf6
AZ
3202static char *
3203dpif_netdev_get_datapath_version(void)
3204{
3205 return xstrdup("<built-in>");
3206}
3207
72865317 3208static void
1c1e46ed 3209dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 3210 uint16_t tcp_flags, long long now)
72865317 3211{
eb94da30 3212 uint16_t flags;
72865317 3213
eb94da30
DDP
3214 atomic_store_relaxed(&netdev_flow->stats.used, now);
3215 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
3216 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
3217 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3218 flags |= tcp_flags;
3219 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
3220}
3221
3222static void
1c1e46ed
AW
3223dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
3224 enum dp_stat_type type, int cnt)
51852a57 3225{
eb94da30 3226 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
51852a57
BP
3227}
3228
623540e4 3229static int
e14deea0 3230dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 3231 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
3232 enum dpif_upcall_type type, const struct nlattr *userdata,
3233 struct ofpbuf *actions, struct ofpbuf *put_actions)
3234{
1c1e46ed 3235 struct dp_netdev *dp = pmd->dp;
6728d578
JG
3236 struct flow_tnl orig_tunnel;
3237 int err;
623540e4 3238
623540e4
EJ
3239 if (OVS_UNLIKELY(!dp->upcall_cb)) {
3240 return ENODEV;
3241 }
3242
6728d578
JG
3243 /* Upcall processing expects the Geneve options to be in the translated
3244 * format but we need to retain the raw format for datapath use. */
3245 orig_tunnel.flags = flow->tunnel.flags;
3246 if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
3247 orig_tunnel.metadata.present.len = flow->tunnel.metadata.present.len;
3248 memcpy(orig_tunnel.metadata.opts.gnv, flow->tunnel.metadata.opts.gnv,
3249 flow->tunnel.metadata.present.len);
3250 err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel,
3251 &flow->tunnel);
3252 if (err) {
3253 return err;
3254 }
3255 }
3256
623540e4
EJ
3257 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
3258 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 3259 char *packet_str;
cf62fa4c 3260 struct ofpbuf key;
5262eea1
JG
3261 struct odp_flow_key_parms odp_parms = {
3262 .flow = flow,
3263 .mask = &wc->masks,
3264 .odp_in_port = flow->in_port.odp_port,
2494ccd7 3265 .support = dp_netdev_support,
5262eea1 3266 };
623540e4
EJ
3267
3268 ofpbuf_init(&key, 0);
5262eea1 3269 odp_flow_key_from_flow(&odp_parms, &key);
cf62fa4c
PS
3270 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3271 dp_packet_size(packet_));
623540e4 3272
6fd6ed71 3273 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
3274
3275 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3276 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3277
3278 ofpbuf_uninit(&key);
3279 free(packet_str);
6fd6ed71 3280
623540e4
EJ
3281 ds_destroy(&ds);
3282 }
3283
6728d578
JG
3284 err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
3285 actions, wc, put_actions, dp->upcall_aux);
3286 if (err && err != ENOSPC) {
3287 return err;
3288 }
3289
3290 /* Translate tunnel metadata masks to datapath format. */
3291 if (wc) {
3292 if (wc->masks.tunnel.metadata.present.map) {
4e548ad9 3293 struct geneve_opt opts[TLV_TOT_OPT_SIZE /
6728d578
JG
3294 sizeof(struct geneve_opt)];
3295
3f32cfeb
JG
3296 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3297 tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
3298 &wc->masks.tunnel,
3299 orig_tunnel.metadata.opts.gnv,
3300 orig_tunnel.metadata.present.len,
3301 opts);
3302 } else {
3303 orig_tunnel.metadata.present.len = 0;
3304 }
6728d578
JG
3305
3306 memset(&wc->masks.tunnel.metadata, 0,
3307 sizeof wc->masks.tunnel.metadata);
3308 memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts,
3309 orig_tunnel.metadata.present.len);
3310 }
3311 wc->masks.tunnel.metadata.present.len = 0xff;
3312 }
3313
3314 /* Restore tunnel metadata. We need to use the saved options to ensure
3315 * that any unknown options are not lost. The generated mask will have
3316 * the same structure, matching on types and lengths but wildcarding
3317 * option data we don't care about. */
3318 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3319 memcpy(&flow->tunnel.metadata.opts.gnv, orig_tunnel.metadata.opts.gnv,
3320 orig_tunnel.metadata.present.len);
3321 flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len;
3322 flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
3323 }
3324
3325 return err;
623540e4
EJ
3326}
3327
9bbf1c3d 3328static inline uint32_t
048963aa
DDP
3329dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3330 const struct miniflow *mf)
9bbf1c3d 3331{
048963aa 3332 uint32_t hash, recirc_depth;
9bbf1c3d 3333
f2f44f5d
DDP
3334 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
3335 hash = dp_packet_get_rss_hash(packet);
3336 } else {
9bbf1c3d 3337 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 3338 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 3339 }
048963aa
DDP
3340
3341 /* The RSS hash must account for the recirculation depth to avoid
3342 * collisions in the exact match cache */
3343 recirc_depth = *recirc_depth_get_unsafe();
3344 if (OVS_UNLIKELY(recirc_depth)) {
3345 hash = hash_finish(hash, recirc_depth);
3346 dp_packet_set_rss_hash(packet, hash);
3347 }
9bbf1c3d
DDP
3348 return hash;
3349}
3350
567bbb2e 3351struct packet_batch {
8cbf4f47
DDP
3352 unsigned int packet_count;
3353 unsigned int byte_count;
3354 uint16_t tcp_flags;
3355
3356 struct dp_netdev_flow *flow;
3357
cd159f1a 3358 struct dp_packet *packets[NETDEV_MAX_BURST];
8cbf4f47
DDP
3359};
3360
3361static inline void
e14deea0 3362packet_batch_update(struct packet_batch *batch, struct dp_packet *packet,
9bbf1c3d 3363 const struct miniflow *mf)
8cbf4f47
DDP
3364{
3365 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3366 batch->packets[batch->packet_count++] = packet;
cf62fa4c 3367 batch->byte_count += dp_packet_size(packet);
8cbf4f47
DDP
3368}
3369
3370static inline void
41ccaa24 3371packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow)
8cbf4f47 3372{
11e5cf1f 3373 flow->batch = batch;
8cbf4f47 3374
11e5cf1f 3375 batch->flow = flow;
8cbf4f47
DDP
3376 batch->packet_count = 0;
3377 batch->byte_count = 0;
3378 batch->tcp_flags = 0;
8cbf4f47
DDP
3379}
3380
3381static inline void
65f13b50 3382packet_batch_execute(struct packet_batch *batch,
abcf3ef4 3383 struct dp_netdev_pmd_thread *pmd,
11bfdadd 3384 long long now)
8cbf4f47
DDP
3385{
3386 struct dp_netdev_actions *actions;
3387 struct dp_netdev_flow *flow = batch->flow;
3388
11e5cf1f 3389 dp_netdev_flow_used(flow, batch->packet_count, batch->byte_count,
11bfdadd 3390 batch->tcp_flags, now);
8cbf4f47
DDP
3391
3392 actions = dp_netdev_flow_get_actions(flow);
3393
65f13b50 3394 dp_netdev_execute_actions(pmd, batch->packets, batch->packet_count, true,
41ccaa24 3395 actions->actions, actions->size);
8cbf4f47
DDP
3396}
3397
8aaa125d 3398static inline void
e14deea0 3399dp_netdev_queue_batches(struct dp_packet *pkt,
9bbf1c3d 3400 struct dp_netdev_flow *flow, const struct miniflow *mf,
8aaa125d 3401 struct packet_batch *batches, size_t *n_batches)
9bbf1c3d 3402{
8aaa125d 3403 struct packet_batch *batch = flow->batch;
11e5cf1f 3404
f9fe365b
AZ
3405 if (OVS_UNLIKELY(!batch)) {
3406 batch = &batches[(*n_batches)++];
3407 packet_batch_init(batch, flow);
9bbf1c3d
DDP
3408 }
3409
9bbf1c3d 3410 packet_batch_update(batch, pkt, mf);
9bbf1c3d
DDP
3411}
3412
9bbf1c3d 3413/* Try to process all ('cnt') the 'packets' using only the exact match cache
a90ed026 3414 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d
DDP
3415 * miniflow is copied into 'keys' and the packet pointer is moved at the
3416 * beginning of the 'packets' array.
9bbf1c3d
DDP
3417 *
3418 * The function returns the number of packets that needs to be processed in the
3419 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026
DDP
3420 *
3421 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
3422 * initialized by this function using 'port_no'.
9bbf1c3d
DDP
3423 */
3424static inline size_t
e14deea0 3425emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
8aaa125d 3426 size_t cnt, struct netdev_flow_key *keys,
a90ed026
DDP
3427 struct packet_batch batches[], size_t *n_batches,
3428 bool md_is_valid, odp_port_t port_no)
72865317 3429{
65f13b50 3430 struct emc_cache *flow_cache = &pmd->flow_cache;
b89c678b 3431 struct netdev_flow_key *key = &keys[0];
3d88a620 3432 size_t i, n_missed = 0, n_dropped = 0;
8cbf4f47 3433
84d6d5eb 3434 for (i = 0; i < cnt; i++) {
9bbf1c3d 3435 struct dp_netdev_flow *flow;
5a2fed48 3436 struct dp_packet *packet = packets[i];
9bbf1c3d 3437
5a2fed48
AZ
3438 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
3439 dp_packet_delete(packet);
3d88a620 3440 n_dropped++;
84d6d5eb
EJ
3441 continue;
3442 }
8cbf4f47 3443
72a5e2b8 3444 if (i != cnt - 1) {
a90ed026 3445 /* Prefetch next packet data and metadata. */
72a5e2b8 3446 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 3447 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
3448 }
3449
a90ed026
DDP
3450 if (!md_is_valid) {
3451 pkt_metadata_init(&packet->md, port_no);
3452 }
5a2fed48 3453 miniflow_extract(packet, &key->mf);
d262ac2c 3454 key->len = 0; /* Not computed yet. */
5a2fed48 3455 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
9bbf1c3d 3456
d262ac2c 3457 flow = emc_lookup(flow_cache, key);
8aaa125d 3458 if (OVS_LIKELY(flow)) {
5a2fed48 3459 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
8aaa125d
DDP
3460 n_batches);
3461 } else {
d1aa0b94
AZ
3462 /* Exact match cache missed. Group missed packets together at
3463 * the beginning of the 'packets' array. */
b89c678b 3464 packets[n_missed] = packet;
400486f7
DDP
3465 /* 'key[n_missed]' contains the key of the current packet and it
3466 * must be returned to the caller. The next key should be extracted
3467 * to 'keys[n_missed + 1]'. */
3468 key = &keys[++n_missed];
9bbf1c3d
DDP
3469 }
3470 }
3471
3d88a620 3472 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
4f150744 3473
3d88a620 3474 return n_missed;
9bbf1c3d
DDP
3475}
3476
3477static inline void
65f13b50 3478fast_path_processing(struct dp_netdev_pmd_thread *pmd,
e14deea0 3479 struct dp_packet **packets, size_t cnt,
8aaa125d
DDP
3480 struct netdev_flow_key *keys,
3481 struct packet_batch batches[], size_t *n_batches)
9bbf1c3d 3482{
1a0d5831 3483#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3484 const size_t PKT_ARRAY_SIZE = cnt;
3485#else
1a0d5831 3486 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3487 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 3488#endif
0de8783a 3489 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50
AW
3490 struct dp_netdev *dp = pmd->dp;
3491 struct emc_cache *flow_cache = &pmd->flow_cache;
8aaa125d 3492 int miss_cnt = 0, lost_cnt = 0;
9bbf1c3d 3493 bool any_miss;
8aaa125d 3494 size_t i;
9bbf1c3d
DDP
3495
3496 for (i = 0; i < cnt; i++) {
0de8783a 3497 /* Key length is needed in all the cases, hash computed on demand. */
361d808d 3498 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
9bbf1c3d 3499 }
1c1e46ed 3500 any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
623540e4
EJ
3501 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3502 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
3503 struct ofpbuf actions, put_actions;
7af12bd7 3504 ovs_u128 ufid;
623540e4
EJ
3505
3506 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
3507 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
3508
3509 for (i = 0; i < cnt; i++) {
0de8783a 3510 struct dp_netdev_flow *netdev_flow;
623540e4 3511 struct ofpbuf *add_actions;
0de8783a 3512 struct match match;
623540e4
EJ
3513 int error;
3514
0de8783a 3515 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
3516 continue;
3517 }
3518
3519 /* It's possible that an earlier slow path execution installed
0de8783a 3520 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 3521 * to catch it here than execute a miss. */
1c1e46ed 3522 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
623540e4 3523 if (netdev_flow) {
0de8783a 3524 rules[i] = &netdev_flow->cr;
623540e4
EJ
3525 continue;
3526 }
3527
60fc3b7b
DDP
3528 miss_cnt++;
3529
4d8f90b1 3530 match.tun_md.valid = false;
0de8783a 3531 miniflow_expand(&keys[i].mf, &match.flow);
623540e4
EJ
3532
3533 ofpbuf_clear(&actions);
3534 ofpbuf_clear(&put_actions);
3535
7af12bd7 3536 dpif_flow_hash(dp->dpif, &match.flow, sizeof match.flow, &ufid);
1c1e46ed 3537 error = dp_netdev_upcall(pmd, packets[i], &match.flow, &match.wc,
7af12bd7 3538 &ufid, DPIF_UC_MISS, NULL, &actions,
0de8783a 3539 &put_actions);
623540e4 3540 if (OVS_UNLIKELY(error && error != ENOSPC)) {
7ad20cbd 3541 dp_packet_delete(packets[i]);
60fc3b7b 3542 lost_cnt++;
623540e4
EJ
3543 continue;
3544 }
3545
449b8131
JR
3546 /* The Netlink encoding of datapath flow keys cannot express
3547 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3548 * tag is interpreted as exact match on the fact that there is no
3549 * VLAN. Unless we refactor a lot of code that translates between
3550 * Netlink and struct flow representations, we have to do the same
3551 * here. */
3552 if (!match.wc.masks.vlan_tci) {
3553 match.wc.masks.vlan_tci = htons(0xffff);
3554 }
3555
623540e4
EJ
3556 /* We can't allow the packet batching in the next loop to execute
3557 * the actions. Otherwise, if there are any slow path actions,
3558 * we'll send the packet up twice. */
41ccaa24 3559 dp_netdev_execute_actions(pmd, &packets[i], 1, true,
6fd6ed71 3560 actions.data, actions.size);
623540e4 3561
6fd6ed71 3562 add_actions = put_actions.size ? &put_actions : &actions;
0de8783a
JR
3563 if (OVS_LIKELY(error != ENOSPC)) {
3564 /* XXX: There's a race window where a flow covering this packet
3565 * could have already been installed since we last did the flow
3566 * lookup before upcall. This could be solved by moving the
3567 * mutex lock outside the loop, but that's an awful long time
3568 * to be locking everyone out of making flow installs. If we
3569 * move to a per-core classifier, it would be reasonable. */
1c1e46ed
AW
3570 ovs_mutex_lock(&pmd->flow_mutex);
3571 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
0de8783a 3572 if (OVS_LIKELY(!netdev_flow)) {
1c1e46ed 3573 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6fd6ed71
PS
3574 add_actions->data,
3575 add_actions->size);
0de8783a 3576 }
1c1e46ed 3577 ovs_mutex_unlock(&pmd->flow_mutex);
0de8783a 3578
0de8783a 3579 emc_insert(flow_cache, &keys[i], netdev_flow);
623540e4 3580 }
623540e4
EJ
3581 }
3582
3583 ofpbuf_uninit(&actions);
3584 ofpbuf_uninit(&put_actions);
3585 fat_rwlock_unlock(&dp->upcall_rwlock);
60fc3b7b 3586 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
ac8c2081 3587 } else if (OVS_UNLIKELY(any_miss)) {
ac8c2081 3588 for (i = 0; i < cnt; i++) {
0de8783a 3589 if (OVS_UNLIKELY(!rules[i])) {
e14deea0 3590 dp_packet_delete(packets[i]);
8aaa125d
DDP
3591 lost_cnt++;
3592 miss_cnt++;
ac8c2081
DDP
3593 }
3594 }
623540e4 3595 }
84d6d5eb 3596
8cbf4f47 3597 for (i = 0; i < cnt; i++) {
e14deea0 3598 struct dp_packet *packet = packets[i];
84d6d5eb 3599 struct dp_netdev_flow *flow;
8cbf4f47 3600
0de8783a 3601 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
3602 continue;
3603 }
3604
84d6d5eb 3605 flow = dp_netdev_flow_cast(rules[i]);
0de8783a 3606
0de8783a 3607 emc_insert(flow_cache, &keys[i], flow);
8aaa125d 3608 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
8cbf4f47
DDP
3609 }
3610
8aaa125d
DDP
3611 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
3612 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
3613 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
72865317
BP
3614}
3615
a90ed026
DDP
3616/* Packets enter the datapath from a port (or from recirculation) here.
3617 *
3618 * For performance reasons a caller may choose not to initialize the metadata
3619 * in 'packets': in this case 'mdinit' is false and this function needs to
3620 * initialize it using 'port_no'. If the metadata in 'packets' is already
3621 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
adcf00ba 3622static void
a90ed026
DDP
3623dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
3624 struct dp_packet **packets, int cnt,
3625 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 3626{
1a0d5831 3627#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3628 const size_t PKT_ARRAY_SIZE = cnt;
3629#else
1a0d5831 3630 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3631 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d
DDP
3632#endif
3633 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8aaa125d 3634 struct packet_batch batches[PKT_ARRAY_SIZE];
11bfdadd 3635 long long now = time_msec();
8aaa125d 3636 size_t newcnt, n_batches, i;
9bbf1c3d 3637
8aaa125d 3638 n_batches = 0;
a90ed026
DDP
3639 newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches,
3640 md_is_valid, port_no);
9bbf1c3d 3641 if (OVS_UNLIKELY(newcnt)) {
8aaa125d
DDP
3642 fast_path_processing(pmd, packets, newcnt, keys, batches, &n_batches);
3643 }
3644
603f2ce0
EJ
3645 for (i = 0; i < n_batches; i++) {
3646 batches[i].flow->batch = NULL;
3647 }
3648
8aaa125d
DDP
3649 for (i = 0; i < n_batches; i++) {
3650 packet_batch_execute(&batches[i], pmd, now);
9bbf1c3d
DDP
3651 }
3652}
3653
a90ed026
DDP
3654static void
3655dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
3656 struct dp_packet **packets, int cnt,
3657 odp_port_t port_no)
3658{
3659 dp_netdev_input__(pmd, packets, cnt, false, port_no);
3660}
3661
3662static void
3663dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
3664 struct dp_packet **packets, int cnt)
3665{
3666 dp_netdev_input__(pmd, packets, cnt, true, 0);
3667}
3668
9080a111 3669struct dp_netdev_execute_aux {
65f13b50 3670 struct dp_netdev_pmd_thread *pmd;
9080a111
JR
3671};
3672
e4e74c3a
AW
3673static void
3674dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
3675 void *aux)
3676{
3677 struct dp_netdev *dp = get_dp_netdev(dpif);
3678 dp->dp_purge_aux = aux;
3679 dp->dp_purge_cb = cb;
3680}
3681
6b31e073 3682static void
623540e4
EJ
3683dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
3684 void *aux)
6b31e073
RW
3685{
3686 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 3687 dp->upcall_aux = aux;
6b31e073
RW
3688 dp->upcall_cb = cb;
3689}
3690
ac8c2081 3691static void
3bcc10c0 3692dp_netdev_drop_packets(struct dp_packet **packets, int cnt, bool may_steal)
ac8c2081 3693{
ac8c2081 3694 if (may_steal) {
a36de779
PS
3695 int i;
3696
ac8c2081 3697 for (i = 0; i < cnt; i++) {
e14deea0 3698 dp_packet_delete(packets[i]);
ac8c2081
DDP
3699 }
3700 }
3701}
3702
a36de779
PS
3703static int
3704push_tnl_action(const struct dp_netdev *dp,
3705 const struct nlattr *attr,
e14deea0 3706 struct dp_packet **packets, int cnt)
a36de779
PS
3707{
3708 struct dp_netdev_port *tun_port;
3709 const struct ovs_action_push_tnl *data;
3710
3711 data = nl_attr_get(attr);
3712
3713 tun_port = dp_netdev_lookup_port(dp, u32_to_odp(data->tnl_port));
3714 if (!tun_port) {
3715 return -EINVAL;
3716 }
3717 netdev_push_header(tun_port->netdev, packets, cnt, data);
3718
3719 return 0;
3720}
3721
3722static void
28e2fa02
DDP
3723dp_netdev_clone_pkt_batch(struct dp_packet **dst_pkts,
3724 struct dp_packet **src_pkts, int cnt)
a36de779
PS
3725{
3726 int i;
3727
3728 for (i = 0; i < cnt; i++) {
28e2fa02 3729 dst_pkts[i] = dp_packet_clone(src_pkts[i]);
a36de779
PS
3730 }
3731}
3732
9080a111 3733static void
e14deea0 3734dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
09f9da0b 3735 const struct nlattr *a, bool may_steal)
8a4e3a85 3736 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
3737{
3738 struct dp_netdev_execute_aux *aux = aux_;
623540e4 3739 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
3740 struct dp_netdev_pmd_thread *pmd = aux->pmd;
3741 struct dp_netdev *dp = pmd->dp;
09f9da0b 3742 int type = nl_attr_type(a);
8a4e3a85 3743 struct dp_netdev_port *p;
8cbf4f47 3744 int i;
9080a111 3745
09f9da0b
JR
3746 switch ((enum ovs_action_attr)type) {
3747 case OVS_ACTION_ATTR_OUTPUT:
623540e4 3748 p = dp_netdev_lookup_port(dp, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 3749 if (OVS_LIKELY(p)) {
347ba9bb
IM
3750 int tx_qid;
3751
3752 atomic_read_relaxed(&pmd->tx_qid, &tx_qid);
3753
3754 netdev_send(p->netdev, tx_qid, packets, cnt, may_steal);
ac8c2081 3755 return;
8a4e3a85 3756 }
09f9da0b
JR
3757 break;
3758
a36de779
PS
3759 case OVS_ACTION_ATTR_TUNNEL_PUSH:
3760 if (*depth < MAX_RECIRC_DEPTH) {
cd159f1a 3761 struct dp_packet *tnl_pkt[NETDEV_MAX_BURST];
a36de779
PS
3762 int err;
3763
3764 if (!may_steal) {
3765 dp_netdev_clone_pkt_batch(tnl_pkt, packets, cnt);
3766 packets = tnl_pkt;
3767 }
3768
3769 err = push_tnl_action(dp, a, packets, cnt);
3770 if (!err) {
3771 (*depth)++;
a90ed026 3772 dp_netdev_recirculate(pmd, packets, cnt);
a36de779
PS
3773 (*depth)--;
3774 } else {
3775 dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal);
3776 }
3777 return;
3778 }
3779 break;
3780
3781 case OVS_ACTION_ATTR_TUNNEL_POP:
3782 if (*depth < MAX_RECIRC_DEPTH) {
3783 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
3784
3785 p = dp_netdev_lookup_port(dp, portno);
3786 if (p) {
cd159f1a 3787 struct dp_packet *tnl_pkt[NETDEV_MAX_BURST];
a36de779
PS
3788 int err;
3789
3790 if (!may_steal) {
3791 dp_netdev_clone_pkt_batch(tnl_pkt, packets, cnt);
3792 packets = tnl_pkt;
3793 }
3794
3795 err = netdev_pop_header(p->netdev, packets, cnt);
3796 if (!err) {
3797
3798 for (i = 0; i < cnt; i++) {
3799 packets[i]->md.in_port.odp_port = portno;
3800 }
3801
3802 (*depth)++;
a90ed026 3803 dp_netdev_recirculate(pmd, packets, cnt);
a36de779
PS
3804 (*depth)--;
3805 } else {
3806 dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal);
3807 }
3808 return;
3809 }
3810 }
3811 break;
3812
623540e4
EJ
3813 case OVS_ACTION_ATTR_USERSPACE:
3814 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3815 const struct nlattr *userdata;
3816 struct ofpbuf actions;
3817 struct flow flow;
7af12bd7 3818 ovs_u128 ufid;
4fc65926 3819
623540e4
EJ
3820 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
3821 ofpbuf_init(&actions, 0);
8cbf4f47 3822
623540e4
EJ
3823 for (i = 0; i < cnt; i++) {
3824 int error;
3825
3826 ofpbuf_clear(&actions);
3827
cf62fa4c 3828 flow_extract(packets[i], &flow);
7af12bd7 3829 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
1c1e46ed 3830 error = dp_netdev_upcall(pmd, packets[i], &flow, NULL, &ufid,
7af12bd7 3831 DPIF_UC_ACTION, userdata,&actions,
623540e4
EJ
3832 NULL);
3833 if (!error || error == ENOSPC) {
ac8c2081 3834 dp_netdev_execute_actions(pmd, &packets[i], 1, may_steal,
6fd6ed71 3835 actions.data, actions.size);
ac8c2081 3836 } else if (may_steal) {
e14deea0 3837 dp_packet_delete(packets[i]);
623540e4 3838 }
db73f716 3839 }
623540e4
EJ
3840 ofpbuf_uninit(&actions);
3841 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 3842
ac8c2081
DDP
3843 return;
3844 }
09f9da0b 3845 break;
572f732a 3846
adcf00ba
AZ
3847 case OVS_ACTION_ATTR_RECIRC:
3848 if (*depth < MAX_RECIRC_DEPTH) {
cd159f1a 3849 struct dp_packet *recirc_pkts[NETDEV_MAX_BURST];
572f732a 3850
28e2fa02
DDP
3851 if (!may_steal) {
3852 dp_netdev_clone_pkt_batch(recirc_pkts, packets, cnt);
3853 packets = recirc_pkts;
3854 }
8cbf4f47 3855
28e2fa02
DDP
3856 for (i = 0; i < cnt; i++) {
3857 packets[i]->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 3858 }
28e2fa02
DDP
3859
3860 (*depth)++;
a90ed026 3861 dp_netdev_recirculate(pmd, packets, cnt);
adcf00ba
AZ
3862 (*depth)--;
3863
ac8c2081 3864 return;
adcf00ba 3865 }
ac8c2081
DDP
3866
3867 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 3868 break;
572f732a 3869
07659514
JS
3870 case OVS_ACTION_ATTR_CT:
3871 /* If a flow with this action is slow-pathed, datapath assistance is
3872 * required to implement it. However, we don't support this action
3873 * in the userspace datapath. */
3874 VLOG_WARN("Cannot execute conntrack action in userspace.");
3875 break;
3876
09f9da0b
JR
3877 case OVS_ACTION_ATTR_PUSH_VLAN:
3878 case OVS_ACTION_ATTR_POP_VLAN:
3879 case OVS_ACTION_ATTR_PUSH_MPLS:
3880 case OVS_ACTION_ATTR_POP_MPLS:
3881 case OVS_ACTION_ATTR_SET:
6d670e7f 3882 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 3883 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 3884 case OVS_ACTION_ATTR_HASH:
09f9da0b
JR
3885 case OVS_ACTION_ATTR_UNSPEC:
3886 case __OVS_ACTION_ATTR_MAX:
3887 OVS_NOT_REACHED();
da546e07 3888 }
ac8c2081
DDP
3889
3890 dp_netdev_drop_packets(packets, cnt, may_steal);
98403001
BP
3891}
3892
4edb9ae9 3893static void
65f13b50 3894dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
e14deea0 3895 struct dp_packet **packets, int cnt,
41ccaa24 3896 bool may_steal,
9080a111 3897 const struct nlattr *actions, size_t actions_len)
72865317 3898{
41ccaa24 3899 struct dp_netdev_execute_aux aux = { pmd };
9080a111 3900
41ccaa24 3901 odp_execute_actions(&aux, packets, cnt, may_steal, actions,
8cbf4f47 3902 actions_len, dp_execute_cb);
72865317
BP
3903}
3904
3905const struct dpif_class dpif_netdev_class = {
72865317 3906 "netdev",
6553d06b 3907 dpif_netdev_init,
2197d7ab 3908 dpif_netdev_enumerate,
0aeaabc8 3909 dpif_netdev_port_open_type,
72865317
BP
3910 dpif_netdev_open,
3911 dpif_netdev_close,
7dab847a 3912 dpif_netdev_destroy,
e4cfed38
PS
3913 dpif_netdev_run,
3914 dpif_netdev_wait,
72865317 3915 dpif_netdev_get_stats,
72865317
BP
3916 dpif_netdev_port_add,
3917 dpif_netdev_port_del,
3918 dpif_netdev_port_query_by_number,
3919 dpif_netdev_port_query_by_name,
98403001 3920 NULL, /* port_get_pid */
b0ec0f27
BP
3921 dpif_netdev_port_dump_start,
3922 dpif_netdev_port_dump_next,
3923 dpif_netdev_port_dump_done,
72865317
BP
3924 dpif_netdev_port_poll,
3925 dpif_netdev_port_poll_wait,
72865317 3926 dpif_netdev_flow_flush,
ac64794a
BP
3927 dpif_netdev_flow_dump_create,
3928 dpif_netdev_flow_dump_destroy,
3929 dpif_netdev_flow_dump_thread_create,
3930 dpif_netdev_flow_dump_thread_destroy,
704a1e09 3931 dpif_netdev_flow_dump_next,
1a0c894a 3932 dpif_netdev_operate,
6b31e073
RW
3933 NULL, /* recv_set */
3934 NULL, /* handlers_set */
f2eee189 3935 dpif_netdev_pmd_set,
5bf93d67 3936 dpif_netdev_queue_to_priority,
6b31e073
RW
3937 NULL, /* recv */
3938 NULL, /* recv_wait */
3939 NULL, /* recv_purge */
e4e74c3a 3940 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
3941 dpif_netdev_register_upcall_cb,
3942 dpif_netdev_enable_upcall,
3943 dpif_netdev_disable_upcall,
b5cbbcf6 3944 dpif_netdev_get_datapath_version,
b77d9629
DDP
3945 NULL, /* ct_dump_start */
3946 NULL, /* ct_dump_next */
3947 NULL, /* ct_dump_done */
a0f7b6d5 3948 NULL, /* ct_flush */
72865317 3949};
614c4892 3950
74cc3969
BP
3951static void
3952dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
3953 const char *argv[], void *aux OVS_UNUSED)
3954{
59e6d833
BP
3955 struct dp_netdev_port *old_port;
3956 struct dp_netdev_port *new_port;
74cc3969 3957 struct dp_netdev *dp;
ff073a71 3958 odp_port_t port_no;
74cc3969 3959
8a4e3a85 3960 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
3961 dp = shash_find_data(&dp_netdevs, argv[1]);
3962 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 3963 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
3964 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
3965 return;
3966 }
8a4e3a85
BP
3967 ovs_refcount_ref(&dp->ref_cnt);
3968 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 3969
59e6d833
BP
3970 ovs_mutex_lock(&dp->port_mutex);
3971 if (get_port_by_name(dp, argv[2], &old_port)) {
74cc3969 3972 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 3973 goto exit;
74cc3969
BP
3974 }
3975
ff073a71
BP
3976 port_no = u32_to_odp(atoi(argv[3]));
3977 if (!port_no || port_no == ODPP_NONE) {
74cc3969 3978 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 3979 goto exit;
74cc3969 3980 }
ff073a71 3981 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 3982 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 3983 goto exit;
74cc3969 3984 }
59e6d833
BP
3985
3986 /* Remove old port. */
35303d71 3987 cmap_remove(&dp->ports, &old_port->node, hash_port_no(old_port->port_no));
59e6d833
BP
3988 ovsrcu_postpone(free, old_port);
3989
3990 /* Insert new port (cmap semantics mean we cannot re-insert 'old_port'). */
3991 new_port = xmemdup(old_port, sizeof *old_port);
35303d71 3992 new_port->port_no = port_no;
59e6d833
BP
3993 cmap_insert(&dp->ports, &new_port->node, hash_port_no(port_no));
3994
d33ed218 3995 seq_change(dp->port_seq);
74cc3969 3996 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
3997
3998exit:
59e6d833 3999 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 4000 dp_netdev_unref(dp);
74cc3969
BP
4001}
4002
c40b890f
BP
4003static void
4004dpif_dummy_delete_port(struct unixctl_conn *conn, int argc OVS_UNUSED,
4005 const char *argv[], void *aux OVS_UNUSED)
4006{
4007 struct dp_netdev_port *port;
4008 struct dp_netdev *dp;
4009
4010 ovs_mutex_lock(&dp_netdev_mutex);
4011 dp = shash_find_data(&dp_netdevs, argv[1]);
4012 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
4013 ovs_mutex_unlock(&dp_netdev_mutex);
4014 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4015 return;
4016 }
4017 ovs_refcount_ref(&dp->ref_cnt);
4018 ovs_mutex_unlock(&dp_netdev_mutex);
4019
4020 ovs_mutex_lock(&dp->port_mutex);
4021 if (get_port_by_name(dp, argv[2], &port)) {
4022 unixctl_command_reply_error(conn, "unknown port");
35303d71 4023 } else if (port->port_no == ODPP_LOCAL) {
c40b890f
BP
4024 unixctl_command_reply_error(conn, "can't delete local port");
4025 } else {
4026 do_del_port(dp, port);
4027 unixctl_command_reply(conn, NULL);
4028 }
4029 ovs_mutex_unlock(&dp->port_mutex);
4030
4031 dp_netdev_unref(dp);
4032}
4033
0cbfe35d
BP
4034static void
4035dpif_dummy_register__(const char *type)
4036{
4037 struct dpif_class *class;
4038
4039 class = xmalloc(sizeof *class);
4040 *class = dpif_netdev_class;
4041 class->type = xstrdup(type);
4042 dp_register_provider(class);
4043}
4044
8420c7ad
BP
4045static void
4046dpif_dummy_override(const char *type)
4047{
65d43fdc
YT
4048 int error;
4049
4050 /*
4051 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
4052 * a userland-only build. It's useful for testsuite.
4053 */
4054 error = dp_unregister_provider(type);
4055 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
4056 dpif_dummy_register__(type);
4057 }
4058}
4059
614c4892 4060void
8420c7ad 4061dpif_dummy_register(enum dummy_level level)
614c4892 4062{
8420c7ad 4063 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
4064 struct sset types;
4065 const char *type;
4066
4067 sset_init(&types);
4068 dp_enumerate_types(&types);
4069 SSET_FOR_EACH (type, &types) {
8420c7ad 4070 dpif_dummy_override(type);
0cbfe35d
BP
4071 }
4072 sset_destroy(&types);
8420c7ad
BP
4073 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
4074 dpif_dummy_override("system");
614c4892 4075 }
0cbfe35d
BP
4076
4077 dpif_dummy_register__("dummy");
74cc3969
BP
4078
4079 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 4080 "dp port new-number",
74cc3969 4081 3, 3, dpif_dummy_change_port_number, NULL);
74467d5c 4082 unixctl_command_register("dpif-dummy/delete-port", "dp port",
c40b890f 4083 2, 2, dpif_dummy_delete_port, NULL);
614c4892 4084}
0de8783a
JR
4085\f
4086/* Datapath Classifier. */
4087
4088/* A set of rules that all have the same fields wildcarded. */
4089struct dpcls_subtable {
4090 /* The fields are only used by writers. */
4091 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
4092
4093 /* These fields are accessed by readers. */
4094 struct cmap rules; /* Contains "struct dpcls_rule"s. */
4095 struct netdev_flow_key mask; /* Wildcards for fields (const). */
4096 /* 'mask' must be the last field, additional space is allocated here. */
4097};
4098
4099/* Initializes 'cls' as a classifier that initially contains no classification
4100 * rules. */
4101static void
4102dpcls_init(struct dpcls *cls)
4103{
4104 cmap_init(&cls->subtables_map);
4105 pvector_init(&cls->subtables);
4106}
4107
4108static void
4109dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
4110{
4111 pvector_remove(&cls->subtables, subtable);
4112 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
4113 subtable->mask.hash);
4114 cmap_destroy(&subtable->rules);
4115 ovsrcu_postpone(free, subtable);
4116}
4117
4118/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
4119 * caller's responsibility.
4120 * May only be called after all the readers have been terminated. */
4121static void
4122dpcls_destroy(struct dpcls *cls)
4123{
4124 if (cls) {
4125 struct dpcls_subtable *subtable;
4126
4127 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 4128 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
4129 dpcls_destroy_subtable(cls, subtable);
4130 }
4131 cmap_destroy(&cls->subtables_map);
4132 pvector_destroy(&cls->subtables);
4133 }
4134}
4135
4136static struct dpcls_subtable *
4137dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4138{
4139 struct dpcls_subtable *subtable;
4140
4141 /* Need to add one. */
caeb4906
JR
4142 subtable = xmalloc(sizeof *subtable
4143 - sizeof subtable->mask.mf + mask->len);
0de8783a
JR
4144 cmap_init(&subtable->rules);
4145 netdev_flow_key_clone(&subtable->mask, mask);
4146 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
4147 pvector_insert(&cls->subtables, subtable, 0);
802f84ff 4148 pvector_publish(&cls->subtables);
0de8783a
JR
4149
4150 return subtable;
4151}
4152
4153static inline struct dpcls_subtable *
4154dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4155{
4156 struct dpcls_subtable *subtable;
4157
4158 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
4159 &cls->subtables_map) {
4160 if (netdev_flow_key_equal(&subtable->mask, mask)) {
4161 return subtable;
4162 }
4163 }
4164 return dpcls_create_subtable(cls, mask);
4165}
4166
4167/* Insert 'rule' into 'cls'. */
4168static void
4169dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
4170 const struct netdev_flow_key *mask)
4171{
4172 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
4173
4174 rule->mask = &subtable->mask;
4175 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
4176}
4177
4178/* Removes 'rule' from 'cls', also destructing the 'rule'. */
4179static void
4180dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
4181{
4182 struct dpcls_subtable *subtable;
4183
4184 ovs_assert(rule->mask);
4185
4186 INIT_CONTAINER(subtable, rule->mask, mask);
4187
4188 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
4189 == 0) {
4190 dpcls_destroy_subtable(cls, subtable);
802f84ff 4191 pvector_publish(&cls->subtables);
0de8783a
JR
4192 }
4193}
4194
361d808d
JR
4195/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
4196 * in 'mask' the values in 'key' and 'target' are the same. */
0de8783a
JR
4197static inline bool
4198dpcls_rule_matches_key(const struct dpcls_rule *rule,
4199 const struct netdev_flow_key *target)
4200{
09b0fa9c
JR
4201 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
4202 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 4203 uint64_t value;
0de8783a 4204
5fcff47b
JR
4205 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
4206 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
4207 return false;
4208 }
4209 }
4210 return true;
4211}
4212
4213/* For each miniflow in 'flows' performs a classifier lookup writing the result
4214 * into the corresponding slot in 'rules'. If a particular entry in 'flows' is
4215 * NULL it is skipped.
4216 *
4217 * This function is optimized for use in the userspace datapath and therefore
4218 * does not implement a lot of features available in the standard
4219 * classifier_lookup() function. Specifically, it does not implement
4220 * priorities, instead returning any rule which matches the flow.
4221 *
4222 * Returns true if all flows found a corresponding rule. */
4223static bool
4224dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
4225 struct dpcls_rule **rules, const size_t cnt)
4226{
4227 /* The batch size 16 was experimentally found faster than 8 or 32. */
4228 typedef uint16_t map_type;
4229#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
4230
4231#if !defined(__CHECKER__) && !defined(_WIN32)
4232 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
4233#else
cd159f1a 4234 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
0de8783a
JR
4235#endif
4236 map_type maps[N_MAPS];
4237 struct dpcls_subtable *subtable;
4238
4239 memset(maps, 0xff, sizeof maps);
4240 if (cnt % MAP_BITS) {
4241 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
4242 }
4243 memset(rules, 0, cnt * sizeof *rules);
4244
4245 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
4246 const struct netdev_flow_key *mkeys = keys;
4247 struct dpcls_rule **mrules = rules;
4248 map_type remains = 0;
4249 int m;
4250
4251 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
4252
4253 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
4254 uint32_t hashes[MAP_BITS];
4255 const struct cmap_node *nodes[MAP_BITS];
4256 unsigned long map = maps[m];
4257 int i;
4258
4259 if (!map) {
4260 continue; /* Skip empty maps. */
4261 }
4262
4263 /* Compute hashes for the remaining keys. */
3ee6026a 4264 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
4265 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
4266 &subtable->mask);
4267 }
4268 /* Lookup. */
4269 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
4270 /* Check results. */
3ee6026a 4271 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
4272 struct dpcls_rule *rule;
4273
4274 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
4275 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
4276 mrules[i] = rule;
4277 goto next;
4278 }
4279 }
3ee6026a 4280 ULLONG_SET0(map, i); /* Did not match. */
0de8783a
JR
4281 next:
4282 ; /* Keep Sparse happy. */
4283 }
4284 maps[m] &= ~map; /* Clear the found rules. */
4285 remains |= maps[m];
4286 }
4287 if (!remains) {
4288 return true; /* All found. */
4289 }
4290 }
4291 return false; /* Some misses. */
4292}