]> git.proxmox.com Git - ovs.git/blame - lib/dpif-netdev.c
ovs-numa: Change 'core_id' to unsigned.
[ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
ff073a71 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
72865317 24#include <netinet/in.h>
9d82ec47 25#include <sys/socket.h>
7f3adc00 26#include <net/if.h>
cdee00fd 27#include <stdint.h>
72865317
BP
28#include <stdlib.h>
29#include <string.h>
30#include <sys/ioctl.h>
31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
59e6d833 34#include "cmap.h"
72865317 35#include "csum.h"
e14deea0 36#include "dp-packet.h"
614c4892 37#include "dpif.h"
72865317 38#include "dpif-provider.h"
614c4892 39#include "dummy.h"
36956a7d 40#include "dynamic-string.h"
afae68b1 41#include "fat-rwlock.h"
72865317 42#include "flow.h"
9f361d6b 43#include "cmap.h"
6c3eee82 44#include "latch.h"
72865317 45#include "list.h"
0de8783a 46#include "match.h"
8c301900 47#include "meta-flow.h"
72865317 48#include "netdev.h"
8617afff 49#include "netdev-dpdk.h"
de281153 50#include "netdev-vport.h"
cdee00fd 51#include "netlink.h"
f094af7b 52#include "odp-execute.h"
72865317
BP
53#include "odp-util.h"
54#include "ofp-print.h"
55#include "ofpbuf.h"
5a034064 56#include "ovs-numa.h"
61e7deb1 57#include "ovs-rcu.h"
72865317
BP
58#include "packets.h"
59#include "poll-loop.h"
0de8783a 60#include "pvector.h"
26c6b6cd 61#include "random.h"
d33ed218 62#include "seq.h"
462278db 63#include "shash.h"
0cbfe35d 64#include "sset.h"
72865317 65#include "timeval.h"
a36de779 66#include "tnl-arp-cache.h"
74cc3969 67#include "unixctl.h"
72865317 68#include "util.h"
e6211adc 69#include "openvswitch/vlog.h"
5136ce49 70
d98e6007 71VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 72
8bb113da 73#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
74/* Use per thread recirc_depth to prevent recirculation loop. */
75#define MAX_RECIRC_DEPTH 5
76DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 77
72865317 78/* Configuration parameters. */
72865317
BP
79enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
80
8a4e3a85
BP
81/* Protects against changes to 'dp_netdevs'. */
82static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
83
84/* Contains all 'struct dp_netdev's. */
85static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
86 = SHASH_INITIALIZER(&dp_netdevs);
87
623540e4 88static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 89
79df317f 90/* Stores a miniflow with inline values */
9bbf1c3d 91
9bbf1c3d 92struct netdev_flow_key {
caeb4906
JR
93 uint32_t hash; /* Hash function differs for different users. */
94 uint32_t len; /* Length of the following miniflow (incl. map). */
0de8783a 95 struct miniflow mf;
d70e8c28 96 uint64_t buf[FLOW_MAX_PACKET_U64S - MINI_N_INLINE];
9bbf1c3d
DDP
97};
98
99/* Exact match cache for frequently used flows
100 *
101 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
102 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 103 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
104 *
105 * A cache entry holds a reference to its 'dp_netdev_flow'.
106 *
107 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
108 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
109 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
110 * value is the index of a cache entry where the miniflow could be.
111 *
112 *
113 * Thread-safety
114 * =============
115 *
116 * Each pmd_thread has its own private exact match cache.
117 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
118 */
119
fc82e877 120#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
121#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
122#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
123#define EM_FLOW_HASH_SEGS 2
124
125struct emc_entry {
9bbf1c3d 126 struct dp_netdev_flow *flow;
0de8783a 127 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
128};
129
130struct emc_cache {
131 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 132 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
133};
134
135/* Iterate in the exact match cache through every entry that might contain a
136 * miniflow with hash 'HASH'. */
137#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
138 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
139 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
140 i__ < EM_FLOW_HASH_SEGS; \
141 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
142\f
143/* Simple non-wildcarding single-priority classifier. */
144
145struct dpcls {
146 struct cmap subtables_map;
147 struct pvector subtables;
148};
9bbf1c3d 149
0de8783a
JR
150/* A rule to be inserted to the classifier. */
151struct dpcls_rule {
152 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
153 struct netdev_flow_key *mask; /* Subtable's mask. */
154 struct netdev_flow_key flow; /* Matching key. */
155 /* 'flow' must be the last field, additional space is allocated here. */
156};
157
158static void dpcls_init(struct dpcls *);
159static void dpcls_destroy(struct dpcls *);
160static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
161 const struct netdev_flow_key *mask);
162static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
163static bool dpcls_lookup(const struct dpcls *cls,
164 const struct netdev_flow_key keys[],
165 struct dpcls_rule **rules, size_t cnt);
166\f
8a4e3a85
BP
167/* Datapath based on the network device interface from netdev.h.
168 *
169 *
170 * Thread-safety
171 * =============
172 *
173 * Some members, marked 'const', are immutable. Accessing other members
174 * requires synchronization, as noted in more detail below.
175 *
176 * Acquisition order is, from outermost to innermost:
177 *
178 * dp_netdev_mutex (global)
59e6d833 179 * port_mutex
8a4e3a85 180 */
72865317 181struct dp_netdev {
8a4e3a85
BP
182 const struct dpif_class *const class;
183 const char *const name;
6b31e073 184 struct dpif *dpif;
6a8267c5
BP
185 struct ovs_refcount ref_cnt;
186 atomic_flag destroyed;
72865317 187
8a4e3a85
BP
188 /* Ports.
189 *
59e6d833
BP
190 * Protected by RCU. Take the mutex to add or remove ports. */
191 struct ovs_mutex port_mutex;
192 struct cmap ports;
d33ed218 193 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 194
6b31e073
RW
195 /* Protects access to ofproto-dpif-upcall interface during revalidator
196 * thread synchronization. */
197 struct fat_rwlock upcall_rwlock;
623540e4
EJ
198 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
199 void *upcall_aux;
6b31e073 200
65f13b50
AW
201 /* Stores all 'struct dp_netdev_pmd_thread's. */
202 struct cmap poll_threads;
203
204 /* Protects the access of the 'struct dp_netdev_pmd_thread'
205 * instance for non-pmd thread. */
206 struct ovs_mutex non_pmd_mutex;
207
208 /* Each pmd thread will store its pointer to
209 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
210 ovsthread_key_t per_pmd_key;
f2eee189
AW
211
212 /* Number of rx queues for each dpdk interface and the cpu mask
213 * for pin of pmd threads. */
214 size_t n_dpdk_rxqs;
215 char *pmd_cmask;
a36de779 216 uint64_t last_tnl_conf_seq;
72865317
BP
217};
218
8a4e3a85 219static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
59e6d833 220 odp_port_t);
ff073a71 221
51852a57 222enum dp_stat_type {
abcf3ef4
DDP
223 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
224 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
51852a57
BP
225 DP_STAT_MISS, /* Packets that did not match. */
226 DP_STAT_LOST, /* Packets not passed up to the client. */
227 DP_N_STATS
228};
229
55e3ca97
DDP
230enum pmd_cycles_counter_type {
231 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
232 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
233 PMD_N_CYCLES
234};
235
72865317
BP
236/* A port in a netdev-based datapath. */
237struct dp_netdev_port {
efa2bcbb 238 struct pkt_metadata md;
72865317 239 struct netdev *netdev;
efa2bcbb 240 struct cmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 241 struct netdev_saved_flags *sf;
55c955bd 242 struct netdev_rxq **rxq;
b284085e 243 struct ovs_refcount ref_cnt;
0cbfe35d 244 char *type; /* Port type as requested by user. */
72865317
BP
245};
246
1c1e46ed
AW
247/* Contained by struct dp_netdev_flow's 'stats' member. */
248struct dp_netdev_flow_stats {
eb94da30
DDP
249 atomic_llong used; /* Last used time, in monotonic msecs. */
250 atomic_ullong packet_count; /* Number of packets matched. */
251 atomic_ullong byte_count; /* Number of bytes matched. */
252 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
253};
254
255/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
256 *
257 *
258 * Thread-safety
259 * =============
260 *
261 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 262 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
263 *
264 * Motivation
265 * ----------
266 *
267 * The thread safety rules described here for "struct dp_netdev_flow" are
268 * motivated by two goals:
269 *
270 * - Prevent threads that read members of "struct dp_netdev_flow" from
271 * reading bad data due to changes by some thread concurrently modifying
272 * those members.
273 *
274 * - Prevent two threads making changes to members of a given "struct
275 * dp_netdev_flow" from interfering with each other.
276 *
277 *
278 * Rules
279 * -----
280 *
ed79f89a
DDP
281 * A flow 'flow' may be accessed without a risk of being freed during an RCU
282 * grace period. Code that needs to hold onto a flow for a while
283 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
284 *
285 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
286 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
287 * from modification.
8a4e3a85
BP
288 *
289 * Some members, marked 'const', are immutable. Accessing other members
290 * requires synchronization, as noted in more detail below.
291 */
72865317 292struct dp_netdev_flow {
11e5cf1f 293 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 294 /* Hash table index by unmasked flow. */
1c1e46ed
AW
295 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
296 /* 'flow_table'. */
70e5ed6f 297 const ovs_u128 ufid; /* Unique flow identifier. */
bd5131ba 298 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 299 /* flow. */
72865317 300
ed79f89a
DDP
301 /* Number of references.
302 * The classifier owns one reference.
303 * Any thread trying to keep a rule from being freed should hold its own
304 * reference. */
305 struct ovs_refcount ref_cnt;
306
11e5cf1f
DDP
307 bool dead;
308
1c1e46ed
AW
309 /* Statistics. */
310 struct dp_netdev_flow_stats stats;
8a4e3a85 311
45c626a3 312 /* Actions. */
61e7deb1 313 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 314
11e5cf1f
DDP
315 /* While processing a group of input packets, the datapath uses the next
316 * member to store a pointer to the output batch for the flow. It is
317 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
318 * packet_batch_init() and packet_batch_execute()). */
319 struct packet_batch *batch;
320
0de8783a
JR
321 /* Packet classification. */
322 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
323 /* 'cr' must be the last member. */
72865317
BP
324};
325
ed79f89a 326static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 327static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f
JS
328static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
329 struct flow *);
8a4e3a85 330
a84cb64a
BP
331/* A set of datapath actions within a "struct dp_netdev_flow".
332 *
333 *
334 * Thread-safety
335 * =============
336 *
45c626a3 337 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 338struct dp_netdev_actions {
a84cb64a
BP
339 /* These members are immutable: they do not change during the struct's
340 * lifetime. */
a84cb64a 341 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 342 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
343};
344
345struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
346 size_t);
61e7deb1
BP
347struct dp_netdev_actions *dp_netdev_flow_get_actions(
348 const struct dp_netdev_flow *);
349static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 350
1c1e46ed
AW
351/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
352struct dp_netdev_pmd_stats {
353 /* Indexed by DP_STAT_*. */
eb94da30 354 atomic_ullong n[DP_N_STATS];
1c1e46ed
AW
355};
356
55e3ca97
DDP
357/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
358struct dp_netdev_pmd_cycles {
359 /* Indexed by PMD_CYCLES_*. */
360 atomic_ullong n[PMD_N_CYCLES];
361};
362
e4cfed38
PS
363/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
364 * the performance overhead of interrupt processing. Therefore netdev can
365 * not implement rx-wait for these devices. dpif-netdev needs to poll
366 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 367 * devices assigned to itself.
e4cfed38
PS
368 *
369 * DPDK used PMD for accessing NIC.
370 *
65f13b50
AW
371 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
372 * I/O of all non-pmd threads. There will be no actual thread created
373 * for the instance.
1c1e46ed
AW
374 *
375 * Each struct has its own flow table and classifier. Packets received
376 * from managed ports are looked up in the corresponding pmd thread's
377 * flow table, and are executed with the found actions.
378 * */
65f13b50 379struct dp_netdev_pmd_thread {
6c3eee82 380 struct dp_netdev *dp;
1c1e46ed 381 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
65f13b50 382 struct cmap_node node; /* In 'dp->poll_threads'. */
accf8626
AW
383
384 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
385 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
386
65f13b50
AW
387 /* Per thread exact-match cache. Note, the instance for cpu core
388 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
389 * need to be protected (e.g. by 'dp_netdev_mutex'). All other
390 * instances will only be accessed by its own pmd thread. */
9bbf1c3d 391 struct emc_cache flow_cache;
1c1e46ed
AW
392
393 /* Classifier and Flow-Table.
394 *
395 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
396 * changes to 'cls' must be made while still holding the 'flow_mutex'.
397 */
398 struct ovs_mutex flow_mutex;
399 struct dpcls cls;
400 struct cmap flow_table OVS_GUARDED; /* Flow table. */
401
402 /* Statistics. */
403 struct dp_netdev_pmd_stats stats;
404
55e3ca97
DDP
405 /* Cycles counters */
406 struct dp_netdev_pmd_cycles cycles;
407
408 /* Used to count cicles. See 'cycles_counter_end()' */
409 unsigned long long last_cycles;
410
65f13b50
AW
411 struct latch exit_latch; /* For terminating the pmd thread. */
412 atomic_uint change_seq; /* For reloading pmd ports. */
6c3eee82 413 pthread_t thread;
65f13b50
AW
414 int index; /* Idx of this pmd thread among pmd*/
415 /* threads on same numa node. */
bd5131ba 416 unsigned core_id; /* CPU core id of this pmd thread. */
65f13b50 417 int numa_id; /* numa node id of this pmd thread. */
6553d06b
DDP
418
419 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
420 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
421 * values and subtracts them from 'stats' and 'cycles' before
422 * reporting to the user */
423 unsigned long long stats_zero[DP_N_STATS];
424 uint64_t cycles_zero[PMD_N_CYCLES];
6c3eee82
BP
425};
426
84067a4c
JR
427#define PMD_INITIAL_SEQ 1
428
72865317
BP
429/* Interface to netdev-based datapath. */
430struct dpif_netdev {
431 struct dpif dpif;
432 struct dp_netdev *dp;
d33ed218 433 uint64_t last_port_seq;
72865317
BP
434};
435
8a4e3a85 436static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
59e6d833 437 struct dp_netdev_port **portp);
8a4e3a85 438static int get_port_by_name(struct dp_netdev *dp, const char *devname,
59e6d833 439 struct dp_netdev_port **portp);
8a4e3a85
BP
440static void dp_netdev_free(struct dp_netdev *)
441 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
442static int do_add_port(struct dp_netdev *dp, const char *devname,
443 const char *type, odp_port_t port_no)
59e6d833 444 OVS_REQUIRES(dp->port_mutex);
c40b890f 445static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 446 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
447static int dpif_netdev_open(const struct dpif_class *, const char *name,
448 bool create, struct dpif **);
65f13b50 449static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
e14deea0 450 struct dp_packet **, int c,
41ccaa24 451 bool may_steal,
4edb9ae9 452 const struct nlattr *actions,
e4cfed38 453 size_t actions_len);
65f13b50 454static void dp_netdev_input(struct dp_netdev_pmd_thread *,
e14deea0 455 struct dp_packet **, int cnt);
41ccaa24 456
6b31e073 457static void dp_netdev_disable_upcall(struct dp_netdev *);
accf8626 458void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50
AW
459static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
460 struct dp_netdev *dp, int index,
bd5131ba 461 unsigned core_id, int numa_id);
1c1e46ed 462static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
f2eee189 463static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
b19befae 464static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 465 unsigned core_id);
1c1e46ed
AW
466static struct dp_netdev_pmd_thread *
467dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
65f13b50
AW
468static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
469static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
470static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
f2eee189 471static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
1c1e46ed
AW
472static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
473static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
474static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
72865317 475
67ad54cb 476static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d
DDP
477static void emc_clear_entry(struct emc_entry *ce);
478
479static void
480emc_cache_init(struct emc_cache *flow_cache)
481{
482 int i;
483
caeb4906
JR
484 BUILD_ASSERT(offsetof(struct miniflow, inline_values) == sizeof(uint64_t));
485
67ad54cb 486 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
487 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
488 flow_cache->entries[i].flow = NULL;
0de8783a 489 flow_cache->entries[i].key.hash = 0;
caeb4906
JR
490 flow_cache->entries[i].key.len
491 = offsetof(struct miniflow, inline_values);
0de8783a
JR
492 miniflow_initialize(&flow_cache->entries[i].key.mf,
493 flow_cache->entries[i].key.buf);
9bbf1c3d
DDP
494 }
495}
496
497static void
498emc_cache_uninit(struct emc_cache *flow_cache)
499{
500 int i;
501
502 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
503 emc_clear_entry(&flow_cache->entries[i]);
504 }
505}
506
67ad54cb
AW
507/* Check and clear dead flow references slowly (one entry at each
508 * invocation). */
509static void
510emc_cache_slow_sweep(struct emc_cache *flow_cache)
511{
512 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
513
514 if (!emc_entry_alive(entry)) {
515 emc_clear_entry(entry);
516 }
517 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
518}
519
72865317
BP
520static struct dpif_netdev *
521dpif_netdev_cast(const struct dpif *dpif)
522{
cb22974d 523 ovs_assert(dpif->dpif_class->open == dpif_netdev_open);
72865317
BP
524 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
525}
526
527static struct dp_netdev *
528get_dp_netdev(const struct dpif *dpif)
529{
530 return dpif_netdev_cast(dpif)->dp;
531}
6553d06b
DDP
532\f
533enum pmd_info_type {
534 PMD_INFO_SHOW_STATS, /* show how cpu cycles are spent */
535 PMD_INFO_CLEAR_STATS /* set the cycles count to 0 */
536};
537
538static void
539pmd_info_show_stats(struct ds *reply,
540 struct dp_netdev_pmd_thread *pmd,
541 unsigned long long stats[DP_N_STATS],
542 uint64_t cycles[PMD_N_CYCLES])
543{
544 unsigned long long total_packets = 0;
545 uint64_t total_cycles = 0;
546 int i;
547
548 /* These loops subtracts reference values ('*_zero') from the counters.
549 * Since loads and stores are relaxed, it might be possible for a '*_zero'
550 * value to be more recent than the current value we're reading from the
551 * counter. This is not a big problem, since these numbers are not
552 * supposed to be too accurate, but we should at least make sure that
553 * the result is not negative. */
554 for (i = 0; i < DP_N_STATS; i++) {
555 if (stats[i] > pmd->stats_zero[i]) {
556 stats[i] -= pmd->stats_zero[i];
557 } else {
558 stats[i] = 0;
559 }
560
561 if (i != DP_STAT_LOST) {
562 /* Lost packets are already included in DP_STAT_MISS */
563 total_packets += stats[i];
564 }
565 }
566
567 for (i = 0; i < PMD_N_CYCLES; i++) {
568 if (cycles[i] > pmd->cycles_zero[i]) {
569 cycles[i] -= pmd->cycles_zero[i];
570 } else {
571 cycles[i] = 0;
572 }
573
574 total_cycles += cycles[i];
575 }
576
577 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
578 ? "main thread" : "pmd thread");
579
580 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
581 ds_put_format(reply, " numa_id %d", pmd->numa_id);
582 }
583 if (pmd->core_id != OVS_CORE_UNSPEC) {
bd5131ba 584 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
585 }
586 ds_put_cstr(reply, ":\n");
587
588 ds_put_format(reply,
589 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
590 "\tmiss:%llu\n\tlost:%llu\n",
591 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
592 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
593
594 if (total_cycles == 0) {
595 return;
596 }
597
598 ds_put_format(reply,
599 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
600 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
601 cycles[PMD_CYCLES_POLLING],
602 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
603 cycles[PMD_CYCLES_PROCESSING],
604 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
605
606 if (total_packets == 0) {
607 return;
608 }
609
610 ds_put_format(reply,
611 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
612 total_cycles / (double)total_packets,
613 total_cycles, total_packets);
614
615 ds_put_format(reply,
616 "\tavg processing cycles per packet: "
617 "%.02f (%"PRIu64"/%llu)\n",
618 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
619 cycles[PMD_CYCLES_PROCESSING], total_packets);
620}
621
622static void
623pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
624 struct dp_netdev_pmd_thread *pmd,
625 unsigned long long stats[DP_N_STATS],
626 uint64_t cycles[PMD_N_CYCLES])
627{
628 int i;
629
630 /* We cannot write 'stats' and 'cycles' (because they're written by other
631 * threads) and we shouldn't change 'stats' (because they're used to count
632 * datapath stats, which must not be cleared here). Instead, we save the
633 * current values and subtract them from the values to be displayed in the
634 * future */
635 for (i = 0; i < DP_N_STATS; i++) {
636 pmd->stats_zero[i] = stats[i];
637 }
638 for (i = 0; i < PMD_N_CYCLES; i++) {
639 pmd->cycles_zero[i] = cycles[i];
640 }
641}
642
643static void
644dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
645 void *aux)
646{
647 struct ds reply = DS_EMPTY_INITIALIZER;
648 struct dp_netdev_pmd_thread *pmd;
649 struct dp_netdev *dp = NULL;
650 enum pmd_info_type type = *(enum pmd_info_type *) aux;
651
652 ovs_mutex_lock(&dp_netdev_mutex);
653
654 if (argc == 2) {
655 dp = shash_find_data(&dp_netdevs, argv[1]);
656 } else if (shash_count(&dp_netdevs) == 1) {
657 /* There's only one datapath */
658 dp = shash_first(&dp_netdevs)->data;
659 }
660
661 if (!dp) {
662 ovs_mutex_unlock(&dp_netdev_mutex);
663 unixctl_command_reply_error(conn,
664 "please specify an existing datapath");
665 return;
666 }
667
668 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
669 unsigned long long stats[DP_N_STATS];
670 uint64_t cycles[PMD_N_CYCLES];
671 int i;
672
673 /* Read current stats and cycle counters */
674 for (i = 0; i < ARRAY_SIZE(stats); i++) {
675 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
676 }
677 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
678 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
679 }
680
681 if (type == PMD_INFO_CLEAR_STATS) {
682 pmd_info_clear_stats(&reply, pmd, stats, cycles);
683 } else if (type == PMD_INFO_SHOW_STATS) {
684 pmd_info_show_stats(&reply, pmd, stats, cycles);
685 }
686 }
687
688 ovs_mutex_unlock(&dp_netdev_mutex);
689
690 unixctl_command_reply(conn, ds_cstr(&reply));
691 ds_destroy(&reply);
692}
693\f
694static int
695dpif_netdev_init(void)
696{
697 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
698 clear_aux = PMD_INFO_CLEAR_STATS;
699
700 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
701 0, 1, dpif_netdev_pmd_info,
702 (void *)&show_aux);
703 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
704 0, 1, dpif_netdev_pmd_info,
705 (void *)&clear_aux);
706 return 0;
707}
72865317 708
2197d7ab 709static int
2240af25
DDP
710dpif_netdev_enumerate(struct sset *all_dps,
711 const struct dpif_class *dpif_class)
2197d7ab
GL
712{
713 struct shash_node *node;
714
97be1538 715 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 716 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
717 struct dp_netdev *dp = node->data;
718 if (dpif_class != dp->class) {
719 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
720 * If the class doesn't match, skip this dpif. */
721 continue;
722 }
2197d7ab
GL
723 sset_add(all_dps, node->name);
724 }
97be1538 725 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 726
2197d7ab
GL
727 return 0;
728}
729
add90f6f
EJ
730static bool
731dpif_netdev_class_is_dummy(const struct dpif_class *class)
732{
733 return class != &dpif_netdev_class;
734}
735
0aeaabc8
JP
736static const char *
737dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
738{
739 return strcmp(type, "internal") ? type
add90f6f 740 : dpif_netdev_class_is_dummy(class) ? "dummy"
0aeaabc8
JP
741 : "tap";
742}
743
72865317
BP
744static struct dpif *
745create_dpif_netdev(struct dp_netdev *dp)
746{
462278db 747 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 748 struct dpif_netdev *dpif;
72865317 749
6a8267c5 750 ovs_refcount_ref(&dp->ref_cnt);
72865317 751
72865317 752 dpif = xmalloc(sizeof *dpif);
614c4892 753 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 754 dpif->dp = dp;
d33ed218 755 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
756
757 return &dpif->dpif;
758}
759
4e022ec0
AW
760/* Choose an unused, non-zero port number and return it on success.
761 * Return ODPP_NONE on failure. */
762static odp_port_t
e44768b7 763choose_port(struct dp_netdev *dp, const char *name)
59e6d833 764 OVS_REQUIRES(dp->port_mutex)
e44768b7 765{
4e022ec0 766 uint32_t port_no;
e44768b7
JP
767
768 if (dp->class != &dpif_netdev_class) {
769 const char *p;
770 int start_no = 0;
771
772 /* If the port name begins with "br", start the number search at
773 * 100 to make writing tests easier. */
774 if (!strncmp(name, "br", 2)) {
775 start_no = 100;
776 }
777
778 /* If the port name contains a number, try to assign that port number.
779 * This can make writing unit tests easier because port numbers are
780 * predictable. */
781 for (p = name; *p != '\0'; p++) {
782 if (isdigit((unsigned char) *p)) {
783 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
784 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
785 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 786 return u32_to_odp(port_no);
e44768b7
JP
787 }
788 break;
789 }
790 }
791 }
792
ff073a71
BP
793 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
794 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 795 return u32_to_odp(port_no);
e44768b7
JP
796 }
797 }
798
4e022ec0 799 return ODPP_NONE;
e44768b7
JP
800}
801
72865317 802static int
614c4892
BP
803create_dp_netdev(const char *name, const struct dpif_class *class,
804 struct dp_netdev **dpp)
8a4e3a85 805 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
806{
807 struct dp_netdev *dp;
808 int error;
72865317 809
462278db 810 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
811 shash_add(&dp_netdevs, name, dp);
812
813 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
814 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 815 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 816 atomic_flag_clear(&dp->destroyed);
8a4e3a85 817
59e6d833
BP
818 ovs_mutex_init(&dp->port_mutex);
819 cmap_init(&dp->ports);
d33ed218 820 dp->port_seq = seq_create();
6b31e073
RW
821 fat_rwlock_init(&dp->upcall_rwlock);
822
823 /* Disable upcalls by default. */
824 dp_netdev_disable_upcall(dp);
623540e4 825 dp->upcall_aux = NULL;
6b31e073 826 dp->upcall_cb = NULL;
e44768b7 827
65f13b50
AW
828 cmap_init(&dp->poll_threads);
829 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
830 ovsthread_key_create(&dp->per_pmd_key, NULL);
831
832 /* Reserves the core NON_PMD_CORE_ID for all non-pmd threads. */
833 ovs_numa_try_pin_core_specific(NON_PMD_CORE_ID);
f2eee189
AW
834 dp_netdev_set_nonpmd(dp);
835 dp->n_dpdk_rxqs = NR_QUEUE;
65f13b50 836
59e6d833 837 ovs_mutex_lock(&dp->port_mutex);
4e022ec0 838 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
59e6d833 839 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
840 if (error) {
841 dp_netdev_free(dp);
462278db 842 return error;
72865317
BP
843 }
844
a36de779 845 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 846 *dpp = dp;
72865317
BP
847 return 0;
848}
849
850static int
614c4892 851dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 852 bool create, struct dpif **dpifp)
72865317 853{
462278db 854 struct dp_netdev *dp;
5279f8fd 855 int error;
462278db 856
97be1538 857 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
858 dp = shash_find_data(&dp_netdevs, name);
859 if (!dp) {
5279f8fd 860 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 861 } else {
5279f8fd
BP
862 error = (dp->class != class ? EINVAL
863 : create ? EEXIST
864 : 0);
865 }
866 if (!error) {
867 *dpifp = create_dpif_netdev(dp);
6b31e073 868 dp->dpif = *dpifp;
72865317 869 }
97be1538 870 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 871
5279f8fd 872 return error;
72865317
BP
873}
874
88ace79b
DDP
875static void
876dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
877 OVS_NO_THREAD_SAFETY_ANALYSIS
878{
879 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
880 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
881
882 /* Before freeing a lock we should release it */
883 fat_rwlock_unlock(&dp->upcall_rwlock);
884 fat_rwlock_destroy(&dp->upcall_rwlock);
885}
886
8a4e3a85
BP
887/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
888 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
889static void
890dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 891 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 892{
59e6d833 893 struct dp_netdev_port *port;
4ad28026 894
8a4e3a85
BP
895 shash_find_and_delete(&dp_netdevs, dp->name);
896
65f13b50 897 dp_netdev_destroy_all_pmds(dp);
8bd89cdc 898 cmap_destroy(&dp->poll_threads);
65f13b50
AW
899 ovs_mutex_destroy(&dp->non_pmd_mutex);
900 ovsthread_key_delete(dp->per_pmd_key);
6c3eee82 901
59e6d833 902 ovs_mutex_lock(&dp->port_mutex);
a532e683 903 CMAP_FOR_EACH (port, node, &dp->ports) {
c40b890f 904 do_del_port(dp, port);
1ba530f4 905 }
59e6d833 906 ovs_mutex_unlock(&dp->port_mutex);
51852a57 907
d33ed218 908 seq_destroy(dp->port_seq);
59e6d833 909 cmap_destroy(&dp->ports);
88ace79b
DDP
910
911 /* Upcalls must be disabled at this point */
912 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 913
f2eee189 914 free(dp->pmd_cmask);
8a4e3a85 915 free(CONST_CAST(char *, dp->name));
72865317
BP
916 free(dp);
917}
918
8a4e3a85
BP
919static void
920dp_netdev_unref(struct dp_netdev *dp)
921{
922 if (dp) {
923 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
924 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
925 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 926 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
927 dp_netdev_free(dp);
928 }
929 ovs_mutex_unlock(&dp_netdev_mutex);
930 }
931}
932
72865317
BP
933static void
934dpif_netdev_close(struct dpif *dpif)
935{
936 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 937
8a4e3a85 938 dp_netdev_unref(dp);
72865317
BP
939 free(dpif);
940}
941
942static int
7dab847a 943dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
944{
945 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 946
6a8267c5 947 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 948 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
949 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
950 OVS_NOT_REACHED();
951 }
952 }
5279f8fd 953
72865317
BP
954 return 0;
955}
956
eb94da30
DDP
957/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
958 * load/store semantics. While the increment is not atomic, the load and
959 * store operations are, making it impossible to read inconsistent values.
960 *
961 * This is used to update thread local stats counters. */
962static void
963non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
964{
965 unsigned long long tmp;
966
967 atomic_read_relaxed(var, &tmp);
968 tmp += n;
969 atomic_store_relaxed(var, tmp);
970}
971
72865317 972static int
a8d9304d 973dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
974{
975 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 976 struct dp_netdev_pmd_thread *pmd;
8a4e3a85 977
1c1e46ed
AW
978 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
979 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
eb94da30 980 unsigned long long n;
1c1e46ed 981 stats->n_flows += cmap_count(&pmd->flow_table);
eb94da30 982
abcf3ef4
DDP
983 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
984 stats->n_hit += n;
985 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
eb94da30
DDP
986 stats->n_hit += n;
987 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
988 stats->n_missed += n;
989 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
990 stats->n_lost += n;
51852a57 991 }
1ce3fa06 992 stats->n_masks = UINT32_MAX;
847108dc 993 stats->n_mask_hit = UINT64_MAX;
5279f8fd 994
72865317
BP
995 return 0;
996}
997
e4cfed38 998static void
65f13b50 999dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1000{
65f13b50
AW
1001 int old_seq;
1002
accf8626
AW
1003 if (pmd->core_id == NON_PMD_CORE_ID) {
1004 return;
1005 }
1006
1007 ovs_mutex_lock(&pmd->cond_mutex);
65f13b50 1008 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
accf8626
AW
1009 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1010 ovs_mutex_unlock(&pmd->cond_mutex);
65f13b50 1011}
e4cfed38 1012
65f13b50
AW
1013/* Causes all pmd threads to reload its tx/rx devices.
1014 * Must be called after adding/removing ports. */
1015static void
1016dp_netdev_reload_pmds(struct dp_netdev *dp)
1017{
1018 struct dp_netdev_pmd_thread *pmd;
e4cfed38 1019
65f13b50
AW
1020 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1021 dp_netdev_reload_pmd__(pmd);
84067a4c 1022 }
e4cfed38
PS
1023}
1024
59e6d833
BP
1025static uint32_t
1026hash_port_no(odp_port_t port_no)
1027{
1028 return hash_int(odp_to_u32(port_no), 0);
1029}
1030
72865317 1031static int
c3827f61 1032do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
4e022ec0 1033 odp_port_t port_no)
59e6d833 1034 OVS_REQUIRES(dp->port_mutex)
72865317 1035{
4b609110 1036 struct netdev_saved_flags *sf;
72865317
BP
1037 struct dp_netdev_port *port;
1038 struct netdev *netdev;
2499a8ce 1039 enum netdev_flags flags;
0cbfe35d 1040 const char *open_type;
72865317 1041 int error;
55c955bd 1042 int i;
72865317 1043
17050610
BP
1044 /* Reject devices already in 'dp'. */
1045 if (!get_port_by_name(dp, devname, &port)) {
1046 return EEXIST;
1047 }
72865317
BP
1048
1049 /* Open and validate network device. */
0aeaabc8 1050 open_type = dpif_netdev_port_open_type(dp->class, type);
0cbfe35d 1051 error = netdev_open(devname, open_type, &netdev);
72865317
BP
1052 if (error) {
1053 return error;
1054 }
72865317
BP
1055 /* XXX reject non-Ethernet devices */
1056
2499a8ce
AC
1057 netdev_get_flags(netdev, &flags);
1058 if (flags & NETDEV_LOOPBACK) {
1059 VLOG_ERR("%s: cannot add a loopback device", devname);
1060 netdev_close(netdev);
1061 return EINVAL;
1062 }
1063
5a034064
AW
1064 if (netdev_is_pmd(netdev)) {
1065 int n_cores = ovs_numa_get_n_cores();
1066
1067 if (n_cores == OVS_CORE_UNSPEC) {
1068 VLOG_ERR("%s, cannot get cpu core info", devname);
1069 return ENOENT;
1070 }
1071 /* There can only be ovs_numa_get_n_cores() pmd threads,
f2eee189
AW
1072 * so creates a txq for each. */
1073 error = netdev_set_multiq(netdev, n_cores, dp->n_dpdk_rxqs);
7251515e 1074 if (error && (error != EOPNOTSUPP)) {
5a034064
AW
1075 VLOG_ERR("%s, cannot set multiq", devname);
1076 return errno;
1077 }
1078 }
e4cfed38 1079 port = xzalloc(sizeof *port);
efa2bcbb 1080 port->md = PKT_METADATA_INITIALIZER(port_no);
e4cfed38 1081 port->netdev = netdev;
55c955bd 1082 port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev));
e4cfed38 1083 port->type = xstrdup(type);
55c955bd
PS
1084 for (i = 0; i < netdev_n_rxq(netdev); i++) {
1085 error = netdev_rxq_open(netdev, &port->rxq[i], i);
1086 if (error
1087 && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) {
1088 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1089 devname, ovs_strerror(errno));
1090 netdev_close(netdev);
16bea12c
TG
1091 free(port->type);
1092 free(port->rxq);
1093 free(port);
55c955bd
PS
1094 return error;
1095 }
7b6b0ef4
BP
1096 }
1097
4b609110 1098 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 1099 if (error) {
55c955bd
PS
1100 for (i = 0; i < netdev_n_rxq(netdev); i++) {
1101 netdev_rxq_close(port->rxq[i]);
1102 }
72865317 1103 netdev_close(netdev);
16bea12c 1104 free(port->type);
f7791740 1105 free(port->rxq);
e4cfed38 1106 free(port);
72865317
BP
1107 return error;
1108 }
4b609110 1109 port->sf = sf;
e4cfed38 1110
f7d63652
AW
1111 ovs_refcount_init(&port->ref_cnt);
1112 cmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1113
e4cfed38 1114 if (netdev_is_pmd(netdev)) {
65f13b50
AW
1115 dp_netdev_set_pmds_on_numa(dp, netdev_get_numa_id(netdev));
1116 dp_netdev_reload_pmds(dp);
e4cfed38 1117 }
d33ed218 1118 seq_change(dp->port_seq);
72865317
BP
1119
1120 return 0;
1121}
1122
247527db
BP
1123static int
1124dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1125 odp_port_t *port_nop)
247527db
BP
1126{
1127 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1128 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1129 const char *dpif_port;
4e022ec0 1130 odp_port_t port_no;
5279f8fd 1131 int error;
247527db 1132
59e6d833 1133 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1134 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1135 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1136 port_no = *port_nop;
1137 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1138 } else {
3aa30359 1139 port_no = choose_port(dp, dpif_port);
5279f8fd 1140 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1141 }
5279f8fd 1142 if (!error) {
247527db 1143 *port_nop = port_no;
5279f8fd 1144 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1145 }
59e6d833 1146 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1147
1148 return error;
72865317
BP
1149}
1150
1151static int
4e022ec0 1152dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1153{
1154 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1155 int error;
1156
59e6d833 1157 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1158 if (port_no == ODPP_LOCAL) {
1159 error = EINVAL;
1160 } else {
1161 struct dp_netdev_port *port;
1162
1163 error = get_port_by_number(dp, port_no, &port);
1164 if (!error) {
1165 do_del_port(dp, port);
1166 }
1167 }
59e6d833 1168 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1169
1170 return error;
72865317
BP
1171}
1172
1173static bool
4e022ec0 1174is_valid_port_number(odp_port_t port_no)
72865317 1175{
ff073a71
BP
1176 return port_no != ODPP_NONE;
1177}
1178
1179static struct dp_netdev_port *
1180dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1181{
1182 struct dp_netdev_port *port;
1183
59e6d833 1184 CMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
efa2bcbb 1185 if (port->md.in_port.odp_port == port_no) {
ff073a71
BP
1186 return port;
1187 }
1188 }
1189 return NULL;
72865317
BP
1190}
1191
1192static int
1193get_port_by_number(struct dp_netdev *dp,
4e022ec0 1194 odp_port_t port_no, struct dp_netdev_port **portp)
72865317
BP
1195{
1196 if (!is_valid_port_number(port_no)) {
1197 *portp = NULL;
1198 return EINVAL;
1199 } else {
ff073a71 1200 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
1201 return *portp ? 0 : ENOENT;
1202 }
1203}
1204
b284085e
PS
1205static void
1206port_ref(struct dp_netdev_port *port)
1207{
1208 if (port) {
1209 ovs_refcount_ref(&port->ref_cnt);
1210 }
1211}
1212
a1fdee13
AW
1213static bool
1214port_try_ref(struct dp_netdev_port *port)
1215{
1216 if (port) {
1217 return ovs_refcount_try_ref_rcu(&port->ref_cnt);
1218 }
1219
1220 return false;
1221}
1222
59e6d833
BP
1223static void
1224port_unref(struct dp_netdev_port *port)
1225{
24f83812 1226 if (port && ovs_refcount_unref_relaxed(&port->ref_cnt) == 1) {
accf8626
AW
1227 int n_rxq = netdev_n_rxq(port->netdev);
1228 int i;
1229
1230 netdev_close(port->netdev);
1231 netdev_restore_flags(port->sf);
1232
1233 for (i = 0; i < n_rxq; i++) {
1234 netdev_rxq_close(port->rxq[i]);
1235 }
1236 free(port->rxq);
1237 free(port->type);
1238 free(port);
b284085e
PS
1239 }
1240}
1241
72865317
BP
1242static int
1243get_port_by_name(struct dp_netdev *dp,
1244 const char *devname, struct dp_netdev_port **portp)
59e6d833 1245 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1246{
1247 struct dp_netdev_port *port;
1248
a532e683 1249 CMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 1250 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
1251 *portp = port;
1252 return 0;
1253 }
1254 }
1255 return ENOENT;
1256}
1257
65f13b50
AW
1258static int
1259get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1260{
1261 struct dp_netdev_pmd_thread *pmd;
1262 int n_pmds = 0;
1263
1264 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1265 if (pmd->numa_id == numa_id) {
1266 n_pmds++;
1267 }
1268 }
1269
1270 return n_pmds;
1271}
1272
1273/* Returns 'true' if there is a port with pmd netdev and the netdev
1274 * is on numa node 'numa_id'. */
1275static bool
1276has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
1277{
1278 struct dp_netdev_port *port;
1279
1280 CMAP_FOR_EACH (port, node, &dp->ports) {
1281 if (netdev_is_pmd(port->netdev)
1282 && netdev_get_numa_id(port->netdev) == numa_id) {
1283 return true;
1284 }
1285 }
1286
1287 return false;
1288}
1289
1290
c40b890f
BP
1291static void
1292do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 1293 OVS_REQUIRES(dp->port_mutex)
72865317 1294{
efa2bcbb
DDP
1295 cmap_remove(&dp->ports, &port->node,
1296 hash_odp_port(port->md.in_port.odp_port));
d33ed218 1297 seq_change(dp->port_seq);
e4cfed38 1298 if (netdev_is_pmd(port->netdev)) {
65f13b50
AW
1299 int numa_id = netdev_get_numa_id(port->netdev);
1300
1301 /* If there is no netdev on the numa node, deletes the pmd threads
1302 * for that numa. Else, just reloads the queues. */
1303 if (!has_pmd_port_for_numa(dp, numa_id)) {
1304 dp_netdev_del_pmds_on_numa(dp, numa_id);
1305 }
1306 dp_netdev_reload_pmds(dp);
e4cfed38 1307 }
72865317 1308
b284085e 1309 port_unref(port);
72865317
BP
1310}
1311
1312static void
4c738a8d
BP
1313answer_port_query(const struct dp_netdev_port *port,
1314 struct dpif_port *dpif_port)
72865317 1315{
3efb6063 1316 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 1317 dpif_port->type = xstrdup(port->type);
efa2bcbb 1318 dpif_port->port_no = port->md.in_port.odp_port;
72865317
BP
1319}
1320
1321static int
4e022ec0 1322dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 1323 struct dpif_port *dpif_port)
72865317
BP
1324{
1325 struct dp_netdev *dp = get_dp_netdev(dpif);
1326 struct dp_netdev_port *port;
1327 int error;
1328
1329 error = get_port_by_number(dp, port_no, &port);
4afba28d 1330 if (!error && dpif_port) {
4c738a8d 1331 answer_port_query(port, dpif_port);
72865317 1332 }
5279f8fd 1333
72865317
BP
1334 return error;
1335}
1336
1337static int
1338dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 1339 struct dpif_port *dpif_port)
72865317
BP
1340{
1341 struct dp_netdev *dp = get_dp_netdev(dpif);
1342 struct dp_netdev_port *port;
1343 int error;
1344
59e6d833 1345 ovs_mutex_lock(&dp->port_mutex);
72865317 1346 error = get_port_by_name(dp, devname, &port);
4afba28d 1347 if (!error && dpif_port) {
4c738a8d 1348 answer_port_query(port, dpif_port);
72865317 1349 }
59e6d833 1350 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1351
72865317
BP
1352 return error;
1353}
1354
61e7deb1
BP
1355static void
1356dp_netdev_flow_free(struct dp_netdev_flow *flow)
1357{
61e7deb1 1358 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1359 free(flow);
1360}
1361
ed79f89a
DDP
1362static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1363{
1364 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1365 ovsrcu_postpone(dp_netdev_flow_free, flow);
1366 }
1367}
1368
70e5ed6f
JS
1369static uint32_t
1370dp_netdev_flow_hash(const ovs_u128 *ufid)
1371{
1372 return ufid->u32[0];
1373}
1374
72865317 1375static void
1c1e46ed
AW
1376dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1377 struct dp_netdev_flow *flow)
1378 OVS_REQUIRES(pmd->flow_mutex)
72865317 1379{
9f361d6b 1380 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2c0ea78f 1381
1c1e46ed
AW
1382 dpcls_remove(&pmd->cls, &flow->cr);
1383 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
9bbf1c3d 1384 flow->dead = true;
ed79f89a
DDP
1385
1386 dp_netdev_flow_unref(flow);
72865317
BP
1387}
1388
1389static void
1c1e46ed 1390dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 1391{
78c8df12 1392 struct dp_netdev_flow *netdev_flow;
72865317 1393
1c1e46ed
AW
1394 ovs_mutex_lock(&pmd->flow_mutex);
1395 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1396 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 1397 }
1c1e46ed 1398 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
1399}
1400
1401static int
1402dpif_netdev_flow_flush(struct dpif *dpif)
1403{
1404 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
1405 struct dp_netdev_pmd_thread *pmd;
1406
1407 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1408 dp_netdev_pmd_flow_flush(pmd);
1409 }
5279f8fd 1410
72865317
BP
1411 return 0;
1412}
1413
b0ec0f27 1414struct dp_netdev_port_state {
59e6d833 1415 struct cmap_position position;
4c738a8d 1416 char *name;
b0ec0f27
BP
1417};
1418
1419static int
1420dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1421{
1422 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1423 return 0;
1424}
1425
72865317 1426static int
b0ec0f27 1427dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1428 struct dpif_port *dpif_port)
72865317 1429{
b0ec0f27 1430 struct dp_netdev_port_state *state = state_;
72865317 1431 struct dp_netdev *dp = get_dp_netdev(dpif);
59e6d833 1432 struct cmap_node *node;
ff073a71 1433 int retval;
72865317 1434
59e6d833 1435 node = cmap_next_position(&dp->ports, &state->position);
ff073a71
BP
1436 if (node) {
1437 struct dp_netdev_port *port;
5279f8fd 1438
ff073a71
BP
1439 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1440
1441 free(state->name);
1442 state->name = xstrdup(netdev_get_name(port->netdev));
1443 dpif_port->name = state->name;
1444 dpif_port->type = port->type;
efa2bcbb 1445 dpif_port->port_no = port->md.in_port.odp_port;
ff073a71
BP
1446
1447 retval = 0;
1448 } else {
1449 retval = EOF;
72865317 1450 }
5279f8fd 1451
ff073a71 1452 return retval;
b0ec0f27
BP
1453}
1454
1455static int
4c738a8d 1456dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1457{
4c738a8d
BP
1458 struct dp_netdev_port_state *state = state_;
1459 free(state->name);
b0ec0f27
BP
1460 free(state);
1461 return 0;
72865317
BP
1462}
1463
1464static int
67a4917b 1465dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1466{
1467 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1468 uint64_t new_port_seq;
5279f8fd
BP
1469 int error;
1470
d33ed218
BP
1471 new_port_seq = seq_read(dpif->dp->port_seq);
1472 if (dpif->last_port_seq != new_port_seq) {
1473 dpif->last_port_seq = new_port_seq;
5279f8fd 1474 error = ENOBUFS;
72865317 1475 } else {
5279f8fd 1476 error = EAGAIN;
72865317 1477 }
5279f8fd
BP
1478
1479 return error;
72865317
BP
1480}
1481
1482static void
1483dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1484{
1485 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1486
d33ed218 1487 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1488}
1489
1490static struct dp_netdev_flow *
0de8783a 1491dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
1492{
1493 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1494}
1495
9bbf1c3d
DDP
1496static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1497{
1498 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1499}
1500
79df317f
DDP
1501/* netdev_flow_key utilities.
1502 *
1503 * netdev_flow_key is basically a miniflow. We use these functions
1504 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1505 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1506 *
1507 * - Since we are dealing exclusively with miniflows created by
1508 * miniflow_extract(), if the map is different the miniflow is different.
1509 * Therefore we can be faster by comparing the map and the miniflow in a
1510 * single memcmp().
1511 * _ netdev_flow_key's miniflow has always inline values.
1512 * - These functions can be inlined by the compiler.
1513 *
1514 * The following assertions make sure that what we're doing with miniflow is
1515 * safe
1516 */
1517BUILD_ASSERT_DECL(offsetof(struct miniflow, inline_values)
1518 == sizeof(uint64_t));
79df317f
DDP
1519
1520/* Given the number of bits set in the miniflow map, returns the size of the
caeb4906 1521 * 'netdev_flow_key.mf' */
79df317f
DDP
1522static inline uint32_t
1523netdev_flow_key_size(uint32_t flow_u32s)
1524{
caeb4906 1525 return offsetof(struct miniflow, inline_values) +
0de8783a 1526 MINIFLOW_VALUES_SIZE(flow_u32s);
79df317f
DDP
1527}
1528
79df317f
DDP
1529static inline bool
1530netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
1531 const struct netdev_flow_key *b)
1532{
caeb4906
JR
1533 /* 'b->len' may be not set yet. */
1534 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
1535}
1536
1537/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1538 * The maps are compared bitwise, so both 'key->mf' 'mf' must have been
1539 * generated by miniflow_extract. */
1540static inline bool
1541netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1542 const struct miniflow *mf)
79df317f 1543{
caeb4906 1544 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
1545}
1546
1547static inline void
1548netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
1549 const struct netdev_flow_key *src)
1550{
caeb4906
JR
1551 memcpy(dst, src,
1552 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
1553}
1554
1555/* Slow. */
1556static void
1557netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1558 const struct flow *src)
1559{
cf62fa4c 1560 struct dp_packet packet;
0de8783a 1561 uint64_t buf_stub[512 / 8];
0de8783a
JR
1562
1563 miniflow_initialize(&dst->mf, dst->buf);
1564
cf62fa4c
PS
1565 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1566 pkt_metadata_from_flow(&packet.md, src);
0de8783a 1567 flow_compose(&packet, src);
cf62fa4c
PS
1568 miniflow_extract(&packet, &dst->mf);
1569 dp_packet_uninit(&packet);
0de8783a
JR
1570
1571 dst->len = netdev_flow_key_size(count_1bits(dst->mf.map));
1572 dst->hash = 0; /* Not computed yet. */
1573}
1574
1575/* Initialize a netdev_flow_key 'mask' from 'match'. */
1576static inline void
1577netdev_flow_mask_init(struct netdev_flow_key *mask,
1578 const struct match *match)
1579{
d70e8c28
JR
1580 const uint64_t *mask_u64 = (const uint64_t *) &match->wc.masks;
1581 uint64_t *dst = mask->mf.inline_values;
0de8783a
JR
1582 uint64_t map, mask_map = 0;
1583 uint32_t hash = 0;
1584 int n;
1585
1586 /* Only check masks that make sense for the flow. */
1587 map = flow_wc_map(&match->flow);
1588
1589 while (map) {
1590 uint64_t rm1bit = rightmost_1bit(map);
1591 int i = raw_ctz(map);
1592
d70e8c28 1593 if (mask_u64[i]) {
0de8783a 1594 mask_map |= rm1bit;
d70e8c28
JR
1595 *dst++ = mask_u64[i];
1596 hash = hash_add64(hash, mask_u64[i]);
0de8783a
JR
1597 }
1598 map -= rm1bit;
1599 }
1600
1601 mask->mf.values_inline = true;
1602 mask->mf.map = mask_map;
1603
aae7c34f 1604 hash = hash_add64(hash, mask_map);
0de8783a
JR
1605
1606 n = dst - mask->mf.inline_values;
1607
d70e8c28 1608 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
1609 mask->len = netdev_flow_key_size(n);
1610}
1611
1612/* Initializes 'dst' as a copy of 'src' masked with 'mask'. */
1613static inline void
1614netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1615 const struct flow *flow,
1616 const struct netdev_flow_key *mask)
79df317f 1617{
d70e8c28
JR
1618 uint64_t *dst_u64 = dst->mf.inline_values;
1619 const uint64_t *mask_u64 = mask->mf.inline_values;
0de8783a 1620 uint32_t hash = 0;
d70e8c28 1621 uint64_t value;
0de8783a
JR
1622
1623 dst->len = mask->len;
1624 dst->mf.values_inline = true;
1625 dst->mf.map = mask->mf.map;
1626
1627 FLOW_FOR_EACH_IN_MAP(value, flow, mask->mf.map) {
d70e8c28
JR
1628 *dst_u64 = value & *mask_u64++;
1629 hash = hash_add64(hash, *dst_u64++);
0de8783a 1630 }
d70e8c28 1631 dst->hash = hash_finish(hash, (dst_u64 - dst->mf.inline_values) * 8);
0de8783a
JR
1632}
1633
d70e8c28 1634/* Iterate through all netdev_flow_key u64 values specified by 'MAP' */
0de8783a
JR
1635#define NETDEV_FLOW_KEY_FOR_EACH_IN_MAP(VALUE, KEY, MAP) \
1636 for (struct mf_for_each_in_map_aux aux__ \
1637 = { (KEY)->mf.inline_values, (KEY)->mf.map, MAP }; \
1638 mf_get_next_in_map(&aux__, &(VALUE)); \
1639 )
1640
1641/* Returns a hash value for the bits of 'key' where there are 1-bits in
1642 * 'mask'. */
1643static inline uint32_t
1644netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1645 const struct netdev_flow_key *mask)
1646{
d70e8c28 1647 const uint64_t *p = mask->mf.inline_values;
0de8783a 1648 uint32_t hash = 0;
d70e8c28 1649 uint64_t key_u64;
0de8783a 1650
d70e8c28
JR
1651 NETDEV_FLOW_KEY_FOR_EACH_IN_MAP(key_u64, key, mask->mf.map) {
1652 hash = hash_add64(hash, key_u64 & *p++);
0de8783a
JR
1653 }
1654
d70e8c28 1655 return hash_finish(hash, (p - mask->mf.inline_values) * 8);
79df317f
DDP
1656}
1657
9bbf1c3d
DDP
1658static inline bool
1659emc_entry_alive(struct emc_entry *ce)
1660{
1661 return ce->flow && !ce->flow->dead;
1662}
1663
1664static void
1665emc_clear_entry(struct emc_entry *ce)
1666{
1667 if (ce->flow) {
1668 dp_netdev_flow_unref(ce->flow);
1669 ce->flow = NULL;
1670 }
1671}
1672
1673static inline void
1674emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 1675 const struct netdev_flow_key *key)
9bbf1c3d
DDP
1676{
1677 if (ce->flow != flow) {
1678 if (ce->flow) {
1679 dp_netdev_flow_unref(ce->flow);
1680 }
1681
1682 if (dp_netdev_flow_ref(flow)) {
1683 ce->flow = flow;
1684 } else {
1685 ce->flow = NULL;
1686 }
1687 }
0de8783a
JR
1688 if (key) {
1689 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
1690 }
1691}
1692
1693static inline void
0de8783a 1694emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
1695 struct dp_netdev_flow *flow)
1696{
1697 struct emc_entry *to_be_replaced = NULL;
1698 struct emc_entry *current_entry;
1699
0de8783a
JR
1700 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1701 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 1702 /* We found the entry with the 'mf' miniflow */
0de8783a 1703 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
1704 return;
1705 }
1706
1707 /* Replacement policy: put the flow in an empty (not alive) entry, or
1708 * in the first entry where it can be */
1709 if (!to_be_replaced
1710 || (emc_entry_alive(to_be_replaced)
1711 && !emc_entry_alive(current_entry))
0de8783a 1712 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
1713 to_be_replaced = current_entry;
1714 }
1715 }
1716 /* We didn't find the miniflow in the cache.
1717 * The 'to_be_replaced' entry is where the new flow will be stored */
1718
0de8783a 1719 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
1720}
1721
1722static inline struct dp_netdev_flow *
0de8783a 1723emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
1724{
1725 struct emc_entry *current_entry;
1726
0de8783a
JR
1727 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1728 if (current_entry->key.hash == key->hash
1729 && emc_entry_alive(current_entry)
1730 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 1731
0de8783a 1732 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
1733 return current_entry->flow;
1734 }
1735 }
1736
1737 return NULL;
1738}
1739
72865317 1740static struct dp_netdev_flow *
1c1e46ed
AW
1741dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
1742 const struct netdev_flow_key *key)
2c0ea78f 1743{
8a4e3a85 1744 struct dp_netdev_flow *netdev_flow;
0de8783a 1745 struct dpcls_rule *rule;
2c0ea78f 1746
1c1e46ed 1747 dpcls_lookup(&pmd->cls, key, &rule, 1);
4f150744 1748 netdev_flow = dp_netdev_flow_cast(rule);
2c0ea78f 1749
8a4e3a85 1750 return netdev_flow;
2c0ea78f
GS
1751}
1752
1753static struct dp_netdev_flow *
1c1e46ed
AW
1754dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
1755 const ovs_u128 *ufidp, const struct nlattr *key,
1756 size_t key_len)
72865317 1757{
1763b4b8 1758 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
1759 struct flow flow;
1760 ovs_u128 ufid;
1761
1762 /* If a UFID is not provided, determine one based on the key. */
1763 if (!ufidp && key && key_len
1764 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1c1e46ed 1765 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
70e5ed6f
JS
1766 ufidp = &ufid;
1767 }
72865317 1768
70e5ed6f
JS
1769 if (ufidp) {
1770 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 1771 &pmd->flow_table) {
70e5ed6f
JS
1772 if (ovs_u128_equal(&netdev_flow->ufid, ufidp)) {
1773 return netdev_flow;
1774 }
72865317
BP
1775 }
1776 }
8a4e3a85 1777
72865317
BP
1778 return NULL;
1779}
1780
1781static void
eb94da30 1782get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1763b4b8 1783 struct dpif_flow_stats *stats)
feebdea2 1784{
eb94da30
DDP
1785 struct dp_netdev_flow *netdev_flow;
1786 unsigned long long n;
1787 long long used;
1788 uint16_t flags;
1789
1790 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
1791
1792 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
1793 stats->n_packets = n;
1794 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
1795 stats->n_bytes = n;
1796 atomic_read_relaxed(&netdev_flow->stats.used, &used);
1797 stats->used = used;
1798 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
1799 stats->tcp_flags = flags;
72865317
BP
1800}
1801
7af12bd7
JS
1802/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
1803 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
1804 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
1805 * protect them. */
6fe09f8c 1806static void
70e5ed6f 1807dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
7af12bd7 1808 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 1809 struct dpif_flow *flow, bool terse)
6fe09f8c 1810{
64bb477f
JS
1811 if (terse) {
1812 memset(flow, 0, sizeof *flow);
1813 } else {
1814 struct flow_wildcards wc;
1815 struct dp_netdev_actions *actions;
1816 size_t offset;
1817
1818 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
1819
1820 /* Key */
6fd6ed71 1821 offset = key_buf->size;
64bb477f
JS
1822 flow->key = ofpbuf_tail(key_buf);
1823 odp_flow_key_from_flow(key_buf, &netdev_flow->flow, &wc.masks,
1824 netdev_flow->flow.in_port.odp_port, true);
6fd6ed71 1825 flow->key_len = key_buf->size - offset;
64bb477f
JS
1826
1827 /* Mask */
6fd6ed71 1828 offset = mask_buf->size;
64bb477f
JS
1829 flow->mask = ofpbuf_tail(mask_buf);
1830 odp_flow_key_from_mask(mask_buf, &wc.masks, &netdev_flow->flow,
1831 odp_to_u32(wc.masks.in_port.odp_port),
1832 SIZE_MAX, true);
6fd6ed71 1833 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
1834
1835 /* Actions */
1836 actions = dp_netdev_flow_get_actions(netdev_flow);
1837 flow->actions = actions->actions;
1838 flow->actions_len = actions->size;
1839 }
6fe09f8c 1840
70e5ed6f
JS
1841 flow->ufid = netdev_flow->ufid;
1842 flow->ufid_present = true;
1c1e46ed 1843 flow->pmd_id = netdev_flow->pmd_id;
6fe09f8c
JS
1844 get_dpif_flow_stats(netdev_flow, &flow->stats);
1845}
1846
36956a7d 1847static int
8c301900
JR
1848dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1849 const struct nlattr *mask_key,
1850 uint32_t mask_key_len, const struct flow *flow,
1851 struct flow *mask)
1852{
1853 if (mask_key_len) {
80e44883
BP
1854 enum odp_key_fitness fitness;
1855
1856 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, mask, flow);
1857 if (fitness) {
8c301900
JR
1858 /* This should not happen: it indicates that
1859 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
1860 * disagree on the acceptable form of a mask. Log the problem
1861 * as an error, with enough details to enable debugging. */
1862 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1863
1864 if (!VLOG_DROP_ERR(&rl)) {
1865 struct ds s;
1866
1867 ds_init(&s);
1868 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
1869 true);
80e44883
BP
1870 VLOG_ERR("internal error parsing flow mask %s (%s)",
1871 ds_cstr(&s), odp_key_fitness_to_string(fitness));
8c301900
JR
1872 ds_destroy(&s);
1873 }
1874
1875 return EINVAL;
1876 }
8c301900
JR
1877 } else {
1878 enum mf_field_id id;
1879 /* No mask key, unwildcard everything except fields whose
1880 * prerequisities are not met. */
1881 memset(mask, 0x0, sizeof *mask);
1882
1883 for (id = 0; id < MFF_N_IDS; ++id) {
1884 /* Skip registers and metadata. */
1885 if (!(id >= MFF_REG0 && id < MFF_REG0 + FLOW_N_REGS)
1886 && id != MFF_METADATA) {
1887 const struct mf_field *mf = mf_from_id(id);
1888 if (mf_are_prereqs_ok(mf, flow)) {
1889 mf_mask_field(mf, mask);
1890 }
1891 }
1892 }
1893 }
1894
f3f750e5
BP
1895 /* Force unwildcard the in_port.
1896 *
1897 * We need to do this even in the case where we unwildcard "everything"
1898 * above because "everything" only includes the 16-bit OpenFlow port number
1899 * mask->in_port.ofp_port, which only covers half of the 32-bit datapath
1900 * port number mask->in_port.odp_port. */
1901 mask->in_port.odp_port = u32_to_odp(UINT32_MAX);
1902
8c301900
JR
1903 return 0;
1904}
1905
1906static int
1907dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1908 struct flow *flow)
36956a7d 1909{
586ddea5
BP
1910 odp_port_t in_port;
1911
8c301900 1912 if (odp_flow_key_to_flow(key, key_len, flow)) {
36956a7d 1913 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
1914 * and odp_flow_key_to_flow() disagree on the acceptable form of a
1915 * flow. Log the problem as an error, with enough details to enable
1916 * debugging. */
36956a7d
BP
1917 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1918
1919 if (!VLOG_DROP_ERR(&rl)) {
1920 struct ds s;
1921
1922 ds_init(&s);
8c301900 1923 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
1924 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
1925 ds_destroy(&s);
1926 }
1927
1928 return EINVAL;
1929 }
1930
586ddea5
BP
1931 in_port = flow->in_port.odp_port;
1932 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
1933 return EINVAL;
1934 }
1935
36956a7d
BP
1936 return 0;
1937}
1938
72865317 1939static int
6fe09f8c 1940dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
1941{
1942 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1943 struct dp_netdev_flow *netdev_flow;
1c1e46ed 1944 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
1945 unsigned pmd_id = get->pmd_id == PMD_ID_NULL
1946 ? NON_PMD_CORE_ID : get->pmd_id;
70e5ed6f 1947 int error = 0;
8a4e3a85 1948
1c1e46ed
AW
1949 pmd = dp_netdev_get_pmd(dp, pmd_id);
1950 if (!pmd) {
1951 return EINVAL;
1952 }
1953
1954 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
1955 get->key_len);
1763b4b8 1956 if (netdev_flow) {
70e5ed6f 1957 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
64bb477f 1958 get->flow, false);
70e5ed6f 1959 } else {
5279f8fd 1960 error = ENOENT;
72865317 1961 }
1c1e46ed
AW
1962 dp_netdev_pmd_unref(pmd);
1963
bc4a05c6 1964
5279f8fd 1965 return error;
72865317
BP
1966}
1967
0de8783a 1968static struct dp_netdev_flow *
1c1e46ed
AW
1969dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
1970 struct match *match, const ovs_u128 *ufid,
ae2ceebd 1971 const struct nlattr *actions, size_t actions_len)
1c1e46ed 1972 OVS_REQUIRES(pmd->flow_mutex)
72865317 1973{
0de8783a
JR
1974 struct dp_netdev_flow *flow;
1975 struct netdev_flow_key mask;
ed79f89a 1976
0de8783a
JR
1977 netdev_flow_mask_init(&mask, match);
1978 /* Make sure wc does not have metadata. */
1979 ovs_assert(!(mask.mf.map & (MINIFLOW_MAP(metadata) | MINIFLOW_MAP(regs))));
679ba04c 1980
0de8783a 1981 /* Do not allocate extra space. */
caeb4906 1982 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 1983 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 1984 flow->dead = false;
11e5cf1f 1985 flow->batch = NULL;
bd5131ba 1986 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 1987 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 1988 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 1989 ovs_refcount_init(&flow->ref_cnt);
0de8783a 1990 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 1991
0de8783a 1992 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
1c1e46ed 1993 dpcls_insert(&pmd->cls, &flow->cr, &mask);
72865317 1994
4c75aaab
EJ
1995 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
1996 dp_netdev_flow_hash(&flow->ufid));
1997
623540e4 1998 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
0de8783a 1999 struct match match;
623540e4
EJ
2000 struct ds ds = DS_EMPTY_INITIALIZER;
2001
0de8783a
JR
2002 match.flow = flow->flow;
2003 miniflow_expand(&flow->cr.mask->mf, &match.wc.masks);
2004
623540e4 2005 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
2006 odp_format_ufid(ufid, &ds);
2007 ds_put_cstr(&ds, " ");
0de8783a 2008 match_format(&match, &ds, OFP_DEFAULT_PRIORITY);
623540e4
EJ
2009 ds_put_cstr(&ds, ", actions:");
2010 format_odp_actions(&ds, actions, actions_len);
2011
2012 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2013
2014 ds_destroy(&ds);
2015 }
2016
0de8783a 2017 return flow;
72865317
BP
2018}
2019
72865317 2020static int
89625d1e 2021dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
2022{
2023 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2024 struct dp_netdev_flow *netdev_flow;
0de8783a 2025 struct netdev_flow_key key;
1c1e46ed 2026 struct dp_netdev_pmd_thread *pmd;
ae2ceebd 2027 struct match match;
70e5ed6f 2028 ovs_u128 ufid;
bd5131ba
DDP
2029 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2030 ? NON_PMD_CORE_ID : put->pmd_id;
36956a7d
BP
2031 int error;
2032
ae2ceebd 2033 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
2034 if (error) {
2035 return error;
2036 }
2037 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2038 put->mask, put->mask_len,
ae2ceebd 2039 &match.flow, &match.wc.masks);
36956a7d
BP
2040 if (error) {
2041 return error;
2042 }
0de8783a 2043
1c1e46ed
AW
2044 pmd = dp_netdev_get_pmd(dp, pmd_id);
2045 if (!pmd) {
2046 return EINVAL;
2047 }
2048
0de8783a
JR
2049 /* Must produce a netdev_flow_key for lookup.
2050 * This interface is no longer performance critical, since it is not used
2051 * for upcall processing any more. */
2052 netdev_flow_key_from_flow(&key, &match.flow);
72865317 2053
70e5ed6f
JS
2054 if (put->ufid) {
2055 ufid = *put->ufid;
2056 } else {
2057 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2058 }
2059
1c1e46ed
AW
2060 ovs_mutex_lock(&pmd->flow_mutex);
2061 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key);
1763b4b8 2062 if (!netdev_flow) {
89625d1e 2063 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 2064 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
89625d1e
BP
2065 if (put->stats) {
2066 memset(put->stats, 0, sizeof *put->stats);
feebdea2 2067 }
1c1e46ed 2068 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
70e5ed6f 2069 put->actions_len);
0de8783a 2070 error = 0;
72865317 2071 } else {
5279f8fd 2072 error = EFBIG;
72865317
BP
2073 }
2074 } else {
5279f8fd 2075 error = ENOENT;
72865317
BP
2076 }
2077 } else {
2c0ea78f 2078 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 2079 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
2080 struct dp_netdev_actions *new_actions;
2081 struct dp_netdev_actions *old_actions;
2082
2083 new_actions = dp_netdev_actions_create(put->actions,
2084 put->actions_len);
2085
61e7deb1
BP
2086 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2087 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 2088
a84cb64a
BP
2089 if (put->stats) {
2090 get_dpif_flow_stats(netdev_flow, put->stats);
2091 }
2092 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
2093 /* XXX: The userspace datapath uses thread local statistics
2094 * (for flows), which should be updated only by the owning
2095 * thread. Since we cannot write on stats memory here,
2096 * we choose not to support this flag. Please note:
2097 * - This feature is currently used only by dpctl commands with
2098 * option --clear.
2099 * - Should the need arise, this operation can be implemented
2100 * by keeping a base value (to be update here) for each
2101 * counter, and subtracting it before outputting the stats */
2102 error = EOPNOTSUPP;
72865317 2103 }
8a4e3a85 2104
61e7deb1 2105 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 2106 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 2107 error = EEXIST;
2c0ea78f
GS
2108 } else {
2109 /* Overlapping flow. */
2110 error = EINVAL;
72865317
BP
2111 }
2112 }
1c1e46ed
AW
2113 ovs_mutex_unlock(&pmd->flow_mutex);
2114 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2115
2116 return error;
72865317
BP
2117}
2118
72865317 2119static int
b99d3cee 2120dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
2121{
2122 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2123 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2124 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
2125 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2126 ? NON_PMD_CORE_ID : del->pmd_id;
70e5ed6f 2127 int error = 0;
72865317 2128
1c1e46ed
AW
2129 pmd = dp_netdev_get_pmd(dp, pmd_id);
2130 if (!pmd) {
2131 return EINVAL;
2132 }
2133
2134 ovs_mutex_lock(&pmd->flow_mutex);
2135 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2136 del->key_len);
1763b4b8 2137 if (netdev_flow) {
b99d3cee 2138 if (del->stats) {
1763b4b8 2139 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 2140 }
1c1e46ed 2141 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2142 } else {
5279f8fd 2143 error = ENOENT;
72865317 2144 }
1c1e46ed
AW
2145 ovs_mutex_unlock(&pmd->flow_mutex);
2146 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2147
2148 return error;
72865317
BP
2149}
2150
ac64794a
BP
2151struct dpif_netdev_flow_dump {
2152 struct dpif_flow_dump up;
1c1e46ed
AW
2153 struct cmap_position poll_thread_pos;
2154 struct cmap_position flow_pos;
2155 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
2156 int status;
2157 struct ovs_mutex mutex;
e723fd32
JS
2158};
2159
ac64794a
BP
2160static struct dpif_netdev_flow_dump *
2161dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 2162{
ac64794a 2163 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
2164}
2165
ac64794a 2166static struct dpif_flow_dump *
64bb477f 2167dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
e723fd32 2168{
ac64794a 2169 struct dpif_netdev_flow_dump *dump;
e723fd32 2170
1c1e46ed 2171 dump = xzalloc(sizeof *dump);
ac64794a 2172 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 2173 dump->up.terse = terse;
ac64794a
BP
2174 ovs_mutex_init(&dump->mutex);
2175
2176 return &dump->up;
e723fd32
JS
2177}
2178
2179static int
ac64794a 2180dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 2181{
ac64794a 2182 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 2183
ac64794a
BP
2184 ovs_mutex_destroy(&dump->mutex);
2185 free(dump);
704a1e09
BP
2186 return 0;
2187}
2188
ac64794a
BP
2189struct dpif_netdev_flow_dump_thread {
2190 struct dpif_flow_dump_thread up;
2191 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
2192 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2193 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
2194};
2195
2196static struct dpif_netdev_flow_dump_thread *
2197dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2198{
2199 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2200}
2201
2202static struct dpif_flow_dump_thread *
2203dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2204{
2205 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2206 struct dpif_netdev_flow_dump_thread *thread;
2207
2208 thread = xmalloc(sizeof *thread);
2209 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2210 thread->dump = dump;
2211 return &thread->up;
2212}
2213
2214static void
2215dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2216{
2217 struct dpif_netdev_flow_dump_thread *thread
2218 = dpif_netdev_flow_dump_thread_cast(thread_);
2219
2220 free(thread);
2221}
2222
704a1e09 2223static int
ac64794a 2224dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 2225 struct dpif_flow *flows, int max_flows)
ac64794a
BP
2226{
2227 struct dpif_netdev_flow_dump_thread *thread
2228 = dpif_netdev_flow_dump_thread_cast(thread_);
2229 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 2230 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
8bb113da
RW
2231 int n_flows = 0;
2232 int i;
14608a15 2233
ac64794a 2234 ovs_mutex_lock(&dump->mutex);
8bb113da 2235 if (!dump->status) {
1c1e46ed
AW
2236 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2237 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2238 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2239 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2240
2241 /* First call to dump_next(), extracts the first pmd thread.
2242 * If there is no pmd thread, returns immediately. */
2243 if (!pmd) {
2244 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2245 if (!pmd) {
2246 ovs_mutex_unlock(&dump->mutex);
2247 return n_flows;
8bb113da 2248
8bb113da 2249 }
d2ad7ef1 2250 }
1c1e46ed
AW
2251
2252 do {
2253 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2254 struct cmap_node *node;
2255
2256 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2257 if (!node) {
2258 break;
2259 }
2260 netdev_flows[n_flows] = CONTAINER_OF(node,
2261 struct dp_netdev_flow,
2262 node);
2263 }
2264 /* When finishing dumping the current pmd thread, moves to
2265 * the next. */
2266 if (n_flows < flow_limit) {
2267 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2268 dp_netdev_pmd_unref(pmd);
2269 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2270 if (!pmd) {
2271 dump->status = EOF;
2272 break;
2273 }
2274 }
2275 /* Keeps the reference to next caller. */
2276 dump->cur_pmd = pmd;
2277
2278 /* If the current dump is empty, do not exit the loop, since the
2279 * remaining pmds could have flows to be dumped. Just dumps again
2280 * on the new 'pmd'. */
2281 } while (!n_flows);
8a4e3a85 2282 }
ac64794a 2283 ovs_mutex_unlock(&dump->mutex);
ac64794a 2284
8bb113da
RW
2285 for (i = 0; i < n_flows; i++) {
2286 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2287 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2288 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2289 struct dpif_flow *f = &flows[i];
7af12bd7 2290 struct ofpbuf key, mask;
8bb113da 2291
7af12bd7
JS
2292 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2293 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
64bb477f
JS
2294 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2295 dump->up.terse);
8bb113da 2296 }
feebdea2 2297
8bb113da 2298 return n_flows;
72865317
BP
2299}
2300
2301static int
758c456d 2302dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 2303 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
2304{
2305 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 2306 struct dp_netdev_pmd_thread *pmd;
cf62fa4c 2307 struct dp_packet *pp;
72865317 2308
cf62fa4c
PS
2309 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2310 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
2311 return EINVAL;
2312 }
2313
65f13b50
AW
2314 /* Tries finding the 'pmd'. If NULL is returned, that means
2315 * the current thread is a non-pmd thread and should use
b19befae 2316 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
2317 pmd = ovsthread_getspecific(dp->per_pmd_key);
2318 if (!pmd) {
b19befae 2319 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
65f13b50
AW
2320 }
2321
2322 /* If the current thread is non-pmd thread, acquires
2323 * the 'non_pmd_mutex'. */
2324 if (pmd->core_id == NON_PMD_CORE_ID) {
2325 ovs_mutex_lock(&dp->non_pmd_mutex);
433330a8 2326 ovs_mutex_lock(&dp->port_mutex);
65f13b50 2327 }
1c1e46ed 2328
cf62fa4c 2329 pp = execute->packet;
41ccaa24 2330 dp_netdev_execute_actions(pmd, &pp, 1, false, execute->actions,
9bbf1c3d 2331 execute->actions_len);
65f13b50 2332 if (pmd->core_id == NON_PMD_CORE_ID) {
1c1e46ed 2333 dp_netdev_pmd_unref(pmd);
433330a8 2334 ovs_mutex_unlock(&dp->port_mutex);
65f13b50
AW
2335 ovs_mutex_unlock(&dp->non_pmd_mutex);
2336 }
8a4e3a85 2337
758c456d 2338 return 0;
72865317
BP
2339}
2340
1a0c894a
BP
2341static void
2342dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2343{
2344 size_t i;
2345
2346 for (i = 0; i < n_ops; i++) {
2347 struct dpif_op *op = ops[i];
2348
2349 switch (op->type) {
2350 case DPIF_OP_FLOW_PUT:
2351 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2352 break;
2353
2354 case DPIF_OP_FLOW_DEL:
2355 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2356 break;
2357
2358 case DPIF_OP_EXECUTE:
2359 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2360 break;
6fe09f8c
JS
2361
2362 case DPIF_OP_FLOW_GET:
2363 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2364 break;
1a0c894a
BP
2365 }
2366 }
2367}
2368
f2eee189
AW
2369/* Returns true if the configuration for rx queues or cpu mask
2370 * is changed. */
2371static bool
2372pmd_config_changed(const struct dp_netdev *dp, size_t rxqs, const char *cmask)
2373{
2374 if (dp->n_dpdk_rxqs != rxqs) {
2375 return true;
2376 } else {
2377 if (dp->pmd_cmask != NULL && cmask != NULL) {
2378 return strcmp(dp->pmd_cmask, cmask);
2379 } else {
2380 return (dp->pmd_cmask != NULL || cmask != NULL);
2381 }
2382 }
2383}
2384
2385/* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
2386static int
2387dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs, const char *cmask)
2388{
2389 struct dp_netdev *dp = get_dp_netdev(dpif);
2390
2391 if (pmd_config_changed(dp, n_rxqs, cmask)) {
2392 struct dp_netdev_port *port;
2393
2394 dp_netdev_destroy_all_pmds(dp);
2395
2396 CMAP_FOR_EACH (port, node, &dp->ports) {
2397 if (netdev_is_pmd(port->netdev)) {
2398 int i, err;
2399
2400 /* Closes the existing 'rxq's. */
2401 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2402 netdev_rxq_close(port->rxq[i]);
2403 port->rxq[i] = NULL;
2404 }
2405
2406 /* Sets the new rx queue config. */
2407 err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores(),
2408 n_rxqs);
7251515e 2409 if (err && (err != EOPNOTSUPP)) {
f2eee189
AW
2410 VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
2411 " %u", netdev_get_name(port->netdev),
2412 n_rxqs);
2413 return err;
2414 }
2415
2416 /* If the set_multiq() above succeeds, reopens the 'rxq's. */
2417 port->rxq = xrealloc(port->rxq, sizeof *port->rxq
2418 * netdev_n_rxq(port->netdev));
2419 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2420 netdev_rxq_open(port->netdev, &port->rxq[i], i);
2421 }
2422 }
2423 }
2424 dp->n_dpdk_rxqs = n_rxqs;
2425
2426 /* Reconfigures the cpu mask. */
2427 ovs_numa_set_cpu_mask(cmask);
2428 free(dp->pmd_cmask);
2429 dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
2430
2431 /* Restores the non-pmd. */
2432 dp_netdev_set_nonpmd(dp);
2433 /* Restores all pmd threads. */
2434 dp_netdev_reset_pmd_threads(dp);
2435 }
2436
2437 return 0;
2438}
2439
5bf93d67
EJ
2440static int
2441dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2442 uint32_t queue_id, uint32_t *priority)
2443{
2444 *priority = queue_id;
2445 return 0;
2446}
2447
72865317 2448\f
9ff55ae2
DDP
2449/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2450 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
a84cb64a
BP
2451struct dp_netdev_actions *
2452dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2453{
2454 struct dp_netdev_actions *netdev_actions;
2455
9ff55ae2
DDP
2456 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2457 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
2458 netdev_actions->size = size;
2459
2460 return netdev_actions;
2461}
2462
a84cb64a 2463struct dp_netdev_actions *
61e7deb1 2464dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 2465{
61e7deb1 2466 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
2467}
2468
61e7deb1
BP
2469static void
2470dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 2471{
61e7deb1 2472 free(actions);
a84cb64a
BP
2473}
2474\f
55e3ca97
DDP
2475static inline unsigned long long
2476cycles_counter(void)
2477{
2478#ifdef DPDK_NETDEV
2479 return rte_get_tsc_cycles();
2480#else
2481 return 0;
2482#endif
2483}
2484
2485/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2486extern struct ovs_mutex cycles_counter_fake_mutex;
2487
2488/* Start counting cycles. Must be followed by 'cycles_count_end()' */
2489static inline void
2490cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2491 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2492 OVS_NO_THREAD_SAFETY_ANALYSIS
2493{
2494 pmd->last_cycles = cycles_counter();
2495}
2496
2497/* Stop counting cycles and add them to the counter 'type' */
2498static inline void
2499cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2500 enum pmd_cycles_counter_type type)
2501 OVS_RELEASES(&cycles_counter_fake_mutex)
2502 OVS_NO_THREAD_SAFETY_ANALYSIS
2503{
2504 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2505
2506 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2507}
e4cfed38 2508
5794e276 2509static void
65f13b50 2510dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2511 struct dp_netdev_port *port,
2512 struct netdev_rxq *rxq)
e4cfed38 2513{
cd159f1a 2514 struct dp_packet *packets[NETDEV_MAX_BURST];
8cbf4f47 2515 int error, cnt;
e4cfed38 2516
55e3ca97 2517 cycles_count_start(pmd);
8cbf4f47 2518 error = netdev_rxq_recv(rxq, packets, &cnt);
55e3ca97 2519 cycles_count_end(pmd, PMD_CYCLES_POLLING);
e4cfed38 2520 if (!error) {
41ccaa24 2521 int i;
3c33f0ff
JR
2522
2523 *recirc_depth_get() = 0;
41ccaa24
PS
2524
2525 /* XXX: initialize md in netdev implementation. */
2526 for (i = 0; i < cnt; i++) {
efa2bcbb 2527 packets[i]->md = port->md;
41ccaa24 2528 }
55e3ca97 2529 cycles_count_start(pmd);
41ccaa24 2530 dp_netdev_input(pmd, packets, cnt);
55e3ca97 2531 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
e4cfed38 2532 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 2533 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
2534
2535 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 2536 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
2537 }
2538}
2539
a36de779
PS
2540/* Return true if needs to revalidate datapath flows. */
2541static bool
e4cfed38
PS
2542dpif_netdev_run(struct dpif *dpif)
2543{
2544 struct dp_netdev_port *port;
2545 struct dp_netdev *dp = get_dp_netdev(dpif);
b19befae
AW
2546 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_pmd(dp,
2547 NON_PMD_CORE_ID);
a36de779 2548 uint64_t new_tnl_seq;
e4cfed38 2549
65f13b50 2550 ovs_mutex_lock(&dp->non_pmd_mutex);
a532e683 2551 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2552 if (!netdev_is_pmd(port->netdev)) {
2553 int i;
2554
2555 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
65f13b50 2556 dp_netdev_process_rxq_port(non_pmd, port, port->rxq[i]);
55c955bd 2557 }
e4cfed38
PS
2558 }
2559 }
65f13b50 2560 ovs_mutex_unlock(&dp->non_pmd_mutex);
1c1e46ed
AW
2561 dp_netdev_pmd_unref(non_pmd);
2562
a36de779
PS
2563 tnl_arp_cache_run();
2564 new_tnl_seq = seq_read(tnl_conf_seq);
2565
2566 if (dp->last_tnl_conf_seq != new_tnl_seq) {
2567 dp->last_tnl_conf_seq = new_tnl_seq;
2568 return true;
2569 }
2570 return false;
e4cfed38
PS
2571}
2572
2573static void
2574dpif_netdev_wait(struct dpif *dpif)
2575{
2576 struct dp_netdev_port *port;
2577 struct dp_netdev *dp = get_dp_netdev(dpif);
2578
59e6d833 2579 ovs_mutex_lock(&dp_netdev_mutex);
a532e683 2580 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2581 if (!netdev_is_pmd(port->netdev)) {
2582 int i;
2583
2584 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2585 netdev_rxq_wait(port->rxq[i]);
2586 }
e4cfed38
PS
2587 }
2588 }
59e6d833 2589 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 2590 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
2591}
2592
f7791740 2593struct rxq_poll {
e4cfed38 2594 struct dp_netdev_port *port;
55c955bd 2595 struct netdev_rxq *rx;
e4cfed38
PS
2596};
2597
2598static int
65f13b50 2599pmd_load_queues(struct dp_netdev_pmd_thread *pmd,
f7791740 2600 struct rxq_poll **ppoll_list, int poll_cnt)
e4cfed38 2601{
f7791740 2602 struct rxq_poll *poll_list = *ppoll_list;
e4cfed38 2603 struct dp_netdev_port *port;
65f13b50 2604 int n_pmds_on_numa, index, i;
e4cfed38
PS
2605
2606 /* Simple scheduler for netdev rx polling. */
e4cfed38 2607 for (i = 0; i < poll_cnt; i++) {
65f13b50 2608 port_unref(poll_list[i].port);
e4cfed38
PS
2609 }
2610
2611 poll_cnt = 0;
65f13b50 2612 n_pmds_on_numa = get_n_pmd_threads_on_numa(pmd->dp, pmd->numa_id);
e4cfed38
PS
2613 index = 0;
2614
65f13b50 2615 CMAP_FOR_EACH (port, node, &pmd->dp->ports) {
a1fdee13
AW
2616 /* Calls port_try_ref() to prevent the main thread
2617 * from deleting the port. */
2618 if (port_try_ref(port)) {
65f13b50
AW
2619 if (netdev_is_pmd(port->netdev)
2620 && netdev_get_numa_id(port->netdev) == pmd->numa_id) {
a1fdee13
AW
2621 int i;
2622
2623 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
65f13b50 2624 if ((index % n_pmds_on_numa) == pmd->index) {
a1fdee13
AW
2625 poll_list = xrealloc(poll_list,
2626 sizeof *poll_list * (poll_cnt + 1));
2627
2628 port_ref(port);
2629 poll_list[poll_cnt].port = port;
2630 poll_list[poll_cnt].rx = port->rxq[i];
2631 poll_cnt++;
2632 }
2633 index++;
55c955bd 2634 }
e4cfed38 2635 }
a1fdee13
AW
2636 /* Unrefs the port_try_ref(). */
2637 port_unref(port);
e4cfed38
PS
2638 }
2639 }
2640
e4cfed38
PS
2641 *ppoll_list = poll_list;
2642 return poll_cnt;
2643}
2644
6c3eee82 2645static void *
e4cfed38 2646pmd_thread_main(void *f_)
6c3eee82 2647{
65f13b50 2648 struct dp_netdev_pmd_thread *pmd = f_;
e4cfed38 2649 unsigned int lc = 0;
f7791740 2650 struct rxq_poll *poll_list;
84067a4c 2651 unsigned int port_seq = PMD_INITIAL_SEQ;
e4cfed38
PS
2652 int poll_cnt;
2653 int i;
6c3eee82 2654
e4cfed38
PS
2655 poll_cnt = 0;
2656 poll_list = NULL;
2657
65f13b50
AW
2658 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
2659 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
2660 pmd_thread_setaffinity_cpu(pmd->core_id);
e4cfed38 2661reload:
65f13b50
AW
2662 emc_cache_init(&pmd->flow_cache);
2663 poll_cnt = pmd_load_queues(pmd, &poll_list, poll_cnt);
6c3eee82 2664
accf8626
AW
2665 /* Signal here to make sure the pmd finishes
2666 * reloading the updated configuration. */
2667 dp_netdev_pmd_reload_done(pmd);
2668
e4cfed38 2669 for (;;) {
6c3eee82
BP
2670 int i;
2671
e4cfed38 2672 for (i = 0; i < poll_cnt; i++) {
65f13b50 2673 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
e4cfed38
PS
2674 }
2675
2676 if (lc++ > 1024) {
84067a4c 2677 unsigned int seq;
6c3eee82 2678
e4cfed38 2679 lc = 0;
84067a4c 2680
67ad54cb 2681 emc_cache_slow_sweep(&pmd->flow_cache);
84067a4c
JR
2682 ovsrcu_quiesce();
2683
65f13b50 2684 atomic_read_relaxed(&pmd->change_seq, &seq);
84067a4c
JR
2685 if (seq != port_seq) {
2686 port_seq = seq;
6c3eee82
BP
2687 break;
2688 }
2689 }
e4cfed38 2690 }
6c3eee82 2691
65f13b50 2692 emc_cache_uninit(&pmd->flow_cache);
9bbf1c3d 2693
65f13b50 2694 if (!latch_is_set(&pmd->exit_latch)){
e4cfed38
PS
2695 goto reload;
2696 }
6c3eee82 2697
e4cfed38
PS
2698 for (i = 0; i < poll_cnt; i++) {
2699 port_unref(poll_list[i].port);
6c3eee82 2700 }
6c3eee82 2701
accf8626
AW
2702 dp_netdev_pmd_reload_done(pmd);
2703
e4cfed38 2704 free(poll_list);
6c3eee82
BP
2705 return NULL;
2706}
2707
6b31e073
RW
2708static void
2709dp_netdev_disable_upcall(struct dp_netdev *dp)
2710 OVS_ACQUIRES(dp->upcall_rwlock)
2711{
2712 fat_rwlock_wrlock(&dp->upcall_rwlock);
2713}
2714
2715static void
2716dpif_netdev_disable_upcall(struct dpif *dpif)
2717 OVS_NO_THREAD_SAFETY_ANALYSIS
2718{
2719 struct dp_netdev *dp = get_dp_netdev(dpif);
2720 dp_netdev_disable_upcall(dp);
2721}
2722
2723static void
2724dp_netdev_enable_upcall(struct dp_netdev *dp)
2725 OVS_RELEASES(dp->upcall_rwlock)
2726{
2727 fat_rwlock_unlock(&dp->upcall_rwlock);
2728}
2729
2730static void
2731dpif_netdev_enable_upcall(struct dpif *dpif)
2732 OVS_NO_THREAD_SAFETY_ANALYSIS
2733{
2734 struct dp_netdev *dp = get_dp_netdev(dpif);
2735 dp_netdev_enable_upcall(dp);
2736}
2737
accf8626
AW
2738void
2739dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
2740{
2741 ovs_mutex_lock(&pmd->cond_mutex);
2742 xpthread_cond_signal(&pmd->cond);
2743 ovs_mutex_unlock(&pmd->cond_mutex);
2744}
2745
1c1e46ed
AW
2746/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
2747 * the pointer if succeeds, otherwise, NULL.
2748 *
2749 * Caller must unrefs the returned reference. */
65f13b50 2750static struct dp_netdev_pmd_thread *
bd5131ba 2751dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
2752{
2753 struct dp_netdev_pmd_thread *pmd;
55847abe 2754 const struct cmap_node *pnode;
65f13b50 2755
b19befae 2756 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
2757 if (!pnode) {
2758 return NULL;
2759 }
65f13b50
AW
2760 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
2761
1c1e46ed 2762 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
2763}
2764
f2eee189
AW
2765/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
2766static void
2767dp_netdev_set_nonpmd(struct dp_netdev *dp)
2768{
2769 struct dp_netdev_pmd_thread *non_pmd;
2770
2771 non_pmd = xzalloc(sizeof *non_pmd);
2772 dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
2773 OVS_NUMA_UNSPEC);
2774}
2775
1c1e46ed
AW
2776/* Caller must have valid pointer to 'pmd'. */
2777static bool
2778dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
2779{
2780 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
2781}
2782
2783static void
2784dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
2785{
2786 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
2787 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
2788 }
2789}
2790
2791/* Given cmap position 'pos', tries to ref the next node. If try_ref()
2792 * fails, keeps checking for next node until reaching the end of cmap.
2793 *
2794 * Caller must unrefs the returned reference. */
2795static struct dp_netdev_pmd_thread *
2796dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
2797{
2798 struct dp_netdev_pmd_thread *next;
2799
2800 do {
2801 struct cmap_node *node;
2802
2803 node = cmap_next_position(&dp->poll_threads, pos);
2804 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
2805 : NULL;
2806 } while (next && !dp_netdev_pmd_try_ref(next));
2807
2808 return next;
2809}
2810
65f13b50 2811/* Configures the 'pmd' based on the input argument. */
6c3eee82 2812static void
65f13b50 2813dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
bd5131ba 2814 int index, unsigned core_id, int numa_id)
65f13b50
AW
2815{
2816 pmd->dp = dp;
2817 pmd->index = index;
2818 pmd->core_id = core_id;
2819 pmd->numa_id = numa_id;
1c1e46ed
AW
2820
2821 ovs_refcount_init(&pmd->ref_cnt);
65f13b50
AW
2822 latch_init(&pmd->exit_latch);
2823 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
accf8626
AW
2824 xpthread_cond_init(&pmd->cond, NULL);
2825 ovs_mutex_init(&pmd->cond_mutex);
1c1e46ed
AW
2826 ovs_mutex_init(&pmd->flow_mutex);
2827 dpcls_init(&pmd->cls);
2828 cmap_init(&pmd->flow_table);
65f13b50
AW
2829 /* init the 'flow_cache' since there is no
2830 * actual thread created for NON_PMD_CORE_ID. */
2831 if (core_id == NON_PMD_CORE_ID) {
2832 emc_cache_init(&pmd->flow_cache);
2833 }
2834 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
2835 hash_int(core_id, 0));
2836}
2837
1c1e46ed
AW
2838static void
2839dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
2840{
2841 dp_netdev_pmd_flow_flush(pmd);
2842 dpcls_destroy(&pmd->cls);
2843 cmap_destroy(&pmd->flow_table);
2844 ovs_mutex_destroy(&pmd->flow_mutex);
2845 latch_destroy(&pmd->exit_latch);
2846 xpthread_cond_destroy(&pmd->cond);
2847 ovs_mutex_destroy(&pmd->cond_mutex);
2848 free(pmd);
2849}
2850
2851/* Stops the pmd thread, removes it from the 'dp->poll_threads',
2852 * and unrefs the struct. */
65f13b50
AW
2853static void
2854dp_netdev_del_pmd(struct dp_netdev_pmd_thread *pmd)
6c3eee82 2855{
65f13b50 2856 /* Uninit the 'flow_cache' since there is
1c1e46ed 2857 * no actual thread uninit it for NON_PMD_CORE_ID. */
65f13b50
AW
2858 if (pmd->core_id == NON_PMD_CORE_ID) {
2859 emc_cache_uninit(&pmd->flow_cache);
2860 } else {
2861 latch_set(&pmd->exit_latch);
2862 dp_netdev_reload_pmd__(pmd);
2863 ovs_numa_unpin_core(pmd->core_id);
2864 xpthread_join(pmd->thread, NULL);
2865 }
2866 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 2867 dp_netdev_pmd_unref(pmd);
65f13b50 2868}
6c3eee82 2869
65f13b50
AW
2870/* Destroys all pmd threads. */
2871static void
2872dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
2873{
2874 struct dp_netdev_pmd_thread *pmd;
2875
2876 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2877 dp_netdev_del_pmd(pmd);
6c3eee82 2878 }
65f13b50 2879}
6c3eee82 2880
65f13b50
AW
2881/* Deletes all pmd threads on numa node 'numa_id'. */
2882static void
2883dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
2884{
2885 struct dp_netdev_pmd_thread *pmd;
6c3eee82 2886
65f13b50
AW
2887 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2888 if (pmd->numa_id == numa_id) {
2889 dp_netdev_del_pmd(pmd);
2890 }
6c3eee82 2891 }
65f13b50 2892}
6c3eee82 2893
65f13b50
AW
2894/* Checks the numa node id of 'netdev' and starts pmd threads for
2895 * the numa node. */
2896static void
2897dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
2898{
2899 int n_pmds;
e4cfed38 2900
65f13b50
AW
2901 if (!ovs_numa_numa_id_is_valid(numa_id)) {
2902 VLOG_ERR("Cannot create pmd threads due to numa id (%d)"
2903 "invalid", numa_id);
2904 return ;
2905 }
2906
2907 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
2908
2909 /* If there are already pmd threads created for the numa node
2910 * in which 'netdev' is on, do nothing. Else, creates the
2911 * pmd threads for the numa node. */
2912 if (!n_pmds) {
2913 int can_have, n_unpinned, i;
2914
2915 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
2916 if (!n_unpinned) {
2917 VLOG_ERR("Cannot create pmd threads due to out of unpinned "
2918 "cores on numa node");
2919 return;
2920 }
6c3eee82 2921
f2eee189
AW
2922 /* If cpu mask is specified, uses all unpinned cores, otherwise
2923 * tries creating NR_PMD_THREADS pmd threads. */
2924 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
65f13b50
AW
2925 for (i = 0; i < can_have; i++) {
2926 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
bd5131ba 2927 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
e4cfed38 2928
65f13b50
AW
2929 dp_netdev_configure_pmd(pmd, dp, i, core_id, numa_id);
2930 /* Each thread will distribute all devices rx-queues among
2931 * themselves. */
2932 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
2933 }
2934 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
6c3eee82
BP
2935 }
2936}
e4cfed38 2937
6c3eee82 2938\f
f2eee189
AW
2939/* Called after pmd threads config change. Restarts pmd threads with
2940 * new configuration. */
2941static void
2942dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
2943{
2944 struct dp_netdev_port *port;
2945
2946 CMAP_FOR_EACH (port, node, &dp->ports) {
2947 if (netdev_is_pmd(port->netdev)) {
2948 int numa_id = netdev_get_numa_id(port->netdev);
2949
2950 dp_netdev_set_pmds_on_numa(dp, numa_id);
2951 }
2952 }
2953}
2954
b5cbbcf6
AZ
2955static char *
2956dpif_netdev_get_datapath_version(void)
2957{
2958 return xstrdup("<built-in>");
2959}
2960
72865317 2961static void
1c1e46ed 2962dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 2963 uint16_t tcp_flags, long long now)
72865317 2964{
eb94da30 2965 uint16_t flags;
72865317 2966
eb94da30
DDP
2967 atomic_store_relaxed(&netdev_flow->stats.used, now);
2968 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
2969 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
2970 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2971 flags |= tcp_flags;
2972 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
2973}
2974
2975static void
1c1e46ed
AW
2976dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
2977 enum dp_stat_type type, int cnt)
51852a57 2978{
eb94da30 2979 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
51852a57
BP
2980}
2981
623540e4 2982static int
e14deea0 2983dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 2984 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
2985 enum dpif_upcall_type type, const struct nlattr *userdata,
2986 struct ofpbuf *actions, struct ofpbuf *put_actions)
2987{
1c1e46ed 2988 struct dp_netdev *dp = pmd->dp;
623540e4 2989
623540e4
EJ
2990 if (OVS_UNLIKELY(!dp->upcall_cb)) {
2991 return ENODEV;
2992 }
2993
2994 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
2995 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 2996 char *packet_str;
cf62fa4c 2997 struct ofpbuf key;
623540e4
EJ
2998
2999 ofpbuf_init(&key, 0);
3000 odp_flow_key_from_flow(&key, flow, &wc->masks, flow->in_port.odp_port,
3001 true);
cf62fa4c
PS
3002 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3003 dp_packet_size(packet_));
623540e4 3004
6fd6ed71 3005 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
3006
3007 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3008 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3009
3010 ofpbuf_uninit(&key);
3011 free(packet_str);
6fd6ed71 3012
623540e4
EJ
3013 ds_destroy(&ds);
3014 }
3015
cf62fa4c 3016 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
1c1e46ed 3017 actions, wc, put_actions, dp->upcall_aux);
623540e4
EJ
3018}
3019
9bbf1c3d 3020static inline uint32_t
048963aa
DDP
3021dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3022 const struct miniflow *mf)
9bbf1c3d 3023{
048963aa 3024 uint32_t hash, recirc_depth;
9bbf1c3d 3025
2bc1bbd2 3026 hash = dp_packet_get_rss_hash(packet);
9bbf1c3d
DDP
3027 if (OVS_UNLIKELY(!hash)) {
3028 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 3029 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 3030 }
048963aa
DDP
3031
3032 /* The RSS hash must account for the recirculation depth to avoid
3033 * collisions in the exact match cache */
3034 recirc_depth = *recirc_depth_get_unsafe();
3035 if (OVS_UNLIKELY(recirc_depth)) {
3036 hash = hash_finish(hash, recirc_depth);
3037 dp_packet_set_rss_hash(packet, hash);
3038 }
9bbf1c3d
DDP
3039 return hash;
3040}
3041
567bbb2e 3042struct packet_batch {
8cbf4f47
DDP
3043 unsigned int packet_count;
3044 unsigned int byte_count;
3045 uint16_t tcp_flags;
3046
3047 struct dp_netdev_flow *flow;
3048
cd159f1a 3049 struct dp_packet *packets[NETDEV_MAX_BURST];
8cbf4f47
DDP
3050};
3051
3052static inline void
e14deea0 3053packet_batch_update(struct packet_batch *batch, struct dp_packet *packet,
9bbf1c3d 3054 const struct miniflow *mf)
8cbf4f47
DDP
3055{
3056 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3057 batch->packets[batch->packet_count++] = packet;
cf62fa4c 3058 batch->byte_count += dp_packet_size(packet);
8cbf4f47
DDP
3059}
3060
3061static inline void
41ccaa24 3062packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow)
8cbf4f47 3063{
11e5cf1f 3064 flow->batch = batch;
8cbf4f47 3065
11e5cf1f 3066 batch->flow = flow;
8cbf4f47
DDP
3067 batch->packet_count = 0;
3068 batch->byte_count = 0;
3069 batch->tcp_flags = 0;
8cbf4f47
DDP
3070}
3071
3072static inline void
65f13b50 3073packet_batch_execute(struct packet_batch *batch,
abcf3ef4 3074 struct dp_netdev_pmd_thread *pmd,
11bfdadd 3075 long long now)
8cbf4f47
DDP
3076{
3077 struct dp_netdev_actions *actions;
3078 struct dp_netdev_flow *flow = batch->flow;
3079
11e5cf1f 3080 dp_netdev_flow_used(flow, batch->packet_count, batch->byte_count,
11bfdadd 3081 batch->tcp_flags, now);
8cbf4f47
DDP
3082
3083 actions = dp_netdev_flow_get_actions(flow);
3084
65f13b50 3085 dp_netdev_execute_actions(pmd, batch->packets, batch->packet_count, true,
41ccaa24 3086 actions->actions, actions->size);
8cbf4f47
DDP
3087}
3088
8aaa125d 3089static inline void
e14deea0 3090dp_netdev_queue_batches(struct dp_packet *pkt,
9bbf1c3d 3091 struct dp_netdev_flow *flow, const struct miniflow *mf,
8aaa125d 3092 struct packet_batch *batches, size_t *n_batches)
9bbf1c3d 3093{
8aaa125d 3094 struct packet_batch *batch = flow->batch;
11e5cf1f
DDP
3095
3096 if (OVS_LIKELY(batch)) {
3097 packet_batch_update(batch, pkt, mf);
8aaa125d 3098 return;
9bbf1c3d
DDP
3099 }
3100
3101 batch = &batches[(*n_batches)++];
41ccaa24 3102 packet_batch_init(batch, flow);
9bbf1c3d 3103 packet_batch_update(batch, pkt, mf);
9bbf1c3d
DDP
3104}
3105
3106static inline void
e14deea0 3107dp_packet_swap(struct dp_packet **a, struct dp_packet **b)
9bbf1c3d 3108{
e14deea0 3109 struct dp_packet *tmp = *a;
9bbf1c3d
DDP
3110 *a = *b;
3111 *b = tmp;
3112}
3113
3114/* Try to process all ('cnt') the 'packets' using only the exact match cache
8aaa125d
DDP
3115 * 'flow_cache'. If a flow is not found for a packet 'packets[i]', the
3116 * miniflow is copied into 'keys' and the packet pointer is moved at the
3117 * beginning of the 'packets' array.
9bbf1c3d
DDP
3118 *
3119 * The function returns the number of packets that needs to be processed in the
3120 * 'packets' array (they have been moved to the beginning of the vector).
3121 */
3122static inline size_t
e14deea0 3123emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
8aaa125d
DDP
3124 size_t cnt, struct netdev_flow_key *keys,
3125 struct packet_batch batches[], size_t *n_batches)
72865317 3126{
65f13b50 3127 struct emc_cache *flow_cache = &pmd->flow_cache;
8aaa125d
DDP
3128 struct netdev_flow_key key;
3129 size_t i, notfound_cnt = 0;
8cbf4f47 3130
0de8783a 3131 miniflow_initialize(&key.mf, key.buf);
84d6d5eb 3132 for (i = 0; i < cnt; i++) {
9bbf1c3d 3133 struct dp_netdev_flow *flow;
9bbf1c3d 3134
cf62fa4c 3135 if (OVS_UNLIKELY(dp_packet_size(packets[i]) < ETH_HEADER_LEN)) {
e14deea0 3136 dp_packet_delete(packets[i]);
84d6d5eb
EJ
3137 continue;
3138 }
8cbf4f47 3139
cf62fa4c 3140 miniflow_extract(packets[i], &key.mf);
0de8783a 3141 key.len = 0; /* Not computed yet. */
048963aa 3142 key.hash = dpif_netdev_packet_get_rss_hash(packets[i], &key.mf);
9bbf1c3d 3143
0de8783a 3144 flow = emc_lookup(flow_cache, &key);
8aaa125d
DDP
3145 if (OVS_LIKELY(flow)) {
3146 dp_netdev_queue_batches(packets[i], flow, &key.mf, batches,
3147 n_batches);
3148 } else {
9bbf1c3d 3149 if (i != notfound_cnt) {
e14deea0 3150 dp_packet_swap(&packets[i], &packets[notfound_cnt]);
9bbf1c3d
DDP
3151 }
3152
3153 keys[notfound_cnt++] = key;
3154 }
3155 }
3156
8aaa125d 3157 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - notfound_cnt);
4f150744 3158
9bbf1c3d
DDP
3159 return notfound_cnt;
3160}
3161
3162static inline void
65f13b50 3163fast_path_processing(struct dp_netdev_pmd_thread *pmd,
e14deea0 3164 struct dp_packet **packets, size_t cnt,
8aaa125d
DDP
3165 struct netdev_flow_key *keys,
3166 struct packet_batch batches[], size_t *n_batches)
9bbf1c3d 3167{
1a0d5831 3168#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3169 const size_t PKT_ARRAY_SIZE = cnt;
3170#else
1a0d5831 3171 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3172 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 3173#endif
0de8783a 3174 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50
AW
3175 struct dp_netdev *dp = pmd->dp;
3176 struct emc_cache *flow_cache = &pmd->flow_cache;
8aaa125d 3177 int miss_cnt = 0, lost_cnt = 0;
9bbf1c3d 3178 bool any_miss;
8aaa125d 3179 size_t i;
9bbf1c3d
DDP
3180
3181 for (i = 0; i < cnt; i++) {
0de8783a
JR
3182 /* Key length is needed in all the cases, hash computed on demand. */
3183 keys[i].len = netdev_flow_key_size(count_1bits(keys[i].mf.map));
9bbf1c3d 3184 }
1c1e46ed 3185 any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
623540e4
EJ
3186 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3187 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
3188 struct ofpbuf actions, put_actions;
7af12bd7 3189 ovs_u128 ufid;
623540e4
EJ
3190
3191 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
3192 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
3193
3194 for (i = 0; i < cnt; i++) {
0de8783a 3195 struct dp_netdev_flow *netdev_flow;
623540e4 3196 struct ofpbuf *add_actions;
0de8783a 3197 struct match match;
623540e4
EJ
3198 int error;
3199
0de8783a 3200 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
3201 continue;
3202 }
3203
3204 /* It's possible that an earlier slow path execution installed
0de8783a 3205 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 3206 * to catch it here than execute a miss. */
1c1e46ed 3207 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
623540e4 3208 if (netdev_flow) {
0de8783a 3209 rules[i] = &netdev_flow->cr;
623540e4
EJ
3210 continue;
3211 }
3212
60fc3b7b
DDP
3213 miss_cnt++;
3214
0de8783a 3215 miniflow_expand(&keys[i].mf, &match.flow);
623540e4
EJ
3216
3217 ofpbuf_clear(&actions);
3218 ofpbuf_clear(&put_actions);
3219
7af12bd7 3220 dpif_flow_hash(dp->dpif, &match.flow, sizeof match.flow, &ufid);
1c1e46ed 3221 error = dp_netdev_upcall(pmd, packets[i], &match.flow, &match.wc,
7af12bd7 3222 &ufid, DPIF_UC_MISS, NULL, &actions,
0de8783a 3223 &put_actions);
623540e4 3224 if (OVS_UNLIKELY(error && error != ENOSPC)) {
7ad20cbd 3225 dp_packet_delete(packets[i]);
60fc3b7b 3226 lost_cnt++;
623540e4
EJ
3227 continue;
3228 }
3229
3230 /* We can't allow the packet batching in the next loop to execute
3231 * the actions. Otherwise, if there are any slow path actions,
3232 * we'll send the packet up twice. */
41ccaa24 3233 dp_netdev_execute_actions(pmd, &packets[i], 1, true,
6fd6ed71 3234 actions.data, actions.size);
623540e4 3235
6fd6ed71 3236 add_actions = put_actions.size ? &put_actions : &actions;
0de8783a
JR
3237 if (OVS_LIKELY(error != ENOSPC)) {
3238 /* XXX: There's a race window where a flow covering this packet
3239 * could have already been installed since we last did the flow
3240 * lookup before upcall. This could be solved by moving the
3241 * mutex lock outside the loop, but that's an awful long time
3242 * to be locking everyone out of making flow installs. If we
3243 * move to a per-core classifier, it would be reasonable. */
1c1e46ed
AW
3244 ovs_mutex_lock(&pmd->flow_mutex);
3245 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
0de8783a 3246 if (OVS_LIKELY(!netdev_flow)) {
1c1e46ed 3247 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6fd6ed71
PS
3248 add_actions->data,
3249 add_actions->size);
0de8783a 3250 }
1c1e46ed 3251 ovs_mutex_unlock(&pmd->flow_mutex);
0de8783a 3252
0de8783a 3253 emc_insert(flow_cache, &keys[i], netdev_flow);
623540e4 3254 }
623540e4
EJ
3255 }
3256
3257 ofpbuf_uninit(&actions);
3258 ofpbuf_uninit(&put_actions);
3259 fat_rwlock_unlock(&dp->upcall_rwlock);
60fc3b7b 3260 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
ac8c2081 3261 } else if (OVS_UNLIKELY(any_miss)) {
ac8c2081 3262 for (i = 0; i < cnt; i++) {
0de8783a 3263 if (OVS_UNLIKELY(!rules[i])) {
e14deea0 3264 dp_packet_delete(packets[i]);
8aaa125d
DDP
3265 lost_cnt++;
3266 miss_cnt++;
ac8c2081
DDP
3267 }
3268 }
623540e4 3269 }
84d6d5eb 3270
8cbf4f47 3271 for (i = 0; i < cnt; i++) {
e14deea0 3272 struct dp_packet *packet = packets[i];
84d6d5eb 3273 struct dp_netdev_flow *flow;
8cbf4f47 3274
0de8783a 3275 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
3276 continue;
3277 }
3278
84d6d5eb 3279 flow = dp_netdev_flow_cast(rules[i]);
0de8783a 3280
0de8783a 3281 emc_insert(flow_cache, &keys[i], flow);
8aaa125d 3282 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
8cbf4f47
DDP
3283 }
3284
8aaa125d
DDP
3285 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
3286 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
3287 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
72865317
BP
3288}
3289
adcf00ba 3290static void
65f13b50 3291dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
e14deea0 3292 struct dp_packet **packets, int cnt)
9bbf1c3d 3293{
1a0d5831 3294#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3295 const size_t PKT_ARRAY_SIZE = cnt;
3296#else
1a0d5831 3297 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3298 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d
DDP
3299#endif
3300 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8aaa125d 3301 struct packet_batch batches[PKT_ARRAY_SIZE];
11bfdadd 3302 long long now = time_msec();
8aaa125d 3303 size_t newcnt, n_batches, i;
9bbf1c3d 3304
8aaa125d
DDP
3305 n_batches = 0;
3306 newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches);
9bbf1c3d 3307 if (OVS_UNLIKELY(newcnt)) {
8aaa125d
DDP
3308 fast_path_processing(pmd, packets, newcnt, keys, batches, &n_batches);
3309 }
3310
603f2ce0
EJ
3311 for (i = 0; i < n_batches; i++) {
3312 batches[i].flow->batch = NULL;
3313 }
3314
8aaa125d
DDP
3315 for (i = 0; i < n_batches; i++) {
3316 packet_batch_execute(&batches[i], pmd, now);
9bbf1c3d
DDP
3317 }
3318}
3319
9080a111 3320struct dp_netdev_execute_aux {
65f13b50 3321 struct dp_netdev_pmd_thread *pmd;
9080a111
JR
3322};
3323
6b31e073 3324static void
623540e4
EJ
3325dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
3326 void *aux)
6b31e073
RW
3327{
3328 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 3329 dp->upcall_aux = aux;
6b31e073
RW
3330 dp->upcall_cb = cb;
3331}
3332
ac8c2081 3333static void
e14deea0 3334dp_netdev_drop_packets(struct dp_packet ** packets, int cnt, bool may_steal)
ac8c2081 3335{
ac8c2081 3336 if (may_steal) {
a36de779
PS
3337 int i;
3338
ac8c2081 3339 for (i = 0; i < cnt; i++) {
e14deea0 3340 dp_packet_delete(packets[i]);
ac8c2081
DDP
3341 }
3342 }
3343}
3344
a36de779
PS
3345static int
3346push_tnl_action(const struct dp_netdev *dp,
3347 const struct nlattr *attr,
e14deea0 3348 struct dp_packet **packets, int cnt)
a36de779
PS
3349{
3350 struct dp_netdev_port *tun_port;
3351 const struct ovs_action_push_tnl *data;
3352
3353 data = nl_attr_get(attr);
3354
3355 tun_port = dp_netdev_lookup_port(dp, u32_to_odp(data->tnl_port));
3356 if (!tun_port) {
3357 return -EINVAL;
3358 }
3359 netdev_push_header(tun_port->netdev, packets, cnt, data);
3360
3361 return 0;
3362}
3363
3364static void
28e2fa02
DDP
3365dp_netdev_clone_pkt_batch(struct dp_packet **dst_pkts,
3366 struct dp_packet **src_pkts, int cnt)
a36de779
PS
3367{
3368 int i;
3369
3370 for (i = 0; i < cnt; i++) {
28e2fa02 3371 dst_pkts[i] = dp_packet_clone(src_pkts[i]);
a36de779
PS
3372 }
3373}
3374
9080a111 3375static void
e14deea0 3376dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt,
09f9da0b 3377 const struct nlattr *a, bool may_steal)
8a4e3a85 3378 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
3379{
3380 struct dp_netdev_execute_aux *aux = aux_;
623540e4 3381 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
3382 struct dp_netdev_pmd_thread *pmd = aux->pmd;
3383 struct dp_netdev *dp = pmd->dp;
09f9da0b 3384 int type = nl_attr_type(a);
8a4e3a85 3385 struct dp_netdev_port *p;
8cbf4f47 3386 int i;
9080a111 3387
09f9da0b
JR
3388 switch ((enum ovs_action_attr)type) {
3389 case OVS_ACTION_ATTR_OUTPUT:
623540e4 3390 p = dp_netdev_lookup_port(dp, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 3391 if (OVS_LIKELY(p)) {
65f13b50 3392 netdev_send(p->netdev, pmd->core_id, packets, cnt, may_steal);
ac8c2081 3393 return;
8a4e3a85 3394 }
09f9da0b
JR
3395 break;
3396
a36de779
PS
3397 case OVS_ACTION_ATTR_TUNNEL_PUSH:
3398 if (*depth < MAX_RECIRC_DEPTH) {
cd159f1a 3399 struct dp_packet *tnl_pkt[NETDEV_MAX_BURST];
a36de779
PS
3400 int err;
3401
3402 if (!may_steal) {
3403 dp_netdev_clone_pkt_batch(tnl_pkt, packets, cnt);
3404 packets = tnl_pkt;
3405 }
3406
3407 err = push_tnl_action(dp, a, packets, cnt);
3408 if (!err) {
3409 (*depth)++;
3410 dp_netdev_input(pmd, packets, cnt);
3411 (*depth)--;
3412 } else {
3413 dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal);
3414 }
3415 return;
3416 }
3417 break;
3418
3419 case OVS_ACTION_ATTR_TUNNEL_POP:
3420 if (*depth < MAX_RECIRC_DEPTH) {
3421 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
3422
3423 p = dp_netdev_lookup_port(dp, portno);
3424 if (p) {
cd159f1a 3425 struct dp_packet *tnl_pkt[NETDEV_MAX_BURST];
a36de779
PS
3426 int err;
3427
3428 if (!may_steal) {
3429 dp_netdev_clone_pkt_batch(tnl_pkt, packets, cnt);
3430 packets = tnl_pkt;
3431 }
3432
3433 err = netdev_pop_header(p->netdev, packets, cnt);
3434 if (!err) {
3435
3436 for (i = 0; i < cnt; i++) {
3437 packets[i]->md.in_port.odp_port = portno;
3438 }
3439
3440 (*depth)++;
3441 dp_netdev_input(pmd, packets, cnt);
3442 (*depth)--;
3443 } else {
3444 dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal);
3445 }
3446 return;
3447 }
3448 }
3449 break;
3450
623540e4
EJ
3451 case OVS_ACTION_ATTR_USERSPACE:
3452 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3453 const struct nlattr *userdata;
3454 struct ofpbuf actions;
3455 struct flow flow;
7af12bd7 3456 ovs_u128 ufid;
4fc65926 3457
623540e4
EJ
3458 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
3459 ofpbuf_init(&actions, 0);
8cbf4f47 3460
623540e4
EJ
3461 for (i = 0; i < cnt; i++) {
3462 int error;
3463
3464 ofpbuf_clear(&actions);
3465
cf62fa4c 3466 flow_extract(packets[i], &flow);
7af12bd7 3467 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
1c1e46ed 3468 error = dp_netdev_upcall(pmd, packets[i], &flow, NULL, &ufid,
7af12bd7 3469 DPIF_UC_ACTION, userdata,&actions,
623540e4
EJ
3470 NULL);
3471 if (!error || error == ENOSPC) {
ac8c2081 3472 dp_netdev_execute_actions(pmd, &packets[i], 1, may_steal,
6fd6ed71 3473 actions.data, actions.size);
ac8c2081 3474 } else if (may_steal) {
e14deea0 3475 dp_packet_delete(packets[i]);
623540e4 3476 }
db73f716 3477 }
623540e4
EJ
3478 ofpbuf_uninit(&actions);
3479 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 3480
ac8c2081
DDP
3481 return;
3482 }
09f9da0b 3483 break;
572f732a 3484
adcf00ba
AZ
3485 case OVS_ACTION_ATTR_RECIRC:
3486 if (*depth < MAX_RECIRC_DEPTH) {
cd159f1a 3487 struct dp_packet *recirc_pkts[NETDEV_MAX_BURST];
572f732a 3488
28e2fa02
DDP
3489 if (!may_steal) {
3490 dp_netdev_clone_pkt_batch(recirc_pkts, packets, cnt);
3491 packets = recirc_pkts;
3492 }
8cbf4f47 3493
28e2fa02
DDP
3494 for (i = 0; i < cnt; i++) {
3495 packets[i]->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 3496 }
28e2fa02
DDP
3497
3498 (*depth)++;
3499 dp_netdev_input(pmd, packets, cnt);
adcf00ba
AZ
3500 (*depth)--;
3501
ac8c2081 3502 return;
adcf00ba 3503 }
ac8c2081
DDP
3504
3505 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 3506 break;
572f732a 3507
09f9da0b
JR
3508 case OVS_ACTION_ATTR_PUSH_VLAN:
3509 case OVS_ACTION_ATTR_POP_VLAN:
3510 case OVS_ACTION_ATTR_PUSH_MPLS:
3511 case OVS_ACTION_ATTR_POP_MPLS:
3512 case OVS_ACTION_ATTR_SET:
6d670e7f 3513 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 3514 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 3515 case OVS_ACTION_ATTR_HASH:
09f9da0b
JR
3516 case OVS_ACTION_ATTR_UNSPEC:
3517 case __OVS_ACTION_ATTR_MAX:
3518 OVS_NOT_REACHED();
da546e07 3519 }
ac8c2081
DDP
3520
3521 dp_netdev_drop_packets(packets, cnt, may_steal);
98403001
BP
3522}
3523
4edb9ae9 3524static void
65f13b50 3525dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
e14deea0 3526 struct dp_packet **packets, int cnt,
41ccaa24 3527 bool may_steal,
9080a111 3528 const struct nlattr *actions, size_t actions_len)
72865317 3529{
41ccaa24 3530 struct dp_netdev_execute_aux aux = { pmd };
9080a111 3531
41ccaa24 3532 odp_execute_actions(&aux, packets, cnt, may_steal, actions,
8cbf4f47 3533 actions_len, dp_execute_cb);
72865317
BP
3534}
3535
3536const struct dpif_class dpif_netdev_class = {
72865317 3537 "netdev",
6553d06b 3538 dpif_netdev_init,
2197d7ab 3539 dpif_netdev_enumerate,
0aeaabc8 3540 dpif_netdev_port_open_type,
72865317
BP
3541 dpif_netdev_open,
3542 dpif_netdev_close,
7dab847a 3543 dpif_netdev_destroy,
e4cfed38
PS
3544 dpif_netdev_run,
3545 dpif_netdev_wait,
72865317 3546 dpif_netdev_get_stats,
72865317
BP
3547 dpif_netdev_port_add,
3548 dpif_netdev_port_del,
3549 dpif_netdev_port_query_by_number,
3550 dpif_netdev_port_query_by_name,
98403001 3551 NULL, /* port_get_pid */
b0ec0f27
BP
3552 dpif_netdev_port_dump_start,
3553 dpif_netdev_port_dump_next,
3554 dpif_netdev_port_dump_done,
72865317
BP
3555 dpif_netdev_port_poll,
3556 dpif_netdev_port_poll_wait,
72865317 3557 dpif_netdev_flow_flush,
ac64794a
BP
3558 dpif_netdev_flow_dump_create,
3559 dpif_netdev_flow_dump_destroy,
3560 dpif_netdev_flow_dump_thread_create,
3561 dpif_netdev_flow_dump_thread_destroy,
704a1e09 3562 dpif_netdev_flow_dump_next,
1a0c894a 3563 dpif_netdev_operate,
6b31e073
RW
3564 NULL, /* recv_set */
3565 NULL, /* handlers_set */
f2eee189 3566 dpif_netdev_pmd_set,
5bf93d67 3567 dpif_netdev_queue_to_priority,
6b31e073
RW
3568 NULL, /* recv */
3569 NULL, /* recv_wait */
3570 NULL, /* recv_purge */
3571 dpif_netdev_register_upcall_cb,
3572 dpif_netdev_enable_upcall,
3573 dpif_netdev_disable_upcall,
b5cbbcf6 3574 dpif_netdev_get_datapath_version,
72865317 3575};
614c4892 3576
74cc3969
BP
3577static void
3578dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
3579 const char *argv[], void *aux OVS_UNUSED)
3580{
59e6d833
BP
3581 struct dp_netdev_port *old_port;
3582 struct dp_netdev_port *new_port;
74cc3969 3583 struct dp_netdev *dp;
ff073a71 3584 odp_port_t port_no;
74cc3969 3585
8a4e3a85 3586 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
3587 dp = shash_find_data(&dp_netdevs, argv[1]);
3588 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 3589 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
3590 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
3591 return;
3592 }
8a4e3a85
BP
3593 ovs_refcount_ref(&dp->ref_cnt);
3594 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 3595
59e6d833
BP
3596 ovs_mutex_lock(&dp->port_mutex);
3597 if (get_port_by_name(dp, argv[2], &old_port)) {
74cc3969 3598 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 3599 goto exit;
74cc3969
BP
3600 }
3601
ff073a71
BP
3602 port_no = u32_to_odp(atoi(argv[3]));
3603 if (!port_no || port_no == ODPP_NONE) {
74cc3969 3604 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 3605 goto exit;
74cc3969 3606 }
ff073a71 3607 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 3608 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 3609 goto exit;
74cc3969 3610 }
59e6d833
BP
3611
3612 /* Remove old port. */
efa2bcbb 3613 cmap_remove(&dp->ports, &old_port->node, hash_port_no(old_port->md.in_port.odp_port));
59e6d833
BP
3614 ovsrcu_postpone(free, old_port);
3615
3616 /* Insert new port (cmap semantics mean we cannot re-insert 'old_port'). */
3617 new_port = xmemdup(old_port, sizeof *old_port);
efa2bcbb 3618 new_port->md.in_port.odp_port = port_no;
59e6d833
BP
3619 cmap_insert(&dp->ports, &new_port->node, hash_port_no(port_no));
3620
d33ed218 3621 seq_change(dp->port_seq);
74cc3969 3622 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
3623
3624exit:
59e6d833 3625 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 3626 dp_netdev_unref(dp);
74cc3969
BP
3627}
3628
c40b890f
BP
3629static void
3630dpif_dummy_delete_port(struct unixctl_conn *conn, int argc OVS_UNUSED,
3631 const char *argv[], void *aux OVS_UNUSED)
3632{
3633 struct dp_netdev_port *port;
3634 struct dp_netdev *dp;
3635
3636 ovs_mutex_lock(&dp_netdev_mutex);
3637 dp = shash_find_data(&dp_netdevs, argv[1]);
3638 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
3639 ovs_mutex_unlock(&dp_netdev_mutex);
3640 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
3641 return;
3642 }
3643 ovs_refcount_ref(&dp->ref_cnt);
3644 ovs_mutex_unlock(&dp_netdev_mutex);
3645
3646 ovs_mutex_lock(&dp->port_mutex);
3647 if (get_port_by_name(dp, argv[2], &port)) {
3648 unixctl_command_reply_error(conn, "unknown port");
efa2bcbb 3649 } else if (port->md.in_port.odp_port == ODPP_LOCAL) {
c40b890f
BP
3650 unixctl_command_reply_error(conn, "can't delete local port");
3651 } else {
3652 do_del_port(dp, port);
3653 unixctl_command_reply(conn, NULL);
3654 }
3655 ovs_mutex_unlock(&dp->port_mutex);
3656
3657 dp_netdev_unref(dp);
3658}
3659
0cbfe35d
BP
3660static void
3661dpif_dummy_register__(const char *type)
3662{
3663 struct dpif_class *class;
3664
3665 class = xmalloc(sizeof *class);
3666 *class = dpif_netdev_class;
3667 class->type = xstrdup(type);
3668 dp_register_provider(class);
3669}
3670
614c4892 3671void
0cbfe35d 3672dpif_dummy_register(bool override)
614c4892 3673{
0cbfe35d
BP
3674 if (override) {
3675 struct sset types;
3676 const char *type;
3677
3678 sset_init(&types);
3679 dp_enumerate_types(&types);
3680 SSET_FOR_EACH (type, &types) {
3681 if (!dp_unregister_provider(type)) {
3682 dpif_dummy_register__(type);
3683 }
3684 }
3685 sset_destroy(&types);
614c4892 3686 }
0cbfe35d
BP
3687
3688 dpif_dummy_register__("dummy");
74cc3969
BP
3689
3690 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 3691 "dp port new-number",
74cc3969 3692 3, 3, dpif_dummy_change_port_number, NULL);
74467d5c 3693 unixctl_command_register("dpif-dummy/delete-port", "dp port",
c40b890f 3694 2, 2, dpif_dummy_delete_port, NULL);
614c4892 3695}
0de8783a
JR
3696\f
3697/* Datapath Classifier. */
3698
3699/* A set of rules that all have the same fields wildcarded. */
3700struct dpcls_subtable {
3701 /* The fields are only used by writers. */
3702 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
3703
3704 /* These fields are accessed by readers. */
3705 struct cmap rules; /* Contains "struct dpcls_rule"s. */
3706 struct netdev_flow_key mask; /* Wildcards for fields (const). */
3707 /* 'mask' must be the last field, additional space is allocated here. */
3708};
3709
3710/* Initializes 'cls' as a classifier that initially contains no classification
3711 * rules. */
3712static void
3713dpcls_init(struct dpcls *cls)
3714{
3715 cmap_init(&cls->subtables_map);
3716 pvector_init(&cls->subtables);
3717}
3718
3719static void
3720dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
3721{
3722 pvector_remove(&cls->subtables, subtable);
3723 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
3724 subtable->mask.hash);
3725 cmap_destroy(&subtable->rules);
3726 ovsrcu_postpone(free, subtable);
3727}
3728
3729/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
3730 * caller's responsibility.
3731 * May only be called after all the readers have been terminated. */
3732static void
3733dpcls_destroy(struct dpcls *cls)
3734{
3735 if (cls) {
3736 struct dpcls_subtable *subtable;
3737
3738 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
3739 dpcls_destroy_subtable(cls, subtable);
3740 }
3741 cmap_destroy(&cls->subtables_map);
3742 pvector_destroy(&cls->subtables);
3743 }
3744}
3745
3746static struct dpcls_subtable *
3747dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
3748{
3749 struct dpcls_subtable *subtable;
3750
3751 /* Need to add one. */
caeb4906
JR
3752 subtable = xmalloc(sizeof *subtable
3753 - sizeof subtable->mask.mf + mask->len);
0de8783a
JR
3754 cmap_init(&subtable->rules);
3755 netdev_flow_key_clone(&subtable->mask, mask);
3756 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
3757 pvector_insert(&cls->subtables, subtable, 0);
802f84ff 3758 pvector_publish(&cls->subtables);
0de8783a
JR
3759
3760 return subtable;
3761}
3762
3763static inline struct dpcls_subtable *
3764dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
3765{
3766 struct dpcls_subtable *subtable;
3767
3768 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
3769 &cls->subtables_map) {
3770 if (netdev_flow_key_equal(&subtable->mask, mask)) {
3771 return subtable;
3772 }
3773 }
3774 return dpcls_create_subtable(cls, mask);
3775}
3776
3777/* Insert 'rule' into 'cls'. */
3778static void
3779dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
3780 const struct netdev_flow_key *mask)
3781{
3782 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
3783
3784 rule->mask = &subtable->mask;
3785 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
3786}
3787
3788/* Removes 'rule' from 'cls', also destructing the 'rule'. */
3789static void
3790dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
3791{
3792 struct dpcls_subtable *subtable;
3793
3794 ovs_assert(rule->mask);
3795
3796 INIT_CONTAINER(subtable, rule->mask, mask);
3797
3798 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
3799 == 0) {
3800 dpcls_destroy_subtable(cls, subtable);
802f84ff 3801 pvector_publish(&cls->subtables);
0de8783a
JR
3802 }
3803}
3804
3805/* Returns true if 'target' satisifies 'key' in 'mask', that is, if each 1-bit
3806 * in 'mask' the values in 'key' and 'target' are the same.
3807 *
3808 * Note: 'key' and 'mask' have the same mask, and 'key' is already masked. */
3809static inline bool
3810dpcls_rule_matches_key(const struct dpcls_rule *rule,
3811 const struct netdev_flow_key *target)
3812{
d70e8c28
JR
3813 const uint64_t *keyp = rule->flow.mf.inline_values;
3814 const uint64_t *maskp = rule->mask->mf.inline_values;
3815 uint64_t target_u64;
0de8783a 3816
d70e8c28
JR
3817 NETDEV_FLOW_KEY_FOR_EACH_IN_MAP(target_u64, target, rule->flow.mf.map) {
3818 if (OVS_UNLIKELY((target_u64 & *maskp++) != *keyp++)) {
0de8783a
JR
3819 return false;
3820 }
3821 }
3822 return true;
3823}
3824
3825/* For each miniflow in 'flows' performs a classifier lookup writing the result
3826 * into the corresponding slot in 'rules'. If a particular entry in 'flows' is
3827 * NULL it is skipped.
3828 *
3829 * This function is optimized for use in the userspace datapath and therefore
3830 * does not implement a lot of features available in the standard
3831 * classifier_lookup() function. Specifically, it does not implement
3832 * priorities, instead returning any rule which matches the flow.
3833 *
3834 * Returns true if all flows found a corresponding rule. */
3835static bool
3836dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
3837 struct dpcls_rule **rules, const size_t cnt)
3838{
3839 /* The batch size 16 was experimentally found faster than 8 or 32. */
3840 typedef uint16_t map_type;
3841#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
3842
3843#if !defined(__CHECKER__) && !defined(_WIN32)
3844 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
3845#else
cd159f1a 3846 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
0de8783a
JR
3847#endif
3848 map_type maps[N_MAPS];
3849 struct dpcls_subtable *subtable;
3850
3851 memset(maps, 0xff, sizeof maps);
3852 if (cnt % MAP_BITS) {
3853 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
3854 }
3855 memset(rules, 0, cnt * sizeof *rules);
3856
3857 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
3858 const struct netdev_flow_key *mkeys = keys;
3859 struct dpcls_rule **mrules = rules;
3860 map_type remains = 0;
3861 int m;
3862
3863 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
3864
3865 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
3866 uint32_t hashes[MAP_BITS];
3867 const struct cmap_node *nodes[MAP_BITS];
3868 unsigned long map = maps[m];
3869 int i;
3870
3871 if (!map) {
3872 continue; /* Skip empty maps. */
3873 }
3874
3875 /* Compute hashes for the remaining keys. */
3876 ULONG_FOR_EACH_1(i, map) {
3877 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
3878 &subtable->mask);
3879 }
3880 /* Lookup. */
3881 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
3882 /* Check results. */
3883 ULONG_FOR_EACH_1(i, map) {
3884 struct dpcls_rule *rule;
3885
3886 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
3887 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
3888 mrules[i] = rule;
3889 goto next;
3890 }
3891 }
3892 ULONG_SET0(map, i); /* Did not match. */
3893 next:
3894 ; /* Keep Sparse happy. */
3895 }
3896 maps[m] &= ~map; /* Clear the found rules. */
3897 remains |= maps[m];
3898 }
3899 if (!remains) {
3900 return true; /* All found. */
3901 }
3902 }
3903 return false; /* Some misses. */
3904}