]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: Populate dpif class field in offload struct.
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19 #include "dpif-netdev-private.h"
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <net/if.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/ioctl.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35
36 #include "bitmap.h"
37 #include "cmap.h"
38 #include "conntrack.h"
39 #include "coverage.h"
40 #include "ct-dpif.h"
41 #include "csum.h"
42 #include "dp-packet.h"
43 #include "dpif.h"
44 #include "dpif-netdev-perf.h"
45 #include "dpif-provider.h"
46 #include "dummy.h"
47 #include "fat-rwlock.h"
48 #include "flow.h"
49 #include "hmapx.h"
50 #include "id-pool.h"
51 #include "ipf.h"
52 #include "netdev.h"
53 #include "netdev-offload.h"
54 #include "netdev-provider.h"
55 #include "netdev-vport.h"
56 #include "netlink.h"
57 #include "odp-execute.h"
58 #include "odp-util.h"
59 #include "openvswitch/dynamic-string.h"
60 #include "openvswitch/list.h"
61 #include "openvswitch/match.h"
62 #include "openvswitch/ofp-parse.h"
63 #include "openvswitch/ofp-print.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openvswitch/shash.h"
66 #include "openvswitch/vlog.h"
67 #include "ovs-numa.h"
68 #include "ovs-rcu.h"
69 #include "packets.h"
70 #include "openvswitch/poll-loop.h"
71 #include "pvector.h"
72 #include "random.h"
73 #include "seq.h"
74 #include "smap.h"
75 #include "sset.h"
76 #include "timeval.h"
77 #include "tnl-neigh-cache.h"
78 #include "tnl-ports.h"
79 #include "unixctl.h"
80 #include "util.h"
81 #include "uuid.h"
82
83 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
84
85 /* Auto Load Balancing Defaults */
86 #define ALB_ACCEPTABLE_IMPROVEMENT 25
87 #define ALB_PMD_LOAD_THRESHOLD 95
88 #define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
89 #define MIN_TO_MSEC 60000
90
91 #define FLOW_DUMP_MAX_BATCH 50
92 /* Use per thread recirc_depth to prevent recirculation loop. */
93 #define MAX_RECIRC_DEPTH 6
94 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
95
96 /* Use instant packet send by default. */
97 #define DEFAULT_TX_FLUSH_INTERVAL 0
98
99 /* Configuration parameters. */
100 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
101 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
102 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
103 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
104
105 COVERAGE_DEFINE(datapath_drop_meter);
106 COVERAGE_DEFINE(datapath_drop_upcall_error);
107 COVERAGE_DEFINE(datapath_drop_lock_error);
108 COVERAGE_DEFINE(datapath_drop_userspace_action_error);
109 COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
110 COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
111 COVERAGE_DEFINE(datapath_drop_recirc_error);
112 COVERAGE_DEFINE(datapath_drop_invalid_port);
113 COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
114 COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
115
116 /* Protects against changes to 'dp_netdevs'. */
117 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
118
119 /* Contains all 'struct dp_netdev's. */
120 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
121 = SHASH_INITIALIZER(&dp_netdevs);
122
123 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
124
125 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
126 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
127 | CS_SRC_NAT | CS_DST_NAT)
128 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
129
130 static struct odp_support dp_netdev_support = {
131 .max_vlan_headers = SIZE_MAX,
132 .max_mpls_depth = SIZE_MAX,
133 .recirc = true,
134 .ct_state = true,
135 .ct_zone = true,
136 .ct_mark = true,
137 .ct_label = true,
138 .ct_state_nat = true,
139 .ct_orig_tuple = true,
140 .ct_orig_tuple6 = true,
141 };
142
143 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
144 *
145 * Exact match cache for frequently used flows
146 *
147 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
148 * search its entries for a miniflow that matches exactly the miniflow of the
149 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
150 *
151 * A cache entry holds a reference to its 'dp_netdev_flow'.
152 *
153 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
154 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
155 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
156 * value is the index of a cache entry where the miniflow could be.
157 *
158 *
159 * Signature match cache (SMC)
160 *
161 * This cache stores a 16-bit signature for each flow without storing keys, and
162 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
163 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
164 * SMC uses a set-associative design that each bucket contains
165 * SMC_ENTRY_PER_BUCKET number of entries.
166 * Since 16-bit flow_table index is used, if there are more than 2^16
167 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
168 *
169 *
170 * Thread-safety
171 * =============
172 *
173 * Each pmd_thread has its own private exact match cache.
174 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
175 */
176
177 #define EM_FLOW_HASH_SHIFT 13
178 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
179 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
180 #define EM_FLOW_HASH_SEGS 2
181
182 /* SMC uses a set-associative design. A bucket contains a set of entries that
183 * a flow item can occupy. For now, it uses one hash function rather than two
184 * as for the EMC design. */
185 #define SMC_ENTRY_PER_BUCKET 4
186 #define SMC_ENTRIES (1u << 20)
187 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
188 #define SMC_MASK (SMC_BUCKET_CNT - 1)
189
190 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
191 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
192 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
193 DEFAULT_EM_FLOW_INSERT_INV_PROB)
194
195 struct emc_entry {
196 struct dp_netdev_flow *flow;
197 struct netdev_flow_key key; /* key.hash used for emc hash value. */
198 };
199
200 struct emc_cache {
201 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
202 int sweep_idx; /* For emc_cache_slow_sweep(). */
203 };
204
205 struct smc_bucket {
206 uint16_t sig[SMC_ENTRY_PER_BUCKET];
207 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
208 };
209
210 /* Signature match cache, differentiate from EMC cache */
211 struct smc_cache {
212 struct smc_bucket buckets[SMC_BUCKET_CNT];
213 };
214
215 struct dfc_cache {
216 struct emc_cache emc_cache;
217 struct smc_cache smc_cache;
218 };
219
220 /* Iterate in the exact match cache through every entry that might contain a
221 * miniflow with hash 'HASH'. */
222 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
223 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
224 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
225 i__ < EM_FLOW_HASH_SEGS; \
226 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
227 \f
228 /* Simple non-wildcarding single-priority classifier. */
229
230 /* Time in microseconds between successive optimizations of the dpcls
231 * subtable vector */
232 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
233
234 /* Time in microseconds of the interval in which rxq processing cycles used
235 * in rxq to pmd assignments is measured and stored. */
236 #define PMD_RXQ_INTERVAL_LEN 10000000LL
237
238 /* Number of intervals for which cycles are stored
239 * and used during rxq to pmd assignment. */
240 #define PMD_RXQ_INTERVAL_MAX 6
241
242 struct dpcls {
243 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
244 odp_port_t in_port;
245 struct cmap subtables_map;
246 struct pvector subtables;
247 };
248
249 /* Data structure to keep packet order till fastpath processing. */
250 struct dp_packet_flow_map {
251 struct dp_packet *packet;
252 struct dp_netdev_flow *flow;
253 uint16_t tcp_flags;
254 };
255
256 static void dpcls_init(struct dpcls *);
257 static void dpcls_destroy(struct dpcls *);
258 static void dpcls_sort_subtable_vector(struct dpcls *);
259 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
260 const struct netdev_flow_key *mask);
261 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
262 static bool dpcls_lookup(struct dpcls *cls,
263 const struct netdev_flow_key *keys[],
264 struct dpcls_rule **rules, size_t cnt,
265 int *num_lookups_p);
266
267 /* Set of supported meter flags */
268 #define DP_SUPPORTED_METER_FLAGS_MASK \
269 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
270
271 /* Set of supported meter band types */
272 #define DP_SUPPORTED_METER_BAND_TYPES \
273 ( 1 << OFPMBT13_DROP )
274
275 struct dp_meter_band {
276 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
277 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
278 uint64_t packet_count;
279 uint64_t byte_count;
280 };
281
282 struct dp_meter {
283 uint16_t flags;
284 uint16_t n_bands;
285 uint32_t max_delta_t;
286 uint64_t used;
287 uint64_t packet_count;
288 uint64_t byte_count;
289 struct dp_meter_band bands[];
290 };
291
292 struct pmd_auto_lb {
293 bool auto_lb_requested; /* Auto load balancing requested by user. */
294 bool is_enabled; /* Current status of Auto load balancing. */
295 uint64_t rebalance_intvl;
296 uint64_t rebalance_poll_timer;
297 };
298
299 /* Datapath based on the network device interface from netdev.h.
300 *
301 *
302 * Thread-safety
303 * =============
304 *
305 * Some members, marked 'const', are immutable. Accessing other members
306 * requires synchronization, as noted in more detail below.
307 *
308 * Acquisition order is, from outermost to innermost:
309 *
310 * dp_netdev_mutex (global)
311 * port_mutex
312 * non_pmd_mutex
313 */
314 struct dp_netdev {
315 const struct dpif_class *const class;
316 const char *const name;
317 struct ovs_refcount ref_cnt;
318 atomic_flag destroyed;
319
320 /* Ports.
321 *
322 * Any lookup into 'ports' or any access to the dp_netdev_ports found
323 * through 'ports' requires taking 'port_mutex'. */
324 struct ovs_mutex port_mutex;
325 struct hmap ports;
326 struct seq *port_seq; /* Incremented whenever a port changes. */
327
328 /* The time that a packet can wait in output batch for sending. */
329 atomic_uint32_t tx_flush_interval;
330
331 /* Meters. */
332 struct ovs_mutex meter_locks[N_METER_LOCKS];
333 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
334
335 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
336 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
337 /* Enable collection of PMD performance metrics. */
338 atomic_bool pmd_perf_metrics;
339 /* Enable the SMC cache from ovsdb config */
340 atomic_bool smc_enable_db;
341
342 /* Protects access to ofproto-dpif-upcall interface during revalidator
343 * thread synchronization. */
344 struct fat_rwlock upcall_rwlock;
345 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
346 void *upcall_aux;
347
348 /* Callback function for notifying the purging of dp flows (during
349 * reseting pmd deletion). */
350 dp_purge_callback *dp_purge_cb;
351 void *dp_purge_aux;
352
353 /* Stores all 'struct dp_netdev_pmd_thread's. */
354 struct cmap poll_threads;
355 /* id pool for per thread static_tx_qid. */
356 struct id_pool *tx_qid_pool;
357 struct ovs_mutex tx_qid_pool_mutex;
358 /* Use measured cycles for rxq to pmd assignment. */
359 bool pmd_rxq_assign_cyc;
360
361 /* Protects the access of the 'struct dp_netdev_pmd_thread'
362 * instance for non-pmd thread. */
363 struct ovs_mutex non_pmd_mutex;
364
365 /* Each pmd thread will store its pointer to
366 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
367 ovsthread_key_t per_pmd_key;
368
369 struct seq *reconfigure_seq;
370 uint64_t last_reconfigure_seq;
371
372 /* Cpu mask for pin of pmd threads. */
373 char *pmd_cmask;
374
375 uint64_t last_tnl_conf_seq;
376
377 struct conntrack *conntrack;
378 struct pmd_auto_lb pmd_alb;
379 };
380
381 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
382 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
383 {
384 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
385 }
386
387 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
388 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
389 {
390 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
391 }
392
393
394 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
395 odp_port_t)
396 OVS_REQUIRES(dp->port_mutex);
397
398 enum rxq_cycles_counter_type {
399 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
400 processing packets during the current
401 interval. */
402 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
403 during rxq to pmd assignment. */
404 RXQ_N_CYCLES
405 };
406
407 enum {
408 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
409 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
410 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
411 };
412
413 struct dp_flow_offload_item {
414 struct dp_netdev_pmd_thread *pmd;
415 struct dp_netdev_flow *flow;
416 int op;
417 struct match match;
418 struct nlattr *actions;
419 size_t actions_len;
420
421 struct ovs_list node;
422 };
423
424 struct dp_flow_offload {
425 struct ovs_mutex mutex;
426 struct ovs_list list;
427 pthread_cond_t cond;
428 };
429
430 static struct dp_flow_offload dp_flow_offload = {
431 .mutex = OVS_MUTEX_INITIALIZER,
432 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
433 };
434
435 static struct ovsthread_once offload_thread_once
436 = OVSTHREAD_ONCE_INITIALIZER;
437
438 #define XPS_TIMEOUT 500000LL /* In microseconds. */
439
440 /* Contained by struct dp_netdev_port's 'rxqs' member. */
441 struct dp_netdev_rxq {
442 struct dp_netdev_port *port;
443 struct netdev_rxq *rx;
444 unsigned core_id; /* Core to which this queue should be
445 pinned. OVS_CORE_UNSPEC if the
446 queue doesn't need to be pinned to a
447 particular core. */
448 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
449 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
450 bool is_vhost; /* Is rxq of a vhost port. */
451
452 /* Counters of cycles spent successfully polling and processing pkts. */
453 atomic_ullong cycles[RXQ_N_CYCLES];
454 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
455 sum them to yield the cycles used for an rxq. */
456 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
457 };
458
459 /* A port in a netdev-based datapath. */
460 struct dp_netdev_port {
461 odp_port_t port_no;
462 bool dynamic_txqs; /* If true XPS will be used. */
463 bool need_reconfigure; /* True if we should reconfigure netdev. */
464 struct netdev *netdev;
465 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
466 struct netdev_saved_flags *sf;
467 struct dp_netdev_rxq *rxqs;
468 unsigned n_rxq; /* Number of elements in 'rxqs' */
469 unsigned *txq_used; /* Number of threads that use each tx queue. */
470 struct ovs_mutex txq_used_mutex;
471 bool emc_enabled; /* If true EMC will be used. */
472 char *type; /* Port type as requested by user. */
473 char *rxq_affinity_list; /* Requested affinity of rx queues. */
474 };
475
476 /* Contained by struct dp_netdev_flow's 'stats' member. */
477 struct dp_netdev_flow_stats {
478 atomic_llong used; /* Last used time, in monotonic msecs. */
479 atomic_ullong packet_count; /* Number of packets matched. */
480 atomic_ullong byte_count; /* Number of bytes matched. */
481 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
482 };
483
484 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
485 *
486 *
487 * Thread-safety
488 * =============
489 *
490 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
491 * its pmd thread's classifier. The text below calls this classifier 'cls'.
492 *
493 * Motivation
494 * ----------
495 *
496 * The thread safety rules described here for "struct dp_netdev_flow" are
497 * motivated by two goals:
498 *
499 * - Prevent threads that read members of "struct dp_netdev_flow" from
500 * reading bad data due to changes by some thread concurrently modifying
501 * those members.
502 *
503 * - Prevent two threads making changes to members of a given "struct
504 * dp_netdev_flow" from interfering with each other.
505 *
506 *
507 * Rules
508 * -----
509 *
510 * A flow 'flow' may be accessed without a risk of being freed during an RCU
511 * grace period. Code that needs to hold onto a flow for a while
512 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
513 *
514 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
515 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
516 * from modification.
517 *
518 * Some members, marked 'const', are immutable. Accessing other members
519 * requires synchronization, as noted in more detail below.
520 */
521 struct dp_netdev_flow {
522 const struct flow flow; /* Unmasked flow that created this entry. */
523 /* Hash table index by unmasked flow. */
524 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
525 /* 'flow_table'. */
526 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
527 const ovs_u128 ufid; /* Unique flow identifier. */
528 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
529 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
530 /* flow. */
531
532 /* Number of references.
533 * The classifier owns one reference.
534 * Any thread trying to keep a rule from being freed should hold its own
535 * reference. */
536 struct ovs_refcount ref_cnt;
537
538 bool dead;
539 uint32_t mark; /* Unique flow mark assigned to a flow */
540
541 /* Statistics. */
542 struct dp_netdev_flow_stats stats;
543
544 /* Actions. */
545 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
546
547 /* While processing a group of input packets, the datapath uses the next
548 * member to store a pointer to the output batch for the flow. It is
549 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
550 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
551 struct packet_batch_per_flow *batch;
552
553 /* Packet classification. */
554 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
555 /* 'cr' must be the last member. */
556 };
557
558 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
559 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
560 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
561 struct flow *, bool);
562
563 /* A set of datapath actions within a "struct dp_netdev_flow".
564 *
565 *
566 * Thread-safety
567 * =============
568 *
569 * A struct dp_netdev_actions 'actions' is protected with RCU. */
570 struct dp_netdev_actions {
571 /* These members are immutable: they do not change during the struct's
572 * lifetime. */
573 unsigned int size; /* Size of 'actions', in bytes. */
574 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
575 };
576
577 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
578 size_t);
579 struct dp_netdev_actions *dp_netdev_flow_get_actions(
580 const struct dp_netdev_flow *);
581 static void dp_netdev_actions_free(struct dp_netdev_actions *);
582
583 struct polled_queue {
584 struct dp_netdev_rxq *rxq;
585 odp_port_t port_no;
586 bool emc_enabled;
587 bool rxq_enabled;
588 uint64_t change_seq;
589 };
590
591 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
592 struct rxq_poll {
593 struct dp_netdev_rxq *rxq;
594 struct hmap_node node;
595 };
596
597 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
598 * 'tnl_port_cache' or 'tx_ports'. */
599 struct tx_port {
600 struct dp_netdev_port *port;
601 int qid;
602 long long last_used;
603 struct hmap_node node;
604 long long flush_time;
605 struct dp_packet_batch output_pkts;
606 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
607 };
608
609 /* A set of properties for the current processing loop that is not directly
610 * associated with the pmd thread itself, but with the packets being
611 * processed or the short-term system configuration (for example, time).
612 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
613 struct dp_netdev_pmd_thread_ctx {
614 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
615 long long now;
616 /* RX queue from which last packet was received. */
617 struct dp_netdev_rxq *last_rxq;
618 /* EMC insertion probability context for the current processing cycle. */
619 uint32_t emc_insert_min;
620 };
621
622 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
623 * the performance overhead of interrupt processing. Therefore netdev can
624 * not implement rx-wait for these devices. dpif-netdev needs to poll
625 * these device to check for recv buffer. pmd-thread does polling for
626 * devices assigned to itself.
627 *
628 * DPDK used PMD for accessing NIC.
629 *
630 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
631 * I/O of all non-pmd threads. There will be no actual thread created
632 * for the instance.
633 *
634 * Each struct has its own flow cache and classifier per managed ingress port.
635 * For packets received on ingress port, a look up is done on corresponding PMD
636 * thread's flow cache and in case of a miss, lookup is performed in the
637 * corresponding classifier of port. Packets are executed with the found
638 * actions in either case.
639 * */
640 struct dp_netdev_pmd_thread {
641 struct dp_netdev *dp;
642 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
643 struct cmap_node node; /* In 'dp->poll_threads'. */
644
645 /* Per thread exact-match cache. Note, the instance for cpu core
646 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
647 * need to be protected by 'non_pmd_mutex'. Every other instance
648 * will only be accessed by its own pmd thread. */
649 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
650
651 /* Flow-Table and classifiers
652 *
653 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
654 * changes to 'classifiers' must be made while still holding the
655 * 'flow_mutex'.
656 */
657 struct ovs_mutex flow_mutex;
658 struct cmap flow_table OVS_GUARDED; /* Flow table. */
659
660 /* One classifier per in_port polled by the pmd */
661 struct cmap classifiers;
662 /* Periodically sort subtable vectors according to hit frequencies */
663 long long int next_optimization;
664 /* End of the next time interval for which processing cycles
665 are stored for each polled rxq. */
666 long long int rxq_next_cycle_store;
667
668 /* Last interval timestamp. */
669 uint64_t intrvl_tsc_prev;
670 /* Last interval cycles. */
671 atomic_ullong intrvl_cycles;
672
673 /* Current context of the PMD thread. */
674 struct dp_netdev_pmd_thread_ctx ctx;
675
676 struct seq *reload_seq;
677 uint64_t last_reload_seq;
678
679 /* These are atomic variables used as a synchronization and configuration
680 * points for thread reload/exit.
681 *
682 * 'reload' atomic is the main one and it's used as a memory
683 * synchronization point for all other knobs and data.
684 *
685 * For a thread that requests PMD reload:
686 *
687 * * All changes that should be visible to the PMD thread must be made
688 * before setting the 'reload'. These changes could use any memory
689 * ordering model including 'relaxed'.
690 * * Setting the 'reload' atomic should occur in the same thread where
691 * all other PMD configuration options updated.
692 * * Setting the 'reload' atomic should be done with 'release' memory
693 * ordering model or stricter. This will guarantee that all previous
694 * changes (including non-atomic and 'relaxed') will be visible to
695 * the PMD thread.
696 * * To check that reload is done, thread should poll the 'reload' atomic
697 * to become 'false'. Polling should be done with 'acquire' memory
698 * ordering model or stricter. This ensures that PMD thread completed
699 * the reload process.
700 *
701 * For the PMD thread:
702 *
703 * * PMD thread should read 'reload' atomic with 'acquire' memory
704 * ordering model or stricter. This will guarantee that all changes
705 * made before setting the 'reload' in the requesting thread will be
706 * visible to the PMD thread.
707 * * All other configuration data could be read with any memory
708 * ordering model (including non-atomic and 'relaxed') but *only after*
709 * reading the 'reload' atomic set to 'true'.
710 * * When the PMD reload done, PMD should (optionally) set all the below
711 * knobs except the 'reload' to their default ('false') values and
712 * (mandatory), as the last step, set the 'reload' to 'false' using
713 * 'release' memory ordering model or stricter. This will inform the
714 * requesting thread that PMD has completed a reload cycle.
715 */
716 atomic_bool reload; /* Do we need to reload ports? */
717 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
718 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
719 atomic_bool exit; /* For terminating the pmd thread. */
720
721 pthread_t thread;
722 unsigned core_id; /* CPU core id of this pmd thread. */
723 int numa_id; /* numa node id of this pmd thread. */
724 bool isolated;
725
726 /* Queue id used by this pmd thread to send packets on all netdevs if
727 * XPS disabled for this netdev. All static_tx_qid's are unique and less
728 * than 'cmap_count(dp->poll_threads)'. */
729 uint32_t static_tx_qid;
730
731 /* Number of filled output batches. */
732 int n_output_batches;
733
734 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
735 /* List of rx queues to poll. */
736 struct hmap poll_list OVS_GUARDED;
737 /* Map of 'tx_port's used for transmission. Written by the main thread,
738 * read by the pmd thread. */
739 struct hmap tx_ports OVS_GUARDED;
740
741 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
742 * ports (that support push_tunnel/pop_tunnel), the other contains ports
743 * with at least one txq (that support send). A port can be in both.
744 *
745 * There are two separate maps to make sure that we don't try to execute
746 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
747 *
748 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
749 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
750 * other instance will only be accessed by its own pmd thread. */
751 struct hmap tnl_port_cache;
752 struct hmap send_port_cache;
753
754 /* Keep track of detailed PMD performance statistics. */
755 struct pmd_perf_stats perf_stats;
756
757 /* Stats from previous iteration used by automatic pmd
758 * load balance logic. */
759 uint64_t prev_stats[PMD_N_STATS];
760 atomic_count pmd_overloaded;
761
762 /* Set to true if the pmd thread needs to be reloaded. */
763 bool need_reload;
764 };
765
766 /* Interface to netdev-based datapath. */
767 struct dpif_netdev {
768 struct dpif dpif;
769 struct dp_netdev *dp;
770 uint64_t last_port_seq;
771 };
772
773 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
774 struct dp_netdev_port **portp)
775 OVS_REQUIRES(dp->port_mutex);
776 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
777 struct dp_netdev_port **portp)
778 OVS_REQUIRES(dp->port_mutex);
779 static void dp_netdev_free(struct dp_netdev *)
780 OVS_REQUIRES(dp_netdev_mutex);
781 static int do_add_port(struct dp_netdev *dp, const char *devname,
782 const char *type, odp_port_t port_no)
783 OVS_REQUIRES(dp->port_mutex);
784 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
785 OVS_REQUIRES(dp->port_mutex);
786 static int dpif_netdev_open(const struct dpif_class *, const char *name,
787 bool create, struct dpif **);
788 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
789 struct dp_packet_batch *,
790 bool should_steal,
791 const struct flow *flow,
792 const struct nlattr *actions,
793 size_t actions_len);
794 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
795 struct dp_packet_batch *, odp_port_t port_no);
796 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
797 struct dp_packet_batch *);
798
799 static void dp_netdev_disable_upcall(struct dp_netdev *);
800 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
801 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
802 struct dp_netdev *dp, unsigned core_id,
803 int numa_id);
804 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
805 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
806 OVS_REQUIRES(dp->port_mutex);
807
808 static void *pmd_thread_main(void *);
809 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
810 unsigned core_id);
811 static struct dp_netdev_pmd_thread *
812 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
813 static void dp_netdev_del_pmd(struct dp_netdev *dp,
814 struct dp_netdev_pmd_thread *pmd);
815 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
816 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
817 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
818 struct dp_netdev_port *port)
819 OVS_REQUIRES(pmd->port_mutex);
820 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
821 struct tx_port *tx)
822 OVS_REQUIRES(pmd->port_mutex);
823 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
824 struct dp_netdev_rxq *rxq)
825 OVS_REQUIRES(pmd->port_mutex);
826 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
827 struct rxq_poll *poll)
828 OVS_REQUIRES(pmd->port_mutex);
829 static int
830 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
831 bool force);
832
833 static void reconfigure_datapath(struct dp_netdev *dp)
834 OVS_REQUIRES(dp->port_mutex);
835 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
836 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
837 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
838 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
839 OVS_REQUIRES(pmd->port_mutex);
840 static inline void
841 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
842 struct polled_queue *poll_list, int poll_cnt);
843 static void
844 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
845 enum rxq_cycles_counter_type type,
846 unsigned long long cycles);
847 static uint64_t
848 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
849 enum rxq_cycles_counter_type type);
850 static void
851 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
852 unsigned long long cycles);
853 static uint64_t
854 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
855 static void
856 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
857 bool purge);
858 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
859 struct tx_port *tx);
860
861 static inline bool emc_entry_alive(struct emc_entry *ce);
862 static void emc_clear_entry(struct emc_entry *ce);
863 static void smc_clear_entry(struct smc_bucket *b, int idx);
864
865 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
866 static inline bool
867 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
868 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
869 struct dp_netdev_flow *flow);
870
871 static void
872 emc_cache_init(struct emc_cache *flow_cache)
873 {
874 int i;
875
876 flow_cache->sweep_idx = 0;
877 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
878 flow_cache->entries[i].flow = NULL;
879 flow_cache->entries[i].key.hash = 0;
880 flow_cache->entries[i].key.len = sizeof(struct miniflow);
881 flowmap_init(&flow_cache->entries[i].key.mf.map);
882 }
883 }
884
885 static void
886 smc_cache_init(struct smc_cache *smc_cache)
887 {
888 int i, j;
889 for (i = 0; i < SMC_BUCKET_CNT; i++) {
890 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
891 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
892 }
893 }
894 }
895
896 static void
897 dfc_cache_init(struct dfc_cache *flow_cache)
898 {
899 emc_cache_init(&flow_cache->emc_cache);
900 smc_cache_init(&flow_cache->smc_cache);
901 }
902
903 static void
904 emc_cache_uninit(struct emc_cache *flow_cache)
905 {
906 int i;
907
908 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
909 emc_clear_entry(&flow_cache->entries[i]);
910 }
911 }
912
913 static void
914 smc_cache_uninit(struct smc_cache *smc)
915 {
916 int i, j;
917
918 for (i = 0; i < SMC_BUCKET_CNT; i++) {
919 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
920 smc_clear_entry(&(smc->buckets[i]), j);
921 }
922 }
923 }
924
925 static void
926 dfc_cache_uninit(struct dfc_cache *flow_cache)
927 {
928 smc_cache_uninit(&flow_cache->smc_cache);
929 emc_cache_uninit(&flow_cache->emc_cache);
930 }
931
932 /* Check and clear dead flow references slowly (one entry at each
933 * invocation). */
934 static void
935 emc_cache_slow_sweep(struct emc_cache *flow_cache)
936 {
937 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
938
939 if (!emc_entry_alive(entry)) {
940 emc_clear_entry(entry);
941 }
942 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
943 }
944
945 /* Updates the time in PMD threads context and should be called in three cases:
946 *
947 * 1. PMD structure initialization:
948 * - dp_netdev_configure_pmd()
949 *
950 * 2. Before processing of the new packet batch:
951 * - dpif_netdev_execute()
952 * - dp_netdev_process_rxq_port()
953 *
954 * 3. At least once per polling iteration in main polling threads if no
955 * packets received on current iteration:
956 * - dpif_netdev_run()
957 * - pmd_thread_main()
958 *
959 * 'pmd->ctx.now' should be used without update in all other cases if possible.
960 */
961 static inline void
962 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
963 {
964 pmd->ctx.now = time_usec();
965 }
966
967 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
968 bool
969 dpif_is_netdev(const struct dpif *dpif)
970 {
971 return dpif->dpif_class->open == dpif_netdev_open;
972 }
973
974 static struct dpif_netdev *
975 dpif_netdev_cast(const struct dpif *dpif)
976 {
977 ovs_assert(dpif_is_netdev(dpif));
978 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
979 }
980
981 static struct dp_netdev *
982 get_dp_netdev(const struct dpif *dpif)
983 {
984 return dpif_netdev_cast(dpif)->dp;
985 }
986 \f
987 enum pmd_info_type {
988 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
989 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
990 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
991 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
992 };
993
994 static void
995 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
996 {
997 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
998 ? "main thread" : "pmd thread");
999 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1000 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1001 }
1002 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1003 ds_put_format(reply, " core_id %u", pmd->core_id);
1004 }
1005 ds_put_cstr(reply, ":\n");
1006 }
1007
1008 static void
1009 pmd_info_show_stats(struct ds *reply,
1010 struct dp_netdev_pmd_thread *pmd)
1011 {
1012 uint64_t stats[PMD_N_STATS];
1013 uint64_t total_cycles, total_packets;
1014 double passes_per_pkt = 0;
1015 double lookups_per_hit = 0;
1016 double packets_per_batch = 0;
1017
1018 pmd_perf_read_counters(&pmd->perf_stats, stats);
1019 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1020 + stats[PMD_CYCLES_ITER_BUSY];
1021 total_packets = stats[PMD_STAT_RECV];
1022
1023 format_pmd_thread(reply, pmd);
1024
1025 if (total_packets > 0) {
1026 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1027 / (double) total_packets;
1028 }
1029 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1030 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1031 / (double) stats[PMD_STAT_MASKED_HIT];
1032 }
1033 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1034 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1035 / (double) stats[PMD_STAT_SENT_BATCHES];
1036 }
1037
1038 ds_put_format(reply,
1039 " packets received: %"PRIu64"\n"
1040 " packet recirculations: %"PRIu64"\n"
1041 " avg. datapath passes per packet: %.02f\n"
1042 " emc hits: %"PRIu64"\n"
1043 " smc hits: %"PRIu64"\n"
1044 " megaflow hits: %"PRIu64"\n"
1045 " avg. subtable lookups per megaflow hit: %.02f\n"
1046 " miss with success upcall: %"PRIu64"\n"
1047 " miss with failed upcall: %"PRIu64"\n"
1048 " avg. packets per output batch: %.02f\n",
1049 total_packets, stats[PMD_STAT_RECIRC],
1050 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1051 stats[PMD_STAT_SMC_HIT],
1052 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1053 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1054 packets_per_batch);
1055
1056 if (total_cycles == 0) {
1057 return;
1058 }
1059
1060 ds_put_format(reply,
1061 " idle cycles: %"PRIu64" (%.02f%%)\n"
1062 " processing cycles: %"PRIu64" (%.02f%%)\n",
1063 stats[PMD_CYCLES_ITER_IDLE],
1064 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1065 stats[PMD_CYCLES_ITER_BUSY],
1066 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1067
1068 if (total_packets == 0) {
1069 return;
1070 }
1071
1072 ds_put_format(reply,
1073 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1074 total_cycles / (double) total_packets,
1075 total_cycles, total_packets);
1076
1077 ds_put_format(reply,
1078 " avg processing cycles per packet: "
1079 "%.02f (%"PRIu64"/%"PRIu64")\n",
1080 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1081 stats[PMD_CYCLES_ITER_BUSY], total_packets);
1082 }
1083
1084 static void
1085 pmd_info_show_perf(struct ds *reply,
1086 struct dp_netdev_pmd_thread *pmd,
1087 struct pmd_perf_params *par)
1088 {
1089 if (pmd->core_id != NON_PMD_CORE_ID) {
1090 char *time_str =
1091 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1092 long long now = time_msec();
1093 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1094
1095 ds_put_cstr(reply, "\n");
1096 ds_put_format(reply, "Time: %s\n", time_str);
1097 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1098 ds_put_cstr(reply, "\n");
1099 format_pmd_thread(reply, pmd);
1100 ds_put_cstr(reply, "\n");
1101 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1102 if (pmd_perf_metrics_enabled(pmd)) {
1103 /* Prevent parallel clearing of perf metrics. */
1104 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1105 if (par->histograms) {
1106 ds_put_cstr(reply, "\n");
1107 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1108 }
1109 if (par->iter_hist_len > 0) {
1110 ds_put_cstr(reply, "\n");
1111 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1112 par->iter_hist_len);
1113 }
1114 if (par->ms_hist_len > 0) {
1115 ds_put_cstr(reply, "\n");
1116 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1117 par->ms_hist_len);
1118 }
1119 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1120 }
1121 free(time_str);
1122 }
1123 }
1124
1125 static int
1126 compare_poll_list(const void *a_, const void *b_)
1127 {
1128 const struct rxq_poll *a = a_;
1129 const struct rxq_poll *b = b_;
1130
1131 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1132 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1133
1134 int cmp = strcmp(namea, nameb);
1135 if (!cmp) {
1136 return netdev_rxq_get_queue_id(a->rxq->rx)
1137 - netdev_rxq_get_queue_id(b->rxq->rx);
1138 } else {
1139 return cmp;
1140 }
1141 }
1142
1143 static void
1144 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1145 size_t *n)
1146 OVS_REQUIRES(pmd->port_mutex)
1147 {
1148 struct rxq_poll *ret, *poll;
1149 size_t i;
1150
1151 *n = hmap_count(&pmd->poll_list);
1152 if (!*n) {
1153 ret = NULL;
1154 } else {
1155 ret = xcalloc(*n, sizeof *ret);
1156 i = 0;
1157 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1158 ret[i] = *poll;
1159 i++;
1160 }
1161 ovs_assert(i == *n);
1162 qsort(ret, *n, sizeof *ret, compare_poll_list);
1163 }
1164
1165 *list = ret;
1166 }
1167
1168 static void
1169 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1170 {
1171 if (pmd->core_id != NON_PMD_CORE_ID) {
1172 struct rxq_poll *list;
1173 size_t n_rxq;
1174 uint64_t total_cycles = 0;
1175
1176 ds_put_format(reply,
1177 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
1178 pmd->numa_id, pmd->core_id, (pmd->isolated)
1179 ? "true" : "false");
1180
1181 ovs_mutex_lock(&pmd->port_mutex);
1182 sorted_poll_list(pmd, &list, &n_rxq);
1183
1184 /* Get the total pmd cycles for an interval. */
1185 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1186 /* Estimate the cycles to cover all intervals. */
1187 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1188
1189 for (int i = 0; i < n_rxq; i++) {
1190 struct dp_netdev_rxq *rxq = list[i].rxq;
1191 const char *name = netdev_rxq_get_name(rxq->rx);
1192 uint64_t proc_cycles = 0;
1193
1194 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1195 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1196 }
1197 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1198 netdev_rxq_get_queue_id(list[i].rxq->rx));
1199 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1200 ? "(enabled) " : "(disabled)");
1201 ds_put_format(reply, " pmd usage: ");
1202 if (total_cycles) {
1203 ds_put_format(reply, "%2"PRIu64"",
1204 proc_cycles * 100 / total_cycles);
1205 ds_put_cstr(reply, " %");
1206 } else {
1207 ds_put_format(reply, "%s", "NOT AVAIL");
1208 }
1209 ds_put_cstr(reply, "\n");
1210 }
1211 ovs_mutex_unlock(&pmd->port_mutex);
1212 free(list);
1213 }
1214 }
1215
1216 static int
1217 compare_poll_thread_list(const void *a_, const void *b_)
1218 {
1219 const struct dp_netdev_pmd_thread *a, *b;
1220
1221 a = *(struct dp_netdev_pmd_thread **)a_;
1222 b = *(struct dp_netdev_pmd_thread **)b_;
1223
1224 if (a->core_id < b->core_id) {
1225 return -1;
1226 }
1227 if (a->core_id > b->core_id) {
1228 return 1;
1229 }
1230 return 0;
1231 }
1232
1233 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1234 * this list, as long as we do not go to quiescent state. */
1235 static void
1236 sorted_poll_thread_list(struct dp_netdev *dp,
1237 struct dp_netdev_pmd_thread ***list,
1238 size_t *n)
1239 {
1240 struct dp_netdev_pmd_thread *pmd;
1241 struct dp_netdev_pmd_thread **pmd_list;
1242 size_t k = 0, n_pmds;
1243
1244 n_pmds = cmap_count(&dp->poll_threads);
1245 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1246
1247 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1248 if (k >= n_pmds) {
1249 break;
1250 }
1251 pmd_list[k++] = pmd;
1252 }
1253
1254 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1255
1256 *list = pmd_list;
1257 *n = k;
1258 }
1259
1260 static void
1261 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1262 const char *argv[], void *aux OVS_UNUSED)
1263 {
1264 struct ds reply = DS_EMPTY_INITIALIZER;
1265 struct dp_netdev *dp = NULL;
1266
1267 ovs_mutex_lock(&dp_netdev_mutex);
1268
1269 if (argc == 2) {
1270 dp = shash_find_data(&dp_netdevs, argv[1]);
1271 } else if (shash_count(&dp_netdevs) == 1) {
1272 /* There's only one datapath */
1273 dp = shash_first(&dp_netdevs)->data;
1274 }
1275
1276 if (!dp) {
1277 ovs_mutex_unlock(&dp_netdev_mutex);
1278 unixctl_command_reply_error(conn,
1279 "please specify an existing datapath");
1280 return;
1281 }
1282
1283 dp_netdev_request_reconfigure(dp);
1284 ovs_mutex_unlock(&dp_netdev_mutex);
1285 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1286 unixctl_command_reply(conn, ds_cstr(&reply));
1287 ds_destroy(&reply);
1288 }
1289
1290 static void
1291 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1292 void *aux)
1293 {
1294 struct ds reply = DS_EMPTY_INITIALIZER;
1295 struct dp_netdev_pmd_thread **pmd_list;
1296 struct dp_netdev *dp = NULL;
1297 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1298 unsigned int core_id;
1299 bool filter_on_pmd = false;
1300 size_t n;
1301
1302 ovs_mutex_lock(&dp_netdev_mutex);
1303
1304 while (argc > 1) {
1305 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1306 if (str_to_uint(argv[2], 10, &core_id)) {
1307 filter_on_pmd = true;
1308 }
1309 argc -= 2;
1310 argv += 2;
1311 } else {
1312 dp = shash_find_data(&dp_netdevs, argv[1]);
1313 argc -= 1;
1314 argv += 1;
1315 }
1316 }
1317
1318 if (!dp) {
1319 if (shash_count(&dp_netdevs) == 1) {
1320 /* There's only one datapath */
1321 dp = shash_first(&dp_netdevs)->data;
1322 } else {
1323 ovs_mutex_unlock(&dp_netdev_mutex);
1324 unixctl_command_reply_error(conn,
1325 "please specify an existing datapath");
1326 return;
1327 }
1328 }
1329
1330 sorted_poll_thread_list(dp, &pmd_list, &n);
1331 for (size_t i = 0; i < n; i++) {
1332 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1333 if (!pmd) {
1334 break;
1335 }
1336 if (filter_on_pmd && pmd->core_id != core_id) {
1337 continue;
1338 }
1339 if (type == PMD_INFO_SHOW_RXQ) {
1340 pmd_info_show_rxq(&reply, pmd);
1341 } else if (type == PMD_INFO_CLEAR_STATS) {
1342 pmd_perf_stats_clear(&pmd->perf_stats);
1343 } else if (type == PMD_INFO_SHOW_STATS) {
1344 pmd_info_show_stats(&reply, pmd);
1345 } else if (type == PMD_INFO_PERF_SHOW) {
1346 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1347 }
1348 }
1349 free(pmd_list);
1350
1351 ovs_mutex_unlock(&dp_netdev_mutex);
1352
1353 unixctl_command_reply(conn, ds_cstr(&reply));
1354 ds_destroy(&reply);
1355 }
1356
1357 static void
1358 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1359 const char *argv[],
1360 void *aux OVS_UNUSED)
1361 {
1362 struct pmd_perf_params par;
1363 long int it_hist = 0, ms_hist = 0;
1364 par.histograms = true;
1365
1366 while (argc > 1) {
1367 if (!strcmp(argv[1], "-nh")) {
1368 par.histograms = false;
1369 argc -= 1;
1370 argv += 1;
1371 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1372 it_hist = strtol(argv[2], NULL, 10);
1373 if (it_hist < 0) {
1374 it_hist = 0;
1375 } else if (it_hist > HISTORY_LEN) {
1376 it_hist = HISTORY_LEN;
1377 }
1378 argc -= 2;
1379 argv += 2;
1380 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1381 ms_hist = strtol(argv[2], NULL, 10);
1382 if (ms_hist < 0) {
1383 ms_hist = 0;
1384 } else if (ms_hist > HISTORY_LEN) {
1385 ms_hist = HISTORY_LEN;
1386 }
1387 argc -= 2;
1388 argv += 2;
1389 } else {
1390 break;
1391 }
1392 }
1393 par.iter_hist_len = it_hist;
1394 par.ms_hist_len = ms_hist;
1395 par.command_type = PMD_INFO_PERF_SHOW;
1396 dpif_netdev_pmd_info(conn, argc, argv, &par);
1397 }
1398 \f
1399 static int
1400 dpif_netdev_init(void)
1401 {
1402 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1403 clear_aux = PMD_INFO_CLEAR_STATS,
1404 poll_aux = PMD_INFO_SHOW_RXQ;
1405
1406 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1407 0, 3, dpif_netdev_pmd_info,
1408 (void *)&show_aux);
1409 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1410 0, 3, dpif_netdev_pmd_info,
1411 (void *)&clear_aux);
1412 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1413 0, 3, dpif_netdev_pmd_info,
1414 (void *)&poll_aux);
1415 unixctl_command_register("dpif-netdev/pmd-perf-show",
1416 "[-nh] [-it iter-history-len]"
1417 " [-ms ms-history-len]"
1418 " [-pmd core] [dp]",
1419 0, 8, pmd_perf_show_cmd,
1420 NULL);
1421 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1422 0, 1, dpif_netdev_pmd_rebalance,
1423 NULL);
1424 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1425 "on|off [-b before] [-a after] [-e|-ne] "
1426 "[-us usec] [-q qlen]",
1427 0, 10, pmd_perf_log_set_cmd,
1428 NULL);
1429 return 0;
1430 }
1431
1432 static int
1433 dpif_netdev_enumerate(struct sset *all_dps,
1434 const struct dpif_class *dpif_class)
1435 {
1436 struct shash_node *node;
1437
1438 ovs_mutex_lock(&dp_netdev_mutex);
1439 SHASH_FOR_EACH(node, &dp_netdevs) {
1440 struct dp_netdev *dp = node->data;
1441 if (dpif_class != dp->class) {
1442 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1443 * If the class doesn't match, skip this dpif. */
1444 continue;
1445 }
1446 sset_add(all_dps, node->name);
1447 }
1448 ovs_mutex_unlock(&dp_netdev_mutex);
1449
1450 return 0;
1451 }
1452
1453 static bool
1454 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1455 {
1456 return class != &dpif_netdev_class;
1457 }
1458
1459 static const char *
1460 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1461 {
1462 return strcmp(type, "internal") ? type
1463 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1464 : "tap";
1465 }
1466
1467 static struct dpif *
1468 create_dpif_netdev(struct dp_netdev *dp)
1469 {
1470 uint16_t netflow_id = hash_string(dp->name, 0);
1471 struct dpif_netdev *dpif;
1472
1473 ovs_refcount_ref(&dp->ref_cnt);
1474
1475 dpif = xmalloc(sizeof *dpif);
1476 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1477 dpif->dp = dp;
1478 dpif->last_port_seq = seq_read(dp->port_seq);
1479
1480 return &dpif->dpif;
1481 }
1482
1483 /* Choose an unused, non-zero port number and return it on success.
1484 * Return ODPP_NONE on failure. */
1485 static odp_port_t
1486 choose_port(struct dp_netdev *dp, const char *name)
1487 OVS_REQUIRES(dp->port_mutex)
1488 {
1489 uint32_t port_no;
1490
1491 if (dp->class != &dpif_netdev_class) {
1492 const char *p;
1493 int start_no = 0;
1494
1495 /* If the port name begins with "br", start the number search at
1496 * 100 to make writing tests easier. */
1497 if (!strncmp(name, "br", 2)) {
1498 start_no = 100;
1499 }
1500
1501 /* If the port name contains a number, try to assign that port number.
1502 * This can make writing unit tests easier because port numbers are
1503 * predictable. */
1504 for (p = name; *p != '\0'; p++) {
1505 if (isdigit((unsigned char) *p)) {
1506 port_no = start_no + strtol(p, NULL, 10);
1507 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1508 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1509 return u32_to_odp(port_no);
1510 }
1511 break;
1512 }
1513 }
1514 }
1515
1516 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1517 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1518 return u32_to_odp(port_no);
1519 }
1520 }
1521
1522 return ODPP_NONE;
1523 }
1524
1525 static int
1526 create_dp_netdev(const char *name, const struct dpif_class *class,
1527 struct dp_netdev **dpp)
1528 OVS_REQUIRES(dp_netdev_mutex)
1529 {
1530 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1531 struct dp_netdev *dp;
1532 int error;
1533
1534 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1535 * unit tests. */
1536 if (!dpif_netdev_class_is_dummy(class)
1537 && ovsthread_once_start(&tsc_freq_check)) {
1538 pmd_perf_estimate_tsc_frequency();
1539 ovsthread_once_done(&tsc_freq_check);
1540 }
1541
1542 dp = xzalloc(sizeof *dp);
1543 shash_add(&dp_netdevs, name, dp);
1544
1545 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1546 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1547 ovs_refcount_init(&dp->ref_cnt);
1548 atomic_flag_clear(&dp->destroyed);
1549
1550 ovs_mutex_init_recursive(&dp->port_mutex);
1551 hmap_init(&dp->ports);
1552 dp->port_seq = seq_create();
1553 fat_rwlock_init(&dp->upcall_rwlock);
1554
1555 dp->reconfigure_seq = seq_create();
1556 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1557
1558 for (int i = 0; i < N_METER_LOCKS; ++i) {
1559 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1560 }
1561
1562 /* Disable upcalls by default. */
1563 dp_netdev_disable_upcall(dp);
1564 dp->upcall_aux = NULL;
1565 dp->upcall_cb = NULL;
1566
1567 dp->conntrack = conntrack_init();
1568
1569 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1570 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1571
1572 cmap_init(&dp->poll_threads);
1573 dp->pmd_rxq_assign_cyc = true;
1574
1575 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1576 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1577 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1578
1579 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1580 ovsthread_key_create(&dp->per_pmd_key, NULL);
1581
1582 ovs_mutex_lock(&dp->port_mutex);
1583 /* non-PMD will be created before all other threads and will
1584 * allocate static_tx_qid = 0. */
1585 dp_netdev_set_nonpmd(dp);
1586
1587 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1588 "internal"),
1589 ODPP_LOCAL);
1590 ovs_mutex_unlock(&dp->port_mutex);
1591 if (error) {
1592 dp_netdev_free(dp);
1593 return error;
1594 }
1595
1596 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1597 *dpp = dp;
1598 return 0;
1599 }
1600
1601 static void
1602 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1603 {
1604 seq_change(dp->reconfigure_seq);
1605 }
1606
1607 static bool
1608 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1609 {
1610 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1611 }
1612
1613 static int
1614 dpif_netdev_open(const struct dpif_class *class, const char *name,
1615 bool create, struct dpif **dpifp)
1616 {
1617 struct dp_netdev *dp;
1618 int error;
1619
1620 ovs_mutex_lock(&dp_netdev_mutex);
1621 dp = shash_find_data(&dp_netdevs, name);
1622 if (!dp) {
1623 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1624 } else {
1625 error = (dp->class != class ? EINVAL
1626 : create ? EEXIST
1627 : 0);
1628 }
1629 if (!error) {
1630 *dpifp = create_dpif_netdev(dp);
1631 }
1632 ovs_mutex_unlock(&dp_netdev_mutex);
1633
1634 return error;
1635 }
1636
1637 static void
1638 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1639 OVS_NO_THREAD_SAFETY_ANALYSIS
1640 {
1641 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1642 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1643
1644 /* Before freeing a lock we should release it */
1645 fat_rwlock_unlock(&dp->upcall_rwlock);
1646 fat_rwlock_destroy(&dp->upcall_rwlock);
1647 }
1648
1649 static void
1650 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1651 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1652 {
1653 if (dp->meters[meter_id]) {
1654 free(dp->meters[meter_id]);
1655 dp->meters[meter_id] = NULL;
1656 }
1657 }
1658
1659 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1660 * through the 'dp_netdevs' shash while freeing 'dp'. */
1661 static void
1662 dp_netdev_free(struct dp_netdev *dp)
1663 OVS_REQUIRES(dp_netdev_mutex)
1664 {
1665 struct dp_netdev_port *port, *next;
1666
1667 shash_find_and_delete(&dp_netdevs, dp->name);
1668
1669 ovs_mutex_lock(&dp->port_mutex);
1670 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1671 do_del_port(dp, port);
1672 }
1673 ovs_mutex_unlock(&dp->port_mutex);
1674
1675 dp_netdev_destroy_all_pmds(dp, true);
1676 cmap_destroy(&dp->poll_threads);
1677
1678 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1679 id_pool_destroy(dp->tx_qid_pool);
1680
1681 ovs_mutex_destroy(&dp->non_pmd_mutex);
1682 ovsthread_key_delete(dp->per_pmd_key);
1683
1684 conntrack_destroy(dp->conntrack);
1685
1686
1687 seq_destroy(dp->reconfigure_seq);
1688
1689 seq_destroy(dp->port_seq);
1690 hmap_destroy(&dp->ports);
1691 ovs_mutex_destroy(&dp->port_mutex);
1692
1693 /* Upcalls must be disabled at this point */
1694 dp_netdev_destroy_upcall_lock(dp);
1695
1696 int i;
1697
1698 for (i = 0; i < MAX_METERS; ++i) {
1699 meter_lock(dp, i);
1700 dp_delete_meter(dp, i);
1701 meter_unlock(dp, i);
1702 }
1703 for (i = 0; i < N_METER_LOCKS; ++i) {
1704 ovs_mutex_destroy(&dp->meter_locks[i]);
1705 }
1706
1707 free(dp->pmd_cmask);
1708 free(CONST_CAST(char *, dp->name));
1709 free(dp);
1710 }
1711
1712 static void
1713 dp_netdev_unref(struct dp_netdev *dp)
1714 {
1715 if (dp) {
1716 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1717 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1718 ovs_mutex_lock(&dp_netdev_mutex);
1719 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1720 dp_netdev_free(dp);
1721 }
1722 ovs_mutex_unlock(&dp_netdev_mutex);
1723 }
1724 }
1725
1726 static void
1727 dpif_netdev_close(struct dpif *dpif)
1728 {
1729 struct dp_netdev *dp = get_dp_netdev(dpif);
1730
1731 dp_netdev_unref(dp);
1732 free(dpif);
1733 }
1734
1735 static int
1736 dpif_netdev_destroy(struct dpif *dpif)
1737 {
1738 struct dp_netdev *dp = get_dp_netdev(dpif);
1739
1740 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1741 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1742 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1743 OVS_NOT_REACHED();
1744 }
1745 }
1746
1747 return 0;
1748 }
1749
1750 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1751 * load/store semantics. While the increment is not atomic, the load and
1752 * store operations are, making it impossible to read inconsistent values.
1753 *
1754 * This is used to update thread local stats counters. */
1755 static void
1756 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1757 {
1758 unsigned long long tmp;
1759
1760 atomic_read_relaxed(var, &tmp);
1761 tmp += n;
1762 atomic_store_relaxed(var, tmp);
1763 }
1764
1765 static int
1766 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1767 {
1768 struct dp_netdev *dp = get_dp_netdev(dpif);
1769 struct dp_netdev_pmd_thread *pmd;
1770 uint64_t pmd_stats[PMD_N_STATS];
1771
1772 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1773 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1774 stats->n_flows += cmap_count(&pmd->flow_table);
1775 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1776 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1777 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1778 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1779 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1780 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1781 }
1782 stats->n_masks = UINT32_MAX;
1783 stats->n_mask_hit = UINT64_MAX;
1784
1785 return 0;
1786 }
1787
1788 static void
1789 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1790 {
1791 if (pmd->core_id == NON_PMD_CORE_ID) {
1792 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1793 ovs_mutex_lock(&pmd->port_mutex);
1794 pmd_load_cached_ports(pmd);
1795 ovs_mutex_unlock(&pmd->port_mutex);
1796 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1797 return;
1798 }
1799
1800 seq_change(pmd->reload_seq);
1801 atomic_store_explicit(&pmd->reload, true, memory_order_release);
1802 }
1803
1804 static uint32_t
1805 hash_port_no(odp_port_t port_no)
1806 {
1807 return hash_int(odp_to_u32(port_no), 0);
1808 }
1809
1810 static int
1811 port_create(const char *devname, const char *type,
1812 odp_port_t port_no, struct dp_netdev_port **portp)
1813 {
1814 struct dp_netdev_port *port;
1815 enum netdev_flags flags;
1816 struct netdev *netdev;
1817 int error;
1818
1819 *portp = NULL;
1820
1821 /* Open and validate network device. */
1822 error = netdev_open(devname, type, &netdev);
1823 if (error) {
1824 return error;
1825 }
1826 /* XXX reject non-Ethernet devices */
1827
1828 netdev_get_flags(netdev, &flags);
1829 if (flags & NETDEV_LOOPBACK) {
1830 VLOG_ERR("%s: cannot add a loopback device", devname);
1831 error = EINVAL;
1832 goto out;
1833 }
1834
1835 port = xzalloc(sizeof *port);
1836 port->port_no = port_no;
1837 port->netdev = netdev;
1838 port->type = xstrdup(type);
1839 port->sf = NULL;
1840 port->emc_enabled = true;
1841 port->need_reconfigure = true;
1842 ovs_mutex_init(&port->txq_used_mutex);
1843
1844 *portp = port;
1845
1846 return 0;
1847
1848 out:
1849 netdev_close(netdev);
1850 return error;
1851 }
1852
1853 static int
1854 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1855 odp_port_t port_no)
1856 OVS_REQUIRES(dp->port_mutex)
1857 {
1858 struct netdev_saved_flags *sf;
1859 struct dp_netdev_port *port;
1860 int error;
1861
1862 /* Reject devices already in 'dp'. */
1863 if (!get_port_by_name(dp, devname, &port)) {
1864 return EEXIST;
1865 }
1866
1867 error = port_create(devname, type, port_no, &port);
1868 if (error) {
1869 return error;
1870 }
1871
1872 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1873 seq_change(dp->port_seq);
1874
1875 reconfigure_datapath(dp);
1876
1877 /* Check that port was successfully configured. */
1878 if (!dp_netdev_lookup_port(dp, port_no)) {
1879 return EINVAL;
1880 }
1881
1882 /* Updating device flags triggers an if_notifier, which triggers a bridge
1883 * reconfiguration and another attempt to add this port, leading to an
1884 * infinite loop if the device is configured incorrectly and cannot be
1885 * added. Setting the promisc mode after a successful reconfiguration,
1886 * since we already know that the device is somehow properly configured. */
1887 error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1888 if (error) {
1889 VLOG_ERR("%s: cannot set promisc flag", devname);
1890 do_del_port(dp, port);
1891 return error;
1892 }
1893 port->sf = sf;
1894
1895 return 0;
1896 }
1897
1898 static int
1899 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1900 odp_port_t *port_nop)
1901 {
1902 struct dp_netdev *dp = get_dp_netdev(dpif);
1903 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1904 const char *dpif_port;
1905 odp_port_t port_no;
1906 int error;
1907
1908 ovs_mutex_lock(&dp->port_mutex);
1909 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1910 if (*port_nop != ODPP_NONE) {
1911 port_no = *port_nop;
1912 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1913 } else {
1914 port_no = choose_port(dp, dpif_port);
1915 error = port_no == ODPP_NONE ? EFBIG : 0;
1916 }
1917 if (!error) {
1918 *port_nop = port_no;
1919 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1920 }
1921 ovs_mutex_unlock(&dp->port_mutex);
1922
1923 return error;
1924 }
1925
1926 static int
1927 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1928 {
1929 struct dp_netdev *dp = get_dp_netdev(dpif);
1930 int error;
1931
1932 ovs_mutex_lock(&dp->port_mutex);
1933 if (port_no == ODPP_LOCAL) {
1934 error = EINVAL;
1935 } else {
1936 struct dp_netdev_port *port;
1937
1938 error = get_port_by_number(dp, port_no, &port);
1939 if (!error) {
1940 do_del_port(dp, port);
1941 }
1942 }
1943 ovs_mutex_unlock(&dp->port_mutex);
1944
1945 return error;
1946 }
1947
1948 static bool
1949 is_valid_port_number(odp_port_t port_no)
1950 {
1951 return port_no != ODPP_NONE;
1952 }
1953
1954 static struct dp_netdev_port *
1955 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1956 OVS_REQUIRES(dp->port_mutex)
1957 {
1958 struct dp_netdev_port *port;
1959
1960 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1961 if (port->port_no == port_no) {
1962 return port;
1963 }
1964 }
1965 return NULL;
1966 }
1967
1968 static int
1969 get_port_by_number(struct dp_netdev *dp,
1970 odp_port_t port_no, struct dp_netdev_port **portp)
1971 OVS_REQUIRES(dp->port_mutex)
1972 {
1973 if (!is_valid_port_number(port_no)) {
1974 *portp = NULL;
1975 return EINVAL;
1976 } else {
1977 *portp = dp_netdev_lookup_port(dp, port_no);
1978 return *portp ? 0 : ENODEV;
1979 }
1980 }
1981
1982 static void
1983 port_destroy(struct dp_netdev_port *port)
1984 {
1985 if (!port) {
1986 return;
1987 }
1988
1989 netdev_close(port->netdev);
1990 netdev_restore_flags(port->sf);
1991
1992 for (unsigned i = 0; i < port->n_rxq; i++) {
1993 netdev_rxq_close(port->rxqs[i].rx);
1994 }
1995 ovs_mutex_destroy(&port->txq_used_mutex);
1996 free(port->rxq_affinity_list);
1997 free(port->txq_used);
1998 free(port->rxqs);
1999 free(port->type);
2000 free(port);
2001 }
2002
2003 static int
2004 get_port_by_name(struct dp_netdev *dp,
2005 const char *devname, struct dp_netdev_port **portp)
2006 OVS_REQUIRES(dp->port_mutex)
2007 {
2008 struct dp_netdev_port *port;
2009
2010 HMAP_FOR_EACH (port, node, &dp->ports) {
2011 if (!strcmp(netdev_get_name(port->netdev), devname)) {
2012 *portp = port;
2013 return 0;
2014 }
2015 }
2016
2017 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2018 * existing port. */
2019 return ENODEV;
2020 }
2021
2022 /* Returns 'true' if there is a port with pmd netdev. */
2023 static bool
2024 has_pmd_port(struct dp_netdev *dp)
2025 OVS_REQUIRES(dp->port_mutex)
2026 {
2027 struct dp_netdev_port *port;
2028
2029 HMAP_FOR_EACH (port, node, &dp->ports) {
2030 if (netdev_is_pmd(port->netdev)) {
2031 return true;
2032 }
2033 }
2034
2035 return false;
2036 }
2037
2038 static void
2039 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2040 OVS_REQUIRES(dp->port_mutex)
2041 {
2042 hmap_remove(&dp->ports, &port->node);
2043 seq_change(dp->port_seq);
2044
2045 reconfigure_datapath(dp);
2046
2047 port_destroy(port);
2048 }
2049
2050 static void
2051 answer_port_query(const struct dp_netdev_port *port,
2052 struct dpif_port *dpif_port)
2053 {
2054 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2055 dpif_port->type = xstrdup(port->type);
2056 dpif_port->port_no = port->port_no;
2057 }
2058
2059 static int
2060 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2061 struct dpif_port *dpif_port)
2062 {
2063 struct dp_netdev *dp = get_dp_netdev(dpif);
2064 struct dp_netdev_port *port;
2065 int error;
2066
2067 ovs_mutex_lock(&dp->port_mutex);
2068 error = get_port_by_number(dp, port_no, &port);
2069 if (!error && dpif_port) {
2070 answer_port_query(port, dpif_port);
2071 }
2072 ovs_mutex_unlock(&dp->port_mutex);
2073
2074 return error;
2075 }
2076
2077 static int
2078 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2079 struct dpif_port *dpif_port)
2080 {
2081 struct dp_netdev *dp = get_dp_netdev(dpif);
2082 struct dp_netdev_port *port;
2083 int error;
2084
2085 ovs_mutex_lock(&dp->port_mutex);
2086 error = get_port_by_name(dp, devname, &port);
2087 if (!error && dpif_port) {
2088 answer_port_query(port, dpif_port);
2089 }
2090 ovs_mutex_unlock(&dp->port_mutex);
2091
2092 return error;
2093 }
2094
2095 static void
2096 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2097 {
2098 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2099 free(flow);
2100 }
2101
2102 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2103 {
2104 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2105 ovsrcu_postpone(dp_netdev_flow_free, flow);
2106 }
2107 }
2108
2109 static uint32_t
2110 dp_netdev_flow_hash(const ovs_u128 *ufid)
2111 {
2112 return ufid->u32[0];
2113 }
2114
2115 static inline struct dpcls *
2116 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2117 odp_port_t in_port)
2118 {
2119 struct dpcls *cls;
2120 uint32_t hash = hash_port_no(in_port);
2121 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2122 if (cls->in_port == in_port) {
2123 /* Port classifier exists already */
2124 return cls;
2125 }
2126 }
2127 return NULL;
2128 }
2129
2130 static inline struct dpcls *
2131 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2132 odp_port_t in_port)
2133 OVS_REQUIRES(pmd->flow_mutex)
2134 {
2135 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2136 uint32_t hash = hash_port_no(in_port);
2137
2138 if (!cls) {
2139 /* Create new classifier for in_port */
2140 cls = xmalloc(sizeof(*cls));
2141 dpcls_init(cls);
2142 cls->in_port = in_port;
2143 cmap_insert(&pmd->classifiers, &cls->node, hash);
2144 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2145 }
2146 return cls;
2147 }
2148
2149 #define MAX_FLOW_MARK (UINT32_MAX - 1)
2150 #define INVALID_FLOW_MARK (UINT32_MAX)
2151
2152 struct megaflow_to_mark_data {
2153 const struct cmap_node node;
2154 ovs_u128 mega_ufid;
2155 uint32_t mark;
2156 };
2157
2158 struct flow_mark {
2159 struct cmap megaflow_to_mark;
2160 struct cmap mark_to_flow;
2161 struct id_pool *pool;
2162 };
2163
2164 static struct flow_mark flow_mark = {
2165 .megaflow_to_mark = CMAP_INITIALIZER,
2166 .mark_to_flow = CMAP_INITIALIZER,
2167 };
2168
2169 static uint32_t
2170 flow_mark_alloc(void)
2171 {
2172 uint32_t mark;
2173
2174 if (!flow_mark.pool) {
2175 /* Haven't initiated yet, do it here */
2176 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2177 }
2178
2179 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2180 return mark;
2181 }
2182
2183 return INVALID_FLOW_MARK;
2184 }
2185
2186 static void
2187 flow_mark_free(uint32_t mark)
2188 {
2189 id_pool_free_id(flow_mark.pool, mark);
2190 }
2191
2192 /* associate megaflow with a mark, which is a 1:1 mapping */
2193 static void
2194 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2195 {
2196 size_t hash = dp_netdev_flow_hash(mega_ufid);
2197 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2198
2199 data->mega_ufid = *mega_ufid;
2200 data->mark = mark;
2201
2202 cmap_insert(&flow_mark.megaflow_to_mark,
2203 CONST_CAST(struct cmap_node *, &data->node), hash);
2204 }
2205
2206 /* disassociate meagaflow with a mark */
2207 static void
2208 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2209 {
2210 size_t hash = dp_netdev_flow_hash(mega_ufid);
2211 struct megaflow_to_mark_data *data;
2212
2213 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2214 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2215 cmap_remove(&flow_mark.megaflow_to_mark,
2216 CONST_CAST(struct cmap_node *, &data->node), hash);
2217 ovsrcu_postpone(free, data);
2218 return;
2219 }
2220 }
2221
2222 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2223 UUID_ARGS((struct uuid *)mega_ufid));
2224 }
2225
2226 static inline uint32_t
2227 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2228 {
2229 size_t hash = dp_netdev_flow_hash(mega_ufid);
2230 struct megaflow_to_mark_data *data;
2231
2232 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2233 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2234 return data->mark;
2235 }
2236 }
2237
2238 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2239 UUID_ARGS((struct uuid *)mega_ufid));
2240 return INVALID_FLOW_MARK;
2241 }
2242
2243 /* associate mark with a flow, which is 1:N mapping */
2244 static void
2245 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2246 {
2247 dp_netdev_flow_ref(flow);
2248
2249 cmap_insert(&flow_mark.mark_to_flow,
2250 CONST_CAST(struct cmap_node *, &flow->mark_node),
2251 hash_int(mark, 0));
2252 flow->mark = mark;
2253
2254 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2255 }
2256
2257 static bool
2258 flow_mark_has_no_ref(uint32_t mark)
2259 {
2260 struct dp_netdev_flow *flow;
2261
2262 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2263 &flow_mark.mark_to_flow) {
2264 if (flow->mark == mark) {
2265 return false;
2266 }
2267 }
2268
2269 return true;
2270 }
2271
2272 static int
2273 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2274 struct dp_netdev_flow *flow)
2275 {
2276 int ret = 0;
2277 uint32_t mark = flow->mark;
2278 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2279 &flow->mark_node);
2280
2281 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2282 flow->mark = INVALID_FLOW_MARK;
2283
2284 /*
2285 * no flow is referencing the mark any more? If so, let's
2286 * remove the flow from hardware and free the mark.
2287 */
2288 if (flow_mark_has_no_ref(mark)) {
2289 struct netdev *port;
2290 odp_port_t in_port = flow->flow.in_port.odp_port;
2291
2292 port = netdev_ports_get(in_port, pmd->dp->class);
2293 if (port) {
2294 /* Taking a global 'port_mutex' to fulfill thread safety
2295 * restrictions for the netdev-offload-dpdk module. */
2296 ovs_mutex_lock(&pmd->dp->port_mutex);
2297 ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2298 ovs_mutex_unlock(&pmd->dp->port_mutex);
2299 netdev_close(port);
2300 }
2301
2302 flow_mark_free(mark);
2303 VLOG_DBG("Freed flow mark %u\n", mark);
2304
2305 megaflow_to_mark_disassociate(&flow->mega_ufid);
2306 }
2307 dp_netdev_flow_unref(flow);
2308
2309 return ret;
2310 }
2311
2312 static void
2313 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2314 {
2315 struct dp_netdev_flow *flow;
2316
2317 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2318 if (flow->pmd_id == pmd->core_id) {
2319 queue_netdev_flow_del(pmd, flow);
2320 }
2321 }
2322 }
2323
2324 static struct dp_netdev_flow *
2325 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2326 const uint32_t mark)
2327 {
2328 struct dp_netdev_flow *flow;
2329
2330 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2331 &flow_mark.mark_to_flow) {
2332 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2333 flow->dead == false) {
2334 return flow;
2335 }
2336 }
2337
2338 return NULL;
2339 }
2340
2341 static struct dp_flow_offload_item *
2342 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2343 struct dp_netdev_flow *flow,
2344 int op)
2345 {
2346 struct dp_flow_offload_item *offload;
2347
2348 offload = xzalloc(sizeof(*offload));
2349 offload->pmd = pmd;
2350 offload->flow = flow;
2351 offload->op = op;
2352
2353 dp_netdev_flow_ref(flow);
2354 dp_netdev_pmd_try_ref(pmd);
2355
2356 return offload;
2357 }
2358
2359 static void
2360 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2361 {
2362 dp_netdev_pmd_unref(offload->pmd);
2363 dp_netdev_flow_unref(offload->flow);
2364
2365 free(offload->actions);
2366 free(offload);
2367 }
2368
2369 static void
2370 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2371 {
2372 ovs_mutex_lock(&dp_flow_offload.mutex);
2373 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2374 xpthread_cond_signal(&dp_flow_offload.cond);
2375 ovs_mutex_unlock(&dp_flow_offload.mutex);
2376 }
2377
2378 static int
2379 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2380 {
2381 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2382 }
2383
2384 /*
2385 * There are two flow offload operations here: addition and modification.
2386 *
2387 * For flow addition, this function does:
2388 * - allocate a new flow mark id
2389 * - perform hardware flow offload
2390 * - associate the flow mark with flow and mega flow
2391 *
2392 * For flow modification, both flow mark and the associations are still
2393 * valid, thus only item 2 needed.
2394 */
2395 static int
2396 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2397 {
2398 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2399 const struct dpif_class *dpif_class = pmd->dp->class;
2400 struct dp_netdev_flow *flow = offload->flow;
2401 odp_port_t in_port = flow->flow.in_port.odp_port;
2402 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2403 struct offload_info info;
2404 struct netdev *port;
2405 uint32_t mark;
2406 int ret;
2407
2408 if (flow->dead) {
2409 return -1;
2410 }
2411
2412 if (modification) {
2413 mark = flow->mark;
2414 ovs_assert(mark != INVALID_FLOW_MARK);
2415 } else {
2416 /*
2417 * If a mega flow has already been offloaded (from other PMD
2418 * instances), do not offload it again.
2419 */
2420 mark = megaflow_to_mark_find(&flow->mega_ufid);
2421 if (mark != INVALID_FLOW_MARK) {
2422 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2423 if (flow->mark != INVALID_FLOW_MARK) {
2424 ovs_assert(flow->mark == mark);
2425 } else {
2426 mark_to_flow_associate(mark, flow);
2427 }
2428 return 0;
2429 }
2430
2431 mark = flow_mark_alloc();
2432 if (mark == INVALID_FLOW_MARK) {
2433 VLOG_ERR("Failed to allocate flow mark!\n");
2434 }
2435 }
2436 info.flow_mark = mark;
2437 info.dpif_class = dpif_class;
2438
2439 port = netdev_ports_get(in_port, pmd->dp->class);
2440 if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2441 netdev_close(port);
2442 goto err_free;
2443 }
2444 /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2445 * the netdev-offload-dpdk module. */
2446 ovs_mutex_lock(&pmd->dp->port_mutex);
2447 ret = netdev_flow_put(port, &offload->match,
2448 CONST_CAST(struct nlattr *, offload->actions),
2449 offload->actions_len, &flow->mega_ufid, &info,
2450 NULL);
2451 ovs_mutex_unlock(&pmd->dp->port_mutex);
2452 netdev_close(port);
2453
2454 if (ret) {
2455 goto err_free;
2456 }
2457
2458 if (!modification) {
2459 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2460 mark_to_flow_associate(mark, flow);
2461 }
2462 return 0;
2463
2464 err_free:
2465 if (!modification) {
2466 flow_mark_free(mark);
2467 } else {
2468 mark_to_flow_disassociate(pmd, flow);
2469 }
2470 return -1;
2471 }
2472
2473 static void *
2474 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2475 {
2476 struct dp_flow_offload_item *offload;
2477 struct ovs_list *list;
2478 const char *op;
2479 int ret;
2480
2481 for (;;) {
2482 ovs_mutex_lock(&dp_flow_offload.mutex);
2483 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2484 ovsrcu_quiesce_start();
2485 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2486 &dp_flow_offload.mutex);
2487 ovsrcu_quiesce_end();
2488 }
2489 list = ovs_list_pop_front(&dp_flow_offload.list);
2490 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2491 ovs_mutex_unlock(&dp_flow_offload.mutex);
2492
2493 switch (offload->op) {
2494 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2495 op = "add";
2496 ret = dp_netdev_flow_offload_put(offload);
2497 break;
2498 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2499 op = "modify";
2500 ret = dp_netdev_flow_offload_put(offload);
2501 break;
2502 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2503 op = "delete";
2504 ret = dp_netdev_flow_offload_del(offload);
2505 break;
2506 default:
2507 OVS_NOT_REACHED();
2508 }
2509
2510 VLOG_DBG("%s to %s netdev flow\n",
2511 ret == 0 ? "succeed" : "failed", op);
2512 dp_netdev_free_flow_offload(offload);
2513 }
2514
2515 return NULL;
2516 }
2517
2518 static void
2519 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2520 struct dp_netdev_flow *flow)
2521 {
2522 struct dp_flow_offload_item *offload;
2523
2524 if (ovsthread_once_start(&offload_thread_once)) {
2525 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2526 ovs_thread_create("dp_netdev_flow_offload",
2527 dp_netdev_flow_offload_main, NULL);
2528 ovsthread_once_done(&offload_thread_once);
2529 }
2530
2531 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2532 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2533 dp_netdev_append_flow_offload(offload);
2534 }
2535
2536 static void
2537 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2538 struct dp_netdev_flow *flow, struct match *match,
2539 const struct nlattr *actions, size_t actions_len)
2540 {
2541 struct dp_flow_offload_item *offload;
2542 int op;
2543
2544 if (!netdev_is_flow_api_enabled()) {
2545 return;
2546 }
2547
2548 if (ovsthread_once_start(&offload_thread_once)) {
2549 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2550 ovs_thread_create("dp_netdev_flow_offload",
2551 dp_netdev_flow_offload_main, NULL);
2552 ovsthread_once_done(&offload_thread_once);
2553 }
2554
2555 if (flow->mark != INVALID_FLOW_MARK) {
2556 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2557 } else {
2558 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2559 }
2560 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2561 offload->match = *match;
2562 offload->actions = xmalloc(actions_len);
2563 memcpy(offload->actions, actions, actions_len);
2564 offload->actions_len = actions_len;
2565
2566 dp_netdev_append_flow_offload(offload);
2567 }
2568
2569 static void
2570 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2571 struct dp_netdev_flow *flow)
2572 OVS_REQUIRES(pmd->flow_mutex)
2573 {
2574 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2575 struct dpcls *cls;
2576 odp_port_t in_port = flow->flow.in_port.odp_port;
2577
2578 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2579 ovs_assert(cls != NULL);
2580 dpcls_remove(cls, &flow->cr);
2581 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2582 if (flow->mark != INVALID_FLOW_MARK) {
2583 queue_netdev_flow_del(pmd, flow);
2584 }
2585 flow->dead = true;
2586
2587 dp_netdev_flow_unref(flow);
2588 }
2589
2590 static void
2591 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2592 {
2593 struct dp_netdev_flow *netdev_flow;
2594
2595 ovs_mutex_lock(&pmd->flow_mutex);
2596 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2597 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2598 }
2599 ovs_mutex_unlock(&pmd->flow_mutex);
2600 }
2601
2602 static int
2603 dpif_netdev_flow_flush(struct dpif *dpif)
2604 {
2605 struct dp_netdev *dp = get_dp_netdev(dpif);
2606 struct dp_netdev_pmd_thread *pmd;
2607
2608 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2609 dp_netdev_pmd_flow_flush(pmd);
2610 }
2611
2612 return 0;
2613 }
2614
2615 struct dp_netdev_port_state {
2616 struct hmap_position position;
2617 char *name;
2618 };
2619
2620 static int
2621 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2622 {
2623 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2624 return 0;
2625 }
2626
2627 static int
2628 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2629 struct dpif_port *dpif_port)
2630 {
2631 struct dp_netdev_port_state *state = state_;
2632 struct dp_netdev *dp = get_dp_netdev(dpif);
2633 struct hmap_node *node;
2634 int retval;
2635
2636 ovs_mutex_lock(&dp->port_mutex);
2637 node = hmap_at_position(&dp->ports, &state->position);
2638 if (node) {
2639 struct dp_netdev_port *port;
2640
2641 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2642
2643 free(state->name);
2644 state->name = xstrdup(netdev_get_name(port->netdev));
2645 dpif_port->name = state->name;
2646 dpif_port->type = port->type;
2647 dpif_port->port_no = port->port_no;
2648
2649 retval = 0;
2650 } else {
2651 retval = EOF;
2652 }
2653 ovs_mutex_unlock(&dp->port_mutex);
2654
2655 return retval;
2656 }
2657
2658 static int
2659 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2660 {
2661 struct dp_netdev_port_state *state = state_;
2662 free(state->name);
2663 free(state);
2664 return 0;
2665 }
2666
2667 static int
2668 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2669 {
2670 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2671 uint64_t new_port_seq;
2672 int error;
2673
2674 new_port_seq = seq_read(dpif->dp->port_seq);
2675 if (dpif->last_port_seq != new_port_seq) {
2676 dpif->last_port_seq = new_port_seq;
2677 error = ENOBUFS;
2678 } else {
2679 error = EAGAIN;
2680 }
2681
2682 return error;
2683 }
2684
2685 static void
2686 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2687 {
2688 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2689
2690 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2691 }
2692
2693 static struct dp_netdev_flow *
2694 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2695 {
2696 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2697 }
2698
2699 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2700 {
2701 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2702 }
2703
2704 /* netdev_flow_key utilities.
2705 *
2706 * netdev_flow_key is basically a miniflow. We use these functions
2707 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2708 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2709 *
2710 * - Since we are dealing exclusively with miniflows created by
2711 * miniflow_extract(), if the map is different the miniflow is different.
2712 * Therefore we can be faster by comparing the map and the miniflow in a
2713 * single memcmp().
2714 * - These functions can be inlined by the compiler. */
2715
2716 /* Given the number of bits set in miniflow's maps, returns the size of the
2717 * 'netdev_flow_key.mf' */
2718 static inline size_t
2719 netdev_flow_key_size(size_t flow_u64s)
2720 {
2721 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2722 }
2723
2724 static inline bool
2725 netdev_flow_key_equal(const struct netdev_flow_key *a,
2726 const struct netdev_flow_key *b)
2727 {
2728 /* 'b->len' may be not set yet. */
2729 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2730 }
2731
2732 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2733 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2734 * generated by miniflow_extract. */
2735 static inline bool
2736 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2737 const struct miniflow *mf)
2738 {
2739 return !memcmp(&key->mf, mf, key->len);
2740 }
2741
2742 static inline void
2743 netdev_flow_key_clone(struct netdev_flow_key *dst,
2744 const struct netdev_flow_key *src)
2745 {
2746 memcpy(dst, src,
2747 offsetof(struct netdev_flow_key, mf) + src->len);
2748 }
2749
2750 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2751 static inline void
2752 netdev_flow_mask_init(struct netdev_flow_key *mask,
2753 const struct match *match)
2754 {
2755 uint64_t *dst = miniflow_values(&mask->mf);
2756 struct flowmap fmap;
2757 uint32_t hash = 0;
2758 size_t idx;
2759
2760 /* Only check masks that make sense for the flow. */
2761 flow_wc_map(&match->flow, &fmap);
2762 flowmap_init(&mask->mf.map);
2763
2764 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2765 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2766
2767 if (mask_u64) {
2768 flowmap_set(&mask->mf.map, idx, 1);
2769 *dst++ = mask_u64;
2770 hash = hash_add64(hash, mask_u64);
2771 }
2772 }
2773
2774 map_t map;
2775
2776 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2777 hash = hash_add64(hash, map);
2778 }
2779
2780 size_t n = dst - miniflow_get_values(&mask->mf);
2781
2782 mask->hash = hash_finish(hash, n * 8);
2783 mask->len = netdev_flow_key_size(n);
2784 }
2785
2786 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2787 static inline void
2788 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2789 const struct flow *flow,
2790 const struct netdev_flow_key *mask)
2791 {
2792 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2793 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2794 uint32_t hash = 0;
2795 uint64_t value;
2796
2797 dst->len = mask->len;
2798 dst->mf = mask->mf; /* Copy maps. */
2799
2800 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2801 *dst_u64 = value & *mask_u64++;
2802 hash = hash_add64(hash, *dst_u64++);
2803 }
2804 dst->hash = hash_finish(hash,
2805 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2806 }
2807
2808 static inline bool
2809 emc_entry_alive(struct emc_entry *ce)
2810 {
2811 return ce->flow && !ce->flow->dead;
2812 }
2813
2814 static void
2815 emc_clear_entry(struct emc_entry *ce)
2816 {
2817 if (ce->flow) {
2818 dp_netdev_flow_unref(ce->flow);
2819 ce->flow = NULL;
2820 }
2821 }
2822
2823 static inline void
2824 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2825 const struct netdev_flow_key *key)
2826 {
2827 if (ce->flow != flow) {
2828 if (ce->flow) {
2829 dp_netdev_flow_unref(ce->flow);
2830 }
2831
2832 if (dp_netdev_flow_ref(flow)) {
2833 ce->flow = flow;
2834 } else {
2835 ce->flow = NULL;
2836 }
2837 }
2838 if (key) {
2839 netdev_flow_key_clone(&ce->key, key);
2840 }
2841 }
2842
2843 static inline void
2844 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2845 struct dp_netdev_flow *flow)
2846 {
2847 struct emc_entry *to_be_replaced = NULL;
2848 struct emc_entry *current_entry;
2849
2850 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2851 if (netdev_flow_key_equal(&current_entry->key, key)) {
2852 /* We found the entry with the 'mf' miniflow */
2853 emc_change_entry(current_entry, flow, NULL);
2854 return;
2855 }
2856
2857 /* Replacement policy: put the flow in an empty (not alive) entry, or
2858 * in the first entry where it can be */
2859 if (!to_be_replaced
2860 || (emc_entry_alive(to_be_replaced)
2861 && !emc_entry_alive(current_entry))
2862 || current_entry->key.hash < to_be_replaced->key.hash) {
2863 to_be_replaced = current_entry;
2864 }
2865 }
2866 /* We didn't find the miniflow in the cache.
2867 * The 'to_be_replaced' entry is where the new flow will be stored */
2868
2869 emc_change_entry(to_be_replaced, flow, key);
2870 }
2871
2872 static inline void
2873 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2874 const struct netdev_flow_key *key,
2875 struct dp_netdev_flow *flow)
2876 {
2877 /* Insert an entry into the EMC based on probability value 'min'. By
2878 * default the value is UINT32_MAX / 100 which yields an insertion
2879 * probability of 1/100 ie. 1% */
2880
2881 uint32_t min = pmd->ctx.emc_insert_min;
2882
2883 if (min && random_uint32() <= min) {
2884 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2885 }
2886 }
2887
2888 static inline struct dp_netdev_flow *
2889 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2890 {
2891 struct emc_entry *current_entry;
2892
2893 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2894 if (current_entry->key.hash == key->hash
2895 && emc_entry_alive(current_entry)
2896 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2897
2898 /* We found the entry with the 'key->mf' miniflow */
2899 return current_entry->flow;
2900 }
2901 }
2902
2903 return NULL;
2904 }
2905
2906 static inline const struct cmap_node *
2907 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2908 {
2909 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2910 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2911 uint16_t sig = hash >> 16;
2912 uint16_t index = UINT16_MAX;
2913
2914 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2915 if (bucket->sig[i] == sig) {
2916 index = bucket->flow_idx[i];
2917 break;
2918 }
2919 }
2920 if (index != UINT16_MAX) {
2921 return cmap_find_by_index(&pmd->flow_table, index);
2922 }
2923 return NULL;
2924 }
2925
2926 static void
2927 smc_clear_entry(struct smc_bucket *b, int idx)
2928 {
2929 b->flow_idx[idx] = UINT16_MAX;
2930 }
2931
2932 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2933 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2934 * If there is already an SMC entry having same signature, the index will be
2935 * updated. If there is no existing entry, but an empty entry is available,
2936 * the empty entry will be taken. If no empty entry or existing same signature,
2937 * a random entry from the hashed bucket will be picked. */
2938 static inline void
2939 smc_insert(struct dp_netdev_pmd_thread *pmd,
2940 const struct netdev_flow_key *key,
2941 uint32_t hash)
2942 {
2943 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2944 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2945 uint16_t index;
2946 uint32_t cmap_index;
2947 bool smc_enable_db;
2948 int i;
2949
2950 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2951 if (!smc_enable_db) {
2952 return;
2953 }
2954
2955 cmap_index = cmap_find_index(&pmd->flow_table, hash);
2956 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2957
2958 /* If the index is larger than SMC can handle (uint16_t), we don't
2959 * insert */
2960 if (index == UINT16_MAX) {
2961 return;
2962 }
2963
2964 /* If an entry with same signature already exists, update the index */
2965 uint16_t sig = key->hash >> 16;
2966 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2967 if (bucket->sig[i] == sig) {
2968 bucket->flow_idx[i] = index;
2969 return;
2970 }
2971 }
2972 /* If there is an empty entry, occupy it. */
2973 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2974 if (bucket->flow_idx[i] == UINT16_MAX) {
2975 bucket->sig[i] = sig;
2976 bucket->flow_idx[i] = index;
2977 return;
2978 }
2979 }
2980 /* Otherwise, pick a random entry. */
2981 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2982 bucket->sig[i] = sig;
2983 bucket->flow_idx[i] = index;
2984 }
2985
2986 static struct dp_netdev_flow *
2987 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2988 const struct netdev_flow_key *key,
2989 int *lookup_num_p)
2990 {
2991 struct dpcls *cls;
2992 struct dpcls_rule *rule;
2993 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2994 in_port.odp_port));
2995 struct dp_netdev_flow *netdev_flow = NULL;
2996
2997 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2998 if (OVS_LIKELY(cls)) {
2999 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3000 netdev_flow = dp_netdev_flow_cast(rule);
3001 }
3002 return netdev_flow;
3003 }
3004
3005 static struct dp_netdev_flow *
3006 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3007 const ovs_u128 *ufidp, const struct nlattr *key,
3008 size_t key_len)
3009 {
3010 struct dp_netdev_flow *netdev_flow;
3011 struct flow flow;
3012 ovs_u128 ufid;
3013
3014 /* If a UFID is not provided, determine one based on the key. */
3015 if (!ufidp && key && key_len
3016 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3017 odp_flow_key_hash(&flow, sizeof flow, &ufid);
3018 ufidp = &ufid;
3019 }
3020
3021 if (ufidp) {
3022 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3023 &pmd->flow_table) {
3024 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3025 return netdev_flow;
3026 }
3027 }
3028 }
3029
3030 return NULL;
3031 }
3032
3033 static bool
3034 dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3035 const struct dp_netdev_flow *netdev_flow,
3036 struct dpif_flow_stats *stats,
3037 struct dpif_flow_attrs *attrs)
3038 {
3039 uint64_t act_buf[1024 / 8];
3040 struct nlattr *actions;
3041 struct netdev *netdev;
3042 struct match match;
3043 struct ofpbuf buf;
3044
3045 int ret = 0;
3046
3047 if (!netdev_is_flow_api_enabled()) {
3048 return false;
3049 }
3050
3051 netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port, dp->class);
3052 if (!netdev) {
3053 return false;
3054 }
3055 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3056 /* Taking a global 'port_mutex' to fulfill thread safety
3057 * restrictions for the netdev-offload-dpdk module. */
3058 ovs_mutex_lock(&dp->port_mutex);
3059 ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid,
3060 stats, attrs, &buf);
3061 ovs_mutex_unlock(&dp->port_mutex);
3062 netdev_close(netdev);
3063 if (ret) {
3064 return false;
3065 }
3066
3067 return true;
3068 }
3069
3070 static void
3071 get_dpif_flow_status(const struct dp_netdev *dp,
3072 const struct dp_netdev_flow *netdev_flow_,
3073 struct dpif_flow_stats *stats,
3074 struct dpif_flow_attrs *attrs)
3075 {
3076 struct dpif_flow_stats offload_stats;
3077 struct dpif_flow_attrs offload_attrs;
3078 struct dp_netdev_flow *netdev_flow;
3079 unsigned long long n;
3080 long long used;
3081 uint16_t flags;
3082
3083 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3084
3085 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3086 stats->n_packets = n;
3087 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3088 stats->n_bytes = n;
3089 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3090 stats->used = used;
3091 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3092 stats->tcp_flags = flags;
3093
3094 if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3095 &offload_stats, &offload_attrs)) {
3096 stats->n_packets += offload_stats.n_packets;
3097 stats->n_bytes += offload_stats.n_bytes;
3098 stats->used = MAX(stats->used, offload_stats.used);
3099 stats->tcp_flags |= offload_stats.tcp_flags;
3100 if (attrs) {
3101 attrs->offloaded = offload_attrs.offloaded;
3102 attrs->dp_layer = offload_attrs.dp_layer;
3103 }
3104 } else if (attrs) {
3105 attrs->offloaded = false;
3106 attrs->dp_layer = "ovs";
3107 }
3108 }
3109
3110 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3111 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3112 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3113 * protect them. */
3114 static void
3115 dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3116 const struct dp_netdev_flow *netdev_flow,
3117 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3118 struct dpif_flow *flow, bool terse)
3119 {
3120 if (terse) {
3121 memset(flow, 0, sizeof *flow);
3122 } else {
3123 struct flow_wildcards wc;
3124 struct dp_netdev_actions *actions;
3125 size_t offset;
3126 struct odp_flow_key_parms odp_parms = {
3127 .flow = &netdev_flow->flow,
3128 .mask = &wc.masks,
3129 .support = dp_netdev_support,
3130 };
3131
3132 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3133 /* in_port is exact matched, but we have left it out from the mask for
3134 * optimnization reasons. Add in_port back to the mask. */
3135 wc.masks.in_port.odp_port = ODPP_NONE;
3136
3137 /* Key */
3138 offset = key_buf->size;
3139 flow->key = ofpbuf_tail(key_buf);
3140 odp_flow_key_from_flow(&odp_parms, key_buf);
3141 flow->key_len = key_buf->size - offset;
3142
3143 /* Mask */
3144 offset = mask_buf->size;
3145 flow->mask = ofpbuf_tail(mask_buf);
3146 odp_parms.key_buf = key_buf;
3147 odp_flow_key_from_mask(&odp_parms, mask_buf);
3148 flow->mask_len = mask_buf->size - offset;
3149
3150 /* Actions */
3151 actions = dp_netdev_flow_get_actions(netdev_flow);
3152 flow->actions = actions->actions;
3153 flow->actions_len = actions->size;
3154 }
3155
3156 flow->ufid = netdev_flow->ufid;
3157 flow->ufid_present = true;
3158 flow->pmd_id = netdev_flow->pmd_id;
3159
3160 get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3161 }
3162
3163 static int
3164 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3165 const struct nlattr *mask_key,
3166 uint32_t mask_key_len, const struct flow *flow,
3167 struct flow_wildcards *wc, bool probe)
3168 {
3169 enum odp_key_fitness fitness;
3170
3171 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3172 if (fitness) {
3173 if (!probe) {
3174 /* This should not happen: it indicates that
3175 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3176 * disagree on the acceptable form of a mask. Log the problem
3177 * as an error, with enough details to enable debugging. */
3178 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3179
3180 if (!VLOG_DROP_ERR(&rl)) {
3181 struct ds s;
3182
3183 ds_init(&s);
3184 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3185 true);
3186 VLOG_ERR("internal error parsing flow mask %s (%s)",
3187 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3188 ds_destroy(&s);
3189 }
3190 }
3191
3192 return EINVAL;
3193 }
3194
3195 return 0;
3196 }
3197
3198 static int
3199 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3200 struct flow *flow, bool probe)
3201 {
3202 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3203 if (!probe) {
3204 /* This should not happen: it indicates that
3205 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3206 * the acceptable form of a flow. Log the problem as an error,
3207 * with enough details to enable debugging. */
3208 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3209
3210 if (!VLOG_DROP_ERR(&rl)) {
3211 struct ds s;
3212
3213 ds_init(&s);
3214 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3215 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3216 ds_destroy(&s);
3217 }
3218 }
3219
3220 return EINVAL;
3221 }
3222
3223 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3224 return EINVAL;
3225 }
3226
3227 return 0;
3228 }
3229
3230 static int
3231 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3232 {
3233 struct dp_netdev *dp = get_dp_netdev(dpif);
3234 struct dp_netdev_flow *netdev_flow;
3235 struct dp_netdev_pmd_thread *pmd;
3236 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3237 struct hmapx_node *node;
3238 int error = EINVAL;
3239
3240 if (get->pmd_id == PMD_ID_NULL) {
3241 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3242 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3243 dp_netdev_pmd_unref(pmd);
3244 }
3245 }
3246 } else {
3247 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3248 if (!pmd) {
3249 goto out;
3250 }
3251 hmapx_add(&to_find, pmd);
3252 }
3253
3254 if (!hmapx_count(&to_find)) {
3255 goto out;
3256 }
3257
3258 HMAPX_FOR_EACH (node, &to_find) {
3259 pmd = (struct dp_netdev_pmd_thread *) node->data;
3260 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3261 get->key_len);
3262 if (netdev_flow) {
3263 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3264 get->buffer, get->flow, false);
3265 error = 0;
3266 break;
3267 } else {
3268 error = ENOENT;
3269 }
3270 }
3271
3272 HMAPX_FOR_EACH (node, &to_find) {
3273 pmd = (struct dp_netdev_pmd_thread *) node->data;
3274 dp_netdev_pmd_unref(pmd);
3275 }
3276 out:
3277 hmapx_destroy(&to_find);
3278 return error;
3279 }
3280
3281 static void
3282 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3283 {
3284 struct flow masked_flow;
3285 size_t i;
3286
3287 for (i = 0; i < sizeof(struct flow); i++) {
3288 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3289 ((uint8_t *)&match->wc)[i];
3290 }
3291 odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3292 }
3293
3294 static struct dp_netdev_flow *
3295 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3296 struct match *match, const ovs_u128 *ufid,
3297 const struct nlattr *actions, size_t actions_len)
3298 OVS_REQUIRES(pmd->flow_mutex)
3299 {
3300 struct dp_netdev_flow *flow;
3301 struct netdev_flow_key mask;
3302 struct dpcls *cls;
3303
3304 /* Make sure in_port is exact matched before we read it. */
3305 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3306 odp_port_t in_port = match->flow.in_port.odp_port;
3307
3308 /* As we select the dpcls based on the port number, each netdev flow
3309 * belonging to the same dpcls will have the same odp_port value.
3310 * For performance reasons we wildcard odp_port here in the mask. In the
3311 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3312 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3313 * will not be part of the subtable mask.
3314 * This will speed up the hash computation during dpcls_lookup() because
3315 * there is one less call to hash_add64() in this case. */
3316 match->wc.masks.in_port.odp_port = 0;
3317 netdev_flow_mask_init(&mask, match);
3318 match->wc.masks.in_port.odp_port = ODPP_NONE;
3319
3320 /* Make sure wc does not have metadata. */
3321 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3322 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3323
3324 /* Do not allocate extra space. */
3325 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3326 memset(&flow->stats, 0, sizeof flow->stats);
3327 flow->dead = false;
3328 flow->batch = NULL;
3329 flow->mark = INVALID_FLOW_MARK;
3330 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3331 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3332 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3333 ovs_refcount_init(&flow->ref_cnt);
3334 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3335
3336 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3337 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3338
3339 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3340 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3341 dpcls_insert(cls, &flow->cr, &mask);
3342
3343 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3344 dp_netdev_flow_hash(&flow->ufid));
3345
3346 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3347
3348 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3349 struct ds ds = DS_EMPTY_INITIALIZER;
3350 struct ofpbuf key_buf, mask_buf;
3351 struct odp_flow_key_parms odp_parms = {
3352 .flow = &match->flow,
3353 .mask = &match->wc.masks,
3354 .support = dp_netdev_support,
3355 };
3356
3357 ofpbuf_init(&key_buf, 0);
3358 ofpbuf_init(&mask_buf, 0);
3359
3360 odp_flow_key_from_flow(&odp_parms, &key_buf);
3361 odp_parms.key_buf = &key_buf;
3362 odp_flow_key_from_mask(&odp_parms, &mask_buf);
3363
3364 ds_put_cstr(&ds, "flow_add: ");
3365 odp_format_ufid(ufid, &ds);
3366 ds_put_cstr(&ds, " ");
3367 odp_flow_format(key_buf.data, key_buf.size,
3368 mask_buf.data, mask_buf.size,
3369 NULL, &ds, false);
3370 ds_put_cstr(&ds, ", actions:");
3371 format_odp_actions(&ds, actions, actions_len, NULL);
3372
3373 VLOG_DBG("%s", ds_cstr(&ds));
3374
3375 ofpbuf_uninit(&key_buf);
3376 ofpbuf_uninit(&mask_buf);
3377
3378 /* Add a printout of the actual match installed. */
3379 struct match m;
3380 ds_clear(&ds);
3381 ds_put_cstr(&ds, "flow match: ");
3382 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3383 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3384 memset(&m.tun_md, 0, sizeof m.tun_md);
3385 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3386
3387 VLOG_DBG("%s", ds_cstr(&ds));
3388
3389 ds_destroy(&ds);
3390 }
3391
3392 return flow;
3393 }
3394
3395 static int
3396 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3397 struct netdev_flow_key *key,
3398 struct match *match,
3399 ovs_u128 *ufid,
3400 const struct dpif_flow_put *put,
3401 struct dpif_flow_stats *stats)
3402 {
3403 struct dp_netdev_flow *netdev_flow;
3404 int error = 0;
3405
3406 if (stats) {
3407 memset(stats, 0, sizeof *stats);
3408 }
3409
3410 ovs_mutex_lock(&pmd->flow_mutex);
3411 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3412 if (!netdev_flow) {
3413 if (put->flags & DPIF_FP_CREATE) {
3414 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
3415 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3416 put->actions_len);
3417 error = 0;
3418 } else {
3419 error = EFBIG;
3420 }
3421 } else {
3422 error = ENOENT;
3423 }
3424 } else {
3425 if (put->flags & DPIF_FP_MODIFY) {
3426 struct dp_netdev_actions *new_actions;
3427 struct dp_netdev_actions *old_actions;
3428
3429 new_actions = dp_netdev_actions_create(put->actions,
3430 put->actions_len);
3431
3432 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3433 ovsrcu_set(&netdev_flow->actions, new_actions);
3434
3435 queue_netdev_flow_put(pmd, netdev_flow, match,
3436 put->actions, put->actions_len);
3437
3438 if (stats) {
3439 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3440 }
3441 if (put->flags & DPIF_FP_ZERO_STATS) {
3442 /* XXX: The userspace datapath uses thread local statistics
3443 * (for flows), which should be updated only by the owning
3444 * thread. Since we cannot write on stats memory here,
3445 * we choose not to support this flag. Please note:
3446 * - This feature is currently used only by dpctl commands with
3447 * option --clear.
3448 * - Should the need arise, this operation can be implemented
3449 * by keeping a base value (to be update here) for each
3450 * counter, and subtracting it before outputting the stats */
3451 error = EOPNOTSUPP;
3452 }
3453
3454 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3455 } else if (put->flags & DPIF_FP_CREATE) {
3456 error = EEXIST;
3457 } else {
3458 /* Overlapping flow. */
3459 error = EINVAL;
3460 }
3461 }
3462 ovs_mutex_unlock(&pmd->flow_mutex);
3463 return error;
3464 }
3465
3466 static int
3467 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3468 {
3469 struct dp_netdev *dp = get_dp_netdev(dpif);
3470 struct netdev_flow_key key, mask;
3471 struct dp_netdev_pmd_thread *pmd;
3472 struct match match;
3473 ovs_u128 ufid;
3474 int error;
3475 bool probe = put->flags & DPIF_FP_PROBE;
3476
3477 if (put->stats) {
3478 memset(put->stats, 0, sizeof *put->stats);
3479 }
3480 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3481 probe);
3482 if (error) {
3483 return error;
3484 }
3485 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3486 put->mask, put->mask_len,
3487 &match.flow, &match.wc, probe);
3488 if (error) {
3489 return error;
3490 }
3491
3492 if (put->ufid) {
3493 ufid = *put->ufid;
3494 } else {
3495 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3496 }
3497
3498 /* The Netlink encoding of datapath flow keys cannot express
3499 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3500 * tag is interpreted as exact match on the fact that there is no
3501 * VLAN. Unless we refactor a lot of code that translates between
3502 * Netlink and struct flow representations, we have to do the same
3503 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3504 if (!match.wc.masks.vlans[0].tci) {
3505 match.wc.masks.vlans[0].tci = htons(0xffff);
3506 }
3507
3508 /* Must produce a netdev_flow_key for lookup.
3509 * Use the same method as employed to create the key when adding
3510 * the flow to the dplcs to make sure they match. */
3511 netdev_flow_mask_init(&mask, &match);
3512 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3513
3514 if (put->pmd_id == PMD_ID_NULL) {
3515 if (cmap_count(&dp->poll_threads) == 0) {
3516 return EINVAL;
3517 }
3518 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3519 struct dpif_flow_stats pmd_stats;
3520 int pmd_error;
3521
3522 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3523 &pmd_stats);
3524 if (pmd_error) {
3525 error = pmd_error;
3526 } else if (put->stats) {
3527 put->stats->n_packets += pmd_stats.n_packets;
3528 put->stats->n_bytes += pmd_stats.n_bytes;
3529 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3530 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3531 }
3532 }
3533 } else {
3534 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3535 if (!pmd) {
3536 return EINVAL;
3537 }
3538 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3539 dp_netdev_pmd_unref(pmd);
3540 }
3541
3542 return error;
3543 }
3544
3545 static int
3546 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3547 struct dpif_flow_stats *stats,
3548 const struct dpif_flow_del *del)
3549 {
3550 struct dp_netdev_flow *netdev_flow;
3551 int error = 0;
3552
3553 ovs_mutex_lock(&pmd->flow_mutex);
3554 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3555 del->key_len);
3556 if (netdev_flow) {
3557 if (stats) {
3558 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3559 }
3560 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3561 } else {
3562 error = ENOENT;
3563 }
3564 ovs_mutex_unlock(&pmd->flow_mutex);
3565
3566 return error;
3567 }
3568
3569 static int
3570 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3571 {
3572 struct dp_netdev *dp = get_dp_netdev(dpif);
3573 struct dp_netdev_pmd_thread *pmd;
3574 int error = 0;
3575
3576 if (del->stats) {
3577 memset(del->stats, 0, sizeof *del->stats);
3578 }
3579
3580 if (del->pmd_id == PMD_ID_NULL) {
3581 if (cmap_count(&dp->poll_threads) == 0) {
3582 return EINVAL;
3583 }
3584 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3585 struct dpif_flow_stats pmd_stats;
3586 int pmd_error;
3587
3588 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3589 if (pmd_error) {
3590 error = pmd_error;
3591 } else if (del->stats) {
3592 del->stats->n_packets += pmd_stats.n_packets;
3593 del->stats->n_bytes += pmd_stats.n_bytes;
3594 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3595 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3596 }
3597 }
3598 } else {
3599 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3600 if (!pmd) {
3601 return EINVAL;
3602 }
3603 error = flow_del_on_pmd(pmd, del->stats, del);
3604 dp_netdev_pmd_unref(pmd);
3605 }
3606
3607
3608 return error;
3609 }
3610
3611 struct dpif_netdev_flow_dump {
3612 struct dpif_flow_dump up;
3613 struct cmap_position poll_thread_pos;
3614 struct cmap_position flow_pos;
3615 struct dp_netdev_pmd_thread *cur_pmd;
3616 int status;
3617 struct ovs_mutex mutex;
3618 };
3619
3620 static struct dpif_netdev_flow_dump *
3621 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3622 {
3623 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3624 }
3625
3626 static struct dpif_flow_dump *
3627 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3628 struct dpif_flow_dump_types *types OVS_UNUSED)
3629 {
3630 struct dpif_netdev_flow_dump *dump;
3631
3632 dump = xzalloc(sizeof *dump);
3633 dpif_flow_dump_init(&dump->up, dpif_);
3634 dump->up.terse = terse;
3635 ovs_mutex_init(&dump->mutex);
3636
3637 return &dump->up;
3638 }
3639
3640 static int
3641 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3642 {
3643 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3644
3645 ovs_mutex_destroy(&dump->mutex);
3646 free(dump);
3647 return 0;
3648 }
3649
3650 struct dpif_netdev_flow_dump_thread {
3651 struct dpif_flow_dump_thread up;
3652 struct dpif_netdev_flow_dump *dump;
3653 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3654 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3655 };
3656
3657 static struct dpif_netdev_flow_dump_thread *
3658 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3659 {
3660 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3661 }
3662
3663 static struct dpif_flow_dump_thread *
3664 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3665 {
3666 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3667 struct dpif_netdev_flow_dump_thread *thread;
3668
3669 thread = xmalloc(sizeof *thread);
3670 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3671 thread->dump = dump;
3672 return &thread->up;
3673 }
3674
3675 static void
3676 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3677 {
3678 struct dpif_netdev_flow_dump_thread *thread
3679 = dpif_netdev_flow_dump_thread_cast(thread_);
3680
3681 free(thread);
3682 }
3683
3684 static int
3685 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3686 struct dpif_flow *flows, int max_flows)
3687 {
3688 struct dpif_netdev_flow_dump_thread *thread
3689 = dpif_netdev_flow_dump_thread_cast(thread_);
3690 struct dpif_netdev_flow_dump *dump = thread->dump;
3691 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3692 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3693 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3694 int n_flows = 0;
3695 int i;
3696
3697 ovs_mutex_lock(&dump->mutex);
3698 if (!dump->status) {
3699 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3700 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3701
3702 /* First call to dump_next(), extracts the first pmd thread.
3703 * If there is no pmd thread, returns immediately. */
3704 if (!pmd) {
3705 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3706 if (!pmd) {
3707 ovs_mutex_unlock(&dump->mutex);
3708 return n_flows;
3709
3710 }
3711 }
3712
3713 do {
3714 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3715 struct cmap_node *node;
3716
3717 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3718 if (!node) {
3719 break;
3720 }
3721 netdev_flows[n_flows] = CONTAINER_OF(node,
3722 struct dp_netdev_flow,
3723 node);
3724 }
3725 /* When finishing dumping the current pmd thread, moves to
3726 * the next. */
3727 if (n_flows < flow_limit) {
3728 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3729 dp_netdev_pmd_unref(pmd);
3730 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3731 if (!pmd) {
3732 dump->status = EOF;
3733 break;
3734 }
3735 }
3736 /* Keeps the reference to next caller. */
3737 dump->cur_pmd = pmd;
3738
3739 /* If the current dump is empty, do not exit the loop, since the
3740 * remaining pmds could have flows to be dumped. Just dumps again
3741 * on the new 'pmd'. */
3742 } while (!n_flows);
3743 }
3744 ovs_mutex_unlock(&dump->mutex);
3745
3746 for (i = 0; i < n_flows; i++) {
3747 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3748 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3749 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3750 struct dpif_flow *f = &flows[i];
3751 struct ofpbuf key, mask;
3752
3753 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3754 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3755 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
3756 dump->up.terse);
3757 }
3758
3759 return n_flows;
3760 }
3761
3762 static int
3763 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3764 OVS_NO_THREAD_SAFETY_ANALYSIS
3765 {
3766 struct dp_netdev *dp = get_dp_netdev(dpif);
3767 struct dp_netdev_pmd_thread *pmd;
3768 struct dp_packet_batch pp;
3769
3770 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3771 dp_packet_size(execute->packet) > UINT16_MAX) {
3772 return EINVAL;
3773 }
3774
3775 /* Tries finding the 'pmd'. If NULL is returned, that means
3776 * the current thread is a non-pmd thread and should use
3777 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3778 pmd = ovsthread_getspecific(dp->per_pmd_key);
3779 if (!pmd) {
3780 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3781 if (!pmd) {
3782 return EBUSY;
3783 }
3784 }
3785
3786 if (execute->probe) {
3787 /* If this is part of a probe, Drop the packet, since executing
3788 * the action may actually cause spurious packets be sent into
3789 * the network. */
3790 if (pmd->core_id == NON_PMD_CORE_ID) {
3791 dp_netdev_pmd_unref(pmd);
3792 }
3793 return 0;
3794 }
3795
3796 /* If the current thread is non-pmd thread, acquires
3797 * the 'non_pmd_mutex'. */
3798 if (pmd->core_id == NON_PMD_CORE_ID) {
3799 ovs_mutex_lock(&dp->non_pmd_mutex);
3800 }
3801
3802 /* Update current time in PMD context. We don't care about EMC insertion
3803 * probability, because we are on a slow path. */
3804 pmd_thread_ctx_time_update(pmd);
3805
3806 /* The action processing expects the RSS hash to be valid, because
3807 * it's always initialized at the beginning of datapath processing.
3808 * In this case, though, 'execute->packet' may not have gone through
3809 * the datapath at all, it may have been generated by the upper layer
3810 * (OpenFlow packet-out, BFD frame, ...). */
3811 if (!dp_packet_rss_valid(execute->packet)) {
3812 dp_packet_set_rss_hash(execute->packet,
3813 flow_hash_5tuple(execute->flow, 0));
3814 }
3815
3816 dp_packet_batch_init_packet(&pp, execute->packet);
3817 pp.do_not_steal = true;
3818 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3819 execute->actions, execute->actions_len);
3820 dp_netdev_pmd_flush_output_packets(pmd, true);
3821
3822 if (pmd->core_id == NON_PMD_CORE_ID) {
3823 ovs_mutex_unlock(&dp->non_pmd_mutex);
3824 dp_netdev_pmd_unref(pmd);
3825 }
3826
3827 return 0;
3828 }
3829
3830 static void
3831 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3832 enum dpif_offload_type offload_type OVS_UNUSED)
3833 {
3834 size_t i;
3835
3836 for (i = 0; i < n_ops; i++) {
3837 struct dpif_op *op = ops[i];
3838
3839 switch (op->type) {
3840 case DPIF_OP_FLOW_PUT:
3841 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3842 break;
3843
3844 case DPIF_OP_FLOW_DEL:
3845 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3846 break;
3847
3848 case DPIF_OP_EXECUTE:
3849 op->error = dpif_netdev_execute(dpif, &op->execute);
3850 break;
3851
3852 case DPIF_OP_FLOW_GET:
3853 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3854 break;
3855 }
3856 }
3857 }
3858
3859 /* Enable or Disable PMD auto load balancing. */
3860 static void
3861 set_pmd_auto_lb(struct dp_netdev *dp)
3862 {
3863 unsigned int cnt = 0;
3864 struct dp_netdev_pmd_thread *pmd;
3865 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3866
3867 bool enable_alb = false;
3868 bool multi_rxq = false;
3869 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3870
3871 /* Ensure that there is at least 2 non-isolated PMDs and
3872 * one of them is polling more than one rxq. */
3873 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3874 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3875 continue;
3876 }
3877
3878 if (hmap_count(&pmd->poll_list) > 1) {
3879 multi_rxq = true;
3880 }
3881 if (cnt && multi_rxq) {
3882 enable_alb = true;
3883 break;
3884 }
3885 cnt++;
3886 }
3887
3888 /* Enable auto LB if it is requested and cycle based assignment is true. */
3889 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3890 pmd_alb->auto_lb_requested;
3891
3892 if (pmd_alb->is_enabled != enable_alb) {
3893 pmd_alb->is_enabled = enable_alb;
3894 if (pmd_alb->is_enabled) {
3895 VLOG_INFO("PMD auto load balance is enabled "
3896 "(with rebalance interval:%"PRIu64" msec)",
3897 pmd_alb->rebalance_intvl);
3898 } else {
3899 pmd_alb->rebalance_poll_timer = 0;
3900 VLOG_INFO("PMD auto load balance is disabled");
3901 }
3902 }
3903
3904 }
3905
3906 /* Applies datapath configuration from the database. Some of the changes are
3907 * actually applied in dpif_netdev_run(). */
3908 static int
3909 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3910 {
3911 struct dp_netdev *dp = get_dp_netdev(dpif);
3912 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3913 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3914 "cycles");
3915 unsigned long long insert_prob =
3916 smap_get_ullong(other_config, "emc-insert-inv-prob",
3917 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3918 uint32_t insert_min, cur_min;
3919 uint32_t tx_flush_interval, cur_tx_flush_interval;
3920 uint64_t rebalance_intvl;
3921
3922 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3923 DEFAULT_TX_FLUSH_INTERVAL);
3924 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3925 if (tx_flush_interval != cur_tx_flush_interval) {
3926 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3927 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3928 tx_flush_interval);
3929 }
3930
3931 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3932 free(dp->pmd_cmask);
3933 dp->pmd_cmask = nullable_xstrdup(cmask);
3934 dp_netdev_request_reconfigure(dp);
3935 }
3936
3937 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3938 if (insert_prob <= UINT32_MAX) {
3939 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3940 } else {
3941 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3942 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3943 }
3944
3945 if (insert_min != cur_min) {
3946 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3947 if (insert_min == 0) {
3948 VLOG_INFO("EMC insertion probability changed to zero");
3949 } else {
3950 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3951 insert_prob, (100 / (float)insert_prob));
3952 }
3953 }
3954
3955 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3956 bool cur_perf_enabled;
3957 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3958 if (perf_enabled != cur_perf_enabled) {
3959 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3960 if (perf_enabled) {
3961 VLOG_INFO("PMD performance metrics collection enabled");
3962 } else {
3963 VLOG_INFO("PMD performance metrics collection disabled");
3964 }
3965 }
3966
3967 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3968 bool cur_smc;
3969 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3970 if (smc_enable != cur_smc) {
3971 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3972 if (smc_enable) {
3973 VLOG_INFO("SMC cache is enabled");
3974 } else {
3975 VLOG_INFO("SMC cache is disabled");
3976 }
3977 }
3978
3979 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3980 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3981 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3982 "Defaulting to 'cycles'.");
3983 pmd_rxq_assign_cyc = true;
3984 pmd_rxq_assign = "cycles";
3985 }
3986 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
3987 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
3988 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3989 pmd_rxq_assign);
3990 dp_netdev_request_reconfigure(dp);
3991 }
3992
3993 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3994 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
3995 false);
3996
3997 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
3998 ALB_PMD_REBALANCE_POLL_INTERVAL);
3999
4000 /* Input is in min, convert it to msec. */
4001 rebalance_intvl =
4002 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4003
4004 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4005 pmd_alb->rebalance_intvl = rebalance_intvl;
4006 }
4007
4008 set_pmd_auto_lb(dp);
4009 return 0;
4010 }
4011
4012 /* Parses affinity list and returns result in 'core_ids'. */
4013 static int
4014 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4015 {
4016 unsigned i;
4017 char *list, *copy, *key, *value;
4018 int error = 0;
4019
4020 for (i = 0; i < n_rxq; i++) {
4021 core_ids[i] = OVS_CORE_UNSPEC;
4022 }
4023
4024 if (!affinity_list) {
4025 return 0;
4026 }
4027
4028 list = copy = xstrdup(affinity_list);
4029
4030 while (ofputil_parse_key_value(&list, &key, &value)) {
4031 int rxq_id, core_id;
4032
4033 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4034 || !str_to_int(value, 0, &core_id) || core_id < 0) {
4035 error = EINVAL;
4036 break;
4037 }
4038
4039 if (rxq_id < n_rxq) {
4040 core_ids[rxq_id] = core_id;
4041 }
4042 }
4043
4044 free(copy);
4045 return error;
4046 }
4047
4048 /* Parses 'affinity_list' and applies configuration if it is valid. */
4049 static int
4050 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4051 const char *affinity_list)
4052 {
4053 unsigned *core_ids, i;
4054 int error = 0;
4055
4056 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4057 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4058 error = EINVAL;
4059 goto exit;
4060 }
4061
4062 for (i = 0; i < port->n_rxq; i++) {
4063 port->rxqs[i].core_id = core_ids[i];
4064 }
4065
4066 exit:
4067 free(core_ids);
4068 return error;
4069 }
4070
4071 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4072 * of given PMD thread. */
4073 static bool
4074 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4075 struct dp_netdev_port *port)
4076 OVS_EXCLUDED(pmd->port_mutex)
4077 {
4078 struct rxq_poll *poll;
4079 bool found = false;
4080
4081 ovs_mutex_lock(&pmd->port_mutex);
4082 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4083 if (port == poll->rxq->port) {
4084 found = true;
4085 break;
4086 }
4087 }
4088 ovs_mutex_unlock(&pmd->port_mutex);
4089 return found;
4090 }
4091
4092 /* Updates port configuration from the database. The changes are actually
4093 * applied in dpif_netdev_run(). */
4094 static int
4095 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4096 const struct smap *cfg)
4097 {
4098 struct dp_netdev *dp = get_dp_netdev(dpif);
4099 struct dp_netdev_port *port;
4100 int error = 0;
4101 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4102 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4103
4104 ovs_mutex_lock(&dp->port_mutex);
4105 error = get_port_by_number(dp, port_no, &port);
4106 if (error) {
4107 goto unlock;
4108 }
4109
4110 if (emc_enabled != port->emc_enabled) {
4111 struct dp_netdev_pmd_thread *pmd;
4112 struct ds ds = DS_EMPTY_INITIALIZER;
4113 uint32_t cur_min, insert_prob;
4114
4115 port->emc_enabled = emc_enabled;
4116 /* Mark for reload all the threads that polls this port and request
4117 * for reconfiguration for the actual reloading of threads. */
4118 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4119 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4120 pmd->need_reload = true;
4121 }
4122 }
4123 dp_netdev_request_reconfigure(dp);
4124
4125 ds_put_format(&ds, "%s: EMC has been %s.",
4126 netdev_get_name(port->netdev),
4127 (emc_enabled) ? "enabled" : "disabled");
4128 if (emc_enabled) {
4129 ds_put_cstr(&ds, " Current insertion probability is ");
4130 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4131 if (!cur_min) {
4132 ds_put_cstr(&ds, "zero.");
4133 } else {
4134 insert_prob = UINT32_MAX / cur_min;
4135 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4136 insert_prob, 100 / (float) insert_prob);
4137 }
4138 }
4139 VLOG_INFO("%s", ds_cstr(&ds));
4140 ds_destroy(&ds);
4141 }
4142
4143 /* Checking for RXq affinity changes. */
4144 if (!netdev_is_pmd(port->netdev)
4145 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4146 goto unlock;
4147 }
4148
4149 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4150 if (error) {
4151 goto unlock;
4152 }
4153 free(port->rxq_affinity_list);
4154 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4155
4156 dp_netdev_request_reconfigure(dp);
4157 unlock:
4158 ovs_mutex_unlock(&dp->port_mutex);
4159 return error;
4160 }
4161
4162 static int
4163 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4164 uint32_t queue_id, uint32_t *priority)
4165 {
4166 *priority = queue_id;
4167 return 0;
4168 }
4169
4170 \f
4171 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4172 * a copy of the 'size' bytes of 'actions' input parameters. */
4173 struct dp_netdev_actions *
4174 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4175 {
4176 struct dp_netdev_actions *netdev_actions;
4177
4178 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4179 memcpy(netdev_actions->actions, actions, size);
4180 netdev_actions->size = size;
4181
4182 return netdev_actions;
4183 }
4184
4185 struct dp_netdev_actions *
4186 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4187 {
4188 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4189 }
4190
4191 static void
4192 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4193 {
4194 free(actions);
4195 }
4196 \f
4197 static void
4198 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4199 enum rxq_cycles_counter_type type,
4200 unsigned long long cycles)
4201 {
4202 atomic_store_relaxed(&rx->cycles[type], cycles);
4203 }
4204
4205 static void
4206 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4207 enum rxq_cycles_counter_type type,
4208 unsigned long long cycles)
4209 {
4210 non_atomic_ullong_add(&rx->cycles[type], cycles);
4211 }
4212
4213 static uint64_t
4214 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4215 enum rxq_cycles_counter_type type)
4216 {
4217 unsigned long long processing_cycles;
4218 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4219 return processing_cycles;
4220 }
4221
4222 static void
4223 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4224 unsigned long long cycles)
4225 {
4226 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4227 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4228 }
4229
4230 static uint64_t
4231 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4232 {
4233 unsigned long long processing_cycles;
4234 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4235 return processing_cycles;
4236 }
4237
4238 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4239 static inline bool
4240 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4241 {
4242 bool pmd_perf_enabled;
4243 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4244 return pmd_perf_enabled;
4245 }
4246 #else
4247 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4248 * performance metrics are not available as locked access to 64 bit
4249 * integers would be prohibitively expensive. */
4250 static inline bool
4251 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4252 {
4253 return false;
4254 }
4255 #endif
4256
4257 static int
4258 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4259 struct tx_port *p)
4260 {
4261 int i;
4262 int tx_qid;
4263 int output_cnt;
4264 bool dynamic_txqs;
4265 struct cycle_timer timer;
4266 uint64_t cycles;
4267 uint32_t tx_flush_interval;
4268
4269 cycle_timer_start(&pmd->perf_stats, &timer);
4270
4271 dynamic_txqs = p->port->dynamic_txqs;
4272 if (dynamic_txqs) {
4273 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4274 } else {
4275 tx_qid = pmd->static_tx_qid;
4276 }
4277
4278 output_cnt = dp_packet_batch_size(&p->output_pkts);
4279 ovs_assert(output_cnt > 0);
4280
4281 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4282 dp_packet_batch_init(&p->output_pkts);
4283
4284 /* Update time of the next flush. */
4285 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4286 p->flush_time = pmd->ctx.now + tx_flush_interval;
4287
4288 ovs_assert(pmd->n_output_batches > 0);
4289 pmd->n_output_batches--;
4290
4291 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4292 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4293
4294 /* Distribute send cycles evenly among transmitted packets and assign to
4295 * their respective rx queues. */
4296 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4297 for (i = 0; i < output_cnt; i++) {
4298 if (p->output_pkts_rxqs[i]) {
4299 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4300 RXQ_CYCLES_PROC_CURR, cycles);
4301 }
4302 }
4303
4304 return output_cnt;
4305 }
4306
4307 static int
4308 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4309 bool force)
4310 {
4311 struct tx_port *p;
4312 int output_cnt = 0;
4313
4314 if (!pmd->n_output_batches) {
4315 return 0;
4316 }
4317
4318 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4319 if (!dp_packet_batch_is_empty(&p->output_pkts)
4320 && (force || pmd->ctx.now >= p->flush_time)) {
4321 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4322 }
4323 }
4324 return output_cnt;
4325 }
4326
4327 static int
4328 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4329 struct dp_netdev_rxq *rxq,
4330 odp_port_t port_no)
4331 {
4332 struct pmd_perf_stats *s = &pmd->perf_stats;
4333 struct dp_packet_batch batch;
4334 struct cycle_timer timer;
4335 int error;
4336 int batch_cnt = 0;
4337 int rem_qlen = 0, *qlen_p = NULL;
4338 uint64_t cycles;
4339
4340 /* Measure duration for polling and processing rx burst. */
4341 cycle_timer_start(&pmd->perf_stats, &timer);
4342
4343 pmd->ctx.last_rxq = rxq;
4344 dp_packet_batch_init(&batch);
4345
4346 /* Fetch the rx queue length only for vhostuser ports. */
4347 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4348 qlen_p = &rem_qlen;
4349 }
4350
4351 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4352 if (!error) {
4353 /* At least one packet received. */
4354 *recirc_depth_get() = 0;
4355 pmd_thread_ctx_time_update(pmd);
4356 batch_cnt = dp_packet_batch_size(&batch);
4357 if (pmd_perf_metrics_enabled(pmd)) {
4358 /* Update batch histogram. */
4359 s->current.batches++;
4360 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4361 /* Update the maximum vhost rx queue fill level. */
4362 if (rxq->is_vhost && rem_qlen >= 0) {
4363 uint32_t qfill = batch_cnt + rem_qlen;
4364 if (qfill > s->current.max_vhost_qfill) {
4365 s->current.max_vhost_qfill = qfill;
4366 }
4367 }
4368 }
4369 /* Process packet batch. */
4370 dp_netdev_input(pmd, &batch, port_no);
4371
4372 /* Assign processing cycles to rx queue. */
4373 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4374 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4375
4376 dp_netdev_pmd_flush_output_packets(pmd, false);
4377 } else {
4378 /* Discard cycles. */
4379 cycle_timer_stop(&pmd->perf_stats, &timer);
4380 if (error != EAGAIN && error != EOPNOTSUPP) {
4381 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4382
4383 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4384 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4385 }
4386 }
4387
4388 pmd->ctx.last_rxq = NULL;
4389
4390 return batch_cnt;
4391 }
4392
4393 static struct tx_port *
4394 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4395 {
4396 struct tx_port *tx;
4397
4398 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4399 if (tx->port->port_no == port_no) {
4400 return tx;
4401 }
4402 }
4403
4404 return NULL;
4405 }
4406
4407 static int
4408 port_reconfigure(struct dp_netdev_port *port)
4409 {
4410 struct netdev *netdev = port->netdev;
4411 int i, err;
4412
4413 /* Closes the existing 'rxq's. */
4414 for (i = 0; i < port->n_rxq; i++) {
4415 netdev_rxq_close(port->rxqs[i].rx);
4416 port->rxqs[i].rx = NULL;
4417 }
4418 unsigned last_nrxq = port->n_rxq;
4419 port->n_rxq = 0;
4420
4421 /* Allows 'netdev' to apply the pending configuration changes. */
4422 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4423 err = netdev_reconfigure(netdev);
4424 if (err && (err != EOPNOTSUPP)) {
4425 VLOG_ERR("Failed to set interface %s new configuration",
4426 netdev_get_name(netdev));
4427 return err;
4428 }
4429 }
4430 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4431 port->rxqs = xrealloc(port->rxqs,
4432 sizeof *port->rxqs * netdev_n_rxq(netdev));
4433 /* Realloc 'used' counters for tx queues. */
4434 free(port->txq_used);
4435 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4436
4437 for (i = 0; i < netdev_n_rxq(netdev); i++) {
4438 bool new_queue = i >= last_nrxq;
4439 if (new_queue) {
4440 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4441 }
4442
4443 port->rxqs[i].port = port;
4444 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4445
4446 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4447 if (err) {
4448 return err;
4449 }
4450 port->n_rxq++;
4451 }
4452
4453 /* Parse affinity list to apply configuration for new queues. */
4454 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4455
4456 /* If reconfiguration was successful mark it as such, so we can use it */
4457 port->need_reconfigure = false;
4458
4459 return 0;
4460 }
4461
4462 struct rr_numa_list {
4463 struct hmap numas; /* Contains 'struct rr_numa' */
4464 };
4465
4466 struct rr_numa {
4467 struct hmap_node node;
4468
4469 int numa_id;
4470
4471 /* Non isolated pmds on numa node 'numa_id' */
4472 struct dp_netdev_pmd_thread **pmds;
4473 int n_pmds;
4474
4475 int cur_index;
4476 bool idx_inc;
4477 };
4478
4479 static struct rr_numa *
4480 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4481 {
4482 struct rr_numa *numa;
4483
4484 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4485 if (numa->numa_id == numa_id) {
4486 return numa;
4487 }
4488 }
4489
4490 return NULL;
4491 }
4492
4493 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4494 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4495 * Returns NULL if 'rr' numa list is empty. */
4496 static struct rr_numa *
4497 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4498 {
4499 struct hmap_node *node = NULL;
4500
4501 if (numa) {
4502 node = hmap_next(&rr->numas, &numa->node);
4503 }
4504 if (!node) {
4505 node = hmap_first(&rr->numas);
4506 }
4507
4508 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4509 }
4510
4511 static void
4512 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4513 {
4514 struct dp_netdev_pmd_thread *pmd;
4515 struct rr_numa *numa;
4516
4517 hmap_init(&rr->numas);
4518
4519 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4520 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4521 continue;
4522 }
4523
4524 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4525 if (!numa) {
4526 numa = xzalloc(sizeof *numa);
4527 numa->numa_id = pmd->numa_id;
4528 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4529 }
4530 numa->n_pmds++;
4531 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4532 numa->pmds[numa->n_pmds - 1] = pmd;
4533 /* At least one pmd so initialise curr_idx and idx_inc. */
4534 numa->cur_index = 0;
4535 numa->idx_inc = true;
4536 }
4537 }
4538
4539 /*
4540 * Returns the next pmd from the numa node.
4541 *
4542 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4543 * either an up or down walk, switching between up/down when the first or last
4544 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4545 *
4546 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4547 * core reached. e.g. 1,2,3,1,2,3,1,2...
4548 */
4549 static struct dp_netdev_pmd_thread *
4550 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4551 {
4552 int numa_idx = numa->cur_index;
4553
4554 if (numa->idx_inc == true) {
4555 /* Incrementing through list of pmds. */
4556 if (numa->cur_index == numa->n_pmds-1) {
4557 /* Reached the last pmd. */
4558 if (updown) {
4559 numa->idx_inc = false;
4560 } else {
4561 numa->cur_index = 0;
4562 }
4563 } else {
4564 numa->cur_index++;
4565 }
4566 } else {
4567 /* Decrementing through list of pmds. */
4568 if (numa->cur_index == 0) {
4569 /* Reached the first pmd. */
4570 numa->idx_inc = true;
4571 } else {
4572 numa->cur_index--;
4573 }
4574 }
4575 return numa->pmds[numa_idx];
4576 }
4577
4578 static void
4579 rr_numa_list_destroy(struct rr_numa_list *rr)
4580 {
4581 struct rr_numa *numa;
4582
4583 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4584 free(numa->pmds);
4585 free(numa);
4586 }
4587 hmap_destroy(&rr->numas);
4588 }
4589
4590 /* Sort Rx Queues by the processing cycles they are consuming. */
4591 static int
4592 compare_rxq_cycles(const void *a, const void *b)
4593 {
4594 struct dp_netdev_rxq *qa;
4595 struct dp_netdev_rxq *qb;
4596 uint64_t cycles_qa, cycles_qb;
4597
4598 qa = *(struct dp_netdev_rxq **) a;
4599 qb = *(struct dp_netdev_rxq **) b;
4600
4601 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4602 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4603
4604 if (cycles_qa != cycles_qb) {
4605 return (cycles_qa < cycles_qb) ? 1 : -1;
4606 } else {
4607 /* Cycles are the same so tiebreak on port/queue id.
4608 * Tiebreaking (as opposed to return 0) ensures consistent
4609 * sort results across multiple OS's. */
4610 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4611 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4612 if (port_qa != port_qb) {
4613 return port_qa > port_qb ? 1 : -1;
4614 } else {
4615 return netdev_rxq_get_queue_id(qa->rx)
4616 - netdev_rxq_get_queue_id(qb->rx);
4617 }
4618 }
4619 }
4620
4621 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4622 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4623 * pmds to unpinned queues.
4624 *
4625 * The function doesn't touch the pmd threads, it just stores the assignment
4626 * in the 'pmd' member of each rxq. */
4627 static void
4628 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4629 {
4630 struct dp_netdev_port *port;
4631 struct rr_numa_list rr;
4632 struct rr_numa *non_local_numa = NULL;
4633 struct dp_netdev_rxq ** rxqs = NULL;
4634 int n_rxqs = 0;
4635 struct rr_numa *numa = NULL;
4636 int numa_id;
4637 bool assign_cyc = dp->pmd_rxq_assign_cyc;
4638
4639 HMAP_FOR_EACH (port, node, &dp->ports) {
4640 if (!netdev_is_pmd(port->netdev)) {
4641 continue;
4642 }
4643
4644 for (int qid = 0; qid < port->n_rxq; qid++) {
4645 struct dp_netdev_rxq *q = &port->rxqs[qid];
4646
4647 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4648 struct dp_netdev_pmd_thread *pmd;
4649
4650 pmd = dp_netdev_get_pmd(dp, q->core_id);
4651 if (!pmd) {
4652 VLOG_WARN("There is no PMD thread on core %d. Queue "
4653 "%d on port \'%s\' will not be polled.",
4654 q->core_id, qid, netdev_get_name(port->netdev));
4655 } else {
4656 q->pmd = pmd;
4657 pmd->isolated = true;
4658 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4659 "rx queue %d.", pmd->core_id, pmd->numa_id,
4660 netdev_rxq_get_name(q->rx),
4661 netdev_rxq_get_queue_id(q->rx));
4662 dp_netdev_pmd_unref(pmd);
4663 }
4664 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4665 uint64_t cycle_hist = 0;
4666
4667 if (n_rxqs == 0) {
4668 rxqs = xmalloc(sizeof *rxqs);
4669 } else {
4670 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4671 }
4672
4673 if (assign_cyc) {
4674 /* Sum the queue intervals and store the cycle history. */
4675 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4676 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4677 }
4678 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4679 cycle_hist);
4680 }
4681 /* Store the queue. */
4682 rxqs[n_rxqs++] = q;
4683 }
4684 }
4685 }
4686
4687 if (n_rxqs > 1 && assign_cyc) {
4688 /* Sort the queues in order of the processing cycles
4689 * they consumed during their last pmd interval. */
4690 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4691 }
4692
4693 rr_numa_list_populate(dp, &rr);
4694 /* Assign the sorted queues to pmds in round robin. */
4695 for (int i = 0; i < n_rxqs; i++) {
4696 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4697 numa = rr_numa_list_lookup(&rr, numa_id);
4698 if (!numa) {
4699 /* There are no pmds on the queue's local NUMA node.
4700 Round robin on the NUMA nodes that do have pmds. */
4701 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4702 if (!non_local_numa) {
4703 VLOG_ERR("There is no available (non-isolated) pmd "
4704 "thread for port \'%s\' queue %d. This queue "
4705 "will not be polled. Is pmd-cpu-mask set to "
4706 "zero? Or are all PMDs isolated to other "
4707 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4708 netdev_rxq_get_queue_id(rxqs[i]->rx));
4709 continue;
4710 }
4711 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4712 VLOG_WARN("There's no available (non-isolated) pmd thread "
4713 "on numa node %d. Queue %d on port \'%s\' will "
4714 "be assigned to the pmd on core %d "
4715 "(numa node %d). Expect reduced performance.",
4716 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4717 netdev_rxq_get_name(rxqs[i]->rx),
4718 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4719 } else {
4720 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4721 if (assign_cyc) {
4722 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4723 "rx queue %d "
4724 "(measured processing cycles %"PRIu64").",
4725 rxqs[i]->pmd->core_id, numa_id,
4726 netdev_rxq_get_name(rxqs[i]->rx),
4727 netdev_rxq_get_queue_id(rxqs[i]->rx),
4728 dp_netdev_rxq_get_cycles(rxqs[i],
4729 RXQ_CYCLES_PROC_HIST));
4730 } else {
4731 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4732 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4733 netdev_rxq_get_name(rxqs[i]->rx),
4734 netdev_rxq_get_queue_id(rxqs[i]->rx));
4735 }
4736 }
4737 }
4738
4739 rr_numa_list_destroy(&rr);
4740 free(rxqs);
4741 }
4742
4743 static void
4744 reload_affected_pmds(struct dp_netdev *dp)
4745 {
4746 struct dp_netdev_pmd_thread *pmd;
4747
4748 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4749 if (pmd->need_reload) {
4750 flow_mark_flush(pmd);
4751 dp_netdev_reload_pmd__(pmd);
4752 }
4753 }
4754
4755 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4756 if (pmd->need_reload) {
4757 if (pmd->core_id != NON_PMD_CORE_ID) {
4758 bool reload;
4759
4760 do {
4761 atomic_read_explicit(&pmd->reload, &reload,
4762 memory_order_acquire);
4763 } while (reload);
4764 }
4765 pmd->need_reload = false;
4766 }
4767 }
4768 }
4769
4770 static void
4771 reconfigure_pmd_threads(struct dp_netdev *dp)
4772 OVS_REQUIRES(dp->port_mutex)
4773 {
4774 struct dp_netdev_pmd_thread *pmd;
4775 struct ovs_numa_dump *pmd_cores;
4776 struct ovs_numa_info_core *core;
4777 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4778 struct hmapx_node *node;
4779 bool changed = false;
4780 bool need_to_adjust_static_tx_qids = false;
4781
4782 /* The pmd threads should be started only if there's a pmd port in the
4783 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4784 * NR_PMD_THREADS per numa node. */
4785 if (!has_pmd_port(dp)) {
4786 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4787 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4788 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4789 } else {
4790 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4791 }
4792
4793 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4794 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4795 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4796 /* Adjustment is required to keep 'static_tx_qid's sequential and
4797 * avoid possible issues, for example, imbalanced tx queue usage
4798 * and unnecessary locking caused by remapping on netdev level. */
4799 need_to_adjust_static_tx_qids = true;
4800 }
4801
4802 /* Check for unwanted pmd threads */
4803 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4804 if (pmd->core_id == NON_PMD_CORE_ID) {
4805 continue;
4806 }
4807 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4808 pmd->core_id)) {
4809 hmapx_add(&to_delete, pmd);
4810 } else if (need_to_adjust_static_tx_qids) {
4811 atomic_store_relaxed(&pmd->reload_tx_qid, true);
4812 pmd->need_reload = true;
4813 }
4814 }
4815
4816 HMAPX_FOR_EACH (node, &to_delete) {
4817 pmd = (struct dp_netdev_pmd_thread *) node->data;
4818 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4819 pmd->numa_id, pmd->core_id);
4820 dp_netdev_del_pmd(dp, pmd);
4821 }
4822 changed = !hmapx_is_empty(&to_delete);
4823 hmapx_destroy(&to_delete);
4824
4825 if (need_to_adjust_static_tx_qids) {
4826 /* 'static_tx_qid's are not sequential now.
4827 * Reload remaining threads to fix this. */
4828 reload_affected_pmds(dp);
4829 }
4830
4831 /* Check for required new pmd threads */
4832 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4833 pmd = dp_netdev_get_pmd(dp, core->core_id);
4834 if (!pmd) {
4835 struct ds name = DS_EMPTY_INITIALIZER;
4836
4837 pmd = xzalloc(sizeof *pmd);
4838 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4839
4840 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4841 pmd->thread = ovs_thread_create(ds_cstr(&name),
4842 pmd_thread_main, pmd);
4843 ds_destroy(&name);
4844
4845 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4846 pmd->numa_id, pmd->core_id);
4847 changed = true;
4848 } else {
4849 dp_netdev_pmd_unref(pmd);
4850 }
4851 }
4852
4853 if (changed) {
4854 struct ovs_numa_info_numa *numa;
4855
4856 /* Log the number of pmd threads per numa node. */
4857 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4858 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4859 numa->n_cores, numa->numa_id);
4860 }
4861 }
4862
4863 ovs_numa_dump_destroy(pmd_cores);
4864 }
4865
4866 static void
4867 pmd_remove_stale_ports(struct dp_netdev *dp,
4868 struct dp_netdev_pmd_thread *pmd)
4869 OVS_EXCLUDED(pmd->port_mutex)
4870 OVS_REQUIRES(dp->port_mutex)
4871 {
4872 struct rxq_poll *poll, *poll_next;
4873 struct tx_port *tx, *tx_next;
4874
4875 ovs_mutex_lock(&pmd->port_mutex);
4876 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4877 struct dp_netdev_port *port = poll->rxq->port;
4878
4879 if (port->need_reconfigure
4880 || !hmap_contains(&dp->ports, &port->node)) {
4881 dp_netdev_del_rxq_from_pmd(pmd, poll);
4882 }
4883 }
4884 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4885 struct dp_netdev_port *port = tx->port;
4886
4887 if (port->need_reconfigure
4888 || !hmap_contains(&dp->ports, &port->node)) {
4889 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4890 }
4891 }
4892 ovs_mutex_unlock(&pmd->port_mutex);
4893 }
4894
4895 /* Must be called each time a port is added/removed or the cmask changes.
4896 * This creates and destroys pmd threads, reconfigures ports, opens their
4897 * rxqs and assigns all rxqs/txqs to pmd threads. */
4898 static void
4899 reconfigure_datapath(struct dp_netdev *dp)
4900 OVS_REQUIRES(dp->port_mutex)
4901 {
4902 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
4903 struct dp_netdev_pmd_thread *pmd;
4904 struct dp_netdev_port *port;
4905 int wanted_txqs;
4906
4907 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4908
4909 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4910 * on the system and the user configuration. */
4911 reconfigure_pmd_threads(dp);
4912
4913 wanted_txqs = cmap_count(&dp->poll_threads);
4914
4915 /* The number of pmd threads might have changed, or a port can be new:
4916 * adjust the txqs. */
4917 HMAP_FOR_EACH (port, node, &dp->ports) {
4918 netdev_set_tx_multiq(port->netdev, wanted_txqs);
4919 }
4920
4921 /* Step 2: Remove from the pmd threads ports that have been removed or
4922 * need reconfiguration. */
4923
4924 /* Check for all the ports that need reconfiguration. We cache this in
4925 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4926 * change at any time. */
4927 HMAP_FOR_EACH (port, node, &dp->ports) {
4928 if (netdev_is_reconf_required(port->netdev)) {
4929 port->need_reconfigure = true;
4930 }
4931 }
4932
4933 /* Remove from the pmd threads all the ports that have been deleted or
4934 * need reconfiguration. */
4935 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4936 pmd_remove_stale_ports(dp, pmd);
4937 }
4938
4939 /* Reload affected pmd threads. We must wait for the pmd threads before
4940 * reconfiguring the ports, because a port cannot be reconfigured while
4941 * it's being used. */
4942 reload_affected_pmds(dp);
4943
4944 /* Step 3: Reconfigure ports. */
4945
4946 /* We only reconfigure the ports that we determined above, because they're
4947 * not being used by any pmd thread at the moment. If a port fails to
4948 * reconfigure we remove it from the datapath. */
4949 struct dp_netdev_port *next_port;
4950 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4951 int err;
4952
4953 if (!port->need_reconfigure) {
4954 continue;
4955 }
4956
4957 err = port_reconfigure(port);
4958 if (err) {
4959 hmap_remove(&dp->ports, &port->node);
4960 seq_change(dp->port_seq);
4961 port_destroy(port);
4962 } else {
4963 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4964 }
4965 }
4966
4967 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4968 * for now, we just update the 'pmd' pointer in each rxq to point to the
4969 * wanted thread according to the scheduling policy. */
4970
4971 /* Reset all the pmd threads to non isolated. */
4972 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4973 pmd->isolated = false;
4974 }
4975
4976 /* Reset all the queues to unassigned */
4977 HMAP_FOR_EACH (port, node, &dp->ports) {
4978 for (int i = 0; i < port->n_rxq; i++) {
4979 port->rxqs[i].pmd = NULL;
4980 }
4981 }
4982
4983 /* Add pinned queues and mark pmd threads isolated. */
4984 rxq_scheduling(dp, true);
4985
4986 /* Add non-pinned queues. */
4987 rxq_scheduling(dp, false);
4988
4989 /* Step 5: Remove queues not compliant with new scheduling. */
4990
4991 /* Count all the threads that will have at least one queue to poll. */
4992 HMAP_FOR_EACH (port, node, &dp->ports) {
4993 for (int qid = 0; qid < port->n_rxq; qid++) {
4994 struct dp_netdev_rxq *q = &port->rxqs[qid];
4995
4996 if (q->pmd) {
4997 hmapx_add(&busy_threads, q->pmd);
4998 }
4999 }
5000 }
5001
5002 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5003 struct rxq_poll *poll, *poll_next;
5004
5005 ovs_mutex_lock(&pmd->port_mutex);
5006 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5007 if (poll->rxq->pmd != pmd) {
5008 dp_netdev_del_rxq_from_pmd(pmd, poll);
5009
5010 /* This pmd might sleep after this step if it has no rxq
5011 * remaining. Tell it to busy wait for new assignment if it
5012 * has at least one scheduled queue. */
5013 if (hmap_count(&pmd->poll_list) == 0 &&
5014 hmapx_contains(&busy_threads, pmd)) {
5015 atomic_store_relaxed(&pmd->wait_for_reload, true);
5016 }
5017 }
5018 }
5019 ovs_mutex_unlock(&pmd->port_mutex);
5020 }
5021
5022 hmapx_destroy(&busy_threads);
5023
5024 /* Reload affected pmd threads. We must wait for the pmd threads to remove
5025 * the old queues before readding them, otherwise a queue can be polled by
5026 * two threads at the same time. */
5027 reload_affected_pmds(dp);
5028
5029 /* Step 6: Add queues from scheduling, if they're not there already. */
5030 HMAP_FOR_EACH (port, node, &dp->ports) {
5031 if (!netdev_is_pmd(port->netdev)) {
5032 continue;
5033 }
5034
5035 for (int qid = 0; qid < port->n_rxq; qid++) {
5036 struct dp_netdev_rxq *q = &port->rxqs[qid];
5037
5038 if (q->pmd) {
5039 ovs_mutex_lock(&q->pmd->port_mutex);
5040 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5041 ovs_mutex_unlock(&q->pmd->port_mutex);
5042 }
5043 }
5044 }
5045
5046 /* Add every port to the tx cache of every pmd thread, if it's not
5047 * there already and if this pmd has at least one rxq to poll. */
5048 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5049 ovs_mutex_lock(&pmd->port_mutex);
5050 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5051 HMAP_FOR_EACH (port, node, &dp->ports) {
5052 dp_netdev_add_port_tx_to_pmd(pmd, port);
5053 }
5054 }
5055 ovs_mutex_unlock(&pmd->port_mutex);
5056 }
5057
5058 /* Reload affected pmd threads. */
5059 reload_affected_pmds(dp);
5060
5061 /* Check if PMD Auto LB is to be enabled */
5062 set_pmd_auto_lb(dp);
5063 }
5064
5065 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5066 static bool
5067 ports_require_restart(const struct dp_netdev *dp)
5068 OVS_REQUIRES(dp->port_mutex)
5069 {
5070 struct dp_netdev_port *port;
5071
5072 HMAP_FOR_EACH (port, node, &dp->ports) {
5073 if (netdev_is_reconf_required(port->netdev)) {
5074 return true;
5075 }
5076 }
5077
5078 return false;
5079 }
5080
5081 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5082 * of elements in array to be considered for calculating vairance.
5083 * Usage example: data array 'a' contains the processing load of each pmd and
5084 * 'n' is the number of PMDs. It returns the variance in processing load of
5085 * PMDs*/
5086 static uint64_t
5087 variance(uint64_t a[], int n)
5088 {
5089 /* Compute mean (average of elements). */
5090 uint64_t sum = 0;
5091 uint64_t mean = 0;
5092 uint64_t sqDiff = 0;
5093
5094 if (!n) {
5095 return 0;
5096 }
5097
5098 for (int i = 0; i < n; i++) {
5099 sum += a[i];
5100 }
5101
5102 if (sum) {
5103 mean = sum / n;
5104
5105 /* Compute sum squared differences with mean. */
5106 for (int i = 0; i < n; i++) {
5107 sqDiff += (a[i] - mean)*(a[i] - mean);
5108 }
5109 }
5110 return (sqDiff ? (sqDiff / n) : 0);
5111 }
5112
5113
5114 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5115 * assignment to PMDs. */
5116 static bool
5117 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5118 uint32_t num_pmds, uint64_t *predicted_variance)
5119 OVS_REQUIRES(dp->port_mutex)
5120 {
5121 struct dp_netdev_port *port;
5122 struct dp_netdev_pmd_thread *pmd;
5123 struct dp_netdev_rxq **rxqs = NULL;
5124 struct rr_numa *numa = NULL;
5125 struct rr_numa_list rr;
5126 int n_rxqs = 0;
5127 bool ret = false;
5128 uint64_t *pmd_usage;
5129
5130 if (!predicted_variance) {
5131 return ret;
5132 }
5133
5134 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5135
5136 HMAP_FOR_EACH (port, node, &dp->ports) {
5137 if (!netdev_is_pmd(port->netdev)) {
5138 continue;
5139 }
5140
5141 for (int qid = 0; qid < port->n_rxq; qid++) {
5142 struct dp_netdev_rxq *q = &port->rxqs[qid];
5143 uint64_t cycle_hist = 0;
5144
5145 if (q->pmd->isolated) {
5146 continue;
5147 }
5148
5149 if (n_rxqs == 0) {
5150 rxqs = xmalloc(sizeof *rxqs);
5151 } else {
5152 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5153 }
5154
5155 /* Sum the queue intervals and store the cycle history. */
5156 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5157 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5158 }
5159 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5160 cycle_hist);
5161 /* Store the queue. */
5162 rxqs[n_rxqs++] = q;
5163 }
5164 }
5165 if (n_rxqs > 1) {
5166 /* Sort the queues in order of the processing cycles
5167 * they consumed during their last pmd interval. */
5168 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5169 }
5170 rr_numa_list_populate(dp, &rr);
5171
5172 for (int i = 0; i < n_rxqs; i++) {
5173 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5174 numa = rr_numa_list_lookup(&rr, numa_id);
5175 if (!numa) {
5176 /* Abort if cross NUMA polling. */
5177 VLOG_DBG("PMD auto lb dry run."
5178 " Aborting due to cross-numa polling.");
5179 goto cleanup;
5180 }
5181
5182 pmd = rr_numa_get_pmd(numa, true);
5183 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5184 "to be assigned port \'%s\' rx queue %d "
5185 "(measured processing cycles %"PRIu64").",
5186 pmd->core_id, numa_id,
5187 netdev_rxq_get_name(rxqs[i]->rx),
5188 netdev_rxq_get_queue_id(rxqs[i]->rx),
5189 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5190
5191 for (int id = 0; id < num_pmds; id++) {
5192 if (pmd->core_id == core_list[id]) {
5193 /* Add the processing cycles of rxq to pmd polling it. */
5194 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5195 RXQ_CYCLES_PROC_HIST);
5196 }
5197 }
5198 }
5199
5200 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5201 uint64_t total_cycles = 0;
5202
5203 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5204 continue;
5205 }
5206
5207 /* Get the total pmd cycles for an interval. */
5208 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5209 /* Estimate the cycles to cover all intervals. */
5210 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5211 for (int id = 0; id < num_pmds; id++) {
5212 if (pmd->core_id == core_list[id]) {
5213 if (pmd_usage[id]) {
5214 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5215 }
5216 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5217 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5218 }
5219 }
5220 }
5221 *predicted_variance = variance(pmd_usage, num_pmds);
5222 ret = true;
5223
5224 cleanup:
5225 rr_numa_list_destroy(&rr);
5226 free(rxqs);
5227 free(pmd_usage);
5228 return ret;
5229 }
5230
5231 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5232 * better distribution of load on PMDs. */
5233 static bool
5234 pmd_rebalance_dry_run(struct dp_netdev *dp)
5235 OVS_REQUIRES(dp->port_mutex)
5236 {
5237 struct dp_netdev_pmd_thread *pmd;
5238 uint64_t *curr_pmd_usage;
5239
5240 uint64_t curr_variance;
5241 uint64_t new_variance;
5242 uint64_t improvement = 0;
5243 uint32_t num_pmds;
5244 uint32_t *pmd_corelist;
5245 struct rxq_poll *poll;
5246 bool ret;
5247
5248 num_pmds = cmap_count(&dp->poll_threads);
5249
5250 if (num_pmds > 1) {
5251 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5252 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5253 } else {
5254 return false;
5255 }
5256
5257 num_pmds = 0;
5258 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5259 uint64_t total_cycles = 0;
5260 uint64_t total_proc = 0;
5261
5262 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5263 continue;
5264 }
5265
5266 /* Get the total pmd cycles for an interval. */
5267 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5268 /* Estimate the cycles to cover all intervals. */
5269 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5270
5271 ovs_mutex_lock(&pmd->port_mutex);
5272 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5273 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5274 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5275 }
5276 }
5277 ovs_mutex_unlock(&pmd->port_mutex);
5278
5279 if (total_proc) {
5280 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5281 }
5282
5283 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5284 pmd->core_id, curr_pmd_usage[num_pmds]);
5285
5286 if (atomic_count_get(&pmd->pmd_overloaded)) {
5287 atomic_count_set(&pmd->pmd_overloaded, 0);
5288 }
5289
5290 pmd_corelist[num_pmds] = pmd->core_id;
5291 num_pmds++;
5292 }
5293
5294 curr_variance = variance(curr_pmd_usage, num_pmds);
5295 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5296
5297 if (ret) {
5298 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5299 " Predicted PMD variance: %"PRIu64"",
5300 curr_variance, new_variance);
5301
5302 if (new_variance < curr_variance) {
5303 improvement =
5304 ((curr_variance - new_variance) * 100) / curr_variance;
5305 }
5306 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5307 ret = false;
5308 }
5309 }
5310
5311 free(curr_pmd_usage);
5312 free(pmd_corelist);
5313 return ret;
5314 }
5315
5316
5317 /* Return true if needs to revalidate datapath flows. */
5318 static bool
5319 dpif_netdev_run(struct dpif *dpif)
5320 {
5321 struct dp_netdev_port *port;
5322 struct dp_netdev *dp = get_dp_netdev(dpif);
5323 struct dp_netdev_pmd_thread *non_pmd;
5324 uint64_t new_tnl_seq;
5325 bool need_to_flush = true;
5326 bool pmd_rebalance = false;
5327 long long int now = time_msec();
5328 struct dp_netdev_pmd_thread *pmd;
5329
5330 ovs_mutex_lock(&dp->port_mutex);
5331 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5332 if (non_pmd) {
5333 ovs_mutex_lock(&dp->non_pmd_mutex);
5334 HMAP_FOR_EACH (port, node, &dp->ports) {
5335 if (!netdev_is_pmd(port->netdev)) {
5336 int i;
5337
5338 if (port->emc_enabled) {
5339 atomic_read_relaxed(&dp->emc_insert_min,
5340 &non_pmd->ctx.emc_insert_min);
5341 } else {
5342 non_pmd->ctx.emc_insert_min = 0;
5343 }
5344
5345 for (i = 0; i < port->n_rxq; i++) {
5346
5347 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5348 continue;
5349 }
5350
5351 if (dp_netdev_process_rxq_port(non_pmd,
5352 &port->rxqs[i],
5353 port->port_no)) {
5354 need_to_flush = false;
5355 }
5356 }
5357 }
5358 }
5359 if (need_to_flush) {
5360 /* We didn't receive anything in the process loop.
5361 * Check if we need to send something.
5362 * There was no time updates on current iteration. */
5363 pmd_thread_ctx_time_update(non_pmd);
5364 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5365 }
5366
5367 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5368 ovs_mutex_unlock(&dp->non_pmd_mutex);
5369
5370 dp_netdev_pmd_unref(non_pmd);
5371 }
5372
5373 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5374 if (pmd_alb->is_enabled) {
5375 if (!pmd_alb->rebalance_poll_timer) {
5376 pmd_alb->rebalance_poll_timer = now;
5377 } else if ((pmd_alb->rebalance_poll_timer +
5378 pmd_alb->rebalance_intvl) < now) {
5379 pmd_alb->rebalance_poll_timer = now;
5380 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5381 if (atomic_count_get(&pmd->pmd_overloaded) >=
5382 PMD_RXQ_INTERVAL_MAX) {
5383 pmd_rebalance = true;
5384 break;
5385 }
5386 }
5387
5388 if (pmd_rebalance &&
5389 !dp_netdev_is_reconf_required(dp) &&
5390 !ports_require_restart(dp) &&
5391 pmd_rebalance_dry_run(dp)) {
5392 VLOG_INFO("PMD auto lb dry run."
5393 " requesting datapath reconfigure.");
5394 dp_netdev_request_reconfigure(dp);
5395 }
5396 }
5397 }
5398
5399 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5400 reconfigure_datapath(dp);
5401 }
5402 ovs_mutex_unlock(&dp->port_mutex);
5403
5404 tnl_neigh_cache_run();
5405 tnl_port_map_run();
5406 new_tnl_seq = seq_read(tnl_conf_seq);
5407
5408 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5409 dp->last_tnl_conf_seq = new_tnl_seq;
5410 return true;
5411 }
5412 return false;
5413 }
5414
5415 static void
5416 dpif_netdev_wait(struct dpif *dpif)
5417 {
5418 struct dp_netdev_port *port;
5419 struct dp_netdev *dp = get_dp_netdev(dpif);
5420
5421 ovs_mutex_lock(&dp_netdev_mutex);
5422 ovs_mutex_lock(&dp->port_mutex);
5423 HMAP_FOR_EACH (port, node, &dp->ports) {
5424 netdev_wait_reconf_required(port->netdev);
5425 if (!netdev_is_pmd(port->netdev)) {
5426 int i;
5427
5428 for (i = 0; i < port->n_rxq; i++) {
5429 netdev_rxq_wait(port->rxqs[i].rx);
5430 }
5431 }
5432 }
5433 ovs_mutex_unlock(&dp->port_mutex);
5434 ovs_mutex_unlock(&dp_netdev_mutex);
5435 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5436 }
5437
5438 static void
5439 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5440 {
5441 struct tx_port *tx_port_cached;
5442
5443 /* Flush all the queued packets. */
5444 dp_netdev_pmd_flush_output_packets(pmd, true);
5445 /* Free all used tx queue ids. */
5446 dpif_netdev_xps_revalidate_pmd(pmd, true);
5447
5448 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5449 free(tx_port_cached);
5450 }
5451 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5452 free(tx_port_cached);
5453 }
5454 }
5455
5456 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5457 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5458 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5459 * one txq. */
5460 static void
5461 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5462 OVS_REQUIRES(pmd->port_mutex)
5463 {
5464 struct tx_port *tx_port, *tx_port_cached;
5465
5466 pmd_free_cached_ports(pmd);
5467 hmap_shrink(&pmd->send_port_cache);
5468 hmap_shrink(&pmd->tnl_port_cache);
5469
5470 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5471 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5472 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5473 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5474 hash_port_no(tx_port_cached->port->port_no));
5475 }
5476
5477 if (netdev_n_txq(tx_port->port->netdev)) {
5478 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5479 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5480 hash_port_no(tx_port_cached->port->port_no));
5481 }
5482 }
5483 }
5484
5485 static void
5486 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5487 {
5488 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5489 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5490 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5491 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5492 }
5493 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5494
5495 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5496 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5497 }
5498
5499 static void
5500 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5501 {
5502 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5503 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5504 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5505 }
5506
5507 static int
5508 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5509 struct polled_queue **ppoll_list)
5510 {
5511 struct polled_queue *poll_list = *ppoll_list;
5512 struct rxq_poll *poll;
5513 int i;
5514
5515 ovs_mutex_lock(&pmd->port_mutex);
5516 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5517 * sizeof *poll_list);
5518
5519 i = 0;
5520 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5521 poll_list[i].rxq = poll->rxq;
5522 poll_list[i].port_no = poll->rxq->port->port_no;
5523 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5524 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5525 poll_list[i].change_seq =
5526 netdev_get_change_seq(poll->rxq->port->netdev);
5527 i++;
5528 }
5529
5530 pmd_load_cached_ports(pmd);
5531
5532 ovs_mutex_unlock(&pmd->port_mutex);
5533
5534 *ppoll_list = poll_list;
5535 return i;
5536 }
5537
5538 static void *
5539 pmd_thread_main(void *f_)
5540 {
5541 struct dp_netdev_pmd_thread *pmd = f_;
5542 struct pmd_perf_stats *s = &pmd->perf_stats;
5543 unsigned int lc = 0;
5544 struct polled_queue *poll_list;
5545 bool wait_for_reload = false;
5546 bool reload_tx_qid;
5547 bool exiting;
5548 bool reload;
5549 int poll_cnt;
5550 int i;
5551 int process_packets = 0;
5552
5553 poll_list = NULL;
5554
5555 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5556 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5557 ovs_numa_thread_setaffinity_core(pmd->core_id);
5558 dpdk_set_lcore_id(pmd->core_id);
5559 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5560 dfc_cache_init(&pmd->flow_cache);
5561 pmd_alloc_static_tx_qid(pmd);
5562
5563 reload:
5564 atomic_count_init(&pmd->pmd_overloaded, 0);
5565
5566 /* List port/core affinity */
5567 for (i = 0; i < poll_cnt; i++) {
5568 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5569 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5570 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5571 /* Reset the rxq current cycles counter. */
5572 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5573 }
5574
5575 if (!poll_cnt) {
5576 if (wait_for_reload) {
5577 /* Don't sleep, control thread will ask for a reload shortly. */
5578 do {
5579 atomic_read_explicit(&pmd->reload, &reload,
5580 memory_order_acquire);
5581 } while (!reload);
5582 } else {
5583 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5584 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5585 poll_block();
5586 }
5587 }
5588 }
5589
5590 pmd->intrvl_tsc_prev = 0;
5591 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5592 cycles_counter_update(s);
5593 /* Protect pmd stats from external clearing while polling. */
5594 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5595 for (;;) {
5596 uint64_t rx_packets = 0, tx_packets = 0;
5597
5598 pmd_perf_start_iteration(s);
5599
5600 for (i = 0; i < poll_cnt; i++) {
5601
5602 if (!poll_list[i].rxq_enabled) {
5603 continue;
5604 }
5605
5606 if (poll_list[i].emc_enabled) {
5607 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5608 &pmd->ctx.emc_insert_min);
5609 } else {
5610 pmd->ctx.emc_insert_min = 0;
5611 }
5612
5613 process_packets =
5614 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5615 poll_list[i].port_no);
5616 rx_packets += process_packets;
5617 }
5618
5619 if (!rx_packets) {
5620 /* We didn't receive anything in the process loop.
5621 * Check if we need to send something.
5622 * There was no time updates on current iteration. */
5623 pmd_thread_ctx_time_update(pmd);
5624 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5625 }
5626
5627 if (lc++ > 1024) {
5628 lc = 0;
5629
5630 coverage_try_clear();
5631 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5632 if (!ovsrcu_try_quiesce()) {
5633 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5634 }
5635
5636 for (i = 0; i < poll_cnt; i++) {
5637 uint64_t current_seq =
5638 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5639 if (poll_list[i].change_seq != current_seq) {
5640 poll_list[i].change_seq = current_seq;
5641 poll_list[i].rxq_enabled =
5642 netdev_rxq_enabled(poll_list[i].rxq->rx);
5643 }
5644 }
5645 }
5646
5647 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5648 if (OVS_UNLIKELY(reload)) {
5649 break;
5650 }
5651
5652 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5653 pmd_perf_metrics_enabled(pmd));
5654 }
5655 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5656
5657 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5658 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5659 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5660 atomic_read_relaxed(&pmd->exit, &exiting);
5661 /* Signal here to make sure the pmd finishes
5662 * reloading the updated configuration. */
5663 dp_netdev_pmd_reload_done(pmd);
5664
5665 if (reload_tx_qid) {
5666 pmd_free_static_tx_qid(pmd);
5667 pmd_alloc_static_tx_qid(pmd);
5668 }
5669
5670 if (!exiting) {
5671 goto reload;
5672 }
5673
5674 pmd_free_static_tx_qid(pmd);
5675 dfc_cache_uninit(&pmd->flow_cache);
5676 free(poll_list);
5677 pmd_free_cached_ports(pmd);
5678 return NULL;
5679 }
5680
5681 static void
5682 dp_netdev_disable_upcall(struct dp_netdev *dp)
5683 OVS_ACQUIRES(dp->upcall_rwlock)
5684 {
5685 fat_rwlock_wrlock(&dp->upcall_rwlock);
5686 }
5687
5688 \f
5689 /* Meters */
5690 static void
5691 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5692 struct ofputil_meter_features *features)
5693 {
5694 features->max_meters = MAX_METERS;
5695 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5696 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5697 features->max_bands = MAX_BANDS;
5698 features->max_color = 0;
5699 }
5700
5701 /* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5702 * that exceed a band are dropped in-place. */
5703 static void
5704 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5705 uint32_t meter_id, long long int now)
5706 {
5707 struct dp_meter *meter;
5708 struct dp_meter_band *band;
5709 struct dp_packet *packet;
5710 long long int long_delta_t; /* msec */
5711 uint32_t delta_t; /* msec */
5712 const size_t cnt = dp_packet_batch_size(packets_);
5713 uint32_t bytes, volume;
5714 int exceeded_band[NETDEV_MAX_BURST];
5715 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5716 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5717
5718 if (meter_id >= MAX_METERS) {
5719 return;
5720 }
5721
5722 meter_lock(dp, meter_id);
5723 meter = dp->meters[meter_id];
5724 if (!meter) {
5725 goto out;
5726 }
5727
5728 /* Initialize as negative values. */
5729 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5730 /* Initialize as zeroes. */
5731 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5732
5733 /* All packets will hit the meter at the same time. */
5734 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5735
5736 if (long_delta_t < 0) {
5737 /* This condition means that we have several threads fighting for a
5738 meter lock, and the one who received the packets a bit later wins.
5739 Assuming that all racing threads received packets at the same time
5740 to avoid overflow. */
5741 long_delta_t = 0;
5742 }
5743
5744 /* Make sure delta_t will not be too large, so that bucket will not
5745 * wrap around below. */
5746 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5747 ? meter->max_delta_t : (uint32_t)long_delta_t;
5748
5749 /* Update meter stats. */
5750 meter->used = now;
5751 meter->packet_count += cnt;
5752 bytes = 0;
5753 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5754 bytes += dp_packet_size(packet);
5755 }
5756 meter->byte_count += bytes;
5757
5758 /* Meters can operate in terms of packets per second or kilobits per
5759 * second. */
5760 if (meter->flags & OFPMF13_PKTPS) {
5761 /* Rate in packets/second, bucket 1/1000 packets. */
5762 /* msec * packets/sec = 1/1000 packets. */
5763 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5764 } else {
5765 /* Rate in kbps, bucket in bits. */
5766 /* msec * kbps = bits */
5767 volume = bytes * 8;
5768 }
5769
5770 /* Update all bands and find the one hit with the highest rate for each
5771 * packet (if any). */
5772 for (int m = 0; m < meter->n_bands; ++m) {
5773 band = &meter->bands[m];
5774
5775 /* Update band's bucket. */
5776 band->bucket += delta_t * band->up.rate;
5777 if (band->bucket > band->up.burst_size) {
5778 band->bucket = band->up.burst_size;
5779 }
5780
5781 /* Drain the bucket for all the packets, if possible. */
5782 if (band->bucket >= volume) {
5783 band->bucket -= volume;
5784 } else {
5785 int band_exceeded_pkt;
5786
5787 /* Band limit hit, must process packet-by-packet. */
5788 if (meter->flags & OFPMF13_PKTPS) {
5789 band_exceeded_pkt = band->bucket / 1000;
5790 band->bucket %= 1000; /* Remainder stays in bucket. */
5791
5792 /* Update the exceeding band for each exceeding packet.
5793 * (Only one band will be fired by a packet, and that
5794 * can be different for each packet.) */
5795 for (int i = band_exceeded_pkt; i < cnt; i++) {
5796 if (band->up.rate > exceeded_rate[i]) {
5797 exceeded_rate[i] = band->up.rate;
5798 exceeded_band[i] = m;
5799 }
5800 }
5801 } else {
5802 /* Packet sizes differ, must process one-by-one. */
5803 band_exceeded_pkt = cnt;
5804 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5805 uint32_t bits = dp_packet_size(packet) * 8;
5806
5807 if (band->bucket >= bits) {
5808 band->bucket -= bits;
5809 } else {
5810 if (i < band_exceeded_pkt) {
5811 band_exceeded_pkt = i;
5812 }
5813 /* Update the exceeding band for the exceeding packet.
5814 * (Only one band will be fired by a packet, and that
5815 * can be different for each packet.) */
5816 if (band->up.rate > exceeded_rate[i]) {
5817 exceeded_rate[i] = band->up.rate;
5818 exceeded_band[i] = m;
5819 }
5820 }
5821 }
5822 }
5823 /* Remember the first exceeding packet. */
5824 if (exceeded_pkt > band_exceeded_pkt) {
5825 exceeded_pkt = band_exceeded_pkt;
5826 }
5827 }
5828 }
5829
5830 /* Fire the highest rate band exceeded by each packet, and drop
5831 * packets if needed. */
5832 size_t j;
5833 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
5834 if (exceeded_band[j] >= 0) {
5835 /* Meter drop packet. */
5836 band = &meter->bands[exceeded_band[j]];
5837 band->packet_count += 1;
5838 band->byte_count += dp_packet_size(packet);
5839 COVERAGE_INC(datapath_drop_meter);
5840 dp_packet_delete(packet);
5841 } else {
5842 /* Meter accepts packet. */
5843 dp_packet_batch_refill(packets_, packet, j);
5844 }
5845 }
5846 out:
5847 meter_unlock(dp, meter_id);
5848 }
5849
5850 /* Meter set/get/del processing is still single-threaded. */
5851 static int
5852 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
5853 struct ofputil_meter_config *config)
5854 {
5855 struct dp_netdev *dp = get_dp_netdev(dpif);
5856 uint32_t mid = meter_id.uint32;
5857 struct dp_meter *meter;
5858 int i;
5859
5860 if (mid >= MAX_METERS) {
5861 return EFBIG; /* Meter_id out of range. */
5862 }
5863
5864 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
5865 return EBADF; /* Unsupported flags set */
5866 }
5867
5868 if (config->n_bands > MAX_BANDS) {
5869 return EINVAL;
5870 }
5871
5872 for (i = 0; i < config->n_bands; ++i) {
5873 switch (config->bands[i].type) {
5874 case OFPMBT13_DROP:
5875 break;
5876 default:
5877 return ENODEV; /* Unsupported band type */
5878 }
5879 }
5880
5881 /* Allocate meter */
5882 meter = xzalloc(sizeof *meter
5883 + config->n_bands * sizeof(struct dp_meter_band));
5884
5885 meter->flags = config->flags;
5886 meter->n_bands = config->n_bands;
5887 meter->max_delta_t = 0;
5888 meter->used = time_usec();
5889
5890 /* set up bands */
5891 for (i = 0; i < config->n_bands; ++i) {
5892 uint32_t band_max_delta_t;
5893
5894 /* Set burst size to a workable value if none specified. */
5895 if (config->bands[i].burst_size == 0) {
5896 config->bands[i].burst_size = config->bands[i].rate;
5897 }
5898
5899 meter->bands[i].up = config->bands[i];
5900 /* Convert burst size to the bucket units: */
5901 /* pkts => 1/1000 packets, kilobits => bits. */
5902 meter->bands[i].up.burst_size *= 1000;
5903 /* Initialize bucket to empty. */
5904 meter->bands[i].bucket = 0;
5905
5906 /* Figure out max delta_t that is enough to fill any bucket. */
5907 band_max_delta_t
5908 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5909 if (band_max_delta_t > meter->max_delta_t) {
5910 meter->max_delta_t = band_max_delta_t;
5911 }
5912 }
5913
5914 meter_lock(dp, mid);
5915 dp_delete_meter(dp, mid); /* Free existing meter, if any */
5916 dp->meters[mid] = meter;
5917 meter_unlock(dp, mid);
5918
5919 return 0;
5920 }
5921
5922 static int
5923 dpif_netdev_meter_get(const struct dpif *dpif,
5924 ofproto_meter_id meter_id_,
5925 struct ofputil_meter_stats *stats, uint16_t n_bands)
5926 {
5927 const struct dp_netdev *dp = get_dp_netdev(dpif);
5928 uint32_t meter_id = meter_id_.uint32;
5929 int retval = 0;
5930
5931 if (meter_id >= MAX_METERS) {
5932 return EFBIG;
5933 }
5934
5935 meter_lock(dp, meter_id);
5936 const struct dp_meter *meter = dp->meters[meter_id];
5937 if (!meter) {
5938 retval = ENOENT;
5939 goto done;
5940 }
5941 if (stats) {
5942 int i = 0;
5943
5944 stats->packet_in_count = meter->packet_count;
5945 stats->byte_in_count = meter->byte_count;
5946
5947 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5948 stats->bands[i].packet_count = meter->bands[i].packet_count;
5949 stats->bands[i].byte_count = meter->bands[i].byte_count;
5950 }
5951
5952 stats->n_bands = i;
5953 }
5954
5955 done:
5956 meter_unlock(dp, meter_id);
5957 return retval;
5958 }
5959
5960 static int
5961 dpif_netdev_meter_del(struct dpif *dpif,
5962 ofproto_meter_id meter_id_,
5963 struct ofputil_meter_stats *stats, uint16_t n_bands)
5964 {
5965 struct dp_netdev *dp = get_dp_netdev(dpif);
5966 int error;
5967
5968 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5969 if (!error) {
5970 uint32_t meter_id = meter_id_.uint32;
5971
5972 meter_lock(dp, meter_id);
5973 dp_delete_meter(dp, meter_id);
5974 meter_unlock(dp, meter_id);
5975 }
5976 return error;
5977 }
5978
5979 \f
5980 static void
5981 dpif_netdev_disable_upcall(struct dpif *dpif)
5982 OVS_NO_THREAD_SAFETY_ANALYSIS
5983 {
5984 struct dp_netdev *dp = get_dp_netdev(dpif);
5985 dp_netdev_disable_upcall(dp);
5986 }
5987
5988 static void
5989 dp_netdev_enable_upcall(struct dp_netdev *dp)
5990 OVS_RELEASES(dp->upcall_rwlock)
5991 {
5992 fat_rwlock_unlock(&dp->upcall_rwlock);
5993 }
5994
5995 static void
5996 dpif_netdev_enable_upcall(struct dpif *dpif)
5997 OVS_NO_THREAD_SAFETY_ANALYSIS
5998 {
5999 struct dp_netdev *dp = get_dp_netdev(dpif);
6000 dp_netdev_enable_upcall(dp);
6001 }
6002
6003 static void
6004 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6005 {
6006 atomic_store_relaxed(&pmd->wait_for_reload, false);
6007 atomic_store_relaxed(&pmd->reload_tx_qid, false);
6008 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6009 atomic_store_explicit(&pmd->reload, false, memory_order_release);
6010 }
6011
6012 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
6013 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6014 * 'core_id' is NON_PMD_CORE_ID).
6015 *
6016 * Caller must unrefs the returned reference. */
6017 static struct dp_netdev_pmd_thread *
6018 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6019 {
6020 struct dp_netdev_pmd_thread *pmd;
6021 const struct cmap_node *pnode;
6022
6023 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
6024 if (!pnode) {
6025 return NULL;
6026 }
6027 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6028
6029 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6030 }
6031
6032 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6033 static void
6034 dp_netdev_set_nonpmd(struct dp_netdev *dp)
6035 OVS_REQUIRES(dp->port_mutex)
6036 {
6037 struct dp_netdev_pmd_thread *non_pmd;
6038
6039 non_pmd = xzalloc(sizeof *non_pmd);
6040 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6041 }
6042
6043 /* Caller must have valid pointer to 'pmd'. */
6044 static bool
6045 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6046 {
6047 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6048 }
6049
6050 static void
6051 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6052 {
6053 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6054 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6055 }
6056 }
6057
6058 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
6059 * fails, keeps checking for next node until reaching the end of cmap.
6060 *
6061 * Caller must unrefs the returned reference. */
6062 static struct dp_netdev_pmd_thread *
6063 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6064 {
6065 struct dp_netdev_pmd_thread *next;
6066
6067 do {
6068 struct cmap_node *node;
6069
6070 node = cmap_next_position(&dp->poll_threads, pos);
6071 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6072 : NULL;
6073 } while (next && !dp_netdev_pmd_try_ref(next));
6074
6075 return next;
6076 }
6077
6078 /* Configures the 'pmd' based on the input argument. */
6079 static void
6080 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6081 unsigned core_id, int numa_id)
6082 {
6083 pmd->dp = dp;
6084 pmd->core_id = core_id;
6085 pmd->numa_id = numa_id;
6086 pmd->need_reload = false;
6087 pmd->n_output_batches = 0;
6088
6089 ovs_refcount_init(&pmd->ref_cnt);
6090 atomic_init(&pmd->exit, false);
6091 pmd->reload_seq = seq_create();
6092 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6093 atomic_init(&pmd->reload, false);
6094 ovs_mutex_init(&pmd->flow_mutex);
6095 ovs_mutex_init(&pmd->port_mutex);
6096 cmap_init(&pmd->flow_table);
6097 cmap_init(&pmd->classifiers);
6098 pmd->ctx.last_rxq = NULL;
6099 pmd_thread_ctx_time_update(pmd);
6100 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6101 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6102 hmap_init(&pmd->poll_list);
6103 hmap_init(&pmd->tx_ports);
6104 hmap_init(&pmd->tnl_port_cache);
6105 hmap_init(&pmd->send_port_cache);
6106 /* init the 'flow_cache' since there is no
6107 * actual thread created for NON_PMD_CORE_ID. */
6108 if (core_id == NON_PMD_CORE_ID) {
6109 dfc_cache_init(&pmd->flow_cache);
6110 pmd_alloc_static_tx_qid(pmd);
6111 }
6112 pmd_perf_stats_init(&pmd->perf_stats);
6113 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6114 hash_int(core_id, 0));
6115 }
6116
6117 static void
6118 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6119 {
6120 struct dpcls *cls;
6121
6122 dp_netdev_pmd_flow_flush(pmd);
6123 hmap_destroy(&pmd->send_port_cache);
6124 hmap_destroy(&pmd->tnl_port_cache);
6125 hmap_destroy(&pmd->tx_ports);
6126 hmap_destroy(&pmd->poll_list);
6127 /* All flows (including their dpcls_rules) have been deleted already */
6128 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6129 dpcls_destroy(cls);
6130 ovsrcu_postpone(free, cls);
6131 }
6132 cmap_destroy(&pmd->classifiers);
6133 cmap_destroy(&pmd->flow_table);
6134 ovs_mutex_destroy(&pmd->flow_mutex);
6135 seq_destroy(pmd->reload_seq);
6136 ovs_mutex_destroy(&pmd->port_mutex);
6137 free(pmd);
6138 }
6139
6140 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6141 * and unrefs the struct. */
6142 static void
6143 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6144 {
6145 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6146 * but extra cleanup is necessary */
6147 if (pmd->core_id == NON_PMD_CORE_ID) {
6148 ovs_mutex_lock(&dp->non_pmd_mutex);
6149 dfc_cache_uninit(&pmd->flow_cache);
6150 pmd_free_cached_ports(pmd);
6151 pmd_free_static_tx_qid(pmd);
6152 ovs_mutex_unlock(&dp->non_pmd_mutex);
6153 } else {
6154 atomic_store_relaxed(&pmd->exit, true);
6155 dp_netdev_reload_pmd__(pmd);
6156 xpthread_join(pmd->thread, NULL);
6157 }
6158
6159 dp_netdev_pmd_clear_ports(pmd);
6160
6161 /* Purges the 'pmd''s flows after stopping the thread, but before
6162 * destroying the flows, so that the flow stats can be collected. */
6163 if (dp->dp_purge_cb) {
6164 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6165 }
6166 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6167 dp_netdev_pmd_unref(pmd);
6168 }
6169
6170 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6171 * thread. */
6172 static void
6173 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6174 {
6175 struct dp_netdev_pmd_thread *pmd;
6176 struct dp_netdev_pmd_thread **pmd_list;
6177 size_t k = 0, n_pmds;
6178
6179 n_pmds = cmap_count(&dp->poll_threads);
6180 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6181
6182 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6183 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6184 continue;
6185 }
6186 /* We cannot call dp_netdev_del_pmd(), since it alters
6187 * 'dp->poll_threads' (while we're iterating it) and it
6188 * might quiesce. */
6189 ovs_assert(k < n_pmds);
6190 pmd_list[k++] = pmd;
6191 }
6192
6193 for (size_t i = 0; i < k; i++) {
6194 dp_netdev_del_pmd(dp, pmd_list[i]);
6195 }
6196 free(pmd_list);
6197 }
6198
6199 /* Deletes all rx queues from pmd->poll_list and all the ports from
6200 * pmd->tx_ports. */
6201 static void
6202 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6203 {
6204 struct rxq_poll *poll;
6205 struct tx_port *port;
6206
6207 ovs_mutex_lock(&pmd->port_mutex);
6208 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6209 free(poll);
6210 }
6211 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6212 free(port);
6213 }
6214 ovs_mutex_unlock(&pmd->port_mutex);
6215 }
6216
6217 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6218 static void
6219 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6220 struct dp_netdev_rxq *rxq)
6221 OVS_REQUIRES(pmd->port_mutex)
6222 {
6223 int qid = netdev_rxq_get_queue_id(rxq->rx);
6224 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6225 struct rxq_poll *poll;
6226
6227 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6228 if (poll->rxq == rxq) {
6229 /* 'rxq' is already polled by this thread. Do nothing. */
6230 return;
6231 }
6232 }
6233
6234 poll = xmalloc(sizeof *poll);
6235 poll->rxq = rxq;
6236 hmap_insert(&pmd->poll_list, &poll->node, hash);
6237
6238 pmd->need_reload = true;
6239 }
6240
6241 /* Delete 'poll' from poll_list of PMD thread. */
6242 static void
6243 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6244 struct rxq_poll *poll)
6245 OVS_REQUIRES(pmd->port_mutex)
6246 {
6247 hmap_remove(&pmd->poll_list, &poll->node);
6248 free(poll);
6249
6250 pmd->need_reload = true;
6251 }
6252
6253 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6254 * changes to take effect. */
6255 static void
6256 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6257 struct dp_netdev_port *port)
6258 OVS_REQUIRES(pmd->port_mutex)
6259 {
6260 struct tx_port *tx;
6261
6262 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6263 if (tx) {
6264 /* 'port' is already on this thread tx cache. Do nothing. */
6265 return;
6266 }
6267
6268 tx = xzalloc(sizeof *tx);
6269
6270 tx->port = port;
6271 tx->qid = -1;
6272 tx->flush_time = 0LL;
6273 dp_packet_batch_init(&tx->output_pkts);
6274
6275 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6276 pmd->need_reload = true;
6277 }
6278
6279 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6280 * changes to take effect. */
6281 static void
6282 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6283 struct tx_port *tx)
6284 OVS_REQUIRES(pmd->port_mutex)
6285 {
6286 hmap_remove(&pmd->tx_ports, &tx->node);
6287 free(tx);
6288 pmd->need_reload = true;
6289 }
6290 \f
6291 static char *
6292 dpif_netdev_get_datapath_version(void)
6293 {
6294 return xstrdup("<built-in>");
6295 }
6296
6297 static void
6298 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6299 uint16_t tcp_flags, long long now)
6300 {
6301 uint16_t flags;
6302
6303 atomic_store_relaxed(&netdev_flow->stats.used, now);
6304 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6305 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6306 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6307 flags |= tcp_flags;
6308 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6309 }
6310
6311 static int
6312 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6313 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6314 enum dpif_upcall_type type, const struct nlattr *userdata,
6315 struct ofpbuf *actions, struct ofpbuf *put_actions)
6316 {
6317 struct dp_netdev *dp = pmd->dp;
6318
6319 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6320 return ENODEV;
6321 }
6322
6323 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6324 struct ds ds = DS_EMPTY_INITIALIZER;
6325 char *packet_str;
6326 struct ofpbuf key;
6327 struct odp_flow_key_parms odp_parms = {
6328 .flow = flow,
6329 .mask = wc ? &wc->masks : NULL,
6330 .support = dp_netdev_support,
6331 };
6332
6333 ofpbuf_init(&key, 0);
6334 odp_flow_key_from_flow(&odp_parms, &key);
6335 packet_str = ofp_dp_packet_to_string(packet_);
6336
6337 odp_flow_key_format(key.data, key.size, &ds);
6338
6339 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6340 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6341
6342 ofpbuf_uninit(&key);
6343 free(packet_str);
6344
6345 ds_destroy(&ds);
6346 }
6347
6348 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6349 actions, wc, put_actions, dp->upcall_aux);
6350 }
6351
6352 static inline uint32_t
6353 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6354 const struct miniflow *mf)
6355 {
6356 uint32_t hash;
6357
6358 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6359 hash = dp_packet_get_rss_hash(packet);
6360 } else {
6361 hash = miniflow_hash_5tuple(mf, 0);
6362 dp_packet_set_rss_hash(packet, hash);
6363 }
6364
6365 return hash;
6366 }
6367
6368 static inline uint32_t
6369 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6370 const struct miniflow *mf)
6371 {
6372 uint32_t hash, recirc_depth;
6373
6374 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6375 hash = dp_packet_get_rss_hash(packet);
6376 } else {
6377 hash = miniflow_hash_5tuple(mf, 0);
6378 dp_packet_set_rss_hash(packet, hash);
6379 }
6380
6381 /* The RSS hash must account for the recirculation depth to avoid
6382 * collisions in the exact match cache */
6383 recirc_depth = *recirc_depth_get_unsafe();
6384 if (OVS_UNLIKELY(recirc_depth)) {
6385 hash = hash_finish(hash, recirc_depth);
6386 }
6387 return hash;
6388 }
6389
6390 struct packet_batch_per_flow {
6391 unsigned int byte_count;
6392 uint16_t tcp_flags;
6393 struct dp_netdev_flow *flow;
6394
6395 struct dp_packet_batch array;
6396 };
6397
6398 static inline void
6399 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6400 struct dp_packet *packet,
6401 uint16_t tcp_flags)
6402 {
6403 batch->byte_count += dp_packet_size(packet);
6404 batch->tcp_flags |= tcp_flags;
6405 dp_packet_batch_add(&batch->array, packet);
6406 }
6407
6408 static inline void
6409 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6410 struct dp_netdev_flow *flow)
6411 {
6412 flow->batch = batch;
6413
6414 batch->flow = flow;
6415 dp_packet_batch_init(&batch->array);
6416 batch->byte_count = 0;
6417 batch->tcp_flags = 0;
6418 }
6419
6420 static inline void
6421 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6422 struct dp_netdev_pmd_thread *pmd)
6423 {
6424 struct dp_netdev_actions *actions;
6425 struct dp_netdev_flow *flow = batch->flow;
6426
6427 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6428 batch->byte_count,
6429 batch->tcp_flags, pmd->ctx.now / 1000);
6430
6431 actions = dp_netdev_flow_get_actions(flow);
6432
6433 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6434 actions->actions, actions->size);
6435 }
6436
6437 static inline void
6438 dp_netdev_queue_batches(struct dp_packet *pkt,
6439 struct dp_netdev_flow *flow, uint16_t tcp_flags,
6440 struct packet_batch_per_flow *batches,
6441 size_t *n_batches)
6442 {
6443 struct packet_batch_per_flow *batch = flow->batch;
6444
6445 if (OVS_UNLIKELY(!batch)) {
6446 batch = &batches[(*n_batches)++];
6447 packet_batch_per_flow_init(batch, flow);
6448 }
6449
6450 packet_batch_per_flow_update(batch, pkt, tcp_flags);
6451 }
6452
6453 static inline void
6454 packet_enqueue_to_flow_map(struct dp_packet *packet,
6455 struct dp_netdev_flow *flow,
6456 uint16_t tcp_flags,
6457 struct dp_packet_flow_map *flow_map,
6458 size_t index)
6459 {
6460 struct dp_packet_flow_map *map = &flow_map[index];
6461 map->flow = flow;
6462 map->packet = packet;
6463 map->tcp_flags = tcp_flags;
6464 }
6465
6466 /* SMC lookup function for a batch of packets.
6467 * By doing batching SMC lookup, we can use prefetch
6468 * to hide memory access latency.
6469 */
6470 static inline void
6471 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6472 struct netdev_flow_key *keys,
6473 struct netdev_flow_key **missed_keys,
6474 struct dp_packet_batch *packets_,
6475 const int cnt,
6476 struct dp_packet_flow_map *flow_map,
6477 uint8_t *index_map)
6478 {
6479 int i;
6480 struct dp_packet *packet;
6481 size_t n_smc_hit = 0, n_missed = 0;
6482 struct dfc_cache *cache = &pmd->flow_cache;
6483 struct smc_cache *smc_cache = &cache->smc_cache;
6484 const struct cmap_node *flow_node;
6485 int recv_idx;
6486 uint16_t tcp_flags;
6487
6488 /* Prefetch buckets for all packets */
6489 for (i = 0; i < cnt; i++) {
6490 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6491 }
6492
6493 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6494 struct dp_netdev_flow *flow = NULL;
6495 flow_node = smc_entry_get(pmd, keys[i].hash);
6496 bool hit = false;
6497 /* Get the original order of this packet in received batch. */
6498 recv_idx = index_map[i];
6499
6500 if (OVS_LIKELY(flow_node != NULL)) {
6501 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6502 /* Since we dont have per-port megaflow to check the port
6503 * number, we need to verify that the input ports match. */
6504 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6505 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6506 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6507
6508 /* SMC hit and emc miss, we insert into EMC */
6509 keys[i].len =
6510 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6511 emc_probabilistic_insert(pmd, &keys[i], flow);
6512 /* Add these packets into the flow map in the same order
6513 * as received.
6514 */
6515 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6516 flow_map, recv_idx);
6517 n_smc_hit++;
6518 hit = true;
6519 break;
6520 }
6521 }
6522 if (hit) {
6523 continue;
6524 }
6525 }
6526
6527 /* SMC missed. Group missed packets together at
6528 * the beginning of the 'packets' array. */
6529 dp_packet_batch_refill(packets_, packet, i);
6530
6531 /* Preserve the order of packet for flow batching. */
6532 index_map[n_missed] = recv_idx;
6533
6534 /* Put missed keys to the pointer arrays return to the caller */
6535 missed_keys[n_missed++] = &keys[i];
6536 }
6537
6538 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6539 }
6540
6541 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6542 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6543 * miniflow is copied into 'keys' and the packet pointer is moved at the
6544 * beginning of the 'packets' array. The pointers of missed keys are put in the
6545 * missed_keys pointer array for future processing.
6546 *
6547 * The function returns the number of packets that needs to be processed in the
6548 * 'packets' array (they have been moved to the beginning of the vector).
6549 *
6550 * For performance reasons a caller may choose not to initialize the metadata
6551 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6552 * is not valid and must be initialized by this function using 'port_no'.
6553 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6554 * will be ignored.
6555 */
6556 static inline size_t
6557 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6558 struct dp_packet_batch *packets_,
6559 struct netdev_flow_key *keys,
6560 struct netdev_flow_key **missed_keys,
6561 struct packet_batch_per_flow batches[], size_t *n_batches,
6562 struct dp_packet_flow_map *flow_map,
6563 size_t *n_flows, uint8_t *index_map,
6564 bool md_is_valid, odp_port_t port_no)
6565 {
6566 struct netdev_flow_key *key = &keys[0];
6567 size_t n_missed = 0, n_emc_hit = 0;
6568 struct dfc_cache *cache = &pmd->flow_cache;
6569 struct dp_packet *packet;
6570 const size_t cnt = dp_packet_batch_size(packets_);
6571 uint32_t cur_min = pmd->ctx.emc_insert_min;
6572 int i;
6573 uint16_t tcp_flags;
6574 bool smc_enable_db;
6575 size_t map_cnt = 0;
6576 bool batch_enable = true;
6577
6578 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6579 pmd_perf_update_counter(&pmd->perf_stats,
6580 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6581 cnt);
6582
6583 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6584 struct dp_netdev_flow *flow;
6585 uint32_t mark;
6586
6587 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6588 dp_packet_delete(packet);
6589 COVERAGE_INC(datapath_drop_rx_invalid_packet);
6590 continue;
6591 }
6592
6593 if (i != cnt - 1) {
6594 struct dp_packet **packets = packets_->packets;
6595 /* Prefetch next packet data and metadata. */
6596 OVS_PREFETCH(dp_packet_data(packets[i+1]));
6597 pkt_metadata_prefetch_init(&packets[i+1]->md);
6598 }
6599
6600 if (!md_is_valid) {
6601 pkt_metadata_init(&packet->md, port_no);
6602 }
6603
6604 if ((*recirc_depth_get() == 0) &&
6605 dp_packet_has_flow_mark(packet, &mark)) {
6606 flow = mark_to_flow_find(pmd, mark);
6607 if (OVS_LIKELY(flow)) {
6608 tcp_flags = parse_tcp_flags(packet);
6609 if (OVS_LIKELY(batch_enable)) {
6610 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6611 n_batches);
6612 } else {
6613 /* Flow batching should be performed only after fast-path
6614 * processing is also completed for packets with emc miss
6615 * or else it will result in reordering of packets with
6616 * same datapath flows. */
6617 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6618 flow_map, map_cnt++);
6619 }
6620 continue;
6621 }
6622 }
6623
6624 miniflow_extract(packet, &key->mf);
6625 key->len = 0; /* Not computed yet. */
6626 key->hash =
6627 (md_is_valid == false)
6628 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6629 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6630
6631 /* If EMC is disabled skip emc_lookup */
6632 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6633 if (OVS_LIKELY(flow)) {
6634 tcp_flags = miniflow_get_tcp_flags(&key->mf);
6635 n_emc_hit++;
6636 if (OVS_LIKELY(batch_enable)) {
6637 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6638 n_batches);
6639 } else {
6640 /* Flow batching should be performed only after fast-path
6641 * processing is also completed for packets with emc miss
6642 * or else it will result in reordering of packets with
6643 * same datapath flows. */
6644 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6645 flow_map, map_cnt++);
6646 }
6647 } else {
6648 /* Exact match cache missed. Group missed packets together at
6649 * the beginning of the 'packets' array. */
6650 dp_packet_batch_refill(packets_, packet, i);
6651
6652 /* Preserve the order of packet for flow batching. */
6653 index_map[n_missed] = map_cnt;
6654 flow_map[map_cnt++].flow = NULL;
6655
6656 /* 'key[n_missed]' contains the key of the current packet and it
6657 * will be passed to SMC lookup. The next key should be extracted
6658 * to 'keys[n_missed + 1]'.
6659 * We also maintain a pointer array to keys missed both SMC and EMC
6660 * which will be returned to the caller for future processing. */
6661 missed_keys[n_missed] = key;
6662 key = &keys[++n_missed];
6663
6664 /* Skip batching for subsequent packets to avoid reordering. */
6665 batch_enable = false;
6666 }
6667 }
6668 /* Count of packets which are not flow batched. */
6669 *n_flows = map_cnt;
6670
6671 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6672
6673 if (!smc_enable_db) {
6674 return dp_packet_batch_size(packets_);
6675 }
6676
6677 /* Packets miss EMC will do a batch lookup in SMC if enabled */
6678 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6679 n_missed, flow_map, index_map);
6680
6681 return dp_packet_batch_size(packets_);
6682 }
6683
6684 static inline int
6685 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6686 struct dp_packet *packet,
6687 const struct netdev_flow_key *key,
6688 struct ofpbuf *actions, struct ofpbuf *put_actions)
6689 {
6690 struct ofpbuf *add_actions;
6691 struct dp_packet_batch b;
6692 struct match match;
6693 ovs_u128 ufid;
6694 int error;
6695 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6696
6697 match.tun_md.valid = false;
6698 miniflow_expand(&key->mf, &match.flow);
6699 memset(&match.wc, 0, sizeof match.wc);
6700
6701 ofpbuf_clear(actions);
6702 ofpbuf_clear(put_actions);
6703
6704 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
6705 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6706 &ufid, DPIF_UC_MISS, NULL, actions,
6707 put_actions);
6708 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6709 dp_packet_delete(packet);
6710 COVERAGE_INC(datapath_drop_upcall_error);
6711 return error;
6712 }
6713
6714 /* The Netlink encoding of datapath flow keys cannot express
6715 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6716 * tag is interpreted as exact match on the fact that there is no
6717 * VLAN. Unless we refactor a lot of code that translates between
6718 * Netlink and struct flow representations, we have to do the same
6719 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
6720 if (!match.wc.masks.vlans[0].tci) {
6721 match.wc.masks.vlans[0].tci = htons(0xffff);
6722 }
6723
6724 /* We can't allow the packet batching in the next loop to execute
6725 * the actions. Otherwise, if there are any slow path actions,
6726 * we'll send the packet up twice. */
6727 dp_packet_batch_init_packet(&b, packet);
6728 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6729 actions->data, actions->size);
6730
6731 add_actions = put_actions->size ? put_actions : actions;
6732 if (OVS_LIKELY(error != ENOSPC)) {
6733 struct dp_netdev_flow *netdev_flow;
6734
6735 /* XXX: There's a race window where a flow covering this packet
6736 * could have already been installed since we last did the flow
6737 * lookup before upcall. This could be solved by moving the
6738 * mutex lock outside the loop, but that's an awful long time
6739 * to be locking revalidators out of making flow modifications. */
6740 ovs_mutex_lock(&pmd->flow_mutex);
6741 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6742 if (OVS_LIKELY(!netdev_flow)) {
6743 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6744 add_actions->data,
6745 add_actions->size);
6746 }
6747 ovs_mutex_unlock(&pmd->flow_mutex);
6748 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6749 smc_insert(pmd, key, hash);
6750 emc_probabilistic_insert(pmd, key, netdev_flow);
6751 }
6752 if (pmd_perf_metrics_enabled(pmd)) {
6753 /* Update upcall stats. */
6754 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6755 struct pmd_perf_stats *s = &pmd->perf_stats;
6756 s->current.upcalls++;
6757 s->current.upcall_cycles += cycles;
6758 histogram_add_sample(&s->cycles_per_upcall, cycles);
6759 }
6760 return error;
6761 }
6762
6763 static inline void
6764 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
6765 struct dp_packet_batch *packets_,
6766 struct netdev_flow_key **keys,
6767 struct dp_packet_flow_map *flow_map,
6768 uint8_t *index_map,
6769 odp_port_t in_port)
6770 {
6771 const size_t cnt = dp_packet_batch_size(packets_);
6772 #if !defined(__CHECKER__) && !defined(_WIN32)
6773 const size_t PKT_ARRAY_SIZE = cnt;
6774 #else
6775 /* Sparse or MSVC doesn't like variable length array. */
6776 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6777 #endif
6778 struct dp_packet *packet;
6779 struct dpcls *cls;
6780 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
6781 struct dp_netdev *dp = pmd->dp;
6782 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
6783 int lookup_cnt = 0, add_lookup_cnt;
6784 bool any_miss;
6785
6786 for (size_t i = 0; i < cnt; i++) {
6787 /* Key length is needed in all the cases, hash computed on demand. */
6788 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
6789 }
6790 /* Get the classifier for the in_port */
6791 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6792 if (OVS_LIKELY(cls)) {
6793 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6794 rules, cnt, &lookup_cnt);
6795 } else {
6796 any_miss = true;
6797 memset(rules, 0, sizeof(rules));
6798 }
6799 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6800 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6801 struct ofpbuf actions, put_actions;
6802
6803 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6804 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6805
6806 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6807 struct dp_netdev_flow *netdev_flow;
6808
6809 if (OVS_LIKELY(rules[i])) {
6810 continue;
6811 }
6812
6813 /* It's possible that an earlier slow path execution installed
6814 * a rule covering this flow. In this case, it's a lot cheaper
6815 * to catch it here than execute a miss. */
6816 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
6817 &add_lookup_cnt);
6818 if (netdev_flow) {
6819 lookup_cnt += add_lookup_cnt;
6820 rules[i] = &netdev_flow->cr;
6821 continue;
6822 }
6823
6824 int error = handle_packet_upcall(pmd, packet, keys[i],
6825 &actions, &put_actions);
6826
6827 if (OVS_UNLIKELY(error)) {
6828 upcall_fail_cnt++;
6829 } else {
6830 upcall_ok_cnt++;
6831 }
6832 }
6833
6834 ofpbuf_uninit(&actions);
6835 ofpbuf_uninit(&put_actions);
6836 fat_rwlock_unlock(&dp->upcall_rwlock);
6837 } else if (OVS_UNLIKELY(any_miss)) {
6838 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6839 if (OVS_UNLIKELY(!rules[i])) {
6840 dp_packet_delete(packet);
6841 COVERAGE_INC(datapath_drop_lock_error);
6842 upcall_fail_cnt++;
6843 }
6844 }
6845 }
6846
6847 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6848 struct dp_netdev_flow *flow;
6849 /* Get the original order of this packet in received batch. */
6850 int recv_idx = index_map[i];
6851 uint16_t tcp_flags;
6852
6853 if (OVS_UNLIKELY(!rules[i])) {
6854 continue;
6855 }
6856
6857 flow = dp_netdev_flow_cast(rules[i]);
6858 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
6859 smc_insert(pmd, keys[i], hash);
6860
6861 emc_probabilistic_insert(pmd, keys[i], flow);
6862 /* Add these packets into the flow map in the same order
6863 * as received.
6864 */
6865 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6866 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6867 flow_map, recv_idx);
6868 }
6869
6870 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6871 cnt - upcall_ok_cnt - upcall_fail_cnt);
6872 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6873 lookup_cnt);
6874 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6875 upcall_ok_cnt);
6876 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6877 upcall_fail_cnt);
6878 }
6879
6880 /* Packets enter the datapath from a port (or from recirculation) here.
6881 *
6882 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6883 * When false the metadata in 'packets' need to be initialized. */
6884 static void
6885 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
6886 struct dp_packet_batch *packets,
6887 bool md_is_valid, odp_port_t port_no)
6888 {
6889 #if !defined(__CHECKER__) && !defined(_WIN32)
6890 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
6891 #else
6892 /* Sparse or MSVC doesn't like variable length array. */
6893 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6894 #endif
6895 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6896 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
6897 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
6898 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
6899 size_t n_batches;
6900 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6901 uint8_t index_map[PKT_ARRAY_SIZE];
6902 size_t n_flows, i;
6903
6904 odp_port_t in_port;
6905
6906 n_batches = 0;
6907 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
6908 flow_map, &n_flows, index_map, md_is_valid, port_no);
6909
6910 if (!dp_packet_batch_is_empty(packets)) {
6911 /* Get ingress port from first packet's metadata. */
6912 in_port = packets->packets[0]->md.in_port.odp_port;
6913 fast_path_processing(pmd, packets, missed_keys,
6914 flow_map, index_map, in_port);
6915 }
6916
6917 /* Batch rest of packets which are in flow map. */
6918 for (i = 0; i < n_flows; i++) {
6919 struct dp_packet_flow_map *map = &flow_map[i];
6920
6921 if (OVS_UNLIKELY(!map->flow)) {
6922 continue;
6923 }
6924 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6925 batches, &n_batches);
6926 }
6927
6928 /* All the flow batches need to be reset before any call to
6929 * packet_batch_per_flow_execute() as it could potentially trigger
6930 * recirculation. When a packet matching flow ‘j’ happens to be
6931 * recirculated, the nested call to dp_netdev_input__() could potentially
6932 * classify the packet as matching another flow - say 'k'. It could happen
6933 * that in the previous call to dp_netdev_input__() that same flow 'k' had
6934 * already its own batches[k] still waiting to be served. So if its
6935 * ‘batch’ member is not reset, the recirculated packet would be wrongly
6936 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
6937 for (i = 0; i < n_batches; i++) {
6938 batches[i].flow->batch = NULL;
6939 }
6940
6941 for (i = 0; i < n_batches; i++) {
6942 packet_batch_per_flow_execute(&batches[i], pmd);
6943 }
6944 }
6945
6946 static void
6947 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
6948 struct dp_packet_batch *packets,
6949 odp_port_t port_no)
6950 {
6951 dp_netdev_input__(pmd, packets, false, port_no);
6952 }
6953
6954 static void
6955 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
6956 struct dp_packet_batch *packets)
6957 {
6958 dp_netdev_input__(pmd, packets, true, 0);
6959 }
6960
6961 struct dp_netdev_execute_aux {
6962 struct dp_netdev_pmd_thread *pmd;
6963 const struct flow *flow;
6964 };
6965
6966 static void
6967 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6968 void *aux)
6969 {
6970 struct dp_netdev *dp = get_dp_netdev(dpif);
6971 dp->dp_purge_aux = aux;
6972 dp->dp_purge_cb = cb;
6973 }
6974
6975 static void
6976 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6977 void *aux)
6978 {
6979 struct dp_netdev *dp = get_dp_netdev(dpif);
6980 dp->upcall_aux = aux;
6981 dp->upcall_cb = cb;
6982 }
6983
6984 static void
6985 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
6986 bool purge)
6987 {
6988 struct tx_port *tx;
6989 struct dp_netdev_port *port;
6990 long long interval;
6991
6992 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
6993 if (!tx->port->dynamic_txqs) {
6994 continue;
6995 }
6996 interval = pmd->ctx.now - tx->last_used;
6997 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
6998 port = tx->port;
6999 ovs_mutex_lock(&port->txq_used_mutex);
7000 port->txq_used[tx->qid]--;
7001 ovs_mutex_unlock(&port->txq_used_mutex);
7002 tx->qid = -1;
7003 }
7004 }
7005 }
7006
7007 static int
7008 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7009 struct tx_port *tx)
7010 {
7011 struct dp_netdev_port *port;
7012 long long interval;
7013 int i, min_cnt, min_qid;
7014
7015 interval = pmd->ctx.now - tx->last_used;
7016 tx->last_used = pmd->ctx.now;
7017
7018 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7019 return tx->qid;
7020 }
7021
7022 port = tx->port;
7023
7024 ovs_mutex_lock(&port->txq_used_mutex);
7025 if (tx->qid >= 0) {
7026 port->txq_used[tx->qid]--;
7027 tx->qid = -1;
7028 }
7029
7030 min_cnt = -1;
7031 min_qid = 0;
7032 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7033 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7034 min_cnt = port->txq_used[i];
7035 min_qid = i;
7036 }
7037 }
7038
7039 port->txq_used[min_qid]++;
7040 tx->qid = min_qid;
7041
7042 ovs_mutex_unlock(&port->txq_used_mutex);
7043
7044 dpif_netdev_xps_revalidate_pmd(pmd, false);
7045
7046 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7047 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7048 return min_qid;
7049 }
7050
7051 static struct tx_port *
7052 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7053 odp_port_t port_no)
7054 {
7055 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7056 }
7057
7058 static struct tx_port *
7059 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7060 odp_port_t port_no)
7061 {
7062 return tx_port_lookup(&pmd->send_port_cache, port_no);
7063 }
7064
7065 static int
7066 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7067 const struct nlattr *attr,
7068 struct dp_packet_batch *batch)
7069 {
7070 struct tx_port *tun_port;
7071 const struct ovs_action_push_tnl *data;
7072 int err;
7073
7074 data = nl_attr_get(attr);
7075
7076 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7077 if (!tun_port) {
7078 err = -EINVAL;
7079 goto error;
7080 }
7081 err = netdev_push_header(tun_port->port->netdev, batch, data);
7082 if (!err) {
7083 return 0;
7084 }
7085 error:
7086 dp_packet_delete_batch(batch, true);
7087 return err;
7088 }
7089
7090 static void
7091 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7092 struct dp_packet *packet, bool should_steal,
7093 struct flow *flow, ovs_u128 *ufid,
7094 struct ofpbuf *actions,
7095 const struct nlattr *userdata)
7096 {
7097 struct dp_packet_batch b;
7098 int error;
7099
7100 ofpbuf_clear(actions);
7101
7102 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7103 DPIF_UC_ACTION, userdata, actions,
7104 NULL);
7105 if (!error || error == ENOSPC) {
7106 dp_packet_batch_init_packet(&b, packet);
7107 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7108 actions->data, actions->size);
7109 } else if (should_steal) {
7110 dp_packet_delete(packet);
7111 COVERAGE_INC(datapath_drop_userspace_action_error);
7112 }
7113 }
7114
7115 static void
7116 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7117 const struct nlattr *a, bool should_steal)
7118 OVS_NO_THREAD_SAFETY_ANALYSIS
7119 {
7120 struct dp_netdev_execute_aux *aux = aux_;
7121 uint32_t *depth = recirc_depth_get();
7122 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7123 struct dp_netdev *dp = pmd->dp;
7124 int type = nl_attr_type(a);
7125 struct tx_port *p;
7126 uint32_t packet_count, packets_dropped;
7127
7128 switch ((enum ovs_action_attr)type) {
7129 case OVS_ACTION_ATTR_OUTPUT:
7130 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
7131 if (OVS_LIKELY(p)) {
7132 struct dp_packet *packet;
7133 struct dp_packet_batch out;
7134
7135 if (!should_steal) {
7136 dp_packet_batch_clone(&out, packets_);
7137 dp_packet_batch_reset_cutlen(packets_);
7138 packets_ = &out;
7139 }
7140 dp_packet_batch_apply_cutlen(packets_);
7141
7142 #ifdef DPDK_NETDEV
7143 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7144 && packets_->packets[0]->source
7145 != p->output_pkts.packets[0]->source)) {
7146 /* XXX: netdev-dpdk assumes that all packets in a single
7147 * output batch has the same source. Flush here to
7148 * avoid memory access issues. */
7149 dp_netdev_pmd_flush_output_on_port(pmd, p);
7150 }
7151 #endif
7152 if (dp_packet_batch_size(&p->output_pkts)
7153 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7154 /* Flush here to avoid overflow. */
7155 dp_netdev_pmd_flush_output_on_port(pmd, p);
7156 }
7157
7158 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7159 pmd->n_output_batches++;
7160 }
7161
7162 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7163 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7164 pmd->ctx.last_rxq;
7165 dp_packet_batch_add(&p->output_pkts, packet);
7166 }
7167 return;
7168 } else {
7169 COVERAGE_ADD(datapath_drop_invalid_port,
7170 dp_packet_batch_size(packets_));
7171 }
7172 break;
7173
7174 case OVS_ACTION_ATTR_TUNNEL_PUSH:
7175 if (should_steal) {
7176 /* We're requested to push tunnel header, but also we need to take
7177 * the ownership of these packets. Thus, we can avoid performing
7178 * the action, because the caller will not use the result anyway.
7179 * Just break to free the batch. */
7180 break;
7181 }
7182 dp_packet_batch_apply_cutlen(packets_);
7183 packet_count = dp_packet_batch_size(packets_);
7184 if (push_tnl_action(pmd, a, packets_)) {
7185 COVERAGE_ADD(datapath_drop_tunnel_push_error,
7186 packet_count);
7187 }
7188 return;
7189
7190 case OVS_ACTION_ATTR_TUNNEL_POP:
7191 if (*depth < MAX_RECIRC_DEPTH) {
7192 struct dp_packet_batch *orig_packets_ = packets_;
7193 odp_port_t portno = nl_attr_get_odp_port(a);
7194
7195 p = pmd_tnl_port_cache_lookup(pmd, portno);
7196 if (p) {
7197 struct dp_packet_batch tnl_pkt;
7198
7199 if (!should_steal) {
7200 dp_packet_batch_clone(&tnl_pkt, packets_);
7201 packets_ = &tnl_pkt;
7202 dp_packet_batch_reset_cutlen(orig_packets_);
7203 }
7204
7205 dp_packet_batch_apply_cutlen(packets_);
7206
7207 packet_count = dp_packet_batch_size(packets_);
7208 netdev_pop_header(p->port->netdev, packets_);
7209 packets_dropped =
7210 packet_count - dp_packet_batch_size(packets_);
7211 if (packets_dropped) {
7212 COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7213 packets_dropped);
7214 }
7215 if (dp_packet_batch_is_empty(packets_)) {
7216 return;
7217 }
7218
7219 struct dp_packet *packet;
7220 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7221 packet->md.in_port.odp_port = portno;
7222 }
7223
7224 (*depth)++;
7225 dp_netdev_recirculate(pmd, packets_);
7226 (*depth)--;
7227 return;
7228 }
7229 COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7230 dp_packet_batch_size(packets_));
7231 } else {
7232 COVERAGE_ADD(datapath_drop_recirc_error,
7233 dp_packet_batch_size(packets_));
7234 }
7235 break;
7236
7237 case OVS_ACTION_ATTR_USERSPACE:
7238 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7239 struct dp_packet_batch *orig_packets_ = packets_;
7240 const struct nlattr *userdata;
7241 struct dp_packet_batch usr_pkt;
7242 struct ofpbuf actions;
7243 struct flow flow;
7244 ovs_u128 ufid;
7245 bool clone = false;
7246
7247 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7248 ofpbuf_init(&actions, 0);
7249
7250 if (packets_->trunc) {
7251 if (!should_steal) {
7252 dp_packet_batch_clone(&usr_pkt, packets_);
7253 packets_ = &usr_pkt;
7254 clone = true;
7255 dp_packet_batch_reset_cutlen(orig_packets_);
7256 }
7257
7258 dp_packet_batch_apply_cutlen(packets_);
7259 }
7260
7261 struct dp_packet *packet;
7262 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7263 flow_extract(packet, &flow);
7264 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7265 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7266 &ufid, &actions, userdata);
7267 }
7268
7269 if (clone) {
7270 dp_packet_delete_batch(packets_, true);
7271 }
7272
7273 ofpbuf_uninit(&actions);
7274 fat_rwlock_unlock(&dp->upcall_rwlock);
7275
7276 return;
7277 }
7278 COVERAGE_ADD(datapath_drop_lock_error,
7279 dp_packet_batch_size(packets_));
7280 break;
7281
7282 case OVS_ACTION_ATTR_RECIRC:
7283 if (*depth < MAX_RECIRC_DEPTH) {
7284 struct dp_packet_batch recirc_pkts;
7285
7286 if (!should_steal) {
7287 dp_packet_batch_clone(&recirc_pkts, packets_);
7288 packets_ = &recirc_pkts;
7289 }
7290
7291 struct dp_packet *packet;
7292 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7293 packet->md.recirc_id = nl_attr_get_u32(a);
7294 }
7295
7296 (*depth)++;
7297 dp_netdev_recirculate(pmd, packets_);
7298 (*depth)--;
7299
7300 return;
7301 }
7302
7303 COVERAGE_ADD(datapath_drop_recirc_error,
7304 dp_packet_batch_size(packets_));
7305 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7306 break;
7307
7308 case OVS_ACTION_ATTR_CT: {
7309 const struct nlattr *b;
7310 bool force = false;
7311 bool commit = false;
7312 unsigned int left;
7313 uint16_t zone = 0;
7314 const char *helper = NULL;
7315 const uint32_t *setmark = NULL;
7316 const struct ovs_key_ct_labels *setlabel = NULL;
7317 struct nat_action_info_t nat_action_info;
7318 struct nat_action_info_t *nat_action_info_ref = NULL;
7319 bool nat_config = false;
7320
7321 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7322 nl_attr_get_size(a)) {
7323 enum ovs_ct_attr sub_type = nl_attr_type(b);
7324
7325 switch(sub_type) {
7326 case OVS_CT_ATTR_FORCE_COMMIT:
7327 force = true;
7328 /* fall through. */
7329 case OVS_CT_ATTR_COMMIT:
7330 commit = true;
7331 break;
7332 case OVS_CT_ATTR_ZONE:
7333 zone = nl_attr_get_u16(b);
7334 break;
7335 case OVS_CT_ATTR_HELPER:
7336 helper = nl_attr_get_string(b);
7337 break;
7338 case OVS_CT_ATTR_MARK:
7339 setmark = nl_attr_get(b);
7340 break;
7341 case OVS_CT_ATTR_LABELS:
7342 setlabel = nl_attr_get(b);
7343 break;
7344 case OVS_CT_ATTR_EVENTMASK:
7345 /* Silently ignored, as userspace datapath does not generate
7346 * netlink events. */
7347 break;
7348 case OVS_CT_ATTR_TIMEOUT:
7349 /* Userspace datapath does not support customized timeout
7350 * policy yet. */
7351 break;
7352 case OVS_CT_ATTR_NAT: {
7353 const struct nlattr *b_nest;
7354 unsigned int left_nest;
7355 bool ip_min_specified = false;
7356 bool proto_num_min_specified = false;
7357 bool ip_max_specified = false;
7358 bool proto_num_max_specified = false;
7359 memset(&nat_action_info, 0, sizeof nat_action_info);
7360 nat_action_info_ref = &nat_action_info;
7361
7362 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7363 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7364
7365 switch (sub_type_nest) {
7366 case OVS_NAT_ATTR_SRC:
7367 case OVS_NAT_ATTR_DST:
7368 nat_config = true;
7369 nat_action_info.nat_action |=
7370 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7371 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7372 break;
7373 case OVS_NAT_ATTR_IP_MIN:
7374 memcpy(&nat_action_info.min_addr,
7375 nl_attr_get(b_nest),
7376 nl_attr_get_size(b_nest));
7377 ip_min_specified = true;
7378 break;
7379 case OVS_NAT_ATTR_IP_MAX:
7380 memcpy(&nat_action_info.max_addr,
7381 nl_attr_get(b_nest),
7382 nl_attr_get_size(b_nest));
7383 ip_max_specified = true;
7384 break;
7385 case OVS_NAT_ATTR_PROTO_MIN:
7386 nat_action_info.min_port =
7387 nl_attr_get_u16(b_nest);
7388 proto_num_min_specified = true;
7389 break;
7390 case OVS_NAT_ATTR_PROTO_MAX:
7391 nat_action_info.max_port =
7392 nl_attr_get_u16(b_nest);
7393 proto_num_max_specified = true;
7394 break;
7395 case OVS_NAT_ATTR_PERSISTENT:
7396 case OVS_NAT_ATTR_PROTO_HASH:
7397 case OVS_NAT_ATTR_PROTO_RANDOM:
7398 break;
7399 case OVS_NAT_ATTR_UNSPEC:
7400 case __OVS_NAT_ATTR_MAX:
7401 OVS_NOT_REACHED();
7402 }
7403 }
7404
7405 if (ip_min_specified && !ip_max_specified) {
7406 nat_action_info.max_addr = nat_action_info.min_addr;
7407 }
7408 if (proto_num_min_specified && !proto_num_max_specified) {
7409 nat_action_info.max_port = nat_action_info.min_port;
7410 }
7411 if (proto_num_min_specified || proto_num_max_specified) {
7412 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7413 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7414 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7415 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7416 }
7417 }
7418 break;
7419 }
7420 case OVS_CT_ATTR_UNSPEC:
7421 case __OVS_CT_ATTR_MAX:
7422 OVS_NOT_REACHED();
7423 }
7424 }
7425
7426 /* We won't be able to function properly in this case, hence
7427 * complain loudly. */
7428 if (nat_config && !commit) {
7429 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7430 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7431 }
7432
7433 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7434 commit, zone, setmark, setlabel, aux->flow->tp_src,
7435 aux->flow->tp_dst, helper, nat_action_info_ref,
7436 pmd->ctx.now / 1000);
7437 break;
7438 }
7439
7440 case OVS_ACTION_ATTR_METER:
7441 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7442 pmd->ctx.now);
7443 break;
7444
7445 case OVS_ACTION_ATTR_PUSH_VLAN:
7446 case OVS_ACTION_ATTR_POP_VLAN:
7447 case OVS_ACTION_ATTR_PUSH_MPLS:
7448 case OVS_ACTION_ATTR_POP_MPLS:
7449 case OVS_ACTION_ATTR_SET:
7450 case OVS_ACTION_ATTR_SET_MASKED:
7451 case OVS_ACTION_ATTR_SAMPLE:
7452 case OVS_ACTION_ATTR_HASH:
7453 case OVS_ACTION_ATTR_UNSPEC:
7454 case OVS_ACTION_ATTR_TRUNC:
7455 case OVS_ACTION_ATTR_PUSH_ETH:
7456 case OVS_ACTION_ATTR_POP_ETH:
7457 case OVS_ACTION_ATTR_CLONE:
7458 case OVS_ACTION_ATTR_PUSH_NSH:
7459 case OVS_ACTION_ATTR_POP_NSH:
7460 case OVS_ACTION_ATTR_CT_CLEAR:
7461 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7462 case OVS_ACTION_ATTR_DROP:
7463 case __OVS_ACTION_ATTR_MAX:
7464 OVS_NOT_REACHED();
7465 }
7466
7467 dp_packet_delete_batch(packets_, should_steal);
7468 }
7469
7470 static void
7471 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7472 struct dp_packet_batch *packets,
7473 bool should_steal, const struct flow *flow,
7474 const struct nlattr *actions, size_t actions_len)
7475 {
7476 struct dp_netdev_execute_aux aux = { pmd, flow };
7477
7478 odp_execute_actions(&aux, packets, should_steal, actions,
7479 actions_len, dp_execute_cb);
7480 }
7481
7482 struct dp_netdev_ct_dump {
7483 struct ct_dpif_dump_state up;
7484 struct conntrack_dump dump;
7485 struct conntrack *ct;
7486 struct dp_netdev *dp;
7487 };
7488
7489 static int
7490 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7491 const uint16_t *pzone, int *ptot_bkts)
7492 {
7493 struct dp_netdev *dp = get_dp_netdev(dpif);
7494 struct dp_netdev_ct_dump *dump;
7495
7496 dump = xzalloc(sizeof *dump);
7497 dump->dp = dp;
7498 dump->ct = dp->conntrack;
7499
7500 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7501
7502 *dump_ = &dump->up;
7503
7504 return 0;
7505 }
7506
7507 static int
7508 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7509 struct ct_dpif_dump_state *dump_,
7510 struct ct_dpif_entry *entry)
7511 {
7512 struct dp_netdev_ct_dump *dump;
7513
7514 INIT_CONTAINER(dump, dump_, up);
7515
7516 return conntrack_dump_next(&dump->dump, entry);
7517 }
7518
7519 static int
7520 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7521 struct ct_dpif_dump_state *dump_)
7522 {
7523 struct dp_netdev_ct_dump *dump;
7524 int err;
7525
7526 INIT_CONTAINER(dump, dump_, up);
7527
7528 err = conntrack_dump_done(&dump->dump);
7529
7530 free(dump);
7531
7532 return err;
7533 }
7534
7535 static int
7536 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7537 const struct ct_dpif_tuple *tuple)
7538 {
7539 struct dp_netdev *dp = get_dp_netdev(dpif);
7540
7541 if (tuple) {
7542 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7543 }
7544 return conntrack_flush(dp->conntrack, zone);
7545 }
7546
7547 static int
7548 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7549 {
7550 struct dp_netdev *dp = get_dp_netdev(dpif);
7551
7552 return conntrack_set_maxconns(dp->conntrack, maxconns);
7553 }
7554
7555 static int
7556 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7557 {
7558 struct dp_netdev *dp = get_dp_netdev(dpif);
7559
7560 return conntrack_get_maxconns(dp->conntrack, maxconns);
7561 }
7562
7563 static int
7564 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7565 {
7566 struct dp_netdev *dp = get_dp_netdev(dpif);
7567
7568 return conntrack_get_nconns(dp->conntrack, nconns);
7569 }
7570
7571 static int
7572 dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7573 {
7574 struct dp_netdev *dp = get_dp_netdev(dpif);
7575
7576 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7577 }
7578
7579 static int
7580 dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7581 {
7582 struct dp_netdev *dp = get_dp_netdev(dpif);
7583 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7584 return 0;
7585 }
7586
7587 static int
7588 dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
7589 const uint32_t *default_limits,
7590 const struct ovs_list *zone_limits)
7591 {
7592 int err = 0;
7593 struct dp_netdev *dp = get_dp_netdev(dpif);
7594 if (default_limits) {
7595 err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
7596 if (err != 0) {
7597 return err;
7598 }
7599 }
7600
7601 struct ct_dpif_zone_limit *zone_limit;
7602 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7603 err = zone_limit_update(dp->conntrack, zone_limit->zone,
7604 zone_limit->limit);
7605 if (err != 0) {
7606 break;
7607 }
7608 }
7609 return err;
7610 }
7611
7612 static int
7613 dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
7614 uint32_t *default_limit,
7615 const struct ovs_list *zone_limits_request,
7616 struct ovs_list *zone_limits_reply)
7617 {
7618 struct dp_netdev *dp = get_dp_netdev(dpif);
7619 struct conntrack_zone_limit czl;
7620
7621 czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
7622 if (czl.zone == DEFAULT_ZONE) {
7623 *default_limit = czl.limit;
7624 } else {
7625 return EINVAL;
7626 }
7627
7628 if (!ovs_list_is_empty(zone_limits_request)) {
7629 struct ct_dpif_zone_limit *zone_limit;
7630 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
7631 czl = zone_limit_get(dp->conntrack, zone_limit->zone);
7632 if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
7633 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
7634 czl.limit, czl.count);
7635 } else {
7636 return EINVAL;
7637 }
7638 }
7639 } else {
7640 for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
7641 czl = zone_limit_get(dp->conntrack, z);
7642 if (czl.zone == z) {
7643 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
7644 czl.count);
7645 }
7646 }
7647 }
7648
7649 return 0;
7650 }
7651
7652 static int
7653 dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
7654 const struct ovs_list *zone_limits)
7655 {
7656 int err = 0;
7657 struct dp_netdev *dp = get_dp_netdev(dpif);
7658 struct ct_dpif_zone_limit *zone_limit;
7659 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7660 err = zone_limit_delete(dp->conntrack, zone_limit->zone);
7661 if (err != 0) {
7662 break;
7663 }
7664 }
7665
7666 return err;
7667 }
7668
7669 static int
7670 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7671 {
7672 struct dp_netdev *dp = get_dp_netdev(dpif);
7673 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
7674 }
7675
7676 static int
7677 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7678 {
7679 struct dp_netdev *dp = get_dp_netdev(dpif);
7680 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
7681 }
7682
7683 static int
7684 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7685 {
7686 struct dp_netdev *dp = get_dp_netdev(dpif);
7687 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
7688 }
7689
7690 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7691 * diverge. */
7692 static int
7693 dpif_netdev_ipf_get_status(struct dpif *dpif,
7694 struct dpif_ipf_status *dpif_ipf_status)
7695 {
7696 struct dp_netdev *dp = get_dp_netdev(dpif);
7697 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
7698 (struct ipf_status *) dpif_ipf_status);
7699 return 0;
7700 }
7701
7702 static int
7703 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7704 struct ipf_dump_ctx **ipf_dump_ctx)
7705 {
7706 return ipf_dump_start(ipf_dump_ctx);
7707 }
7708
7709 static int
7710 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7711 {
7712 struct dp_netdev *dp = get_dp_netdev(dpif);
7713 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
7714 dump);
7715 }
7716
7717 static int
7718 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7719 {
7720 return ipf_dump_done(ipf_dump_ctx);
7721
7722 }
7723
7724 const struct dpif_class dpif_netdev_class = {
7725 "netdev",
7726 true, /* cleanup_required */
7727 dpif_netdev_init,
7728 dpif_netdev_enumerate,
7729 dpif_netdev_port_open_type,
7730 dpif_netdev_open,
7731 dpif_netdev_close,
7732 dpif_netdev_destroy,
7733 dpif_netdev_run,
7734 dpif_netdev_wait,
7735 dpif_netdev_get_stats,
7736 NULL, /* set_features */
7737 dpif_netdev_port_add,
7738 dpif_netdev_port_del,
7739 dpif_netdev_port_set_config,
7740 dpif_netdev_port_query_by_number,
7741 dpif_netdev_port_query_by_name,
7742 NULL, /* port_get_pid */
7743 dpif_netdev_port_dump_start,
7744 dpif_netdev_port_dump_next,
7745 dpif_netdev_port_dump_done,
7746 dpif_netdev_port_poll,
7747 dpif_netdev_port_poll_wait,
7748 dpif_netdev_flow_flush,
7749 dpif_netdev_flow_dump_create,
7750 dpif_netdev_flow_dump_destroy,
7751 dpif_netdev_flow_dump_thread_create,
7752 dpif_netdev_flow_dump_thread_destroy,
7753 dpif_netdev_flow_dump_next,
7754 dpif_netdev_operate,
7755 NULL, /* recv_set */
7756 NULL, /* handlers_set */
7757 dpif_netdev_set_config,
7758 dpif_netdev_queue_to_priority,
7759 NULL, /* recv */
7760 NULL, /* recv_wait */
7761 NULL, /* recv_purge */
7762 dpif_netdev_register_dp_purge_cb,
7763 dpif_netdev_register_upcall_cb,
7764 dpif_netdev_enable_upcall,
7765 dpif_netdev_disable_upcall,
7766 dpif_netdev_get_datapath_version,
7767 dpif_netdev_ct_dump_start,
7768 dpif_netdev_ct_dump_next,
7769 dpif_netdev_ct_dump_done,
7770 dpif_netdev_ct_flush,
7771 dpif_netdev_ct_set_maxconns,
7772 dpif_netdev_ct_get_maxconns,
7773 dpif_netdev_ct_get_nconns,
7774 dpif_netdev_ct_set_tcp_seq_chk,
7775 dpif_netdev_ct_get_tcp_seq_chk,
7776 dpif_netdev_ct_set_limits,
7777 dpif_netdev_ct_get_limits,
7778 dpif_netdev_ct_del_limits,
7779 NULL, /* ct_set_timeout_policy */
7780 NULL, /* ct_get_timeout_policy */
7781 NULL, /* ct_del_timeout_policy */
7782 NULL, /* ct_timeout_policy_dump_start */
7783 NULL, /* ct_timeout_policy_dump_next */
7784 NULL, /* ct_timeout_policy_dump_done */
7785 NULL, /* ct_get_timeout_policy_name */
7786 dpif_netdev_ipf_set_enabled,
7787 dpif_netdev_ipf_set_min_frag,
7788 dpif_netdev_ipf_set_max_nfrags,
7789 dpif_netdev_ipf_get_status,
7790 dpif_netdev_ipf_dump_start,
7791 dpif_netdev_ipf_dump_next,
7792 dpif_netdev_ipf_dump_done,
7793 dpif_netdev_meter_get_features,
7794 dpif_netdev_meter_set,
7795 dpif_netdev_meter_get,
7796 dpif_netdev_meter_del,
7797 };
7798
7799 static void
7800 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7801 const char *argv[], void *aux OVS_UNUSED)
7802 {
7803 struct dp_netdev_port *port;
7804 struct dp_netdev *dp;
7805 odp_port_t port_no;
7806
7807 ovs_mutex_lock(&dp_netdev_mutex);
7808 dp = shash_find_data(&dp_netdevs, argv[1]);
7809 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
7810 ovs_mutex_unlock(&dp_netdev_mutex);
7811 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7812 return;
7813 }
7814 ovs_refcount_ref(&dp->ref_cnt);
7815 ovs_mutex_unlock(&dp_netdev_mutex);
7816
7817 ovs_mutex_lock(&dp->port_mutex);
7818 if (get_port_by_name(dp, argv[2], &port)) {
7819 unixctl_command_reply_error(conn, "unknown port");
7820 goto exit;
7821 }
7822
7823 port_no = u32_to_odp(atoi(argv[3]));
7824 if (!port_no || port_no == ODPP_NONE) {
7825 unixctl_command_reply_error(conn, "bad port number");
7826 goto exit;
7827 }
7828 if (dp_netdev_lookup_port(dp, port_no)) {
7829 unixctl_command_reply_error(conn, "port number already in use");
7830 goto exit;
7831 }
7832
7833 /* Remove port. */
7834 hmap_remove(&dp->ports, &port->node);
7835 reconfigure_datapath(dp);
7836
7837 /* Reinsert with new port number. */
7838 port->port_no = port_no;
7839 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
7840 reconfigure_datapath(dp);
7841
7842 seq_change(dp->port_seq);
7843 unixctl_command_reply(conn, NULL);
7844
7845 exit:
7846 ovs_mutex_unlock(&dp->port_mutex);
7847 dp_netdev_unref(dp);
7848 }
7849
7850 static void
7851 dpif_dummy_register__(const char *type)
7852 {
7853 struct dpif_class *class;
7854
7855 class = xmalloc(sizeof *class);
7856 *class = dpif_netdev_class;
7857 class->type = xstrdup(type);
7858 dp_register_provider(class);
7859 }
7860
7861 static void
7862 dpif_dummy_override(const char *type)
7863 {
7864 int error;
7865
7866 /*
7867 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7868 * a userland-only build. It's useful for testsuite.
7869 */
7870 error = dp_unregister_provider(type);
7871 if (error == 0 || error == EAFNOSUPPORT) {
7872 dpif_dummy_register__(type);
7873 }
7874 }
7875
7876 void
7877 dpif_dummy_register(enum dummy_level level)
7878 {
7879 if (level == DUMMY_OVERRIDE_ALL) {
7880 struct sset types;
7881 const char *type;
7882
7883 sset_init(&types);
7884 dp_enumerate_types(&types);
7885 SSET_FOR_EACH (type, &types) {
7886 dpif_dummy_override(type);
7887 }
7888 sset_destroy(&types);
7889 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7890 dpif_dummy_override("system");
7891 }
7892
7893 dpif_dummy_register__("dummy");
7894
7895 unixctl_command_register("dpif-dummy/change-port-number",
7896 "dp port new-number",
7897 3, 3, dpif_dummy_change_port_number, NULL);
7898 }
7899 \f
7900 /* Datapath Classifier. */
7901
7902 static void
7903 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7904 {
7905 cmap_destroy(&subtable->rules);
7906 ovsrcu_postpone(free, subtable->mf_masks);
7907 ovsrcu_postpone(free, subtable);
7908 }
7909
7910 /* Initializes 'cls' as a classifier that initially contains no classification
7911 * rules. */
7912 static void
7913 dpcls_init(struct dpcls *cls)
7914 {
7915 cmap_init(&cls->subtables_map);
7916 pvector_init(&cls->subtables);
7917 }
7918
7919 static void
7920 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7921 {
7922 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
7923 pvector_remove(&cls->subtables, subtable);
7924 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7925 subtable->mask.hash);
7926 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
7927 }
7928
7929 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
7930 * caller's responsibility.
7931 * May only be called after all the readers have been terminated. */
7932 static void
7933 dpcls_destroy(struct dpcls *cls)
7934 {
7935 if (cls) {
7936 struct dpcls_subtable *subtable;
7937
7938 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
7939 ovs_assert(cmap_count(&subtable->rules) == 0);
7940 dpcls_destroy_subtable(cls, subtable);
7941 }
7942 cmap_destroy(&cls->subtables_map);
7943 pvector_destroy(&cls->subtables);
7944 }
7945 }
7946
7947 static struct dpcls_subtable *
7948 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7949 {
7950 struct dpcls_subtable *subtable;
7951
7952 /* Need to add one. */
7953 subtable = xmalloc(sizeof *subtable
7954 - sizeof subtable->mask.mf + mask->len);
7955 cmap_init(&subtable->rules);
7956 subtable->hit_cnt = 0;
7957 netdev_flow_key_clone(&subtable->mask, mask);
7958
7959 /* The count of bits in the mask defines the space required for masks.
7960 * Then call gen_masks() to create the appropriate masks, avoiding the cost
7961 * of doing runtime calculations. */
7962 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
7963 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
7964 subtable->mf_bits_set_unit0 = unit0;
7965 subtable->mf_bits_set_unit1 = unit1;
7966 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
7967 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
7968
7969 /* Probe for a specialized generic lookup function. */
7970 subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
7971
7972 /* If not set, assign generic lookup. Generic works for any miniflow. */
7973 if (!subtable->lookup_func) {
7974 subtable->lookup_func = dpcls_subtable_lookup_generic;
7975 }
7976
7977 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
7978 /* Add the new subtable at the end of the pvector (with no hits yet) */
7979 pvector_insert(&cls->subtables, subtable, 0);
7980 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
7981 cmap_count(&cls->subtables_map), subtable, cls->in_port);
7982 pvector_publish(&cls->subtables);
7983
7984 return subtable;
7985 }
7986
7987 static inline struct dpcls_subtable *
7988 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7989 {
7990 struct dpcls_subtable *subtable;
7991
7992 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7993 &cls->subtables_map) {
7994 if (netdev_flow_key_equal(&subtable->mask, mask)) {
7995 return subtable;
7996 }
7997 }
7998 return dpcls_create_subtable(cls, mask);
7999 }
8000
8001
8002 /* Periodically sort the dpcls subtable vectors according to hit counts */
8003 static void
8004 dpcls_sort_subtable_vector(struct dpcls *cls)
8005 {
8006 struct pvector *pvec = &cls->subtables;
8007 struct dpcls_subtable *subtable;
8008
8009 PVECTOR_FOR_EACH (subtable, pvec) {
8010 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8011 subtable->hit_cnt = 0;
8012 }
8013 pvector_publish(pvec);
8014 }
8015
8016 static inline void
8017 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8018 struct polled_queue *poll_list, int poll_cnt)
8019 {
8020 struct dpcls *cls;
8021 uint64_t tot_idle = 0, tot_proc = 0;
8022 unsigned int pmd_load = 0;
8023
8024 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
8025 uint64_t curr_tsc;
8026 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8027 if (pmd_alb->is_enabled && !pmd->isolated
8028 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8029 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8030 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8031 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8032 {
8033 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8034 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8035 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8036 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8037
8038 if (tot_proc) {
8039 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8040 }
8041
8042 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8043 atomic_count_inc(&pmd->pmd_overloaded);
8044 } else {
8045 atomic_count_set(&pmd->pmd_overloaded, 0);
8046 }
8047 }
8048
8049 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8050 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8051 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8052 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8053
8054 /* Get the cycles that were used to process each queue and store. */
8055 for (unsigned i = 0; i < poll_cnt; i++) {
8056 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8057 RXQ_CYCLES_PROC_CURR);
8058 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8059 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8060 0);
8061 }
8062 curr_tsc = cycles_counter_update(&pmd->perf_stats);
8063 if (pmd->intrvl_tsc_prev) {
8064 /* There is a prev timestamp, store a new intrvl cycle count. */
8065 atomic_store_relaxed(&pmd->intrvl_cycles,
8066 curr_tsc - pmd->intrvl_tsc_prev);
8067 }
8068 pmd->intrvl_tsc_prev = curr_tsc;
8069 /* Start new measuring interval */
8070 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
8071 }
8072
8073 if (pmd->ctx.now > pmd->next_optimization) {
8074 /* Try to obtain the flow lock to block out revalidator threads.
8075 * If not possible, just try next time. */
8076 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8077 /* Optimize each classifier */
8078 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8079 dpcls_sort_subtable_vector(cls);
8080 }
8081 ovs_mutex_unlock(&pmd->flow_mutex);
8082 /* Start new measuring interval */
8083 pmd->next_optimization = pmd->ctx.now
8084 + DPCLS_OPTIMIZATION_INTERVAL;
8085 }
8086 }
8087 }
8088
8089 /* Insert 'rule' into 'cls'. */
8090 static void
8091 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8092 const struct netdev_flow_key *mask)
8093 {
8094 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8095
8096 /* Refer to subtable's mask, also for later removal. */
8097 rule->mask = &subtable->mask;
8098 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8099 }
8100
8101 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
8102 static void
8103 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8104 {
8105 struct dpcls_subtable *subtable;
8106
8107 ovs_assert(rule->mask);
8108
8109 /* Get subtable from reference in rule->mask. */
8110 INIT_CONTAINER(subtable, rule->mask, mask);
8111 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8112 == 0) {
8113 /* Delete empty subtable. */
8114 dpcls_destroy_subtable(cls, subtable);
8115 pvector_publish(&cls->subtables);
8116 }
8117 }
8118
8119 /* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8120 static inline void
8121 netdev_flow_key_gen_mask_unit(uint64_t iter,
8122 const uint64_t count,
8123 uint64_t *mf_masks)
8124 {
8125 int i;
8126 for (i = 0; i < count; i++) {
8127 uint64_t lowest_bit = (iter & -iter);
8128 iter &= ~lowest_bit;
8129 mf_masks[i] = (lowest_bit - 1);
8130 }
8131 /* Checks that count has covered all bits in the iter bitmap. */
8132 ovs_assert(iter == 0);
8133 }
8134
8135 /* Generate a mask for each block in the miniflow, based on the bits set. This
8136 * allows easily masking packets with the generated array here, without
8137 * calculations. This replaces runtime-calculating the masks.
8138 * @param key The table to generate the mf_masks for
8139 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8140 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8141 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8142 */
8143 void
8144 netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8145 uint64_t *mf_masks,
8146 const uint32_t mf_bits_u0,
8147 const uint32_t mf_bits_u1)
8148 {
8149 uint64_t iter_u0 = tbl->mf.map.bits[0];
8150 uint64_t iter_u1 = tbl->mf.map.bits[1];
8151
8152 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8153 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8154 }
8155
8156 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8157 * in 'mask' the values in 'key' and 'target' are the same. */
8158 bool
8159 dpcls_rule_matches_key(const struct dpcls_rule *rule,
8160 const struct netdev_flow_key *target)
8161 {
8162 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8163 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
8164 uint64_t value;
8165
8166 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8167 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
8168 return false;
8169 }
8170 }
8171 return true;
8172 }
8173
8174 /* For each miniflow in 'keys' performs a classifier lookup writing the result
8175 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
8176 * NULL it is skipped.
8177 *
8178 * This function is optimized for use in the userspace datapath and therefore
8179 * does not implement a lot of features available in the standard
8180 * classifier_lookup() function. Specifically, it does not implement
8181 * priorities, instead returning any rule which matches the flow.
8182 *
8183 * Returns true if all miniflows found a corresponding rule. */
8184 static bool
8185 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8186 struct dpcls_rule **rules, const size_t cnt,
8187 int *num_lookups_p)
8188 {
8189 /* The received 'cnt' miniflows are the search-keys that will be processed
8190 * to find a matching entry into the available subtables.
8191 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8192 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8193 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8194
8195 struct dpcls_subtable *subtable;
8196 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8197
8198 if (cnt != MAP_BITS) {
8199 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8200 }
8201 memset(rules, 0, cnt * sizeof *rules);
8202
8203 int lookups_match = 0, subtable_pos = 1;
8204 uint32_t found_map;
8205
8206 /* The Datapath classifier - aka dpcls - is composed of subtables.
8207 * Subtables are dynamically created as needed when new rules are inserted.
8208 * Each subtable collects rules with matches on a specific subset of packet
8209 * fields as defined by the subtable's mask. We proceed to process every
8210 * search-key against each subtable, but when a match is found for a
8211 * search-key, the search for that key can stop because the rules are
8212 * non-overlapping. */
8213 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8214 /* Call the subtable specific lookup function. */
8215 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8216
8217 /* Count the number of subtables searched for this packet match. This
8218 * estimates the "spread" of subtables looked at per matched packet. */
8219 uint32_t pkts_matched = count_1bits(found_map);
8220 lookups_match += pkts_matched * subtable_pos;
8221
8222 /* Clear the found rules, and return early if all packets are found. */
8223 keys_map &= ~found_map;
8224 if (!keys_map) {
8225 if (num_lookups_p) {
8226 *num_lookups_p = lookups_match;
8227 }
8228 return true;
8229 }
8230 subtable_pos++;
8231 }
8232
8233 if (num_lookups_p) {
8234 *num_lookups_p = lookups_match;
8235 }
8236 return false;
8237 }