]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
lldp: fix a buffer overflow when handling management address TLV
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19 #include "dpif-netdev-private.h"
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <net/if.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/ioctl.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35
36 #include "bitmap.h"
37 #include "cmap.h"
38 #include "conntrack.h"
39 #include "conntrack-tp.h"
40 #include "coverage.h"
41 #include "ct-dpif.h"
42 #include "csum.h"
43 #include "dp-packet.h"
44 #include "dpif.h"
45 #include "dpif-netdev-lookup.h"
46 #include "dpif-netdev-perf.h"
47 #include "dpif-provider.h"
48 #include "dummy.h"
49 #include "fat-rwlock.h"
50 #include "flow.h"
51 #include "hmapx.h"
52 #include "id-pool.h"
53 #include "ipf.h"
54 #include "netdev.h"
55 #include "netdev-offload.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
58 #include "netlink.h"
59 #include "odp-execute.h"
60 #include "odp-util.h"
61 #include "openvswitch/dynamic-string.h"
62 #include "openvswitch/list.h"
63 #include "openvswitch/match.h"
64 #include "openvswitch/ofp-parse.h"
65 #include "openvswitch/ofp-print.h"
66 #include "openvswitch/ofpbuf.h"
67 #include "openvswitch/shash.h"
68 #include "openvswitch/vlog.h"
69 #include "ovs-numa.h"
70 #include "ovs-rcu.h"
71 #include "packets.h"
72 #include "openvswitch/poll-loop.h"
73 #include "pvector.h"
74 #include "random.h"
75 #include "seq.h"
76 #include "smap.h"
77 #include "sset.h"
78 #include "timeval.h"
79 #include "tnl-neigh-cache.h"
80 #include "tnl-ports.h"
81 #include "unixctl.h"
82 #include "util.h"
83 #include "uuid.h"
84
85 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
86
87 /* Auto Load Balancing Defaults */
88 #define ALB_ACCEPTABLE_IMPROVEMENT 25
89 #define ALB_PMD_LOAD_THRESHOLD 95
90 #define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
91 #define MIN_TO_MSEC 60000
92
93 #define FLOW_DUMP_MAX_BATCH 50
94 /* Use per thread recirc_depth to prevent recirculation loop. */
95 #define MAX_RECIRC_DEPTH 6
96 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
97
98 /* Use instant packet send by default. */
99 #define DEFAULT_TX_FLUSH_INTERVAL 0
100
101 /* Configuration parameters. */
102 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
103 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
104 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
105
106 COVERAGE_DEFINE(datapath_drop_meter);
107 COVERAGE_DEFINE(datapath_drop_upcall_error);
108 COVERAGE_DEFINE(datapath_drop_lock_error);
109 COVERAGE_DEFINE(datapath_drop_userspace_action_error);
110 COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
111 COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
112 COVERAGE_DEFINE(datapath_drop_recirc_error);
113 COVERAGE_DEFINE(datapath_drop_invalid_port);
114 COVERAGE_DEFINE(datapath_drop_invalid_bond);
115 COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
116 COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
117
118 /* Protects against changes to 'dp_netdevs'. */
119 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
120
121 /* Contains all 'struct dp_netdev's. */
122 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
123 = SHASH_INITIALIZER(&dp_netdevs);
124
125 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
126
127 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
128 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
129 | CS_SRC_NAT | CS_DST_NAT)
130 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
131
132 static struct odp_support dp_netdev_support = {
133 .max_vlan_headers = SIZE_MAX,
134 .max_mpls_depth = SIZE_MAX,
135 .recirc = true,
136 .ct_state = true,
137 .ct_zone = true,
138 .ct_mark = true,
139 .ct_label = true,
140 .ct_state_nat = true,
141 .ct_orig_tuple = true,
142 .ct_orig_tuple6 = true,
143 };
144
145 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
146 *
147 * Exact match cache for frequently used flows
148 *
149 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
150 * search its entries for a miniflow that matches exactly the miniflow of the
151 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
152 *
153 * A cache entry holds a reference to its 'dp_netdev_flow'.
154 *
155 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
156 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
157 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
158 * value is the index of a cache entry where the miniflow could be.
159 *
160 *
161 * Signature match cache (SMC)
162 *
163 * This cache stores a 16-bit signature for each flow without storing keys, and
164 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
165 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
166 * SMC uses a set-associative design that each bucket contains
167 * SMC_ENTRY_PER_BUCKET number of entries.
168 * Since 16-bit flow_table index is used, if there are more than 2^16
169 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
170 *
171 *
172 * Thread-safety
173 * =============
174 *
175 * Each pmd_thread has its own private exact match cache.
176 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
177 */
178
179 #define EM_FLOW_HASH_SHIFT 13
180 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
181 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
182 #define EM_FLOW_HASH_SEGS 2
183
184 /* SMC uses a set-associative design. A bucket contains a set of entries that
185 * a flow item can occupy. For now, it uses one hash function rather than two
186 * as for the EMC design. */
187 #define SMC_ENTRY_PER_BUCKET 4
188 #define SMC_ENTRIES (1u << 20)
189 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
190 #define SMC_MASK (SMC_BUCKET_CNT - 1)
191
192 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
193 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
194 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
195 DEFAULT_EM_FLOW_INSERT_INV_PROB)
196
197 struct emc_entry {
198 struct dp_netdev_flow *flow;
199 struct netdev_flow_key key; /* key.hash used for emc hash value. */
200 };
201
202 struct emc_cache {
203 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
204 int sweep_idx; /* For emc_cache_slow_sweep(). */
205 };
206
207 struct smc_bucket {
208 uint16_t sig[SMC_ENTRY_PER_BUCKET];
209 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
210 };
211
212 /* Signature match cache, differentiate from EMC cache */
213 struct smc_cache {
214 struct smc_bucket buckets[SMC_BUCKET_CNT];
215 };
216
217 struct dfc_cache {
218 struct emc_cache emc_cache;
219 struct smc_cache smc_cache;
220 };
221
222 /* Iterate in the exact match cache through every entry that might contain a
223 * miniflow with hash 'HASH'. */
224 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
225 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
226 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
227 i__ < EM_FLOW_HASH_SEGS; \
228 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
229 \f
230 /* Simple non-wildcarding single-priority classifier. */
231
232 /* Time in microseconds between successive optimizations of the dpcls
233 * subtable vector */
234 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
235
236 /* Time in microseconds of the interval in which rxq processing cycles used
237 * in rxq to pmd assignments is measured and stored. */
238 #define PMD_RXQ_INTERVAL_LEN 10000000LL
239
240 /* Number of intervals for which cycles are stored
241 * and used during rxq to pmd assignment. */
242 #define PMD_RXQ_INTERVAL_MAX 6
243
244 /* Time in microseconds to try RCU quiescing. */
245 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
246
247 struct dpcls {
248 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
249 odp_port_t in_port;
250 struct cmap subtables_map;
251 struct pvector subtables;
252 };
253
254 /* Data structure to keep packet order till fastpath processing. */
255 struct dp_packet_flow_map {
256 struct dp_packet *packet;
257 struct dp_netdev_flow *flow;
258 uint16_t tcp_flags;
259 };
260
261 static void dpcls_init(struct dpcls *);
262 static void dpcls_destroy(struct dpcls *);
263 static void dpcls_sort_subtable_vector(struct dpcls *);
264 static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
265 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
266 const struct netdev_flow_key *mask);
267 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
268 static bool dpcls_lookup(struct dpcls *cls,
269 const struct netdev_flow_key *keys[],
270 struct dpcls_rule **rules, size_t cnt,
271 int *num_lookups_p);
272
273 /* Set of supported meter flags */
274 #define DP_SUPPORTED_METER_FLAGS_MASK \
275 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
276
277 /* Set of supported meter band types */
278 #define DP_SUPPORTED_METER_BAND_TYPES \
279 ( 1 << OFPMBT13_DROP )
280
281 struct dp_meter_band {
282 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
283 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
284 uint64_t packet_count;
285 uint64_t byte_count;
286 };
287
288 struct dp_meter {
289 uint16_t flags;
290 uint16_t n_bands;
291 uint32_t max_delta_t;
292 uint64_t used;
293 uint64_t packet_count;
294 uint64_t byte_count;
295 struct dp_meter_band bands[];
296 };
297
298 struct pmd_auto_lb {
299 bool auto_lb_requested; /* Auto load balancing requested by user. */
300 bool is_enabled; /* Current status of Auto load balancing. */
301 uint64_t rebalance_intvl;
302 uint64_t rebalance_poll_timer;
303 };
304
305 /* Datapath based on the network device interface from netdev.h.
306 *
307 *
308 * Thread-safety
309 * =============
310 *
311 * Some members, marked 'const', are immutable. Accessing other members
312 * requires synchronization, as noted in more detail below.
313 *
314 * Acquisition order is, from outermost to innermost:
315 *
316 * dp_netdev_mutex (global)
317 * port_mutex
318 * bond_mutex
319 * non_pmd_mutex
320 */
321 struct dp_netdev {
322 const struct dpif_class *const class;
323 const char *const name;
324 struct ovs_refcount ref_cnt;
325 atomic_flag destroyed;
326
327 /* Ports.
328 *
329 * Any lookup into 'ports' or any access to the dp_netdev_ports found
330 * through 'ports' requires taking 'port_mutex'. */
331 struct ovs_mutex port_mutex;
332 struct hmap ports;
333 struct seq *port_seq; /* Incremented whenever a port changes. */
334
335 /* The time that a packet can wait in output batch for sending. */
336 atomic_uint32_t tx_flush_interval;
337
338 /* Meters. */
339 struct ovs_mutex meter_locks[N_METER_LOCKS];
340 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
341
342 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
343 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
344 /* Enable collection of PMD performance metrics. */
345 atomic_bool pmd_perf_metrics;
346 /* Enable the SMC cache from ovsdb config */
347 atomic_bool smc_enable_db;
348
349 /* Protects access to ofproto-dpif-upcall interface during revalidator
350 * thread synchronization. */
351 struct fat_rwlock upcall_rwlock;
352 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
353 void *upcall_aux;
354
355 /* Callback function for notifying the purging of dp flows (during
356 * reseting pmd deletion). */
357 dp_purge_callback *dp_purge_cb;
358 void *dp_purge_aux;
359
360 /* Stores all 'struct dp_netdev_pmd_thread's. */
361 struct cmap poll_threads;
362 /* id pool for per thread static_tx_qid. */
363 struct id_pool *tx_qid_pool;
364 struct ovs_mutex tx_qid_pool_mutex;
365 /* Use measured cycles for rxq to pmd assignment. */
366 bool pmd_rxq_assign_cyc;
367
368 /* Protects the access of the 'struct dp_netdev_pmd_thread'
369 * instance for non-pmd thread. */
370 struct ovs_mutex non_pmd_mutex;
371
372 /* Each pmd thread will store its pointer to
373 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
374 ovsthread_key_t per_pmd_key;
375
376 struct seq *reconfigure_seq;
377 uint64_t last_reconfigure_seq;
378
379 /* Cpu mask for pin of pmd threads. */
380 char *pmd_cmask;
381
382 uint64_t last_tnl_conf_seq;
383
384 struct conntrack *conntrack;
385 struct pmd_auto_lb pmd_alb;
386
387 /* Bonds. */
388 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
389 struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
390 };
391
392 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
393 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
394 {
395 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
396 }
397
398 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
399 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
400 {
401 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
402 }
403
404
405 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
406 odp_port_t)
407 OVS_REQUIRES(dp->port_mutex);
408
409 enum rxq_cycles_counter_type {
410 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
411 processing packets during the current
412 interval. */
413 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
414 during rxq to pmd assignment. */
415 RXQ_N_CYCLES
416 };
417
418 enum {
419 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
420 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
421 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
422 };
423
424 struct dp_flow_offload_item {
425 struct dp_netdev_pmd_thread *pmd;
426 struct dp_netdev_flow *flow;
427 int op;
428 struct match match;
429 struct nlattr *actions;
430 size_t actions_len;
431
432 struct ovs_list node;
433 };
434
435 struct dp_flow_offload {
436 struct ovs_mutex mutex;
437 struct ovs_list list;
438 pthread_cond_t cond;
439 };
440
441 static struct dp_flow_offload dp_flow_offload = {
442 .mutex = OVS_MUTEX_INITIALIZER,
443 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
444 };
445
446 static struct ovsthread_once offload_thread_once
447 = OVSTHREAD_ONCE_INITIALIZER;
448
449 #define XPS_TIMEOUT 500000LL /* In microseconds. */
450
451 /* Contained by struct dp_netdev_port's 'rxqs' member. */
452 struct dp_netdev_rxq {
453 struct dp_netdev_port *port;
454 struct netdev_rxq *rx;
455 unsigned core_id; /* Core to which this queue should be
456 pinned. OVS_CORE_UNSPEC if the
457 queue doesn't need to be pinned to a
458 particular core. */
459 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
460 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
461 bool is_vhost; /* Is rxq of a vhost port. */
462
463 /* Counters of cycles spent successfully polling and processing pkts. */
464 atomic_ullong cycles[RXQ_N_CYCLES];
465 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
466 sum them to yield the cycles used for an rxq. */
467 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
468 };
469
470 /* A port in a netdev-based datapath. */
471 struct dp_netdev_port {
472 odp_port_t port_no;
473 bool dynamic_txqs; /* If true XPS will be used. */
474 bool need_reconfigure; /* True if we should reconfigure netdev. */
475 struct netdev *netdev;
476 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
477 struct netdev_saved_flags *sf;
478 struct dp_netdev_rxq *rxqs;
479 unsigned n_rxq; /* Number of elements in 'rxqs' */
480 unsigned *txq_used; /* Number of threads that use each tx queue. */
481 struct ovs_mutex txq_used_mutex;
482 bool emc_enabled; /* If true EMC will be used. */
483 char *type; /* Port type as requested by user. */
484 char *rxq_affinity_list; /* Requested affinity of rx queues. */
485 };
486
487 /* Contained by struct dp_netdev_flow's 'stats' member. */
488 struct dp_netdev_flow_stats {
489 atomic_llong used; /* Last used time, in monotonic msecs. */
490 atomic_ullong packet_count; /* Number of packets matched. */
491 atomic_ullong byte_count; /* Number of bytes matched. */
492 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
493 };
494
495 /* Contained by struct dp_netdev_flow's 'last_attrs' member. */
496 struct dp_netdev_flow_attrs {
497 atomic_bool offloaded; /* True if flow is offloaded to HW. */
498 ATOMIC(const char *) dp_layer; /* DP layer the flow is handled in. */
499 };
500
501 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
502 *
503 *
504 * Thread-safety
505 * =============
506 *
507 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
508 * its pmd thread's classifier. The text below calls this classifier 'cls'.
509 *
510 * Motivation
511 * ----------
512 *
513 * The thread safety rules described here for "struct dp_netdev_flow" are
514 * motivated by two goals:
515 *
516 * - Prevent threads that read members of "struct dp_netdev_flow" from
517 * reading bad data due to changes by some thread concurrently modifying
518 * those members.
519 *
520 * - Prevent two threads making changes to members of a given "struct
521 * dp_netdev_flow" from interfering with each other.
522 *
523 *
524 * Rules
525 * -----
526 *
527 * A flow 'flow' may be accessed without a risk of being freed during an RCU
528 * grace period. Code that needs to hold onto a flow for a while
529 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
530 *
531 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
532 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
533 * from modification.
534 *
535 * Some members, marked 'const', are immutable. Accessing other members
536 * requires synchronization, as noted in more detail below.
537 */
538 struct dp_netdev_flow {
539 const struct flow flow; /* Unmasked flow that created this entry. */
540 /* Hash table index by unmasked flow. */
541 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
542 /* 'flow_table'. */
543 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
544 const ovs_u128 ufid; /* Unique flow identifier. */
545 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
546 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
547 /* flow. */
548
549 /* Number of references.
550 * The classifier owns one reference.
551 * Any thread trying to keep a rule from being freed should hold its own
552 * reference. */
553 struct ovs_refcount ref_cnt;
554
555 bool dead;
556 uint32_t mark; /* Unique flow mark assigned to a flow */
557
558 /* Statistics. */
559 struct dp_netdev_flow_stats stats;
560
561 /* Statistics and attributes received from the netdev offload provider. */
562 atomic_int netdev_flow_get_result;
563 struct dp_netdev_flow_stats last_stats;
564 struct dp_netdev_flow_attrs last_attrs;
565
566 /* Actions. */
567 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
568
569 /* While processing a group of input packets, the datapath uses the next
570 * member to store a pointer to the output batch for the flow. It is
571 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
572 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
573 struct packet_batch_per_flow *batch;
574
575 /* Packet classification. */
576 char *dp_extra_info; /* String to return in a flow dump/get. */
577 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
578 /* 'cr' must be the last member. */
579 };
580
581 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
582 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
583 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
584 struct flow *, bool);
585
586 /* A set of datapath actions within a "struct dp_netdev_flow".
587 *
588 *
589 * Thread-safety
590 * =============
591 *
592 * A struct dp_netdev_actions 'actions' is protected with RCU. */
593 struct dp_netdev_actions {
594 /* These members are immutable: they do not change during the struct's
595 * lifetime. */
596 unsigned int size; /* Size of 'actions', in bytes. */
597 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
598 };
599
600 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
601 size_t);
602 struct dp_netdev_actions *dp_netdev_flow_get_actions(
603 const struct dp_netdev_flow *);
604 static void dp_netdev_actions_free(struct dp_netdev_actions *);
605
606 struct polled_queue {
607 struct dp_netdev_rxq *rxq;
608 odp_port_t port_no;
609 bool emc_enabled;
610 bool rxq_enabled;
611 uint64_t change_seq;
612 };
613
614 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
615 struct rxq_poll {
616 struct dp_netdev_rxq *rxq;
617 struct hmap_node node;
618 };
619
620 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
621 * 'tnl_port_cache' or 'tx_ports'. */
622 struct tx_port {
623 struct dp_netdev_port *port;
624 int qid;
625 long long last_used;
626 struct hmap_node node;
627 long long flush_time;
628 struct dp_packet_batch output_pkts;
629 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
630 };
631
632 /* Contained by struct tx_bond 'member_buckets'. */
633 struct member_entry {
634 odp_port_t member_id;
635 atomic_ullong n_packets;
636 atomic_ullong n_bytes;
637 };
638
639 /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
640 struct tx_bond {
641 struct cmap_node node;
642 uint32_t bond_id;
643 struct member_entry member_buckets[BOND_BUCKETS];
644 };
645
646 /* A set of properties for the current processing loop that is not directly
647 * associated with the pmd thread itself, but with the packets being
648 * processed or the short-term system configuration (for example, time).
649 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
650 struct dp_netdev_pmd_thread_ctx {
651 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
652 long long now;
653 /* RX queue from which last packet was received. */
654 struct dp_netdev_rxq *last_rxq;
655 /* EMC insertion probability context for the current processing cycle. */
656 uint32_t emc_insert_min;
657 };
658
659 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
660 * the performance overhead of interrupt processing. Therefore netdev can
661 * not implement rx-wait for these devices. dpif-netdev needs to poll
662 * these device to check for recv buffer. pmd-thread does polling for
663 * devices assigned to itself.
664 *
665 * DPDK used PMD for accessing NIC.
666 *
667 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
668 * I/O of all non-pmd threads. There will be no actual thread created
669 * for the instance.
670 *
671 * Each struct has its own flow cache and classifier per managed ingress port.
672 * For packets received on ingress port, a look up is done on corresponding PMD
673 * thread's flow cache and in case of a miss, lookup is performed in the
674 * corresponding classifier of port. Packets are executed with the found
675 * actions in either case.
676 * */
677 struct dp_netdev_pmd_thread {
678 struct dp_netdev *dp;
679 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
680 struct cmap_node node; /* In 'dp->poll_threads'. */
681
682 /* Per thread exact-match cache. Note, the instance for cpu core
683 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
684 * need to be protected by 'non_pmd_mutex'. Every other instance
685 * will only be accessed by its own pmd thread. */
686 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
687
688 /* Flow-Table and classifiers
689 *
690 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
691 * changes to 'classifiers' must be made while still holding the
692 * 'flow_mutex'.
693 */
694 struct ovs_mutex flow_mutex;
695 struct cmap flow_table OVS_GUARDED; /* Flow table. */
696
697 /* One classifier per in_port polled by the pmd */
698 struct cmap classifiers;
699 /* Periodically sort subtable vectors according to hit frequencies */
700 long long int next_optimization;
701 /* End of the next time interval for which processing cycles
702 are stored for each polled rxq. */
703 long long int rxq_next_cycle_store;
704
705 /* Last interval timestamp. */
706 uint64_t intrvl_tsc_prev;
707 /* Last interval cycles. */
708 atomic_ullong intrvl_cycles;
709
710 /* Current context of the PMD thread. */
711 struct dp_netdev_pmd_thread_ctx ctx;
712
713 struct seq *reload_seq;
714 uint64_t last_reload_seq;
715
716 /* These are atomic variables used as a synchronization and configuration
717 * points for thread reload/exit.
718 *
719 * 'reload' atomic is the main one and it's used as a memory
720 * synchronization point for all other knobs and data.
721 *
722 * For a thread that requests PMD reload:
723 *
724 * * All changes that should be visible to the PMD thread must be made
725 * before setting the 'reload'. These changes could use any memory
726 * ordering model including 'relaxed'.
727 * * Setting the 'reload' atomic should occur in the same thread where
728 * all other PMD configuration options updated.
729 * * Setting the 'reload' atomic should be done with 'release' memory
730 * ordering model or stricter. This will guarantee that all previous
731 * changes (including non-atomic and 'relaxed') will be visible to
732 * the PMD thread.
733 * * To check that reload is done, thread should poll the 'reload' atomic
734 * to become 'false'. Polling should be done with 'acquire' memory
735 * ordering model or stricter. This ensures that PMD thread completed
736 * the reload process.
737 *
738 * For the PMD thread:
739 *
740 * * PMD thread should read 'reload' atomic with 'acquire' memory
741 * ordering model or stricter. This will guarantee that all changes
742 * made before setting the 'reload' in the requesting thread will be
743 * visible to the PMD thread.
744 * * All other configuration data could be read with any memory
745 * ordering model (including non-atomic and 'relaxed') but *only after*
746 * reading the 'reload' atomic set to 'true'.
747 * * When the PMD reload done, PMD should (optionally) set all the below
748 * knobs except the 'reload' to their default ('false') values and
749 * (mandatory), as the last step, set the 'reload' to 'false' using
750 * 'release' memory ordering model or stricter. This will inform the
751 * requesting thread that PMD has completed a reload cycle.
752 */
753 atomic_bool reload; /* Do we need to reload ports? */
754 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
755 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
756 atomic_bool exit; /* For terminating the pmd thread. */
757
758 pthread_t thread;
759 unsigned core_id; /* CPU core id of this pmd thread. */
760 int numa_id; /* numa node id of this pmd thread. */
761 bool isolated;
762
763 /* Queue id used by this pmd thread to send packets on all netdevs if
764 * XPS disabled for this netdev. All static_tx_qid's are unique and less
765 * than 'cmap_count(dp->poll_threads)'. */
766 uint32_t static_tx_qid;
767
768 /* Number of filled output batches. */
769 int n_output_batches;
770
771 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
772 /* List of rx queues to poll. */
773 struct hmap poll_list OVS_GUARDED;
774 /* Map of 'tx_port's used for transmission. Written by the main thread,
775 * read by the pmd thread. */
776 struct hmap tx_ports OVS_GUARDED;
777
778 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
779 /* Map of 'tx_bond's used for transmission. Written by the main thread
780 * and read by the pmd thread. */
781 struct cmap tx_bonds;
782
783 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
784 * ports (that support push_tunnel/pop_tunnel), the other contains ports
785 * with at least one txq (that support send). A port can be in both.
786 *
787 * There are two separate maps to make sure that we don't try to execute
788 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
789 *
790 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
791 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
792 * other instance will only be accessed by its own pmd thread. */
793 struct hmap tnl_port_cache;
794 struct hmap send_port_cache;
795
796 /* Keep track of detailed PMD performance statistics. */
797 struct pmd_perf_stats perf_stats;
798
799 /* Stats from previous iteration used by automatic pmd
800 * load balance logic. */
801 uint64_t prev_stats[PMD_N_STATS];
802 atomic_count pmd_overloaded;
803
804 /* Set to true if the pmd thread needs to be reloaded. */
805 bool need_reload;
806
807 /* Next time when PMD should try RCU quiescing. */
808 long long next_rcu_quiesce;
809 };
810
811 /* Interface to netdev-based datapath. */
812 struct dpif_netdev {
813 struct dpif dpif;
814 struct dp_netdev *dp;
815 uint64_t last_port_seq;
816 };
817
818 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
819 struct dp_netdev_port **portp)
820 OVS_REQUIRES(dp->port_mutex);
821 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
822 struct dp_netdev_port **portp)
823 OVS_REQUIRES(dp->port_mutex);
824 static void dp_netdev_free(struct dp_netdev *)
825 OVS_REQUIRES(dp_netdev_mutex);
826 static int do_add_port(struct dp_netdev *dp, const char *devname,
827 const char *type, odp_port_t port_no)
828 OVS_REQUIRES(dp->port_mutex);
829 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
830 OVS_REQUIRES(dp->port_mutex);
831 static int dpif_netdev_open(const struct dpif_class *, const char *name,
832 bool create, struct dpif **);
833 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
834 struct dp_packet_batch *,
835 bool should_steal,
836 const struct flow *flow,
837 const struct nlattr *actions,
838 size_t actions_len);
839 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
840 struct dp_packet_batch *, odp_port_t port_no);
841 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
842 struct dp_packet_batch *);
843
844 static void dp_netdev_disable_upcall(struct dp_netdev *);
845 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
846 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
847 struct dp_netdev *dp, unsigned core_id,
848 int numa_id);
849 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
850 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
851 OVS_REQUIRES(dp->port_mutex);
852
853 static void *pmd_thread_main(void *);
854 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
855 unsigned core_id);
856 static struct dp_netdev_pmd_thread *
857 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
858 static void dp_netdev_del_pmd(struct dp_netdev *dp,
859 struct dp_netdev_pmd_thread *pmd);
860 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
861 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
862 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
863 struct dp_netdev_port *port)
864 OVS_REQUIRES(pmd->port_mutex);
865 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
866 struct tx_port *tx)
867 OVS_REQUIRES(pmd->port_mutex);
868 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
869 struct dp_netdev_rxq *rxq)
870 OVS_REQUIRES(pmd->port_mutex);
871 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
872 struct rxq_poll *poll)
873 OVS_REQUIRES(pmd->port_mutex);
874 static int
875 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
876 bool force);
877 static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
878 struct tx_bond *bond, bool update)
879 OVS_EXCLUDED(pmd->bond_mutex);
880 static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
881 uint32_t bond_id)
882 OVS_EXCLUDED(pmd->bond_mutex);
883
884 static void reconfigure_datapath(struct dp_netdev *dp)
885 OVS_REQUIRES(dp->port_mutex);
886 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
887 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
888 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
889 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
890 OVS_REQUIRES(pmd->port_mutex);
891 static inline void
892 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
893 struct polled_queue *poll_list, int poll_cnt);
894 static void
895 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
896 enum rxq_cycles_counter_type type,
897 unsigned long long cycles);
898 static uint64_t
899 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
900 enum rxq_cycles_counter_type type);
901 static void
902 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
903 unsigned long long cycles);
904 static uint64_t
905 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
906 static void
907 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
908 bool purge);
909 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
910 struct tx_port *tx);
911 static inline struct dpcls *
912 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
913 odp_port_t in_port);
914
915 static inline bool emc_entry_alive(struct emc_entry *ce);
916 static void emc_clear_entry(struct emc_entry *ce);
917 static void smc_clear_entry(struct smc_bucket *b, int idx);
918
919 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
920 static inline bool
921 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
922 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
923 struct dp_netdev_flow *flow);
924
925 static void
926 emc_cache_init(struct emc_cache *flow_cache)
927 {
928 int i;
929
930 flow_cache->sweep_idx = 0;
931 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
932 flow_cache->entries[i].flow = NULL;
933 flow_cache->entries[i].key.hash = 0;
934 flow_cache->entries[i].key.len = sizeof(struct miniflow);
935 flowmap_init(&flow_cache->entries[i].key.mf.map);
936 }
937 }
938
939 static void
940 smc_cache_init(struct smc_cache *smc_cache)
941 {
942 int i, j;
943 for (i = 0; i < SMC_BUCKET_CNT; i++) {
944 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
945 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
946 }
947 }
948 }
949
950 static void
951 dfc_cache_init(struct dfc_cache *flow_cache)
952 {
953 emc_cache_init(&flow_cache->emc_cache);
954 smc_cache_init(&flow_cache->smc_cache);
955 }
956
957 static void
958 emc_cache_uninit(struct emc_cache *flow_cache)
959 {
960 int i;
961
962 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
963 emc_clear_entry(&flow_cache->entries[i]);
964 }
965 }
966
967 static void
968 smc_cache_uninit(struct smc_cache *smc)
969 {
970 int i, j;
971
972 for (i = 0; i < SMC_BUCKET_CNT; i++) {
973 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
974 smc_clear_entry(&(smc->buckets[i]), j);
975 }
976 }
977 }
978
979 static void
980 dfc_cache_uninit(struct dfc_cache *flow_cache)
981 {
982 smc_cache_uninit(&flow_cache->smc_cache);
983 emc_cache_uninit(&flow_cache->emc_cache);
984 }
985
986 /* Check and clear dead flow references slowly (one entry at each
987 * invocation). */
988 static void
989 emc_cache_slow_sweep(struct emc_cache *flow_cache)
990 {
991 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
992
993 if (!emc_entry_alive(entry)) {
994 emc_clear_entry(entry);
995 }
996 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
997 }
998
999 /* Updates the time in PMD threads context and should be called in three cases:
1000 *
1001 * 1. PMD structure initialization:
1002 * - dp_netdev_configure_pmd()
1003 *
1004 * 2. Before processing of the new packet batch:
1005 * - dpif_netdev_execute()
1006 * - dp_netdev_process_rxq_port()
1007 *
1008 * 3. At least once per polling iteration in main polling threads if no
1009 * packets received on current iteration:
1010 * - dpif_netdev_run()
1011 * - pmd_thread_main()
1012 *
1013 * 'pmd->ctx.now' should be used without update in all other cases if possible.
1014 */
1015 static inline void
1016 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
1017 {
1018 pmd->ctx.now = time_usec();
1019 }
1020
1021 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
1022 bool
1023 dpif_is_netdev(const struct dpif *dpif)
1024 {
1025 return dpif->dpif_class->open == dpif_netdev_open;
1026 }
1027
1028 static struct dpif_netdev *
1029 dpif_netdev_cast(const struct dpif *dpif)
1030 {
1031 ovs_assert(dpif_is_netdev(dpif));
1032 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
1033 }
1034
1035 static struct dp_netdev *
1036 get_dp_netdev(const struct dpif *dpif)
1037 {
1038 return dpif_netdev_cast(dpif)->dp;
1039 }
1040 \f
1041 enum pmd_info_type {
1042 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
1043 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
1044 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
1045 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
1046 };
1047
1048 static void
1049 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1050 {
1051 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1052 ? "main thread" : "pmd thread");
1053 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1054 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1055 }
1056 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1057 ds_put_format(reply, " core_id %u", pmd->core_id);
1058 }
1059 ds_put_cstr(reply, ":\n");
1060 }
1061
1062 static void
1063 pmd_info_show_stats(struct ds *reply,
1064 struct dp_netdev_pmd_thread *pmd)
1065 {
1066 uint64_t stats[PMD_N_STATS];
1067 uint64_t total_cycles, total_packets;
1068 double passes_per_pkt = 0;
1069 double lookups_per_hit = 0;
1070 double packets_per_batch = 0;
1071
1072 pmd_perf_read_counters(&pmd->perf_stats, stats);
1073 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1074 + stats[PMD_CYCLES_ITER_BUSY];
1075 total_packets = stats[PMD_STAT_RECV];
1076
1077 format_pmd_thread(reply, pmd);
1078
1079 if (total_packets > 0) {
1080 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1081 / (double) total_packets;
1082 }
1083 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1084 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1085 / (double) stats[PMD_STAT_MASKED_HIT];
1086 }
1087 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1088 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1089 / (double) stats[PMD_STAT_SENT_BATCHES];
1090 }
1091
1092 ds_put_format(reply,
1093 " packets received: %"PRIu64"\n"
1094 " packet recirculations: %"PRIu64"\n"
1095 " avg. datapath passes per packet: %.02f\n"
1096 " emc hits: %"PRIu64"\n"
1097 " smc hits: %"PRIu64"\n"
1098 " megaflow hits: %"PRIu64"\n"
1099 " avg. subtable lookups per megaflow hit: %.02f\n"
1100 " miss with success upcall: %"PRIu64"\n"
1101 " miss with failed upcall: %"PRIu64"\n"
1102 " avg. packets per output batch: %.02f\n",
1103 total_packets, stats[PMD_STAT_RECIRC],
1104 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1105 stats[PMD_STAT_SMC_HIT],
1106 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1107 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1108 packets_per_batch);
1109
1110 if (total_cycles == 0) {
1111 return;
1112 }
1113
1114 ds_put_format(reply,
1115 " idle cycles: %"PRIu64" (%.02f%%)\n"
1116 " processing cycles: %"PRIu64" (%.02f%%)\n",
1117 stats[PMD_CYCLES_ITER_IDLE],
1118 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1119 stats[PMD_CYCLES_ITER_BUSY],
1120 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1121
1122 if (total_packets == 0) {
1123 return;
1124 }
1125
1126 ds_put_format(reply,
1127 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1128 total_cycles / (double) total_packets,
1129 total_cycles, total_packets);
1130
1131 ds_put_format(reply,
1132 " avg processing cycles per packet: "
1133 "%.02f (%"PRIu64"/%"PRIu64")\n",
1134 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1135 stats[PMD_CYCLES_ITER_BUSY], total_packets);
1136 }
1137
1138 static void
1139 pmd_info_show_perf(struct ds *reply,
1140 struct dp_netdev_pmd_thread *pmd,
1141 struct pmd_perf_params *par)
1142 {
1143 if (pmd->core_id != NON_PMD_CORE_ID) {
1144 char *time_str =
1145 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1146 long long now = time_msec();
1147 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1148
1149 ds_put_cstr(reply, "\n");
1150 ds_put_format(reply, "Time: %s\n", time_str);
1151 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1152 ds_put_cstr(reply, "\n");
1153 format_pmd_thread(reply, pmd);
1154 ds_put_cstr(reply, "\n");
1155 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1156 if (pmd_perf_metrics_enabled(pmd)) {
1157 /* Prevent parallel clearing of perf metrics. */
1158 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1159 if (par->histograms) {
1160 ds_put_cstr(reply, "\n");
1161 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1162 }
1163 if (par->iter_hist_len > 0) {
1164 ds_put_cstr(reply, "\n");
1165 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1166 par->iter_hist_len);
1167 }
1168 if (par->ms_hist_len > 0) {
1169 ds_put_cstr(reply, "\n");
1170 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1171 par->ms_hist_len);
1172 }
1173 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1174 }
1175 free(time_str);
1176 }
1177 }
1178
1179 static int
1180 compare_poll_list(const void *a_, const void *b_)
1181 {
1182 const struct rxq_poll *a = a_;
1183 const struct rxq_poll *b = b_;
1184
1185 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1186 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1187
1188 int cmp = strcmp(namea, nameb);
1189 if (!cmp) {
1190 return netdev_rxq_get_queue_id(a->rxq->rx)
1191 - netdev_rxq_get_queue_id(b->rxq->rx);
1192 } else {
1193 return cmp;
1194 }
1195 }
1196
1197 static void
1198 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1199 size_t *n)
1200 OVS_REQUIRES(pmd->port_mutex)
1201 {
1202 struct rxq_poll *ret, *poll;
1203 size_t i;
1204
1205 *n = hmap_count(&pmd->poll_list);
1206 if (!*n) {
1207 ret = NULL;
1208 } else {
1209 ret = xcalloc(*n, sizeof *ret);
1210 i = 0;
1211 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1212 ret[i] = *poll;
1213 i++;
1214 }
1215 ovs_assert(i == *n);
1216 qsort(ret, *n, sizeof *ret, compare_poll_list);
1217 }
1218
1219 *list = ret;
1220 }
1221
1222 static void
1223 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1224 {
1225 if (pmd->core_id != NON_PMD_CORE_ID) {
1226 struct rxq_poll *list;
1227 size_t n_rxq;
1228 uint64_t total_cycles = 0;
1229
1230 ds_put_format(reply,
1231 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
1232 pmd->numa_id, pmd->core_id, (pmd->isolated)
1233 ? "true" : "false");
1234
1235 ovs_mutex_lock(&pmd->port_mutex);
1236 sorted_poll_list(pmd, &list, &n_rxq);
1237
1238 /* Get the total pmd cycles for an interval. */
1239 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1240 /* Estimate the cycles to cover all intervals. */
1241 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1242
1243 for (int i = 0; i < n_rxq; i++) {
1244 struct dp_netdev_rxq *rxq = list[i].rxq;
1245 const char *name = netdev_rxq_get_name(rxq->rx);
1246 uint64_t proc_cycles = 0;
1247
1248 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1249 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1250 }
1251 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1252 netdev_rxq_get_queue_id(list[i].rxq->rx));
1253 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1254 ? "(enabled) " : "(disabled)");
1255 ds_put_format(reply, " pmd usage: ");
1256 if (total_cycles) {
1257 ds_put_format(reply, "%2"PRIu64"",
1258 proc_cycles * 100 / total_cycles);
1259 ds_put_cstr(reply, " %");
1260 } else {
1261 ds_put_format(reply, "%s", "NOT AVAIL");
1262 }
1263 ds_put_cstr(reply, "\n");
1264 }
1265 ovs_mutex_unlock(&pmd->port_mutex);
1266 free(list);
1267 }
1268 }
1269
1270 static int
1271 compare_poll_thread_list(const void *a_, const void *b_)
1272 {
1273 const struct dp_netdev_pmd_thread *a, *b;
1274
1275 a = *(struct dp_netdev_pmd_thread **)a_;
1276 b = *(struct dp_netdev_pmd_thread **)b_;
1277
1278 if (a->core_id < b->core_id) {
1279 return -1;
1280 }
1281 if (a->core_id > b->core_id) {
1282 return 1;
1283 }
1284 return 0;
1285 }
1286
1287 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1288 * this list, as long as we do not go to quiescent state. */
1289 static void
1290 sorted_poll_thread_list(struct dp_netdev *dp,
1291 struct dp_netdev_pmd_thread ***list,
1292 size_t *n)
1293 {
1294 struct dp_netdev_pmd_thread *pmd;
1295 struct dp_netdev_pmd_thread **pmd_list;
1296 size_t k = 0, n_pmds;
1297
1298 n_pmds = cmap_count(&dp->poll_threads);
1299 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1300
1301 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1302 if (k >= n_pmds) {
1303 break;
1304 }
1305 pmd_list[k++] = pmd;
1306 }
1307
1308 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1309
1310 *list = pmd_list;
1311 *n = k;
1312 }
1313
1314 static void
1315 dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1316 const char *argv[] OVS_UNUSED,
1317 void *aux OVS_UNUSED)
1318 {
1319 /* Get a list of all lookup functions. */
1320 struct dpcls_subtable_lookup_info_t *lookup_funcs = NULL;
1321 int32_t count = dpcls_subtable_lookup_info_get(&lookup_funcs);
1322 if (count < 0) {
1323 unixctl_command_reply_error(conn, "error getting lookup names");
1324 return;
1325 }
1326
1327 /* Add all lookup functions to reply string. */
1328 struct ds reply = DS_EMPTY_INITIALIZER;
1329 ds_put_cstr(&reply, "Available lookup functions (priority : name)\n");
1330 for (int i = 0; i < count; i++) {
1331 ds_put_format(&reply, " %d : %s\n", lookup_funcs[i].prio,
1332 lookup_funcs[i].name);
1333 }
1334 unixctl_command_reply(conn, ds_cstr(&reply));
1335 ds_destroy(&reply);
1336 }
1337
1338 static void
1339 dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
1340 const char *argv[], void *aux OVS_UNUSED)
1341 {
1342 /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
1343 * argv[1] is subtable name
1344 * argv[2] is priority
1345 * argv[3] is the datapath name (optional if only 1 datapath exists)
1346 */
1347 const char *func_name = argv[1];
1348
1349 errno = 0;
1350 char *err_char;
1351 uint32_t new_prio = strtoul(argv[2], &err_char, 10);
1352 if (errno != 0 || new_prio > UINT8_MAX) {
1353 unixctl_command_reply_error(conn,
1354 "error converting priority, use integer in range 0-255\n");
1355 return;
1356 }
1357
1358 int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
1359 if (err) {
1360 unixctl_command_reply_error(conn,
1361 "error, subtable lookup function not found\n");
1362 return;
1363 }
1364
1365 /* argv[3] is optional datapath instance. If no datapath name is provided
1366 * and only one datapath exists, the one existing datapath is reprobed.
1367 */
1368 ovs_mutex_lock(&dp_netdev_mutex);
1369 struct dp_netdev *dp = NULL;
1370
1371 if (argc == 4) {
1372 dp = shash_find_data(&dp_netdevs, argv[3]);
1373 } else if (shash_count(&dp_netdevs) == 1) {
1374 dp = shash_first(&dp_netdevs)->data;
1375 }
1376
1377 if (!dp) {
1378 ovs_mutex_unlock(&dp_netdev_mutex);
1379 unixctl_command_reply_error(conn,
1380 "please specify an existing datapath");
1381 return;
1382 }
1383
1384 /* Get PMD threads list, required to get DPCLS instances. */
1385 size_t n;
1386 uint32_t lookup_dpcls_changed = 0;
1387 uint32_t lookup_subtable_changed = 0;
1388 struct dp_netdev_pmd_thread **pmd_list;
1389 sorted_poll_thread_list(dp, &pmd_list, &n);
1390
1391 /* take port mutex as HMAP iters over them. */
1392 ovs_mutex_lock(&dp->port_mutex);
1393
1394 for (size_t i = 0; i < n; i++) {
1395 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1396 if (pmd->core_id == NON_PMD_CORE_ID) {
1397 continue;
1398 }
1399
1400 struct dp_netdev_port *port = NULL;
1401 HMAP_FOR_EACH (port, node, &dp->ports) {
1402 odp_port_t in_port = port->port_no;
1403 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1404 if (!cls) {
1405 continue;
1406 }
1407 uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1408 if (subtbl_changes) {
1409 lookup_dpcls_changed++;
1410 lookup_subtable_changed += subtbl_changes;
1411 }
1412 }
1413 }
1414
1415 /* release port mutex before netdev mutex. */
1416 ovs_mutex_unlock(&dp->port_mutex);
1417 ovs_mutex_unlock(&dp_netdev_mutex);
1418
1419 struct ds reply = DS_EMPTY_INITIALIZER;
1420 ds_put_format(&reply,
1421 "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1422 lookup_dpcls_changed, lookup_subtable_changed);
1423 const char *reply_str = ds_cstr(&reply);
1424 unixctl_command_reply(conn, reply_str);
1425 VLOG_INFO("%s", reply_str);
1426 ds_destroy(&reply);
1427 }
1428
1429 static void
1430 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1431 const char *argv[], void *aux OVS_UNUSED)
1432 {
1433 struct ds reply = DS_EMPTY_INITIALIZER;
1434 struct dp_netdev *dp = NULL;
1435
1436 ovs_mutex_lock(&dp_netdev_mutex);
1437
1438 if (argc == 2) {
1439 dp = shash_find_data(&dp_netdevs, argv[1]);
1440 } else if (shash_count(&dp_netdevs) == 1) {
1441 /* There's only one datapath */
1442 dp = shash_first(&dp_netdevs)->data;
1443 }
1444
1445 if (!dp) {
1446 ovs_mutex_unlock(&dp_netdev_mutex);
1447 unixctl_command_reply_error(conn,
1448 "please specify an existing datapath");
1449 return;
1450 }
1451
1452 dp_netdev_request_reconfigure(dp);
1453 ovs_mutex_unlock(&dp_netdev_mutex);
1454 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1455 unixctl_command_reply(conn, ds_cstr(&reply));
1456 ds_destroy(&reply);
1457 }
1458
1459 static void
1460 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1461 void *aux)
1462 {
1463 struct ds reply = DS_EMPTY_INITIALIZER;
1464 struct dp_netdev_pmd_thread **pmd_list;
1465 struct dp_netdev *dp = NULL;
1466 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1467 unsigned int core_id;
1468 bool filter_on_pmd = false;
1469 size_t n;
1470
1471 ovs_mutex_lock(&dp_netdev_mutex);
1472
1473 while (argc > 1) {
1474 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1475 if (str_to_uint(argv[2], 10, &core_id)) {
1476 filter_on_pmd = true;
1477 }
1478 argc -= 2;
1479 argv += 2;
1480 } else {
1481 dp = shash_find_data(&dp_netdevs, argv[1]);
1482 argc -= 1;
1483 argv += 1;
1484 }
1485 }
1486
1487 if (!dp) {
1488 if (shash_count(&dp_netdevs) == 1) {
1489 /* There's only one datapath */
1490 dp = shash_first(&dp_netdevs)->data;
1491 } else {
1492 ovs_mutex_unlock(&dp_netdev_mutex);
1493 unixctl_command_reply_error(conn,
1494 "please specify an existing datapath");
1495 return;
1496 }
1497 }
1498
1499 sorted_poll_thread_list(dp, &pmd_list, &n);
1500 for (size_t i = 0; i < n; i++) {
1501 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1502 if (!pmd) {
1503 break;
1504 }
1505 if (filter_on_pmd && pmd->core_id != core_id) {
1506 continue;
1507 }
1508 if (type == PMD_INFO_SHOW_RXQ) {
1509 pmd_info_show_rxq(&reply, pmd);
1510 } else if (type == PMD_INFO_CLEAR_STATS) {
1511 pmd_perf_stats_clear(&pmd->perf_stats);
1512 } else if (type == PMD_INFO_SHOW_STATS) {
1513 pmd_info_show_stats(&reply, pmd);
1514 } else if (type == PMD_INFO_PERF_SHOW) {
1515 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1516 }
1517 }
1518 free(pmd_list);
1519
1520 ovs_mutex_unlock(&dp_netdev_mutex);
1521
1522 unixctl_command_reply(conn, ds_cstr(&reply));
1523 ds_destroy(&reply);
1524 }
1525
1526 static void
1527 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1528 const char *argv[],
1529 void *aux OVS_UNUSED)
1530 {
1531 struct pmd_perf_params par;
1532 long int it_hist = 0, ms_hist = 0;
1533 par.histograms = true;
1534
1535 while (argc > 1) {
1536 if (!strcmp(argv[1], "-nh")) {
1537 par.histograms = false;
1538 argc -= 1;
1539 argv += 1;
1540 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1541 it_hist = strtol(argv[2], NULL, 10);
1542 if (it_hist < 0) {
1543 it_hist = 0;
1544 } else if (it_hist > HISTORY_LEN) {
1545 it_hist = HISTORY_LEN;
1546 }
1547 argc -= 2;
1548 argv += 2;
1549 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1550 ms_hist = strtol(argv[2], NULL, 10);
1551 if (ms_hist < 0) {
1552 ms_hist = 0;
1553 } else if (ms_hist > HISTORY_LEN) {
1554 ms_hist = HISTORY_LEN;
1555 }
1556 argc -= 2;
1557 argv += 2;
1558 } else {
1559 break;
1560 }
1561 }
1562 par.iter_hist_len = it_hist;
1563 par.ms_hist_len = ms_hist;
1564 par.command_type = PMD_INFO_PERF_SHOW;
1565 dpif_netdev_pmd_info(conn, argc, argv, &par);
1566 }
1567
1568 static void
1569 dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1570 const char *argv[], void *aux OVS_UNUSED)
1571 {
1572 struct ds reply = DS_EMPTY_INITIALIZER;
1573 struct dp_netdev *dp = NULL;
1574
1575 ovs_mutex_lock(&dp_netdev_mutex);
1576 if (argc == 2) {
1577 dp = shash_find_data(&dp_netdevs, argv[1]);
1578 } else if (shash_count(&dp_netdevs) == 1) {
1579 /* There's only one datapath. */
1580 dp = shash_first(&dp_netdevs)->data;
1581 }
1582 if (!dp) {
1583 ovs_mutex_unlock(&dp_netdev_mutex);
1584 unixctl_command_reply_error(conn,
1585 "please specify an existing datapath");
1586 return;
1587 }
1588
1589 if (cmap_count(&dp->tx_bonds) > 0) {
1590 struct tx_bond *dp_bond_entry;
1591
1592 ds_put_cstr(&reply, "Bonds:\n");
1593 CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1594 ds_put_format(&reply, " bond-id %"PRIu32":\n",
1595 dp_bond_entry->bond_id);
1596 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1597 uint32_t member_id = odp_to_u32(
1598 dp_bond_entry->member_buckets[bucket].member_id);
1599 ds_put_format(&reply,
1600 " bucket %d - member %"PRIu32"\n",
1601 bucket, member_id);
1602 }
1603 }
1604 }
1605 ovs_mutex_unlock(&dp_netdev_mutex);
1606 unixctl_command_reply(conn, ds_cstr(&reply));
1607 ds_destroy(&reply);
1608 }
1609
1610 \f
1611 static int
1612 dpif_netdev_init(void)
1613 {
1614 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1615 clear_aux = PMD_INFO_CLEAR_STATS,
1616 poll_aux = PMD_INFO_SHOW_RXQ;
1617
1618 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1619 0, 3, dpif_netdev_pmd_info,
1620 (void *)&show_aux);
1621 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1622 0, 3, dpif_netdev_pmd_info,
1623 (void *)&clear_aux);
1624 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1625 0, 3, dpif_netdev_pmd_info,
1626 (void *)&poll_aux);
1627 unixctl_command_register("dpif-netdev/pmd-perf-show",
1628 "[-nh] [-it iter-history-len]"
1629 " [-ms ms-history-len]"
1630 " [-pmd core] [dp]",
1631 0, 8, pmd_perf_show_cmd,
1632 NULL);
1633 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1634 0, 1, dpif_netdev_pmd_rebalance,
1635 NULL);
1636 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1637 "on|off [-b before] [-a after] [-e|-ne] "
1638 "[-us usec] [-q qlen]",
1639 0, 10, pmd_perf_log_set_cmd,
1640 NULL);
1641 unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1642 0, 1, dpif_netdev_bond_show,
1643 NULL);
1644 unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1645 "[lookup_func] [prio] [dp]",
1646 2, 3, dpif_netdev_subtable_lookup_set,
1647 NULL);
1648 unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", "",
1649 0, 0, dpif_netdev_subtable_lookup_get,
1650 NULL);
1651 return 0;
1652 }
1653
1654 static int
1655 dpif_netdev_enumerate(struct sset *all_dps,
1656 const struct dpif_class *dpif_class)
1657 {
1658 struct shash_node *node;
1659
1660 ovs_mutex_lock(&dp_netdev_mutex);
1661 SHASH_FOR_EACH(node, &dp_netdevs) {
1662 struct dp_netdev *dp = node->data;
1663 if (dpif_class != dp->class) {
1664 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1665 * If the class doesn't match, skip this dpif. */
1666 continue;
1667 }
1668 sset_add(all_dps, node->name);
1669 }
1670 ovs_mutex_unlock(&dp_netdev_mutex);
1671
1672 return 0;
1673 }
1674
1675 static bool
1676 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1677 {
1678 return class != &dpif_netdev_class;
1679 }
1680
1681 static const char *
1682 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1683 {
1684 return strcmp(type, "internal") ? type
1685 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1686 : "tap";
1687 }
1688
1689 static struct dpif *
1690 create_dpif_netdev(struct dp_netdev *dp)
1691 {
1692 uint16_t netflow_id = hash_string(dp->name, 0);
1693 struct dpif_netdev *dpif;
1694
1695 ovs_refcount_ref(&dp->ref_cnt);
1696
1697 dpif = xmalloc(sizeof *dpif);
1698 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1699 dpif->dp = dp;
1700 dpif->last_port_seq = seq_read(dp->port_seq);
1701
1702 return &dpif->dpif;
1703 }
1704
1705 /* Choose an unused, non-zero port number and return it on success.
1706 * Return ODPP_NONE on failure. */
1707 static odp_port_t
1708 choose_port(struct dp_netdev *dp, const char *name)
1709 OVS_REQUIRES(dp->port_mutex)
1710 {
1711 uint32_t port_no;
1712
1713 if (dp->class != &dpif_netdev_class) {
1714 const char *p;
1715 int start_no = 0;
1716
1717 /* If the port name begins with "br", start the number search at
1718 * 100 to make writing tests easier. */
1719 if (!strncmp(name, "br", 2)) {
1720 start_no = 100;
1721 }
1722
1723 /* If the port name contains a number, try to assign that port number.
1724 * This can make writing unit tests easier because port numbers are
1725 * predictable. */
1726 for (p = name; *p != '\0'; p++) {
1727 if (isdigit((unsigned char) *p)) {
1728 port_no = start_no + strtol(p, NULL, 10);
1729 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1730 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1731 return u32_to_odp(port_no);
1732 }
1733 break;
1734 }
1735 }
1736 }
1737
1738 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1739 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1740 return u32_to_odp(port_no);
1741 }
1742 }
1743
1744 return ODPP_NONE;
1745 }
1746
1747 static int
1748 create_dp_netdev(const char *name, const struct dpif_class *class,
1749 struct dp_netdev **dpp)
1750 OVS_REQUIRES(dp_netdev_mutex)
1751 {
1752 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1753 struct dp_netdev *dp;
1754 int error;
1755
1756 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1757 * unit tests. */
1758 if (!dpif_netdev_class_is_dummy(class)
1759 && ovsthread_once_start(&tsc_freq_check)) {
1760 pmd_perf_estimate_tsc_frequency();
1761 ovsthread_once_done(&tsc_freq_check);
1762 }
1763
1764 dp = xzalloc(sizeof *dp);
1765 shash_add(&dp_netdevs, name, dp);
1766
1767 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1768 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1769 ovs_refcount_init(&dp->ref_cnt);
1770 atomic_flag_clear(&dp->destroyed);
1771
1772 ovs_mutex_init_recursive(&dp->port_mutex);
1773 hmap_init(&dp->ports);
1774 dp->port_seq = seq_create();
1775 ovs_mutex_init(&dp->bond_mutex);
1776 cmap_init(&dp->tx_bonds);
1777
1778 fat_rwlock_init(&dp->upcall_rwlock);
1779
1780 dp->reconfigure_seq = seq_create();
1781 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1782
1783 for (int i = 0; i < N_METER_LOCKS; ++i) {
1784 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1785 }
1786
1787 /* Disable upcalls by default. */
1788 dp_netdev_disable_upcall(dp);
1789 dp->upcall_aux = NULL;
1790 dp->upcall_cb = NULL;
1791
1792 dp->conntrack = conntrack_init();
1793
1794 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1795 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1796
1797 cmap_init(&dp->poll_threads);
1798 dp->pmd_rxq_assign_cyc = true;
1799
1800 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1801 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1802 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1803
1804 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1805 ovsthread_key_create(&dp->per_pmd_key, NULL);
1806
1807 ovs_mutex_lock(&dp->port_mutex);
1808 /* non-PMD will be created before all other threads and will
1809 * allocate static_tx_qid = 0. */
1810 dp_netdev_set_nonpmd(dp);
1811
1812 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1813 "internal"),
1814 ODPP_LOCAL);
1815 ovs_mutex_unlock(&dp->port_mutex);
1816 if (error) {
1817 dp_netdev_free(dp);
1818 return error;
1819 }
1820
1821 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1822 *dpp = dp;
1823 return 0;
1824 }
1825
1826 static void
1827 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1828 {
1829 seq_change(dp->reconfigure_seq);
1830 }
1831
1832 static bool
1833 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1834 {
1835 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1836 }
1837
1838 static int
1839 dpif_netdev_open(const struct dpif_class *class, const char *name,
1840 bool create, struct dpif **dpifp)
1841 {
1842 struct dp_netdev *dp;
1843 int error;
1844
1845 ovs_mutex_lock(&dp_netdev_mutex);
1846 dp = shash_find_data(&dp_netdevs, name);
1847 if (!dp) {
1848 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1849 } else {
1850 error = (dp->class != class ? EINVAL
1851 : create ? EEXIST
1852 : 0);
1853 }
1854 if (!error) {
1855 *dpifp = create_dpif_netdev(dp);
1856 }
1857 ovs_mutex_unlock(&dp_netdev_mutex);
1858
1859 return error;
1860 }
1861
1862 static void
1863 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1864 OVS_NO_THREAD_SAFETY_ANALYSIS
1865 {
1866 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1867 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1868
1869 /* Before freeing a lock we should release it */
1870 fat_rwlock_unlock(&dp->upcall_rwlock);
1871 fat_rwlock_destroy(&dp->upcall_rwlock);
1872 }
1873
1874 static void
1875 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1876 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1877 {
1878 if (dp->meters[meter_id]) {
1879 free(dp->meters[meter_id]);
1880 dp->meters[meter_id] = NULL;
1881 }
1882 }
1883
1884 static uint32_t
1885 hash_bond_id(uint32_t bond_id)
1886 {
1887 return hash_int(bond_id, 0);
1888 }
1889
1890 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1891 * through the 'dp_netdevs' shash while freeing 'dp'. */
1892 static void
1893 dp_netdev_free(struct dp_netdev *dp)
1894 OVS_REQUIRES(dp_netdev_mutex)
1895 {
1896 struct dp_netdev_port *port, *next;
1897 struct tx_bond *bond;
1898
1899 shash_find_and_delete(&dp_netdevs, dp->name);
1900
1901 ovs_mutex_lock(&dp->port_mutex);
1902 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1903 do_del_port(dp, port);
1904 }
1905 ovs_mutex_unlock(&dp->port_mutex);
1906
1907 ovs_mutex_lock(&dp->bond_mutex);
1908 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1909 cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1910 ovsrcu_postpone(free, bond);
1911 }
1912 ovs_mutex_unlock(&dp->bond_mutex);
1913
1914 dp_netdev_destroy_all_pmds(dp, true);
1915 cmap_destroy(&dp->poll_threads);
1916
1917 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1918 id_pool_destroy(dp->tx_qid_pool);
1919
1920 ovs_mutex_destroy(&dp->non_pmd_mutex);
1921 ovsthread_key_delete(dp->per_pmd_key);
1922
1923 conntrack_destroy(dp->conntrack);
1924
1925
1926 seq_destroy(dp->reconfigure_seq);
1927
1928 seq_destroy(dp->port_seq);
1929 hmap_destroy(&dp->ports);
1930 ovs_mutex_destroy(&dp->port_mutex);
1931
1932 cmap_destroy(&dp->tx_bonds);
1933 ovs_mutex_destroy(&dp->bond_mutex);
1934
1935 /* Upcalls must be disabled at this point */
1936 dp_netdev_destroy_upcall_lock(dp);
1937
1938 int i;
1939
1940 for (i = 0; i < MAX_METERS; ++i) {
1941 meter_lock(dp, i);
1942 dp_delete_meter(dp, i);
1943 meter_unlock(dp, i);
1944 }
1945 for (i = 0; i < N_METER_LOCKS; ++i) {
1946 ovs_mutex_destroy(&dp->meter_locks[i]);
1947 }
1948
1949 free(dp->pmd_cmask);
1950 free(CONST_CAST(char *, dp->name));
1951 free(dp);
1952 }
1953
1954 static void
1955 dp_netdev_unref(struct dp_netdev *dp)
1956 {
1957 if (dp) {
1958 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1959 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1960 ovs_mutex_lock(&dp_netdev_mutex);
1961 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1962 dp_netdev_free(dp);
1963 }
1964 ovs_mutex_unlock(&dp_netdev_mutex);
1965 }
1966 }
1967
1968 static void
1969 dpif_netdev_close(struct dpif *dpif)
1970 {
1971 struct dp_netdev *dp = get_dp_netdev(dpif);
1972
1973 dp_netdev_unref(dp);
1974 free(dpif);
1975 }
1976
1977 static int
1978 dpif_netdev_destroy(struct dpif *dpif)
1979 {
1980 struct dp_netdev *dp = get_dp_netdev(dpif);
1981
1982 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1983 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1984 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1985 OVS_NOT_REACHED();
1986 }
1987 }
1988
1989 return 0;
1990 }
1991
1992 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1993 * load/store semantics. While the increment is not atomic, the load and
1994 * store operations are, making it impossible to read inconsistent values.
1995 *
1996 * This is used to update thread local stats counters. */
1997 static void
1998 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1999 {
2000 unsigned long long tmp;
2001
2002 atomic_read_relaxed(var, &tmp);
2003 tmp += n;
2004 atomic_store_relaxed(var, tmp);
2005 }
2006
2007 static int
2008 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2009 {
2010 struct dp_netdev *dp = get_dp_netdev(dpif);
2011 struct dp_netdev_pmd_thread *pmd;
2012 uint64_t pmd_stats[PMD_N_STATS];
2013
2014 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2015 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2016 stats->n_flows += cmap_count(&pmd->flow_table);
2017 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2018 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2019 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2020 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2021 stats->n_missed += pmd_stats[PMD_STAT_MISS];
2022 stats->n_lost += pmd_stats[PMD_STAT_LOST];
2023 }
2024 stats->n_masks = UINT32_MAX;
2025 stats->n_mask_hit = UINT64_MAX;
2026
2027 return 0;
2028 }
2029
2030 static void
2031 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2032 {
2033 if (pmd->core_id == NON_PMD_CORE_ID) {
2034 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2035 ovs_mutex_lock(&pmd->port_mutex);
2036 pmd_load_cached_ports(pmd);
2037 ovs_mutex_unlock(&pmd->port_mutex);
2038 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2039 return;
2040 }
2041
2042 seq_change(pmd->reload_seq);
2043 atomic_store_explicit(&pmd->reload, true, memory_order_release);
2044 }
2045
2046 static uint32_t
2047 hash_port_no(odp_port_t port_no)
2048 {
2049 return hash_int(odp_to_u32(port_no), 0);
2050 }
2051
2052 static int
2053 port_create(const char *devname, const char *type,
2054 odp_port_t port_no, struct dp_netdev_port **portp)
2055 {
2056 struct dp_netdev_port *port;
2057 enum netdev_flags flags;
2058 struct netdev *netdev;
2059 int error;
2060
2061 *portp = NULL;
2062
2063 /* Open and validate network device. */
2064 error = netdev_open(devname, type, &netdev);
2065 if (error) {
2066 return error;
2067 }
2068 /* XXX reject non-Ethernet devices */
2069
2070 netdev_get_flags(netdev, &flags);
2071 if (flags & NETDEV_LOOPBACK) {
2072 VLOG_ERR("%s: cannot add a loopback device", devname);
2073 error = EINVAL;
2074 goto out;
2075 }
2076
2077 port = xzalloc(sizeof *port);
2078 port->port_no = port_no;
2079 port->netdev = netdev;
2080 port->type = xstrdup(type);
2081 port->sf = NULL;
2082 port->emc_enabled = true;
2083 port->need_reconfigure = true;
2084 ovs_mutex_init(&port->txq_used_mutex);
2085
2086 *portp = port;
2087
2088 return 0;
2089
2090 out:
2091 netdev_close(netdev);
2092 return error;
2093 }
2094
2095 static int
2096 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2097 odp_port_t port_no)
2098 OVS_REQUIRES(dp->port_mutex)
2099 {
2100 struct netdev_saved_flags *sf;
2101 struct dp_netdev_port *port;
2102 int error;
2103
2104 /* Reject devices already in 'dp'. */
2105 if (!get_port_by_name(dp, devname, &port)) {
2106 return EEXIST;
2107 }
2108
2109 error = port_create(devname, type, port_no, &port);
2110 if (error) {
2111 return error;
2112 }
2113
2114 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2115 seq_change(dp->port_seq);
2116
2117 reconfigure_datapath(dp);
2118
2119 /* Check that port was successfully configured. */
2120 if (!dp_netdev_lookup_port(dp, port_no)) {
2121 return EINVAL;
2122 }
2123
2124 /* Updating device flags triggers an if_notifier, which triggers a bridge
2125 * reconfiguration and another attempt to add this port, leading to an
2126 * infinite loop if the device is configured incorrectly and cannot be
2127 * added. Setting the promisc mode after a successful reconfiguration,
2128 * since we already know that the device is somehow properly configured. */
2129 error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2130 if (error) {
2131 VLOG_ERR("%s: cannot set promisc flag", devname);
2132 do_del_port(dp, port);
2133 return error;
2134 }
2135 port->sf = sf;
2136
2137 return 0;
2138 }
2139
2140 static int
2141 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2142 odp_port_t *port_nop)
2143 {
2144 struct dp_netdev *dp = get_dp_netdev(dpif);
2145 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2146 const char *dpif_port;
2147 odp_port_t port_no;
2148 int error;
2149
2150 ovs_mutex_lock(&dp->port_mutex);
2151 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2152 if (*port_nop != ODPP_NONE) {
2153 port_no = *port_nop;
2154 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2155 } else {
2156 port_no = choose_port(dp, dpif_port);
2157 error = port_no == ODPP_NONE ? EFBIG : 0;
2158 }
2159 if (!error) {
2160 *port_nop = port_no;
2161 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2162 }
2163 ovs_mutex_unlock(&dp->port_mutex);
2164
2165 return error;
2166 }
2167
2168 static int
2169 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2170 {
2171 struct dp_netdev *dp = get_dp_netdev(dpif);
2172 int error;
2173
2174 ovs_mutex_lock(&dp->port_mutex);
2175 if (port_no == ODPP_LOCAL) {
2176 error = EINVAL;
2177 } else {
2178 struct dp_netdev_port *port;
2179
2180 error = get_port_by_number(dp, port_no, &port);
2181 if (!error) {
2182 do_del_port(dp, port);
2183 }
2184 }
2185 ovs_mutex_unlock(&dp->port_mutex);
2186
2187 return error;
2188 }
2189
2190 static bool
2191 is_valid_port_number(odp_port_t port_no)
2192 {
2193 return port_no != ODPP_NONE;
2194 }
2195
2196 static struct dp_netdev_port *
2197 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2198 OVS_REQUIRES(dp->port_mutex)
2199 {
2200 struct dp_netdev_port *port;
2201
2202 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2203 if (port->port_no == port_no) {
2204 return port;
2205 }
2206 }
2207 return NULL;
2208 }
2209
2210 static int
2211 get_port_by_number(struct dp_netdev *dp,
2212 odp_port_t port_no, struct dp_netdev_port **portp)
2213 OVS_REQUIRES(dp->port_mutex)
2214 {
2215 if (!is_valid_port_number(port_no)) {
2216 *portp = NULL;
2217 return EINVAL;
2218 } else {
2219 *portp = dp_netdev_lookup_port(dp, port_no);
2220 return *portp ? 0 : ENODEV;
2221 }
2222 }
2223
2224 static void
2225 port_destroy(struct dp_netdev_port *port)
2226 {
2227 if (!port) {
2228 return;
2229 }
2230
2231 netdev_close(port->netdev);
2232 netdev_restore_flags(port->sf);
2233
2234 for (unsigned i = 0; i < port->n_rxq; i++) {
2235 netdev_rxq_close(port->rxqs[i].rx);
2236 }
2237 ovs_mutex_destroy(&port->txq_used_mutex);
2238 free(port->rxq_affinity_list);
2239 free(port->txq_used);
2240 free(port->rxqs);
2241 free(port->type);
2242 free(port);
2243 }
2244
2245 static int
2246 get_port_by_name(struct dp_netdev *dp,
2247 const char *devname, struct dp_netdev_port **portp)
2248 OVS_REQUIRES(dp->port_mutex)
2249 {
2250 struct dp_netdev_port *port;
2251
2252 HMAP_FOR_EACH (port, node, &dp->ports) {
2253 if (!strcmp(netdev_get_name(port->netdev), devname)) {
2254 *portp = port;
2255 return 0;
2256 }
2257 }
2258
2259 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2260 * existing port. */
2261 return ENODEV;
2262 }
2263
2264 /* Returns 'true' if there is a port with pmd netdev. */
2265 static bool
2266 has_pmd_port(struct dp_netdev *dp)
2267 OVS_REQUIRES(dp->port_mutex)
2268 {
2269 struct dp_netdev_port *port;
2270
2271 HMAP_FOR_EACH (port, node, &dp->ports) {
2272 if (netdev_is_pmd(port->netdev)) {
2273 return true;
2274 }
2275 }
2276
2277 return false;
2278 }
2279
2280 static void
2281 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2282 OVS_REQUIRES(dp->port_mutex)
2283 {
2284 hmap_remove(&dp->ports, &port->node);
2285 seq_change(dp->port_seq);
2286
2287 reconfigure_datapath(dp);
2288
2289 port_destroy(port);
2290 }
2291
2292 static void
2293 answer_port_query(const struct dp_netdev_port *port,
2294 struct dpif_port *dpif_port)
2295 {
2296 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2297 dpif_port->type = xstrdup(port->type);
2298 dpif_port->port_no = port->port_no;
2299 }
2300
2301 static int
2302 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2303 struct dpif_port *dpif_port)
2304 {
2305 struct dp_netdev *dp = get_dp_netdev(dpif);
2306 struct dp_netdev_port *port;
2307 int error;
2308
2309 ovs_mutex_lock(&dp->port_mutex);
2310 error = get_port_by_number(dp, port_no, &port);
2311 if (!error && dpif_port) {
2312 answer_port_query(port, dpif_port);
2313 }
2314 ovs_mutex_unlock(&dp->port_mutex);
2315
2316 return error;
2317 }
2318
2319 static int
2320 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2321 struct dpif_port *dpif_port)
2322 {
2323 struct dp_netdev *dp = get_dp_netdev(dpif);
2324 struct dp_netdev_port *port;
2325 int error;
2326
2327 ovs_mutex_lock(&dp->port_mutex);
2328 error = get_port_by_name(dp, devname, &port);
2329 if (!error && dpif_port) {
2330 answer_port_query(port, dpif_port);
2331 }
2332 ovs_mutex_unlock(&dp->port_mutex);
2333
2334 return error;
2335 }
2336
2337 static void
2338 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2339 {
2340 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2341 free(flow->dp_extra_info);
2342 free(flow);
2343 }
2344
2345 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2346 {
2347 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2348 ovsrcu_postpone(dp_netdev_flow_free, flow);
2349 }
2350 }
2351
2352 static uint32_t
2353 dp_netdev_flow_hash(const ovs_u128 *ufid)
2354 {
2355 return ufid->u32[0];
2356 }
2357
2358 static inline struct dpcls *
2359 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2360 odp_port_t in_port)
2361 {
2362 struct dpcls *cls;
2363 uint32_t hash = hash_port_no(in_port);
2364 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2365 if (cls->in_port == in_port) {
2366 /* Port classifier exists already */
2367 return cls;
2368 }
2369 }
2370 return NULL;
2371 }
2372
2373 static inline struct dpcls *
2374 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2375 odp_port_t in_port)
2376 OVS_REQUIRES(pmd->flow_mutex)
2377 {
2378 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2379 uint32_t hash = hash_port_no(in_port);
2380
2381 if (!cls) {
2382 /* Create new classifier for in_port */
2383 cls = xmalloc(sizeof(*cls));
2384 dpcls_init(cls);
2385 cls->in_port = in_port;
2386 cmap_insert(&pmd->classifiers, &cls->node, hash);
2387 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2388 }
2389 return cls;
2390 }
2391
2392 #define MAX_FLOW_MARK (UINT32_MAX - 1)
2393 #define INVALID_FLOW_MARK 0
2394 /* Zero flow mark is used to indicate the HW to remove the mark. A packet
2395 * marked with zero mark is received in SW without a mark at all, so it
2396 * cannot be used as a valid mark.
2397 */
2398
2399 struct megaflow_to_mark_data {
2400 const struct cmap_node node;
2401 ovs_u128 mega_ufid;
2402 uint32_t mark;
2403 };
2404
2405 struct flow_mark {
2406 struct cmap megaflow_to_mark;
2407 struct cmap mark_to_flow;
2408 struct id_pool *pool;
2409 };
2410
2411 static struct flow_mark flow_mark = {
2412 .megaflow_to_mark = CMAP_INITIALIZER,
2413 .mark_to_flow = CMAP_INITIALIZER,
2414 };
2415
2416 static uint32_t
2417 flow_mark_alloc(void)
2418 {
2419 uint32_t mark;
2420
2421 if (!flow_mark.pool) {
2422 /* Haven't initiated yet, do it here */
2423 flow_mark.pool = id_pool_create(1, MAX_FLOW_MARK);
2424 }
2425
2426 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2427 return mark;
2428 }
2429
2430 return INVALID_FLOW_MARK;
2431 }
2432
2433 static void
2434 flow_mark_free(uint32_t mark)
2435 {
2436 id_pool_free_id(flow_mark.pool, mark);
2437 }
2438
2439 /* associate megaflow with a mark, which is a 1:1 mapping */
2440 static void
2441 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2442 {
2443 size_t hash = dp_netdev_flow_hash(mega_ufid);
2444 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2445
2446 data->mega_ufid = *mega_ufid;
2447 data->mark = mark;
2448
2449 cmap_insert(&flow_mark.megaflow_to_mark,
2450 CONST_CAST(struct cmap_node *, &data->node), hash);
2451 }
2452
2453 /* disassociate meagaflow with a mark */
2454 static void
2455 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2456 {
2457 size_t hash = dp_netdev_flow_hash(mega_ufid);
2458 struct megaflow_to_mark_data *data;
2459
2460 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2461 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2462 cmap_remove(&flow_mark.megaflow_to_mark,
2463 CONST_CAST(struct cmap_node *, &data->node), hash);
2464 ovsrcu_postpone(free, data);
2465 return;
2466 }
2467 }
2468
2469 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2470 UUID_ARGS((struct uuid *)mega_ufid));
2471 }
2472
2473 static inline uint32_t
2474 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2475 {
2476 size_t hash = dp_netdev_flow_hash(mega_ufid);
2477 struct megaflow_to_mark_data *data;
2478
2479 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2480 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2481 return data->mark;
2482 }
2483 }
2484
2485 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2486 UUID_ARGS((struct uuid *)mega_ufid));
2487 return INVALID_FLOW_MARK;
2488 }
2489
2490 /* associate mark with a flow, which is 1:N mapping */
2491 static void
2492 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2493 {
2494 dp_netdev_flow_ref(flow);
2495
2496 cmap_insert(&flow_mark.mark_to_flow,
2497 CONST_CAST(struct cmap_node *, &flow->mark_node),
2498 hash_int(mark, 0));
2499 flow->mark = mark;
2500
2501 VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2502 flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2503 }
2504
2505 static bool
2506 flow_mark_has_no_ref(uint32_t mark)
2507 {
2508 struct dp_netdev_flow *flow;
2509
2510 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2511 &flow_mark.mark_to_flow) {
2512 if (flow->mark == mark) {
2513 return false;
2514 }
2515 }
2516
2517 return true;
2518 }
2519
2520 static int
2521 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2522 struct dp_netdev_flow *flow)
2523 {
2524 const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2525 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2526 &flow->mark_node);
2527 uint32_t mark = flow->mark;
2528 int ret = 0;
2529
2530 /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2531 * never associated. */
2532 if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2533 return EINVAL;
2534 }
2535
2536 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2537 flow->mark = INVALID_FLOW_MARK;
2538
2539 /*
2540 * no flow is referencing the mark any more? If so, let's
2541 * remove the flow from hardware and free the mark.
2542 */
2543 if (flow_mark_has_no_ref(mark)) {
2544 struct netdev *port;
2545 odp_port_t in_port = flow->flow.in_port.odp_port;
2546
2547 port = netdev_ports_get(in_port, dpif_type_str);
2548 if (port) {
2549 /* Taking a global 'port_mutex' to fulfill thread safety
2550 * restrictions for the netdev-offload-dpdk module. */
2551 ovs_mutex_lock(&pmd->dp->port_mutex);
2552 ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2553 ovs_mutex_unlock(&pmd->dp->port_mutex);
2554 netdev_close(port);
2555 }
2556
2557 flow_mark_free(mark);
2558 VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2559 UUID_ARGS((struct uuid *) &flow->mega_ufid));
2560
2561 megaflow_to_mark_disassociate(&flow->mega_ufid);
2562 }
2563 dp_netdev_flow_unref(flow);
2564
2565 return ret;
2566 }
2567
2568 static void
2569 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2570 {
2571 struct dp_netdev_flow *flow;
2572
2573 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2574 if (flow->pmd_id == pmd->core_id) {
2575 queue_netdev_flow_del(pmd, flow);
2576 }
2577 }
2578 }
2579
2580 static struct dp_netdev_flow *
2581 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2582 const uint32_t mark)
2583 {
2584 struct dp_netdev_flow *flow;
2585
2586 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2587 &flow_mark.mark_to_flow) {
2588 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2589 flow->dead == false) {
2590 return flow;
2591 }
2592 }
2593
2594 return NULL;
2595 }
2596
2597 static struct dp_flow_offload_item *
2598 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2599 struct dp_netdev_flow *flow,
2600 int op)
2601 {
2602 struct dp_flow_offload_item *offload;
2603
2604 offload = xzalloc(sizeof(*offload));
2605 offload->pmd = pmd;
2606 offload->flow = flow;
2607 offload->op = op;
2608
2609 dp_netdev_flow_ref(flow);
2610 dp_netdev_pmd_try_ref(pmd);
2611
2612 return offload;
2613 }
2614
2615 static void
2616 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2617 {
2618 dp_netdev_pmd_unref(offload->pmd);
2619 dp_netdev_flow_unref(offload->flow);
2620
2621 free(offload->actions);
2622 free(offload);
2623 }
2624
2625 static void
2626 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2627 {
2628 ovs_mutex_lock(&dp_flow_offload.mutex);
2629 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2630 xpthread_cond_signal(&dp_flow_offload.cond);
2631 ovs_mutex_unlock(&dp_flow_offload.mutex);
2632 }
2633
2634 static int
2635 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2636 {
2637 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2638 }
2639
2640 /*
2641 * There are two flow offload operations here: addition and modification.
2642 *
2643 * For flow addition, this function does:
2644 * - allocate a new flow mark id
2645 * - perform hardware flow offload
2646 * - associate the flow mark with flow and mega flow
2647 *
2648 * For flow modification, both flow mark and the associations are still
2649 * valid, thus only item 2 needed.
2650 */
2651 static int
2652 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2653 {
2654 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2655 struct dp_netdev_flow *flow = offload->flow;
2656 odp_port_t in_port = flow->flow.in_port.odp_port;
2657 const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2658 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2659 struct offload_info info;
2660 struct netdev *port;
2661 uint32_t mark;
2662 int ret;
2663
2664 if (flow->dead) {
2665 return -1;
2666 }
2667
2668 if (modification) {
2669 mark = flow->mark;
2670 ovs_assert(mark != INVALID_FLOW_MARK);
2671 } else {
2672 /*
2673 * If a mega flow has already been offloaded (from other PMD
2674 * instances), do not offload it again.
2675 */
2676 mark = megaflow_to_mark_find(&flow->mega_ufid);
2677 if (mark != INVALID_FLOW_MARK) {
2678 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2679 if (flow->mark != INVALID_FLOW_MARK) {
2680 ovs_assert(flow->mark == mark);
2681 } else {
2682 mark_to_flow_associate(mark, flow);
2683 }
2684 return 0;
2685 }
2686
2687 mark = flow_mark_alloc();
2688 if (mark == INVALID_FLOW_MARK) {
2689 VLOG_ERR("Failed to allocate flow mark!\n");
2690 return -1;
2691 }
2692 }
2693 info.flow_mark = mark;
2694
2695 port = netdev_ports_get(in_port, dpif_type_str);
2696 if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2697 netdev_close(port);
2698 goto err_free;
2699 }
2700 /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2701 * the netdev-offload-dpdk module. */
2702 ovs_mutex_lock(&pmd->dp->port_mutex);
2703 ret = netdev_flow_put(port, &offload->match,
2704 CONST_CAST(struct nlattr *, offload->actions),
2705 offload->actions_len, &flow->mega_ufid, &info,
2706 NULL);
2707 ovs_mutex_unlock(&pmd->dp->port_mutex);
2708 netdev_close(port);
2709
2710 if (ret) {
2711 goto err_free;
2712 }
2713
2714 if (!modification) {
2715 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2716 mark_to_flow_associate(mark, flow);
2717 }
2718 return 0;
2719
2720 err_free:
2721 if (!modification) {
2722 flow_mark_free(mark);
2723 } else {
2724 mark_to_flow_disassociate(pmd, flow);
2725 }
2726 return -1;
2727 }
2728
2729 static void *
2730 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2731 {
2732 struct dp_flow_offload_item *offload;
2733 struct ovs_list *list;
2734 const char *op;
2735 int ret;
2736
2737 for (;;) {
2738 ovs_mutex_lock(&dp_flow_offload.mutex);
2739 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2740 ovsrcu_quiesce_start();
2741 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2742 &dp_flow_offload.mutex);
2743 ovsrcu_quiesce_end();
2744 }
2745 list = ovs_list_pop_front(&dp_flow_offload.list);
2746 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2747 ovs_mutex_unlock(&dp_flow_offload.mutex);
2748
2749 switch (offload->op) {
2750 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2751 op = "add";
2752 ret = dp_netdev_flow_offload_put(offload);
2753 break;
2754 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2755 op = "modify";
2756 ret = dp_netdev_flow_offload_put(offload);
2757 break;
2758 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2759 op = "delete";
2760 ret = dp_netdev_flow_offload_del(offload);
2761 break;
2762 default:
2763 OVS_NOT_REACHED();
2764 }
2765
2766 VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2767 ret == 0 ? "succeed" : "failed", op,
2768 UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
2769 dp_netdev_free_flow_offload(offload);
2770 ovsrcu_quiesce();
2771 }
2772
2773 return NULL;
2774 }
2775
2776 static void
2777 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2778 struct dp_netdev_flow *flow)
2779 {
2780 struct dp_flow_offload_item *offload;
2781
2782 if (ovsthread_once_start(&offload_thread_once)) {
2783 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2784 ovs_thread_create("dp_netdev_flow_offload",
2785 dp_netdev_flow_offload_main, NULL);
2786 ovsthread_once_done(&offload_thread_once);
2787 }
2788
2789 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2790 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2791 dp_netdev_append_flow_offload(offload);
2792 }
2793
2794 static void
2795 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2796 struct dp_netdev_flow *flow, struct match *match,
2797 const struct nlattr *actions, size_t actions_len)
2798 {
2799 struct dp_flow_offload_item *offload;
2800 int op;
2801
2802 if (!netdev_is_flow_api_enabled()) {
2803 return;
2804 }
2805
2806 if (ovsthread_once_start(&offload_thread_once)) {
2807 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2808 ovs_thread_create("dp_netdev_flow_offload",
2809 dp_netdev_flow_offload_main, NULL);
2810 ovsthread_once_done(&offload_thread_once);
2811 }
2812
2813 if (flow->mark != INVALID_FLOW_MARK) {
2814 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2815 } else {
2816 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2817 }
2818 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2819 offload->match = *match;
2820 offload->actions = xmalloc(actions_len);
2821 memcpy(offload->actions, actions, actions_len);
2822 offload->actions_len = actions_len;
2823
2824 dp_netdev_append_flow_offload(offload);
2825 }
2826
2827 static void
2828 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2829 struct dp_netdev_flow *flow)
2830 OVS_REQUIRES(pmd->flow_mutex)
2831 {
2832 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2833 struct dpcls *cls;
2834 odp_port_t in_port = flow->flow.in_port.odp_port;
2835
2836 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2837 ovs_assert(cls != NULL);
2838 dpcls_remove(cls, &flow->cr);
2839 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2840 if (flow->mark != INVALID_FLOW_MARK) {
2841 queue_netdev_flow_del(pmd, flow);
2842 }
2843 flow->dead = true;
2844
2845 dp_netdev_flow_unref(flow);
2846 }
2847
2848 static void
2849 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2850 {
2851 struct dp_netdev_flow *netdev_flow;
2852
2853 ovs_mutex_lock(&pmd->flow_mutex);
2854 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2855 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2856 }
2857 ovs_mutex_unlock(&pmd->flow_mutex);
2858 }
2859
2860 static int
2861 dpif_netdev_flow_flush(struct dpif *dpif)
2862 {
2863 struct dp_netdev *dp = get_dp_netdev(dpif);
2864 struct dp_netdev_pmd_thread *pmd;
2865
2866 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2867 dp_netdev_pmd_flow_flush(pmd);
2868 }
2869
2870 return 0;
2871 }
2872
2873 struct dp_netdev_port_state {
2874 struct hmap_position position;
2875 char *name;
2876 };
2877
2878 static int
2879 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2880 {
2881 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2882 return 0;
2883 }
2884
2885 static int
2886 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2887 struct dpif_port *dpif_port)
2888 {
2889 struct dp_netdev_port_state *state = state_;
2890 struct dp_netdev *dp = get_dp_netdev(dpif);
2891 struct hmap_node *node;
2892 int retval;
2893
2894 ovs_mutex_lock(&dp->port_mutex);
2895 node = hmap_at_position(&dp->ports, &state->position);
2896 if (node) {
2897 struct dp_netdev_port *port;
2898
2899 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2900
2901 free(state->name);
2902 state->name = xstrdup(netdev_get_name(port->netdev));
2903 dpif_port->name = state->name;
2904 dpif_port->type = port->type;
2905 dpif_port->port_no = port->port_no;
2906
2907 retval = 0;
2908 } else {
2909 retval = EOF;
2910 }
2911 ovs_mutex_unlock(&dp->port_mutex);
2912
2913 return retval;
2914 }
2915
2916 static int
2917 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2918 {
2919 struct dp_netdev_port_state *state = state_;
2920 free(state->name);
2921 free(state);
2922 return 0;
2923 }
2924
2925 static int
2926 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2927 {
2928 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2929 uint64_t new_port_seq;
2930 int error;
2931
2932 new_port_seq = seq_read(dpif->dp->port_seq);
2933 if (dpif->last_port_seq != new_port_seq) {
2934 dpif->last_port_seq = new_port_seq;
2935 error = ENOBUFS;
2936 } else {
2937 error = EAGAIN;
2938 }
2939
2940 return error;
2941 }
2942
2943 static void
2944 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2945 {
2946 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2947
2948 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2949 }
2950
2951 static struct dp_netdev_flow *
2952 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2953 {
2954 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2955 }
2956
2957 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2958 {
2959 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2960 }
2961
2962 /* netdev_flow_key utilities.
2963 *
2964 * netdev_flow_key is basically a miniflow. We use these functions
2965 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2966 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2967 *
2968 * - Since we are dealing exclusively with miniflows created by
2969 * miniflow_extract(), if the map is different the miniflow is different.
2970 * Therefore we can be faster by comparing the map and the miniflow in a
2971 * single memcmp().
2972 * - These functions can be inlined by the compiler. */
2973
2974 /* Given the number of bits set in miniflow's maps, returns the size of the
2975 * 'netdev_flow_key.mf' */
2976 static inline size_t
2977 netdev_flow_key_size(size_t flow_u64s)
2978 {
2979 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2980 }
2981
2982 static inline bool
2983 netdev_flow_key_equal(const struct netdev_flow_key *a,
2984 const struct netdev_flow_key *b)
2985 {
2986 /* 'b->len' may be not set yet. */
2987 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2988 }
2989
2990 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2991 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2992 * generated by miniflow_extract. */
2993 static inline bool
2994 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2995 const struct miniflow *mf)
2996 {
2997 return !memcmp(&key->mf, mf, key->len);
2998 }
2999
3000 static inline void
3001 netdev_flow_key_clone(struct netdev_flow_key *dst,
3002 const struct netdev_flow_key *src)
3003 {
3004 memcpy(dst, src,
3005 offsetof(struct netdev_flow_key, mf) + src->len);
3006 }
3007
3008 /* Initialize a netdev_flow_key 'mask' from 'match'. */
3009 static inline void
3010 netdev_flow_mask_init(struct netdev_flow_key *mask,
3011 const struct match *match)
3012 {
3013 uint64_t *dst = miniflow_values(&mask->mf);
3014 struct flowmap fmap;
3015 uint32_t hash = 0;
3016 size_t idx;
3017
3018 /* Only check masks that make sense for the flow. */
3019 flow_wc_map(&match->flow, &fmap);
3020 flowmap_init(&mask->mf.map);
3021
3022 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
3023 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
3024
3025 if (mask_u64) {
3026 flowmap_set(&mask->mf.map, idx, 1);
3027 *dst++ = mask_u64;
3028 hash = hash_add64(hash, mask_u64);
3029 }
3030 }
3031
3032 map_t map;
3033
3034 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
3035 hash = hash_add64(hash, map);
3036 }
3037
3038 size_t n = dst - miniflow_get_values(&mask->mf);
3039
3040 mask->hash = hash_finish(hash, n * 8);
3041 mask->len = netdev_flow_key_size(n);
3042 }
3043
3044 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
3045 static inline void
3046 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
3047 const struct flow *flow,
3048 const struct netdev_flow_key *mask)
3049 {
3050 uint64_t *dst_u64 = miniflow_values(&dst->mf);
3051 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
3052 uint32_t hash = 0;
3053 uint64_t value;
3054
3055 dst->len = mask->len;
3056 dst->mf = mask->mf; /* Copy maps. */
3057
3058 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
3059 *dst_u64 = value & *mask_u64++;
3060 hash = hash_add64(hash, *dst_u64++);
3061 }
3062 dst->hash = hash_finish(hash,
3063 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
3064 }
3065
3066 static inline bool
3067 emc_entry_alive(struct emc_entry *ce)
3068 {
3069 return ce->flow && !ce->flow->dead;
3070 }
3071
3072 static void
3073 emc_clear_entry(struct emc_entry *ce)
3074 {
3075 if (ce->flow) {
3076 dp_netdev_flow_unref(ce->flow);
3077 ce->flow = NULL;
3078 }
3079 }
3080
3081 static inline void
3082 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
3083 const struct netdev_flow_key *key)
3084 {
3085 if (ce->flow != flow) {
3086 if (ce->flow) {
3087 dp_netdev_flow_unref(ce->flow);
3088 }
3089
3090 if (dp_netdev_flow_ref(flow)) {
3091 ce->flow = flow;
3092 } else {
3093 ce->flow = NULL;
3094 }
3095 }
3096 if (key) {
3097 netdev_flow_key_clone(&ce->key, key);
3098 }
3099 }
3100
3101 static inline void
3102 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
3103 struct dp_netdev_flow *flow)
3104 {
3105 struct emc_entry *to_be_replaced = NULL;
3106 struct emc_entry *current_entry;
3107
3108 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3109 if (netdev_flow_key_equal(&current_entry->key, key)) {
3110 /* We found the entry with the 'mf' miniflow */
3111 emc_change_entry(current_entry, flow, NULL);
3112 return;
3113 }
3114
3115 /* Replacement policy: put the flow in an empty (not alive) entry, or
3116 * in the first entry where it can be */
3117 if (!to_be_replaced
3118 || (emc_entry_alive(to_be_replaced)
3119 && !emc_entry_alive(current_entry))
3120 || current_entry->key.hash < to_be_replaced->key.hash) {
3121 to_be_replaced = current_entry;
3122 }
3123 }
3124 /* We didn't find the miniflow in the cache.
3125 * The 'to_be_replaced' entry is where the new flow will be stored */
3126
3127 emc_change_entry(to_be_replaced, flow, key);
3128 }
3129
3130 static inline void
3131 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
3132 const struct netdev_flow_key *key,
3133 struct dp_netdev_flow *flow)
3134 {
3135 /* Insert an entry into the EMC based on probability value 'min'. By
3136 * default the value is UINT32_MAX / 100 which yields an insertion
3137 * probability of 1/100 ie. 1% */
3138
3139 uint32_t min = pmd->ctx.emc_insert_min;
3140
3141 if (min && random_uint32() <= min) {
3142 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3143 }
3144 }
3145
3146 static inline struct dp_netdev_flow *
3147 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
3148 {
3149 struct emc_entry *current_entry;
3150
3151 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3152 if (current_entry->key.hash == key->hash
3153 && emc_entry_alive(current_entry)
3154 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
3155
3156 /* We found the entry with the 'key->mf' miniflow */
3157 return current_entry->flow;
3158 }
3159 }
3160
3161 return NULL;
3162 }
3163
3164 static inline const struct cmap_node *
3165 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3166 {
3167 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3168 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3169 uint16_t sig = hash >> 16;
3170 uint16_t index = UINT16_MAX;
3171
3172 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3173 if (bucket->sig[i] == sig) {
3174 index = bucket->flow_idx[i];
3175 break;
3176 }
3177 }
3178 if (index != UINT16_MAX) {
3179 return cmap_find_by_index(&pmd->flow_table, index);
3180 }
3181 return NULL;
3182 }
3183
3184 static void
3185 smc_clear_entry(struct smc_bucket *b, int idx)
3186 {
3187 b->flow_idx[idx] = UINT16_MAX;
3188 }
3189
3190 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3191 * turned off, 2) the flow_table index is larger than uint16_t can handle.
3192 * If there is already an SMC entry having same signature, the index will be
3193 * updated. If there is no existing entry, but an empty entry is available,
3194 * the empty entry will be taken. If no empty entry or existing same signature,
3195 * a random entry from the hashed bucket will be picked. */
3196 static inline void
3197 smc_insert(struct dp_netdev_pmd_thread *pmd,
3198 const struct netdev_flow_key *key,
3199 uint32_t hash)
3200 {
3201 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3202 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3203 uint16_t index;
3204 uint32_t cmap_index;
3205 bool smc_enable_db;
3206 int i;
3207
3208 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
3209 if (!smc_enable_db) {
3210 return;
3211 }
3212
3213 cmap_index = cmap_find_index(&pmd->flow_table, hash);
3214 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3215
3216 /* If the index is larger than SMC can handle (uint16_t), we don't
3217 * insert */
3218 if (index == UINT16_MAX) {
3219 return;
3220 }
3221
3222 /* If an entry with same signature already exists, update the index */
3223 uint16_t sig = key->hash >> 16;
3224 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3225 if (bucket->sig[i] == sig) {
3226 bucket->flow_idx[i] = index;
3227 return;
3228 }
3229 }
3230 /* If there is an empty entry, occupy it. */
3231 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3232 if (bucket->flow_idx[i] == UINT16_MAX) {
3233 bucket->sig[i] = sig;
3234 bucket->flow_idx[i] = index;
3235 return;
3236 }
3237 }
3238 /* Otherwise, pick a random entry. */
3239 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3240 bucket->sig[i] = sig;
3241 bucket->flow_idx[i] = index;
3242 }
3243
3244 static struct dp_netdev_flow *
3245 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3246 const struct netdev_flow_key *key,
3247 int *lookup_num_p)
3248 {
3249 struct dpcls *cls;
3250 struct dpcls_rule *rule;
3251 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3252 in_port.odp_port));
3253 struct dp_netdev_flow *netdev_flow = NULL;
3254
3255 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3256 if (OVS_LIKELY(cls)) {
3257 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3258 netdev_flow = dp_netdev_flow_cast(rule);
3259 }
3260 return netdev_flow;
3261 }
3262
3263 static struct dp_netdev_flow *
3264 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3265 const ovs_u128 *ufidp, const struct nlattr *key,
3266 size_t key_len)
3267 {
3268 struct dp_netdev_flow *netdev_flow;
3269 struct flow flow;
3270 ovs_u128 ufid;
3271
3272 /* If a UFID is not provided, determine one based on the key. */
3273 if (!ufidp && key && key_len
3274 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3275 odp_flow_key_hash(&flow, sizeof flow, &ufid);
3276 ufidp = &ufid;
3277 }
3278
3279 if (ufidp) {
3280 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3281 &pmd->flow_table) {
3282 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3283 return netdev_flow;
3284 }
3285 }
3286 }
3287
3288 return NULL;
3289 }
3290
3291 static void
3292 dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3293 const struct dpif_flow_stats *stats,
3294 const struct dpif_flow_attrs *attrs,
3295 int result)
3296 {
3297 struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3298 struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3299
3300 atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result);
3301 if (result) {
3302 return;
3303 }
3304
3305 atomic_store_relaxed(&last_stats->used, stats->used);
3306 atomic_store_relaxed(&last_stats->packet_count, stats->n_packets);
3307 atomic_store_relaxed(&last_stats->byte_count, stats->n_bytes);
3308 atomic_store_relaxed(&last_stats->tcp_flags, stats->tcp_flags);
3309
3310 atomic_store_relaxed(&last_attrs->offloaded, attrs->offloaded);
3311 atomic_store_relaxed(&last_attrs->dp_layer, attrs->dp_layer);
3312
3313 }
3314
3315 static void
3316 dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3317 struct dpif_flow_stats *stats,
3318 struct dpif_flow_attrs *attrs,
3319 int *result)
3320 {
3321 struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3322 struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3323
3324 atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result);
3325 if (*result) {
3326 return;
3327 }
3328
3329 atomic_read_relaxed(&last_stats->used, &stats->used);
3330 atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets);
3331 atomic_read_relaxed(&last_stats->byte_count, &stats->n_bytes);
3332 atomic_read_relaxed(&last_stats->tcp_flags, &stats->tcp_flags);
3333
3334 atomic_read_relaxed(&last_attrs->offloaded, &attrs->offloaded);
3335 atomic_read_relaxed(&last_attrs->dp_layer, &attrs->dp_layer);
3336 }
3337
3338 static bool
3339 dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3340 struct dp_netdev_flow *netdev_flow,
3341 struct dpif_flow_stats *stats,
3342 struct dpif_flow_attrs *attrs)
3343 {
3344 uint64_t act_buf[1024 / 8];
3345 struct nlattr *actions;
3346 struct netdev *netdev;
3347 struct match match;
3348 struct ofpbuf buf;
3349
3350 int ret = 0;
3351
3352 if (!netdev_is_flow_api_enabled()) {
3353 return false;
3354 }
3355
3356 netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3357 dpif_normalize_type(dp->class->type));
3358 if (!netdev) {
3359 return false;
3360 }
3361 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3362 /* Taking a global 'port_mutex' to fulfill thread safety
3363 * restrictions for the netdev-offload-dpdk module.
3364 *
3365 * XXX: Main thread will try to pause/stop all revalidators during datapath
3366 * reconfiguration via datapath purge callback (dp_purge_cb) while
3367 * holding 'dp->port_mutex'. So we're not waiting for mutex here.
3368 * Otherwise, deadlock is possible, bcause revalidators might sleep
3369 * waiting for the main thread to release the lock and main thread
3370 * will wait for them to stop processing.
3371 * This workaround might make statistics less accurate. Especially
3372 * for flow deletion case, since there will be no other attempt. */
3373 if (!ovs_mutex_trylock(&dp->port_mutex)) {
3374 ret = netdev_flow_get(netdev, &match, &actions,
3375 &netdev_flow->mega_ufid, stats, attrs, &buf);
3376 /* Storing statistics and attributes from the last request for
3377 * later use on mutex contention. */
3378 dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret);
3379 ovs_mutex_unlock(&dp->port_mutex);
3380 } else {
3381 dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret);
3382 if (!ret && !attrs->dp_layer) {
3383 /* Flow was never reported as 'offloaded' so it's harmless
3384 * to continue to think so. */
3385 ret = EAGAIN;
3386 }
3387 }
3388 netdev_close(netdev);
3389 if (ret) {
3390 return false;
3391 }
3392
3393 return true;
3394 }
3395
3396 static void
3397 get_dpif_flow_status(const struct dp_netdev *dp,
3398 const struct dp_netdev_flow *netdev_flow_,
3399 struct dpif_flow_stats *stats,
3400 struct dpif_flow_attrs *attrs)
3401 {
3402 struct dpif_flow_stats offload_stats;
3403 struct dpif_flow_attrs offload_attrs;
3404 struct dp_netdev_flow *netdev_flow;
3405 unsigned long long n;
3406 long long used;
3407 uint16_t flags;
3408
3409 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3410
3411 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3412 stats->n_packets = n;
3413 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3414 stats->n_bytes = n;
3415 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3416 stats->used = used;
3417 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3418 stats->tcp_flags = flags;
3419
3420 if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3421 &offload_stats, &offload_attrs)) {
3422 stats->n_packets += offload_stats.n_packets;
3423 stats->n_bytes += offload_stats.n_bytes;
3424 stats->used = MAX(stats->used, offload_stats.used);
3425 stats->tcp_flags |= offload_stats.tcp_flags;
3426 if (attrs) {
3427 attrs->offloaded = offload_attrs.offloaded;
3428 attrs->dp_layer = offload_attrs.dp_layer;
3429 }
3430 } else if (attrs) {
3431 attrs->offloaded = false;
3432 attrs->dp_layer = "ovs";
3433 }
3434 }
3435
3436 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3437 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3438 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3439 * protect them. */
3440 static void
3441 dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3442 const struct dp_netdev_flow *netdev_flow,
3443 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3444 struct dpif_flow *flow, bool terse)
3445 {
3446 if (terse) {
3447 memset(flow, 0, sizeof *flow);
3448 } else {
3449 struct flow_wildcards wc;
3450 struct dp_netdev_actions *actions;
3451 size_t offset;
3452 struct odp_flow_key_parms odp_parms = {
3453 .flow = &netdev_flow->flow,
3454 .mask = &wc.masks,
3455 .support = dp_netdev_support,
3456 };
3457
3458 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3459 /* in_port is exact matched, but we have left it out from the mask for
3460 * optimnization reasons. Add in_port back to the mask. */
3461 wc.masks.in_port.odp_port = ODPP_NONE;
3462
3463 /* Key */
3464 offset = key_buf->size;
3465 flow->key = ofpbuf_tail(key_buf);
3466 odp_flow_key_from_flow(&odp_parms, key_buf);
3467 flow->key_len = key_buf->size - offset;
3468
3469 /* Mask */
3470 offset = mask_buf->size;
3471 flow->mask = ofpbuf_tail(mask_buf);
3472 odp_parms.key_buf = key_buf;
3473 odp_flow_key_from_mask(&odp_parms, mask_buf);
3474 flow->mask_len = mask_buf->size - offset;
3475
3476 /* Actions */
3477 actions = dp_netdev_flow_get_actions(netdev_flow);
3478 flow->actions = actions->actions;
3479 flow->actions_len = actions->size;
3480 }
3481
3482 flow->ufid = netdev_flow->ufid;
3483 flow->ufid_present = true;
3484 flow->pmd_id = netdev_flow->pmd_id;
3485
3486 get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3487 flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3488 }
3489
3490 static int
3491 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3492 const struct nlattr *mask_key,
3493 uint32_t mask_key_len, const struct flow *flow,
3494 struct flow_wildcards *wc, bool probe)
3495 {
3496 enum odp_key_fitness fitness;
3497
3498 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3499 if (fitness) {
3500 if (!probe) {
3501 /* This should not happen: it indicates that
3502 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3503 * disagree on the acceptable form of a mask. Log the problem
3504 * as an error, with enough details to enable debugging. */
3505 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3506
3507 if (!VLOG_DROP_ERR(&rl)) {
3508 struct ds s;
3509
3510 ds_init(&s);
3511 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3512 true);
3513 VLOG_ERR("internal error parsing flow mask %s (%s)",
3514 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3515 ds_destroy(&s);
3516 }
3517 }
3518
3519 return EINVAL;
3520 }
3521
3522 return 0;
3523 }
3524
3525 static int
3526 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3527 struct flow *flow, bool probe)
3528 {
3529 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3530 if (!probe) {
3531 /* This should not happen: it indicates that
3532 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3533 * the acceptable form of a flow. Log the problem as an error,
3534 * with enough details to enable debugging. */
3535 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3536
3537 if (!VLOG_DROP_ERR(&rl)) {
3538 struct ds s;
3539
3540 ds_init(&s);
3541 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3542 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3543 ds_destroy(&s);
3544 }
3545 }
3546
3547 return EINVAL;
3548 }
3549
3550 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3551 return EINVAL;
3552 }
3553
3554 return 0;
3555 }
3556
3557 static int
3558 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3559 {
3560 struct dp_netdev *dp = get_dp_netdev(dpif);
3561 struct dp_netdev_flow *netdev_flow;
3562 struct dp_netdev_pmd_thread *pmd;
3563 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3564 struct hmapx_node *node;
3565 int error = EINVAL;
3566
3567 if (get->pmd_id == PMD_ID_NULL) {
3568 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3569 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3570 dp_netdev_pmd_unref(pmd);
3571 }
3572 }
3573 } else {
3574 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3575 if (!pmd) {
3576 goto out;
3577 }
3578 hmapx_add(&to_find, pmd);
3579 }
3580
3581 if (!hmapx_count(&to_find)) {
3582 goto out;
3583 }
3584
3585 HMAPX_FOR_EACH (node, &to_find) {
3586 pmd = (struct dp_netdev_pmd_thread *) node->data;
3587 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3588 get->key_len);
3589 if (netdev_flow) {
3590 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3591 get->buffer, get->flow, false);
3592 error = 0;
3593 break;
3594 } else {
3595 error = ENOENT;
3596 }
3597 }
3598
3599 HMAPX_FOR_EACH (node, &to_find) {
3600 pmd = (struct dp_netdev_pmd_thread *) node->data;
3601 dp_netdev_pmd_unref(pmd);
3602 }
3603 out:
3604 hmapx_destroy(&to_find);
3605 return error;
3606 }
3607
3608 static void
3609 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3610 {
3611 struct flow masked_flow;
3612 size_t i;
3613
3614 for (i = 0; i < sizeof(struct flow); i++) {
3615 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3616 ((uint8_t *)&match->wc)[i];
3617 }
3618 odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3619 }
3620
3621 static struct dp_netdev_flow *
3622 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3623 struct match *match, const ovs_u128 *ufid,
3624 const struct nlattr *actions, size_t actions_len)
3625 OVS_REQUIRES(pmd->flow_mutex)
3626 {
3627 struct ds extra_info = DS_EMPTY_INITIALIZER;
3628 struct dp_netdev_flow *flow;
3629 struct netdev_flow_key mask;
3630 struct dpcls *cls;
3631 size_t unit;
3632
3633 /* Make sure in_port is exact matched before we read it. */
3634 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3635 odp_port_t in_port = match->flow.in_port.odp_port;
3636
3637 /* As we select the dpcls based on the port number, each netdev flow
3638 * belonging to the same dpcls will have the same odp_port value.
3639 * For performance reasons we wildcard odp_port here in the mask. In the
3640 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3641 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3642 * will not be part of the subtable mask.
3643 * This will speed up the hash computation during dpcls_lookup() because
3644 * there is one less call to hash_add64() in this case. */
3645 match->wc.masks.in_port.odp_port = 0;
3646 netdev_flow_mask_init(&mask, match);
3647 match->wc.masks.in_port.odp_port = ODPP_NONE;
3648
3649 /* Make sure wc does not have metadata. */
3650 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3651 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3652
3653 /* Do not allocate extra space. */
3654 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3655 memset(&flow->stats, 0, sizeof flow->stats);
3656 atomic_init(&flow->netdev_flow_get_result, 0);
3657 memset(&flow->last_stats, 0, sizeof flow->last_stats);
3658 memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
3659 flow->dead = false;
3660 flow->batch = NULL;
3661 flow->mark = INVALID_FLOW_MARK;
3662 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3663 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3664 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3665 ovs_refcount_init(&flow->ref_cnt);
3666 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3667
3668 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3669 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3670
3671 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3672 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3673 dpcls_insert(cls, &flow->cr, &mask);
3674
3675 ds_put_cstr(&extra_info, "miniflow_bits(");
3676 FLOWMAP_FOR_EACH_UNIT (unit) {
3677 if (unit) {
3678 ds_put_char(&extra_info, ',');
3679 }
3680 ds_put_format(&extra_info, "%d",
3681 count_1bits(flow->cr.mask->mf.map.bits[unit]));
3682 }
3683 ds_put_char(&extra_info, ')');
3684 flow->dp_extra_info = ds_steal_cstr(&extra_info);
3685 ds_destroy(&extra_info);
3686
3687 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3688 dp_netdev_flow_hash(&flow->ufid));
3689
3690 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3691
3692 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3693 struct ds ds = DS_EMPTY_INITIALIZER;
3694 struct ofpbuf key_buf, mask_buf;
3695 struct odp_flow_key_parms odp_parms = {
3696 .flow = &match->flow,
3697 .mask = &match->wc.masks,
3698 .support = dp_netdev_support,
3699 };
3700
3701 ofpbuf_init(&key_buf, 0);
3702 ofpbuf_init(&mask_buf, 0);
3703
3704 odp_flow_key_from_flow(&odp_parms, &key_buf);
3705 odp_parms.key_buf = &key_buf;
3706 odp_flow_key_from_mask(&odp_parms, &mask_buf);
3707
3708 ds_put_cstr(&ds, "flow_add: ");
3709 odp_format_ufid(ufid, &ds);
3710 ds_put_cstr(&ds, " mega_");
3711 odp_format_ufid(&flow->mega_ufid, &ds);
3712 ds_put_cstr(&ds, " ");
3713 odp_flow_format(key_buf.data, key_buf.size,
3714 mask_buf.data, mask_buf.size,
3715 NULL, &ds, false);
3716 ds_put_cstr(&ds, ", actions:");
3717 format_odp_actions(&ds, actions, actions_len, NULL);
3718
3719 VLOG_DBG("%s", ds_cstr(&ds));
3720
3721 ofpbuf_uninit(&key_buf);
3722 ofpbuf_uninit(&mask_buf);
3723
3724 /* Add a printout of the actual match installed. */
3725 struct match m;
3726 ds_clear(&ds);
3727 ds_put_cstr(&ds, "flow match: ");
3728 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3729 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3730 memset(&m.tun_md, 0, sizeof m.tun_md);
3731 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3732
3733 VLOG_DBG("%s", ds_cstr(&ds));
3734
3735 ds_destroy(&ds);
3736 }
3737
3738 return flow;
3739 }
3740
3741 static int
3742 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3743 struct netdev_flow_key *key,
3744 struct match *match,
3745 ovs_u128 *ufid,
3746 const struct dpif_flow_put *put,
3747 struct dpif_flow_stats *stats)
3748 {
3749 struct dp_netdev_flow *netdev_flow;
3750 int error = 0;
3751
3752 if (stats) {
3753 memset(stats, 0, sizeof *stats);
3754 }
3755
3756 ovs_mutex_lock(&pmd->flow_mutex);
3757 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3758 if (!netdev_flow) {
3759 if (put->flags & DPIF_FP_CREATE) {
3760 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3761 put->actions_len);
3762 } else {
3763 error = ENOENT;
3764 }
3765 } else {
3766 if (put->flags & DPIF_FP_MODIFY) {
3767 struct dp_netdev_actions *new_actions;
3768 struct dp_netdev_actions *old_actions;
3769
3770 new_actions = dp_netdev_actions_create(put->actions,
3771 put->actions_len);
3772
3773 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3774 ovsrcu_set(&netdev_flow->actions, new_actions);
3775
3776 queue_netdev_flow_put(pmd, netdev_flow, match,
3777 put->actions, put->actions_len);
3778
3779 if (stats) {
3780 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3781 }
3782 if (put->flags & DPIF_FP_ZERO_STATS) {
3783 /* XXX: The userspace datapath uses thread local statistics
3784 * (for flows), which should be updated only by the owning
3785 * thread. Since we cannot write on stats memory here,
3786 * we choose not to support this flag. Please note:
3787 * - This feature is currently used only by dpctl commands with
3788 * option --clear.
3789 * - Should the need arise, this operation can be implemented
3790 * by keeping a base value (to be update here) for each
3791 * counter, and subtracting it before outputting the stats */
3792 error = EOPNOTSUPP;
3793 }
3794
3795 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3796 } else if (put->flags & DPIF_FP_CREATE) {
3797 error = EEXIST;
3798 } else {
3799 /* Overlapping flow. */
3800 error = EINVAL;
3801 }
3802 }
3803 ovs_mutex_unlock(&pmd->flow_mutex);
3804 return error;
3805 }
3806
3807 static int
3808 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3809 {
3810 struct dp_netdev *dp = get_dp_netdev(dpif);
3811 struct netdev_flow_key key, mask;
3812 struct dp_netdev_pmd_thread *pmd;
3813 struct match match;
3814 ovs_u128 ufid;
3815 int error;
3816 bool probe = put->flags & DPIF_FP_PROBE;
3817
3818 if (put->stats) {
3819 memset(put->stats, 0, sizeof *put->stats);
3820 }
3821 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3822 probe);
3823 if (error) {
3824 return error;
3825 }
3826 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3827 put->mask, put->mask_len,
3828 &match.flow, &match.wc, probe);
3829 if (error) {
3830 return error;
3831 }
3832
3833 if (put->ufid) {
3834 ufid = *put->ufid;
3835 } else {
3836 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3837 }
3838
3839 /* The Netlink encoding of datapath flow keys cannot express
3840 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3841 * tag is interpreted as exact match on the fact that there is no
3842 * VLAN. Unless we refactor a lot of code that translates between
3843 * Netlink and struct flow representations, we have to do the same
3844 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3845 if (!match.wc.masks.vlans[0].tci) {
3846 match.wc.masks.vlans[0].tci = htons(0xffff);
3847 }
3848
3849 /* Must produce a netdev_flow_key for lookup.
3850 * Use the same method as employed to create the key when adding
3851 * the flow to the dplcs to make sure they match. */
3852 netdev_flow_mask_init(&mask, &match);
3853 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3854
3855 if (put->pmd_id == PMD_ID_NULL) {
3856 if (cmap_count(&dp->poll_threads) == 0) {
3857 return EINVAL;
3858 }
3859 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3860 struct dpif_flow_stats pmd_stats;
3861 int pmd_error;
3862
3863 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3864 &pmd_stats);
3865 if (pmd_error) {
3866 error = pmd_error;
3867 } else if (put->stats) {
3868 put->stats->n_packets += pmd_stats.n_packets;
3869 put->stats->n_bytes += pmd_stats.n_bytes;
3870 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3871 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3872 }
3873 }
3874 } else {
3875 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3876 if (!pmd) {
3877 return EINVAL;
3878 }
3879 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3880 dp_netdev_pmd_unref(pmd);
3881 }
3882
3883 return error;
3884 }
3885
3886 static int
3887 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3888 struct dpif_flow_stats *stats,
3889 const struct dpif_flow_del *del)
3890 {
3891 struct dp_netdev_flow *netdev_flow;
3892 int error = 0;
3893
3894 ovs_mutex_lock(&pmd->flow_mutex);
3895 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3896 del->key_len);
3897 if (netdev_flow) {
3898 if (stats) {
3899 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3900 }
3901 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3902 } else {
3903 error = ENOENT;
3904 }
3905 ovs_mutex_unlock(&pmd->flow_mutex);
3906
3907 return error;
3908 }
3909
3910 static int
3911 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3912 {
3913 struct dp_netdev *dp = get_dp_netdev(dpif);
3914 struct dp_netdev_pmd_thread *pmd;
3915 int error = 0;
3916
3917 if (del->stats) {
3918 memset(del->stats, 0, sizeof *del->stats);
3919 }
3920
3921 if (del->pmd_id == PMD_ID_NULL) {
3922 if (cmap_count(&dp->poll_threads) == 0) {
3923 return EINVAL;
3924 }
3925 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3926 struct dpif_flow_stats pmd_stats;
3927 int pmd_error;
3928
3929 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3930 if (pmd_error) {
3931 error = pmd_error;
3932 } else if (del->stats) {
3933 del->stats->n_packets += pmd_stats.n_packets;
3934 del->stats->n_bytes += pmd_stats.n_bytes;
3935 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3936 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3937 }
3938 }
3939 } else {
3940 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3941 if (!pmd) {
3942 return EINVAL;
3943 }
3944 error = flow_del_on_pmd(pmd, del->stats, del);
3945 dp_netdev_pmd_unref(pmd);
3946 }
3947
3948
3949 return error;
3950 }
3951
3952 struct dpif_netdev_flow_dump {
3953 struct dpif_flow_dump up;
3954 struct cmap_position poll_thread_pos;
3955 struct cmap_position flow_pos;
3956 struct dp_netdev_pmd_thread *cur_pmd;
3957 int status;
3958 struct ovs_mutex mutex;
3959 };
3960
3961 static struct dpif_netdev_flow_dump *
3962 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3963 {
3964 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3965 }
3966
3967 static struct dpif_flow_dump *
3968 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3969 struct dpif_flow_dump_types *types OVS_UNUSED)
3970 {
3971 struct dpif_netdev_flow_dump *dump;
3972
3973 dump = xzalloc(sizeof *dump);
3974 dpif_flow_dump_init(&dump->up, dpif_);
3975 dump->up.terse = terse;
3976 ovs_mutex_init(&dump->mutex);
3977
3978 return &dump->up;
3979 }
3980
3981 static int
3982 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3983 {
3984 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3985
3986 ovs_mutex_destroy(&dump->mutex);
3987 free(dump);
3988 return 0;
3989 }
3990
3991 struct dpif_netdev_flow_dump_thread {
3992 struct dpif_flow_dump_thread up;
3993 struct dpif_netdev_flow_dump *dump;
3994 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3995 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3996 };
3997
3998 static struct dpif_netdev_flow_dump_thread *
3999 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
4000 {
4001 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
4002 }
4003
4004 static struct dpif_flow_dump_thread *
4005 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
4006 {
4007 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4008 struct dpif_netdev_flow_dump_thread *thread;
4009
4010 thread = xmalloc(sizeof *thread);
4011 dpif_flow_dump_thread_init(&thread->up, &dump->up);
4012 thread->dump = dump;
4013 return &thread->up;
4014 }
4015
4016 static void
4017 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
4018 {
4019 struct dpif_netdev_flow_dump_thread *thread
4020 = dpif_netdev_flow_dump_thread_cast(thread_);
4021
4022 free(thread);
4023 }
4024
4025 static int
4026 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
4027 struct dpif_flow *flows, int max_flows)
4028 {
4029 struct dpif_netdev_flow_dump_thread *thread
4030 = dpif_netdev_flow_dump_thread_cast(thread_);
4031 struct dpif_netdev_flow_dump *dump = thread->dump;
4032 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
4033 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
4034 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
4035 int n_flows = 0;
4036 int i;
4037
4038 ovs_mutex_lock(&dump->mutex);
4039 if (!dump->status) {
4040 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
4041 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
4042
4043 /* First call to dump_next(), extracts the first pmd thread.
4044 * If there is no pmd thread, returns immediately. */
4045 if (!pmd) {
4046 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4047 if (!pmd) {
4048 ovs_mutex_unlock(&dump->mutex);
4049 return n_flows;
4050
4051 }
4052 }
4053
4054 do {
4055 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
4056 struct cmap_node *node;
4057
4058 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
4059 if (!node) {
4060 break;
4061 }
4062 netdev_flows[n_flows] = CONTAINER_OF(node,
4063 struct dp_netdev_flow,
4064 node);
4065 }
4066 /* When finishing dumping the current pmd thread, moves to
4067 * the next. */
4068 if (n_flows < flow_limit) {
4069 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
4070 dp_netdev_pmd_unref(pmd);
4071 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4072 if (!pmd) {
4073 dump->status = EOF;
4074 break;
4075 }
4076 }
4077 /* Keeps the reference to next caller. */
4078 dump->cur_pmd = pmd;
4079
4080 /* If the current dump is empty, do not exit the loop, since the
4081 * remaining pmds could have flows to be dumped. Just dumps again
4082 * on the new 'pmd'. */
4083 } while (!n_flows);
4084 }
4085 ovs_mutex_unlock(&dump->mutex);
4086
4087 for (i = 0; i < n_flows; i++) {
4088 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4089 struct odputil_keybuf *keybuf = &thread->keybuf[i];
4090 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4091 struct dpif_flow *f = &flows[i];
4092 struct ofpbuf key, mask;
4093
4094 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4095 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4096 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4097 dump->up.terse);
4098 }
4099
4100 return n_flows;
4101 }
4102
4103 static int
4104 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4105 OVS_NO_THREAD_SAFETY_ANALYSIS
4106 {
4107 struct dp_netdev *dp = get_dp_netdev(dpif);
4108 struct dp_netdev_pmd_thread *pmd;
4109 struct dp_packet_batch pp;
4110
4111 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4112 dp_packet_size(execute->packet) > UINT16_MAX) {
4113 return EINVAL;
4114 }
4115
4116 /* Tries finding the 'pmd'. If NULL is returned, that means
4117 * the current thread is a non-pmd thread and should use
4118 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4119 pmd = ovsthread_getspecific(dp->per_pmd_key);
4120 if (!pmd) {
4121 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4122 if (!pmd) {
4123 return EBUSY;
4124 }
4125 }
4126
4127 if (execute->probe) {
4128 /* If this is part of a probe, Drop the packet, since executing
4129 * the action may actually cause spurious packets be sent into
4130 * the network. */
4131 if (pmd->core_id == NON_PMD_CORE_ID) {
4132 dp_netdev_pmd_unref(pmd);
4133 }
4134 return 0;
4135 }
4136
4137 /* If the current thread is non-pmd thread, acquires
4138 * the 'non_pmd_mutex'. */
4139 if (pmd->core_id == NON_PMD_CORE_ID) {
4140 ovs_mutex_lock(&dp->non_pmd_mutex);
4141 }
4142
4143 /* Update current time in PMD context. We don't care about EMC insertion
4144 * probability, because we are on a slow path. */
4145 pmd_thread_ctx_time_update(pmd);
4146
4147 /* The action processing expects the RSS hash to be valid, because
4148 * it's always initialized at the beginning of datapath processing.
4149 * In this case, though, 'execute->packet' may not have gone through
4150 * the datapath at all, it may have been generated by the upper layer
4151 * (OpenFlow packet-out, BFD frame, ...). */
4152 if (!dp_packet_rss_valid(execute->packet)) {
4153 dp_packet_set_rss_hash(execute->packet,
4154 flow_hash_5tuple(execute->flow, 0));
4155 }
4156
4157 dp_packet_batch_init_packet(&pp, execute->packet);
4158 pp.do_not_steal = true;
4159 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4160 execute->actions, execute->actions_len);
4161 dp_netdev_pmd_flush_output_packets(pmd, true);
4162
4163 if (pmd->core_id == NON_PMD_CORE_ID) {
4164 ovs_mutex_unlock(&dp->non_pmd_mutex);
4165 dp_netdev_pmd_unref(pmd);
4166 }
4167
4168 return 0;
4169 }
4170
4171 static void
4172 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
4173 enum dpif_offload_type offload_type OVS_UNUSED)
4174 {
4175 size_t i;
4176
4177 for (i = 0; i < n_ops; i++) {
4178 struct dpif_op *op = ops[i];
4179
4180 switch (op->type) {
4181 case DPIF_OP_FLOW_PUT:
4182 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4183 break;
4184
4185 case DPIF_OP_FLOW_DEL:
4186 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4187 break;
4188
4189 case DPIF_OP_EXECUTE:
4190 op->error = dpif_netdev_execute(dpif, &op->execute);
4191 break;
4192
4193 case DPIF_OP_FLOW_GET:
4194 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4195 break;
4196 }
4197 }
4198 }
4199
4200 /* Enable or Disable PMD auto load balancing. */
4201 static void
4202 set_pmd_auto_lb(struct dp_netdev *dp)
4203 {
4204 unsigned int cnt = 0;
4205 struct dp_netdev_pmd_thread *pmd;
4206 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4207
4208 bool enable_alb = false;
4209 bool multi_rxq = false;
4210 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
4211
4212 /* Ensure that there is at least 2 non-isolated PMDs and
4213 * one of them is polling more than one rxq. */
4214 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4215 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4216 continue;
4217 }
4218
4219 if (hmap_count(&pmd->poll_list) > 1) {
4220 multi_rxq = true;
4221 }
4222 if (cnt && multi_rxq) {
4223 enable_alb = true;
4224 break;
4225 }
4226 cnt++;
4227 }
4228
4229 /* Enable auto LB if it is requested and cycle based assignment is true. */
4230 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
4231 pmd_alb->auto_lb_requested;
4232
4233 if (pmd_alb->is_enabled != enable_alb) {
4234 pmd_alb->is_enabled = enable_alb;
4235 if (pmd_alb->is_enabled) {
4236 VLOG_INFO("PMD auto load balance is enabled "
4237 "(with rebalance interval:%"PRIu64" msec)",
4238 pmd_alb->rebalance_intvl);
4239 } else {
4240 pmd_alb->rebalance_poll_timer = 0;
4241 VLOG_INFO("PMD auto load balance is disabled");
4242 }
4243 }
4244
4245 }
4246
4247 /* Applies datapath configuration from the database. Some of the changes are
4248 * actually applied in dpif_netdev_run(). */
4249 static int
4250 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4251 {
4252 struct dp_netdev *dp = get_dp_netdev(dpif);
4253 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4254 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4255 "cycles");
4256 unsigned long long insert_prob =
4257 smap_get_ullong(other_config, "emc-insert-inv-prob",
4258 DEFAULT_EM_FLOW_INSERT_INV_PROB);
4259 uint32_t insert_min, cur_min;
4260 uint32_t tx_flush_interval, cur_tx_flush_interval;
4261 uint64_t rebalance_intvl;
4262
4263 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4264 DEFAULT_TX_FLUSH_INTERVAL);
4265 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4266 if (tx_flush_interval != cur_tx_flush_interval) {
4267 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4268 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4269 tx_flush_interval);
4270 }
4271
4272 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4273 free(dp->pmd_cmask);
4274 dp->pmd_cmask = nullable_xstrdup(cmask);
4275 dp_netdev_request_reconfigure(dp);
4276 }
4277
4278 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4279 if (insert_prob <= UINT32_MAX) {
4280 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4281 } else {
4282 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4283 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4284 }
4285
4286 if (insert_min != cur_min) {
4287 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4288 if (insert_min == 0) {
4289 VLOG_INFO("EMC insertion probability changed to zero");
4290 } else {
4291 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4292 insert_prob, (100 / (float)insert_prob));
4293 }
4294 }
4295
4296 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4297 bool cur_perf_enabled;
4298 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4299 if (perf_enabled != cur_perf_enabled) {
4300 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4301 if (perf_enabled) {
4302 VLOG_INFO("PMD performance metrics collection enabled");
4303 } else {
4304 VLOG_INFO("PMD performance metrics collection disabled");
4305 }
4306 }
4307
4308 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4309 bool cur_smc;
4310 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4311 if (smc_enable != cur_smc) {
4312 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4313 if (smc_enable) {
4314 VLOG_INFO("SMC cache is enabled");
4315 } else {
4316 VLOG_INFO("SMC cache is disabled");
4317 }
4318 }
4319
4320 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
4321 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
4322 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
4323 "Defaulting to 'cycles'.");
4324 pmd_rxq_assign_cyc = true;
4325 pmd_rxq_assign = "cycles";
4326 }
4327 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4328 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4329 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4330 pmd_rxq_assign);
4331 dp_netdev_request_reconfigure(dp);
4332 }
4333
4334 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4335 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4336 false);
4337
4338 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4339 ALB_PMD_REBALANCE_POLL_INTERVAL);
4340
4341 /* Input is in min, convert it to msec. */
4342 rebalance_intvl =
4343 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4344
4345 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4346 pmd_alb->rebalance_intvl = rebalance_intvl;
4347 }
4348
4349 set_pmd_auto_lb(dp);
4350 return 0;
4351 }
4352
4353 /* Parses affinity list and returns result in 'core_ids'. */
4354 static int
4355 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4356 {
4357 unsigned i;
4358 char *list, *copy, *key, *value;
4359 int error = 0;
4360
4361 for (i = 0; i < n_rxq; i++) {
4362 core_ids[i] = OVS_CORE_UNSPEC;
4363 }
4364
4365 if (!affinity_list) {
4366 return 0;
4367 }
4368
4369 list = copy = xstrdup(affinity_list);
4370
4371 while (ofputil_parse_key_value(&list, &key, &value)) {
4372 int rxq_id, core_id;
4373
4374 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4375 || !str_to_int(value, 0, &core_id) || core_id < 0) {
4376 error = EINVAL;
4377 break;
4378 }
4379
4380 if (rxq_id < n_rxq) {
4381 core_ids[rxq_id] = core_id;
4382 }
4383 }
4384
4385 free(copy);
4386 return error;
4387 }
4388
4389 /* Parses 'affinity_list' and applies configuration if it is valid. */
4390 static int
4391 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4392 const char *affinity_list)
4393 {
4394 unsigned *core_ids, i;
4395 int error = 0;
4396
4397 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4398 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4399 error = EINVAL;
4400 goto exit;
4401 }
4402
4403 for (i = 0; i < port->n_rxq; i++) {
4404 port->rxqs[i].core_id = core_ids[i];
4405 }
4406
4407 exit:
4408 free(core_ids);
4409 return error;
4410 }
4411
4412 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4413 * of given PMD thread. */
4414 static bool
4415 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4416 struct dp_netdev_port *port)
4417 OVS_EXCLUDED(pmd->port_mutex)
4418 {
4419 struct rxq_poll *poll;
4420 bool found = false;
4421
4422 ovs_mutex_lock(&pmd->port_mutex);
4423 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4424 if (port == poll->rxq->port) {
4425 found = true;
4426 break;
4427 }
4428 }
4429 ovs_mutex_unlock(&pmd->port_mutex);
4430 return found;
4431 }
4432
4433 /* Updates port configuration from the database. The changes are actually
4434 * applied in dpif_netdev_run(). */
4435 static int
4436 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4437 const struct smap *cfg)
4438 {
4439 struct dp_netdev *dp = get_dp_netdev(dpif);
4440 struct dp_netdev_port *port;
4441 int error = 0;
4442 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4443 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4444
4445 ovs_mutex_lock(&dp->port_mutex);
4446 error = get_port_by_number(dp, port_no, &port);
4447 if (error) {
4448 goto unlock;
4449 }
4450
4451 if (emc_enabled != port->emc_enabled) {
4452 struct dp_netdev_pmd_thread *pmd;
4453 struct ds ds = DS_EMPTY_INITIALIZER;
4454 uint32_t cur_min, insert_prob;
4455
4456 port->emc_enabled = emc_enabled;
4457 /* Mark for reload all the threads that polls this port and request
4458 * for reconfiguration for the actual reloading of threads. */
4459 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4460 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4461 pmd->need_reload = true;
4462 }
4463 }
4464 dp_netdev_request_reconfigure(dp);
4465
4466 ds_put_format(&ds, "%s: EMC has been %s.",
4467 netdev_get_name(port->netdev),
4468 (emc_enabled) ? "enabled" : "disabled");
4469 if (emc_enabled) {
4470 ds_put_cstr(&ds, " Current insertion probability is ");
4471 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4472 if (!cur_min) {
4473 ds_put_cstr(&ds, "zero.");
4474 } else {
4475 insert_prob = UINT32_MAX / cur_min;
4476 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4477 insert_prob, 100 / (float) insert_prob);
4478 }
4479 }
4480 VLOG_INFO("%s", ds_cstr(&ds));
4481 ds_destroy(&ds);
4482 }
4483
4484 /* Checking for RXq affinity changes. */
4485 if (!netdev_is_pmd(port->netdev)
4486 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4487 goto unlock;
4488 }
4489
4490 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4491 if (error) {
4492 goto unlock;
4493 }
4494 free(port->rxq_affinity_list);
4495 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4496
4497 dp_netdev_request_reconfigure(dp);
4498 unlock:
4499 ovs_mutex_unlock(&dp->port_mutex);
4500 return error;
4501 }
4502
4503 static int
4504 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4505 uint32_t queue_id, uint32_t *priority)
4506 {
4507 *priority = queue_id;
4508 return 0;
4509 }
4510
4511 \f
4512 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4513 * a copy of the 'size' bytes of 'actions' input parameters. */
4514 struct dp_netdev_actions *
4515 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4516 {
4517 struct dp_netdev_actions *netdev_actions;
4518
4519 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4520 memcpy(netdev_actions->actions, actions, size);
4521 netdev_actions->size = size;
4522
4523 return netdev_actions;
4524 }
4525
4526 struct dp_netdev_actions *
4527 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4528 {
4529 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4530 }
4531
4532 static void
4533 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4534 {
4535 free(actions);
4536 }
4537 \f
4538 static void
4539 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4540 enum rxq_cycles_counter_type type,
4541 unsigned long long cycles)
4542 {
4543 atomic_store_relaxed(&rx->cycles[type], cycles);
4544 }
4545
4546 static void
4547 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4548 enum rxq_cycles_counter_type type,
4549 unsigned long long cycles)
4550 {
4551 non_atomic_ullong_add(&rx->cycles[type], cycles);
4552 }
4553
4554 static uint64_t
4555 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4556 enum rxq_cycles_counter_type type)
4557 {
4558 unsigned long long processing_cycles;
4559 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4560 return processing_cycles;
4561 }
4562
4563 static void
4564 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4565 unsigned long long cycles)
4566 {
4567 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4568 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4569 }
4570
4571 static uint64_t
4572 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4573 {
4574 unsigned long long processing_cycles;
4575 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4576 return processing_cycles;
4577 }
4578
4579 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4580 static inline bool
4581 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4582 {
4583 bool pmd_perf_enabled;
4584 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4585 return pmd_perf_enabled;
4586 }
4587 #else
4588 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4589 * performance metrics are not available as locked access to 64 bit
4590 * integers would be prohibitively expensive. */
4591 static inline bool
4592 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4593 {
4594 return false;
4595 }
4596 #endif
4597
4598 static int
4599 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4600 struct tx_port *p)
4601 {
4602 int i;
4603 int tx_qid;
4604 int output_cnt;
4605 bool dynamic_txqs;
4606 struct cycle_timer timer;
4607 uint64_t cycles;
4608 uint32_t tx_flush_interval;
4609
4610 cycle_timer_start(&pmd->perf_stats, &timer);
4611
4612 dynamic_txqs = p->port->dynamic_txqs;
4613 if (dynamic_txqs) {
4614 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4615 } else {
4616 tx_qid = pmd->static_tx_qid;
4617 }
4618
4619 output_cnt = dp_packet_batch_size(&p->output_pkts);
4620 ovs_assert(output_cnt > 0);
4621
4622 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4623 dp_packet_batch_init(&p->output_pkts);
4624
4625 /* Update time of the next flush. */
4626 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4627 p->flush_time = pmd->ctx.now + tx_flush_interval;
4628
4629 ovs_assert(pmd->n_output_batches > 0);
4630 pmd->n_output_batches--;
4631
4632 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4633 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4634
4635 /* Distribute send cycles evenly among transmitted packets and assign to
4636 * their respective rx queues. */
4637 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4638 for (i = 0; i < output_cnt; i++) {
4639 if (p->output_pkts_rxqs[i]) {
4640 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4641 RXQ_CYCLES_PROC_CURR, cycles);
4642 }
4643 }
4644
4645 return output_cnt;
4646 }
4647
4648 static int
4649 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4650 bool force)
4651 {
4652 struct tx_port *p;
4653 int output_cnt = 0;
4654
4655 if (!pmd->n_output_batches) {
4656 return 0;
4657 }
4658
4659 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4660 if (!dp_packet_batch_is_empty(&p->output_pkts)
4661 && (force || pmd->ctx.now >= p->flush_time)) {
4662 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4663 }
4664 }
4665 return output_cnt;
4666 }
4667
4668 static int
4669 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4670 struct dp_netdev_rxq *rxq,
4671 odp_port_t port_no)
4672 {
4673 struct pmd_perf_stats *s = &pmd->perf_stats;
4674 struct dp_packet_batch batch;
4675 struct cycle_timer timer;
4676 int error;
4677 int batch_cnt = 0;
4678 int rem_qlen = 0, *qlen_p = NULL;
4679 uint64_t cycles;
4680
4681 /* Measure duration for polling and processing rx burst. */
4682 cycle_timer_start(&pmd->perf_stats, &timer);
4683
4684 pmd->ctx.last_rxq = rxq;
4685 dp_packet_batch_init(&batch);
4686
4687 /* Fetch the rx queue length only for vhostuser ports. */
4688 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4689 qlen_p = &rem_qlen;
4690 }
4691
4692 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4693 if (!error) {
4694 /* At least one packet received. */
4695 *recirc_depth_get() = 0;
4696 pmd_thread_ctx_time_update(pmd);
4697 batch_cnt = dp_packet_batch_size(&batch);
4698 if (pmd_perf_metrics_enabled(pmd)) {
4699 /* Update batch histogram. */
4700 s->current.batches++;
4701 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4702 /* Update the maximum vhost rx queue fill level. */
4703 if (rxq->is_vhost && rem_qlen >= 0) {
4704 uint32_t qfill = batch_cnt + rem_qlen;
4705 if (qfill > s->current.max_vhost_qfill) {
4706 s->current.max_vhost_qfill = qfill;
4707 }
4708 }
4709 }
4710 /* Process packet batch. */
4711 dp_netdev_input(pmd, &batch, port_no);
4712
4713 /* Assign processing cycles to rx queue. */
4714 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4715 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4716
4717 dp_netdev_pmd_flush_output_packets(pmd, false);
4718 } else {
4719 /* Discard cycles. */
4720 cycle_timer_stop(&pmd->perf_stats, &timer);
4721 if (error != EAGAIN && error != EOPNOTSUPP) {
4722 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4723
4724 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4725 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4726 }
4727 }
4728
4729 pmd->ctx.last_rxq = NULL;
4730
4731 return batch_cnt;
4732 }
4733
4734 static struct tx_port *
4735 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4736 {
4737 struct tx_port *tx;
4738
4739 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4740 if (tx->port->port_no == port_no) {
4741 return tx;
4742 }
4743 }
4744
4745 return NULL;
4746 }
4747
4748 static struct tx_bond *
4749 tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4750 {
4751 uint32_t hash = hash_bond_id(bond_id);
4752 struct tx_bond *tx;
4753
4754 CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4755 if (tx->bond_id == bond_id) {
4756 return tx;
4757 }
4758 }
4759 return NULL;
4760 }
4761
4762 static int
4763 port_reconfigure(struct dp_netdev_port *port)
4764 {
4765 struct netdev *netdev = port->netdev;
4766 int i, err;
4767
4768 /* Closes the existing 'rxq's. */
4769 for (i = 0; i < port->n_rxq; i++) {
4770 netdev_rxq_close(port->rxqs[i].rx);
4771 port->rxqs[i].rx = NULL;
4772 }
4773 unsigned last_nrxq = port->n_rxq;
4774 port->n_rxq = 0;
4775
4776 /* Allows 'netdev' to apply the pending configuration changes. */
4777 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4778 err = netdev_reconfigure(netdev);
4779 if (err && (err != EOPNOTSUPP)) {
4780 VLOG_ERR("Failed to set interface %s new configuration",
4781 netdev_get_name(netdev));
4782 return err;
4783 }
4784 }
4785 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4786 port->rxqs = xrealloc(port->rxqs,
4787 sizeof *port->rxqs * netdev_n_rxq(netdev));
4788 /* Realloc 'used' counters for tx queues. */
4789 free(port->txq_used);
4790 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4791
4792 for (i = 0; i < netdev_n_rxq(netdev); i++) {
4793 bool new_queue = i >= last_nrxq;
4794 if (new_queue) {
4795 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4796 }
4797
4798 port->rxqs[i].port = port;
4799 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4800
4801 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4802 if (err) {
4803 return err;
4804 }
4805 port->n_rxq++;
4806 }
4807
4808 /* Parse affinity list to apply configuration for new queues. */
4809 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4810
4811 /* If reconfiguration was successful mark it as such, so we can use it */
4812 port->need_reconfigure = false;
4813
4814 return 0;
4815 }
4816
4817 struct rr_numa_list {
4818 struct hmap numas; /* Contains 'struct rr_numa' */
4819 };
4820
4821 struct rr_numa {
4822 struct hmap_node node;
4823
4824 int numa_id;
4825
4826 /* Non isolated pmds on numa node 'numa_id' */
4827 struct dp_netdev_pmd_thread **pmds;
4828 int n_pmds;
4829
4830 int cur_index;
4831 bool idx_inc;
4832 };
4833
4834 static struct rr_numa *
4835 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4836 {
4837 struct rr_numa *numa;
4838
4839 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4840 if (numa->numa_id == numa_id) {
4841 return numa;
4842 }
4843 }
4844
4845 return NULL;
4846 }
4847
4848 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4849 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4850 * Returns NULL if 'rr' numa list is empty. */
4851 static struct rr_numa *
4852 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4853 {
4854 struct hmap_node *node = NULL;
4855
4856 if (numa) {
4857 node = hmap_next(&rr->numas, &numa->node);
4858 }
4859 if (!node) {
4860 node = hmap_first(&rr->numas);
4861 }
4862
4863 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4864 }
4865
4866 static void
4867 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4868 {
4869 struct dp_netdev_pmd_thread *pmd;
4870 struct rr_numa *numa;
4871
4872 hmap_init(&rr->numas);
4873
4874 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4875 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4876 continue;
4877 }
4878
4879 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4880 if (!numa) {
4881 numa = xzalloc(sizeof *numa);
4882 numa->numa_id = pmd->numa_id;
4883 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4884 }
4885 numa->n_pmds++;
4886 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4887 numa->pmds[numa->n_pmds - 1] = pmd;
4888 /* At least one pmd so initialise curr_idx and idx_inc. */
4889 numa->cur_index = 0;
4890 numa->idx_inc = true;
4891 }
4892 }
4893
4894 /*
4895 * Returns the next pmd from the numa node.
4896 *
4897 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4898 * either an up or down walk, switching between up/down when the first or last
4899 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4900 *
4901 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4902 * core reached. e.g. 1,2,3,1,2,3,1,2...
4903 */
4904 static struct dp_netdev_pmd_thread *
4905 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4906 {
4907 int numa_idx = numa->cur_index;
4908
4909 if (numa->idx_inc == true) {
4910 /* Incrementing through list of pmds. */
4911 if (numa->cur_index == numa->n_pmds-1) {
4912 /* Reached the last pmd. */
4913 if (updown) {
4914 numa->idx_inc = false;
4915 } else {
4916 numa->cur_index = 0;
4917 }
4918 } else {
4919 numa->cur_index++;
4920 }
4921 } else {
4922 /* Decrementing through list of pmds. */
4923 if (numa->cur_index == 0) {
4924 /* Reached the first pmd. */
4925 numa->idx_inc = true;
4926 } else {
4927 numa->cur_index--;
4928 }
4929 }
4930 return numa->pmds[numa_idx];
4931 }
4932
4933 static void
4934 rr_numa_list_destroy(struct rr_numa_list *rr)
4935 {
4936 struct rr_numa *numa;
4937
4938 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4939 free(numa->pmds);
4940 free(numa);
4941 }
4942 hmap_destroy(&rr->numas);
4943 }
4944
4945 /* Sort Rx Queues by the processing cycles they are consuming. */
4946 static int
4947 compare_rxq_cycles(const void *a, const void *b)
4948 {
4949 struct dp_netdev_rxq *qa;
4950 struct dp_netdev_rxq *qb;
4951 uint64_t cycles_qa, cycles_qb;
4952
4953 qa = *(struct dp_netdev_rxq **) a;
4954 qb = *(struct dp_netdev_rxq **) b;
4955
4956 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4957 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4958
4959 if (cycles_qa != cycles_qb) {
4960 return (cycles_qa < cycles_qb) ? 1 : -1;
4961 } else {
4962 /* Cycles are the same so tiebreak on port/queue id.
4963 * Tiebreaking (as opposed to return 0) ensures consistent
4964 * sort results across multiple OS's. */
4965 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4966 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4967 if (port_qa != port_qb) {
4968 return port_qa > port_qb ? 1 : -1;
4969 } else {
4970 return netdev_rxq_get_queue_id(qa->rx)
4971 - netdev_rxq_get_queue_id(qb->rx);
4972 }
4973 }
4974 }
4975
4976 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4977 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4978 * pmds to unpinned queues.
4979 *
4980 * The function doesn't touch the pmd threads, it just stores the assignment
4981 * in the 'pmd' member of each rxq. */
4982 static void
4983 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4984 {
4985 struct dp_netdev_port *port;
4986 struct rr_numa_list rr;
4987 struct rr_numa *non_local_numa = NULL;
4988 struct dp_netdev_rxq ** rxqs = NULL;
4989 int n_rxqs = 0;
4990 struct rr_numa *numa = NULL;
4991 int numa_id;
4992 bool assign_cyc = dp->pmd_rxq_assign_cyc;
4993
4994 HMAP_FOR_EACH (port, node, &dp->ports) {
4995 if (!netdev_is_pmd(port->netdev)) {
4996 continue;
4997 }
4998
4999 for (int qid = 0; qid < port->n_rxq; qid++) {
5000 struct dp_netdev_rxq *q = &port->rxqs[qid];
5001
5002 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
5003 struct dp_netdev_pmd_thread *pmd;
5004
5005 pmd = dp_netdev_get_pmd(dp, q->core_id);
5006 if (!pmd) {
5007 VLOG_WARN("There is no PMD thread on core %d. Queue "
5008 "%d on port \'%s\' will not be polled.",
5009 q->core_id, qid, netdev_get_name(port->netdev));
5010 } else {
5011 q->pmd = pmd;
5012 pmd->isolated = true;
5013 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5014 "rx queue %d.", pmd->core_id, pmd->numa_id,
5015 netdev_rxq_get_name(q->rx),
5016 netdev_rxq_get_queue_id(q->rx));
5017 dp_netdev_pmd_unref(pmd);
5018 }
5019 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
5020 uint64_t cycle_hist = 0;
5021
5022 if (n_rxqs == 0) {
5023 rxqs = xmalloc(sizeof *rxqs);
5024 } else {
5025 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5026 }
5027
5028 if (assign_cyc) {
5029 /* Sum the queue intervals and store the cycle history. */
5030 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5031 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5032 }
5033 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5034 cycle_hist);
5035 }
5036 /* Store the queue. */
5037 rxqs[n_rxqs++] = q;
5038 }
5039 }
5040 }
5041
5042 if (n_rxqs > 1 && assign_cyc) {
5043 /* Sort the queues in order of the processing cycles
5044 * they consumed during their last pmd interval. */
5045 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5046 }
5047
5048 rr_numa_list_populate(dp, &rr);
5049 /* Assign the sorted queues to pmds in round robin. */
5050 for (int i = 0; i < n_rxqs; i++) {
5051 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5052 numa = rr_numa_list_lookup(&rr, numa_id);
5053 if (!numa) {
5054 /* There are no pmds on the queue's local NUMA node.
5055 Round robin on the NUMA nodes that do have pmds. */
5056 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
5057 if (!non_local_numa) {
5058 VLOG_ERR("There is no available (non-isolated) pmd "
5059 "thread for port \'%s\' queue %d. This queue "
5060 "will not be polled. Is pmd-cpu-mask set to "
5061 "zero? Or are all PMDs isolated to other "
5062 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
5063 netdev_rxq_get_queue_id(rxqs[i]->rx));
5064 continue;
5065 }
5066 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
5067 VLOG_WARN("There's no available (non-isolated) pmd thread "
5068 "on numa node %d. Queue %d on port \'%s\' will "
5069 "be assigned to the pmd on core %d "
5070 "(numa node %d). Expect reduced performance.",
5071 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
5072 netdev_rxq_get_name(rxqs[i]->rx),
5073 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
5074 } else {
5075 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
5076 if (assign_cyc) {
5077 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5078 "rx queue %d "
5079 "(measured processing cycles %"PRIu64").",
5080 rxqs[i]->pmd->core_id, numa_id,
5081 netdev_rxq_get_name(rxqs[i]->rx),
5082 netdev_rxq_get_queue_id(rxqs[i]->rx),
5083 dp_netdev_rxq_get_cycles(rxqs[i],
5084 RXQ_CYCLES_PROC_HIST));
5085 } else {
5086 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5087 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
5088 netdev_rxq_get_name(rxqs[i]->rx),
5089 netdev_rxq_get_queue_id(rxqs[i]->rx));
5090 }
5091 }
5092 }
5093
5094 rr_numa_list_destroy(&rr);
5095 free(rxqs);
5096 }
5097
5098 static void
5099 reload_affected_pmds(struct dp_netdev *dp)
5100 {
5101 struct dp_netdev_pmd_thread *pmd;
5102
5103 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5104 if (pmd->need_reload) {
5105 flow_mark_flush(pmd);
5106 dp_netdev_reload_pmd__(pmd);
5107 }
5108 }
5109
5110 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5111 if (pmd->need_reload) {
5112 if (pmd->core_id != NON_PMD_CORE_ID) {
5113 bool reload;
5114
5115 do {
5116 atomic_read_explicit(&pmd->reload, &reload,
5117 memory_order_acquire);
5118 } while (reload);
5119 }
5120 pmd->need_reload = false;
5121 }
5122 }
5123 }
5124
5125 static void
5126 reconfigure_pmd_threads(struct dp_netdev *dp)
5127 OVS_REQUIRES(dp->port_mutex)
5128 {
5129 struct dp_netdev_pmd_thread *pmd;
5130 struct ovs_numa_dump *pmd_cores;
5131 struct ovs_numa_info_core *core;
5132 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5133 struct hmapx_node *node;
5134 bool changed = false;
5135 bool need_to_adjust_static_tx_qids = false;
5136
5137 /* The pmd threads should be started only if there's a pmd port in the
5138 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
5139 * NR_PMD_THREADS per numa node. */
5140 if (!has_pmd_port(dp)) {
5141 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5142 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5143 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5144 } else {
5145 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5146 }
5147
5148 /* We need to adjust 'static_tx_qid's only if we're reducing number of
5149 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5150 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5151 /* Adjustment is required to keep 'static_tx_qid's sequential and
5152 * avoid possible issues, for example, imbalanced tx queue usage
5153 * and unnecessary locking caused by remapping on netdev level. */
5154 need_to_adjust_static_tx_qids = true;
5155 }
5156
5157 /* Check for unwanted pmd threads */
5158 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5159 if (pmd->core_id == NON_PMD_CORE_ID) {
5160 continue;
5161 }
5162 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5163 pmd->core_id)) {
5164 hmapx_add(&to_delete, pmd);
5165 } else if (need_to_adjust_static_tx_qids) {
5166 atomic_store_relaxed(&pmd->reload_tx_qid, true);
5167 pmd->need_reload = true;
5168 }
5169 }
5170
5171 HMAPX_FOR_EACH (node, &to_delete) {
5172 pmd = (struct dp_netdev_pmd_thread *) node->data;
5173 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5174 pmd->numa_id, pmd->core_id);
5175 dp_netdev_del_pmd(dp, pmd);
5176 }
5177 changed = !hmapx_is_empty(&to_delete);
5178 hmapx_destroy(&to_delete);
5179
5180 if (need_to_adjust_static_tx_qids) {
5181 /* 'static_tx_qid's are not sequential now.
5182 * Reload remaining threads to fix this. */
5183 reload_affected_pmds(dp);
5184 }
5185
5186 /* Check for required new pmd threads */
5187 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5188 pmd = dp_netdev_get_pmd(dp, core->core_id);
5189 if (!pmd) {
5190 struct ds name = DS_EMPTY_INITIALIZER;
5191
5192 pmd = xzalloc(sizeof *pmd);
5193 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5194
5195 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5196 pmd->thread = ovs_thread_create(ds_cstr(&name),
5197 pmd_thread_main, pmd);
5198 ds_destroy(&name);
5199
5200 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5201 pmd->numa_id, pmd->core_id);
5202 changed = true;
5203 } else {
5204 dp_netdev_pmd_unref(pmd);
5205 }
5206 }
5207
5208 if (changed) {
5209 struct ovs_numa_info_numa *numa;
5210
5211 /* Log the number of pmd threads per numa node. */
5212 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5213 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5214 numa->n_cores, numa->numa_id);
5215 }
5216 }
5217
5218 ovs_numa_dump_destroy(pmd_cores);
5219 }
5220
5221 static void
5222 pmd_remove_stale_ports(struct dp_netdev *dp,
5223 struct dp_netdev_pmd_thread *pmd)
5224 OVS_EXCLUDED(pmd->port_mutex)
5225 OVS_REQUIRES(dp->port_mutex)
5226 {
5227 struct rxq_poll *poll, *poll_next;
5228 struct tx_port *tx, *tx_next;
5229
5230 ovs_mutex_lock(&pmd->port_mutex);
5231 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5232 struct dp_netdev_port *port = poll->rxq->port;
5233
5234 if (port->need_reconfigure
5235 || !hmap_contains(&dp->ports, &port->node)) {
5236 dp_netdev_del_rxq_from_pmd(pmd, poll);
5237 }
5238 }
5239 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
5240 struct dp_netdev_port *port = tx->port;
5241
5242 if (port->need_reconfigure
5243 || !hmap_contains(&dp->ports, &port->node)) {
5244 dp_netdev_del_port_tx_from_pmd(pmd, tx);
5245 }
5246 }
5247 ovs_mutex_unlock(&pmd->port_mutex);
5248 }
5249
5250 /* Must be called each time a port is added/removed or the cmask changes.
5251 * This creates and destroys pmd threads, reconfigures ports, opens their
5252 * rxqs and assigns all rxqs/txqs to pmd threads. */
5253 static void
5254 reconfigure_datapath(struct dp_netdev *dp)
5255 OVS_REQUIRES(dp->port_mutex)
5256 {
5257 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5258 struct dp_netdev_pmd_thread *pmd;
5259 struct dp_netdev_port *port;
5260 int wanted_txqs;
5261
5262 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5263
5264 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5265 * on the system and the user configuration. */
5266 reconfigure_pmd_threads(dp);
5267
5268 wanted_txqs = cmap_count(&dp->poll_threads);
5269
5270 /* The number of pmd threads might have changed, or a port can be new:
5271 * adjust the txqs. */
5272 HMAP_FOR_EACH (port, node, &dp->ports) {
5273 netdev_set_tx_multiq(port->netdev, wanted_txqs);
5274 }
5275
5276 /* Step 2: Remove from the pmd threads ports that have been removed or
5277 * need reconfiguration. */
5278
5279 /* Check for all the ports that need reconfiguration. We cache this in
5280 * 'port->need_reconfigure', because netdev_is_reconf_required() can
5281 * change at any time.
5282 * Also mark for reconfiguration all ports which will likely change their
5283 * 'dynamic_txqs' parameter. It's required to stop using them before
5284 * changing this setting and it's simpler to mark ports here and allow
5285 * 'pmd_remove_stale_ports' to remove them from threads. There will be
5286 * no actual reconfiguration in 'port_reconfigure' because it's
5287 * unnecessary. */
5288 HMAP_FOR_EACH (port, node, &dp->ports) {
5289 if (netdev_is_reconf_required(port->netdev)
5290 || (port->dynamic_txqs
5291 != (netdev_n_txq(port->netdev) < wanted_txqs))) {
5292 port->need_reconfigure = true;
5293 }
5294 }
5295
5296 /* Remove from the pmd threads all the ports that have been deleted or
5297 * need reconfiguration. */
5298 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5299 pmd_remove_stale_ports(dp, pmd);
5300 }
5301
5302 /* Reload affected pmd threads. We must wait for the pmd threads before
5303 * reconfiguring the ports, because a port cannot be reconfigured while
5304 * it's being used. */
5305 reload_affected_pmds(dp);
5306
5307 /* Step 3: Reconfigure ports. */
5308
5309 /* We only reconfigure the ports that we determined above, because they're
5310 * not being used by any pmd thread at the moment. If a port fails to
5311 * reconfigure we remove it from the datapath. */
5312 struct dp_netdev_port *next_port;
5313 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
5314 int err;
5315
5316 if (!port->need_reconfigure) {
5317 continue;
5318 }
5319
5320 err = port_reconfigure(port);
5321 if (err) {
5322 hmap_remove(&dp->ports, &port->node);
5323 seq_change(dp->port_seq);
5324 port_destroy(port);
5325 } else {
5326 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
5327 }
5328 }
5329
5330 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
5331 * for now, we just update the 'pmd' pointer in each rxq to point to the
5332 * wanted thread according to the scheduling policy. */
5333
5334 /* Reset all the pmd threads to non isolated. */
5335 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5336 pmd->isolated = false;
5337 }
5338
5339 /* Reset all the queues to unassigned */
5340 HMAP_FOR_EACH (port, node, &dp->ports) {
5341 for (int i = 0; i < port->n_rxq; i++) {
5342 port->rxqs[i].pmd = NULL;
5343 }
5344 }
5345
5346 /* Add pinned queues and mark pmd threads isolated. */
5347 rxq_scheduling(dp, true);
5348
5349 /* Add non-pinned queues. */
5350 rxq_scheduling(dp, false);
5351
5352 /* Step 5: Remove queues not compliant with new scheduling. */
5353
5354 /* Count all the threads that will have at least one queue to poll. */
5355 HMAP_FOR_EACH (port, node, &dp->ports) {
5356 for (int qid = 0; qid < port->n_rxq; qid++) {
5357 struct dp_netdev_rxq *q = &port->rxqs[qid];
5358
5359 if (q->pmd) {
5360 hmapx_add(&busy_threads, q->pmd);
5361 }
5362 }
5363 }
5364
5365 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5366 struct rxq_poll *poll, *poll_next;
5367
5368 ovs_mutex_lock(&pmd->port_mutex);
5369 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5370 if (poll->rxq->pmd != pmd) {
5371 dp_netdev_del_rxq_from_pmd(pmd, poll);
5372
5373 /* This pmd might sleep after this step if it has no rxq
5374 * remaining. Tell it to busy wait for new assignment if it
5375 * has at least one scheduled queue. */
5376 if (hmap_count(&pmd->poll_list) == 0 &&
5377 hmapx_contains(&busy_threads, pmd)) {
5378 atomic_store_relaxed(&pmd->wait_for_reload, true);
5379 }
5380 }
5381 }
5382 ovs_mutex_unlock(&pmd->port_mutex);
5383 }
5384
5385 hmapx_destroy(&busy_threads);
5386
5387 /* Reload affected pmd threads. We must wait for the pmd threads to remove
5388 * the old queues before readding them, otherwise a queue can be polled by
5389 * two threads at the same time. */
5390 reload_affected_pmds(dp);
5391
5392 /* Step 6: Add queues from scheduling, if they're not there already. */
5393 HMAP_FOR_EACH (port, node, &dp->ports) {
5394 if (!netdev_is_pmd(port->netdev)) {
5395 continue;
5396 }
5397
5398 for (int qid = 0; qid < port->n_rxq; qid++) {
5399 struct dp_netdev_rxq *q = &port->rxqs[qid];
5400
5401 if (q->pmd) {
5402 ovs_mutex_lock(&q->pmd->port_mutex);
5403 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5404 ovs_mutex_unlock(&q->pmd->port_mutex);
5405 }
5406 }
5407 }
5408
5409 /* Add every port and bond to the tx port and bond caches of
5410 * every pmd thread, if it's not there already and if this pmd
5411 * has at least one rxq to poll.
5412 */
5413 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5414 ovs_mutex_lock(&pmd->port_mutex);
5415 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5416 struct tx_bond *bond;
5417
5418 HMAP_FOR_EACH (port, node, &dp->ports) {
5419 dp_netdev_add_port_tx_to_pmd(pmd, port);
5420 }
5421
5422 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5423 dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5424 }
5425 }
5426 ovs_mutex_unlock(&pmd->port_mutex);
5427 }
5428
5429 /* Reload affected pmd threads. */
5430 reload_affected_pmds(dp);
5431
5432 /* Check if PMD Auto LB is to be enabled */
5433 set_pmd_auto_lb(dp);
5434 }
5435
5436 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5437 static bool
5438 ports_require_restart(const struct dp_netdev *dp)
5439 OVS_REQUIRES(dp->port_mutex)
5440 {
5441 struct dp_netdev_port *port;
5442
5443 HMAP_FOR_EACH (port, node, &dp->ports) {
5444 if (netdev_is_reconf_required(port->netdev)) {
5445 return true;
5446 }
5447 }
5448
5449 return false;
5450 }
5451
5452 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5453 * of elements in array to be considered for calculating vairance.
5454 * Usage example: data array 'a' contains the processing load of each pmd and
5455 * 'n' is the number of PMDs. It returns the variance in processing load of
5456 * PMDs*/
5457 static uint64_t
5458 variance(uint64_t a[], int n)
5459 {
5460 /* Compute mean (average of elements). */
5461 uint64_t sum = 0;
5462 uint64_t mean = 0;
5463 uint64_t sqDiff = 0;
5464
5465 if (!n) {
5466 return 0;
5467 }
5468
5469 for (int i = 0; i < n; i++) {
5470 sum += a[i];
5471 }
5472
5473 if (sum) {
5474 mean = sum / n;
5475
5476 /* Compute sum squared differences with mean. */
5477 for (int i = 0; i < n; i++) {
5478 sqDiff += (a[i] - mean)*(a[i] - mean);
5479 }
5480 }
5481 return (sqDiff ? (sqDiff / n) : 0);
5482 }
5483
5484
5485 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5486 * assignment to PMDs. */
5487 static bool
5488 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5489 uint32_t num_pmds, uint64_t *predicted_variance)
5490 OVS_REQUIRES(dp->port_mutex)
5491 {
5492 struct dp_netdev_port *port;
5493 struct dp_netdev_pmd_thread *pmd;
5494 struct dp_netdev_rxq **rxqs = NULL;
5495 struct rr_numa *numa = NULL;
5496 struct rr_numa_list rr;
5497 int n_rxqs = 0;
5498 bool ret = false;
5499 uint64_t *pmd_usage;
5500
5501 if (!predicted_variance) {
5502 return ret;
5503 }
5504
5505 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5506
5507 HMAP_FOR_EACH (port, node, &dp->ports) {
5508 if (!netdev_is_pmd(port->netdev)) {
5509 continue;
5510 }
5511
5512 for (int qid = 0; qid < port->n_rxq; qid++) {
5513 struct dp_netdev_rxq *q = &port->rxqs[qid];
5514 uint64_t cycle_hist = 0;
5515
5516 if (q->pmd->isolated) {
5517 continue;
5518 }
5519
5520 if (n_rxqs == 0) {
5521 rxqs = xmalloc(sizeof *rxqs);
5522 } else {
5523 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5524 }
5525
5526 /* Sum the queue intervals and store the cycle history. */
5527 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5528 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5529 }
5530 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5531 cycle_hist);
5532 /* Store the queue. */
5533 rxqs[n_rxqs++] = q;
5534 }
5535 }
5536 if (n_rxqs > 1) {
5537 /* Sort the queues in order of the processing cycles
5538 * they consumed during their last pmd interval. */
5539 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5540 }
5541 rr_numa_list_populate(dp, &rr);
5542
5543 for (int i = 0; i < n_rxqs; i++) {
5544 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5545 numa = rr_numa_list_lookup(&rr, numa_id);
5546 if (!numa) {
5547 /* Abort if cross NUMA polling. */
5548 VLOG_DBG("PMD auto lb dry run."
5549 " Aborting due to cross-numa polling.");
5550 goto cleanup;
5551 }
5552
5553 pmd = rr_numa_get_pmd(numa, true);
5554 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5555 "to be assigned port \'%s\' rx queue %d "
5556 "(measured processing cycles %"PRIu64").",
5557 pmd->core_id, numa_id,
5558 netdev_rxq_get_name(rxqs[i]->rx),
5559 netdev_rxq_get_queue_id(rxqs[i]->rx),
5560 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5561
5562 for (int id = 0; id < num_pmds; id++) {
5563 if (pmd->core_id == core_list[id]) {
5564 /* Add the processing cycles of rxq to pmd polling it. */
5565 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5566 RXQ_CYCLES_PROC_HIST);
5567 }
5568 }
5569 }
5570
5571 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5572 uint64_t total_cycles = 0;
5573
5574 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5575 continue;
5576 }
5577
5578 /* Get the total pmd cycles for an interval. */
5579 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5580 /* Estimate the cycles to cover all intervals. */
5581 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5582 for (int id = 0; id < num_pmds; id++) {
5583 if (pmd->core_id == core_list[id]) {
5584 if (pmd_usage[id]) {
5585 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5586 }
5587 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5588 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5589 }
5590 }
5591 }
5592 *predicted_variance = variance(pmd_usage, num_pmds);
5593 ret = true;
5594
5595 cleanup:
5596 rr_numa_list_destroy(&rr);
5597 free(rxqs);
5598 free(pmd_usage);
5599 return ret;
5600 }
5601
5602 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5603 * better distribution of load on PMDs. */
5604 static bool
5605 pmd_rebalance_dry_run(struct dp_netdev *dp)
5606 OVS_REQUIRES(dp->port_mutex)
5607 {
5608 struct dp_netdev_pmd_thread *pmd;
5609 uint64_t *curr_pmd_usage;
5610
5611 uint64_t curr_variance;
5612 uint64_t new_variance;
5613 uint64_t improvement = 0;
5614 uint32_t num_pmds;
5615 uint32_t *pmd_corelist;
5616 struct rxq_poll *poll;
5617 bool ret;
5618
5619 num_pmds = cmap_count(&dp->poll_threads);
5620
5621 if (num_pmds > 1) {
5622 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5623 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5624 } else {
5625 return false;
5626 }
5627
5628 num_pmds = 0;
5629 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5630 uint64_t total_cycles = 0;
5631 uint64_t total_proc = 0;
5632
5633 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5634 continue;
5635 }
5636
5637 /* Get the total pmd cycles for an interval. */
5638 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5639 /* Estimate the cycles to cover all intervals. */
5640 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5641
5642 ovs_mutex_lock(&pmd->port_mutex);
5643 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5644 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5645 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5646 }
5647 }
5648 ovs_mutex_unlock(&pmd->port_mutex);
5649
5650 if (total_proc) {
5651 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5652 }
5653
5654 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5655 pmd->core_id, curr_pmd_usage[num_pmds]);
5656
5657 if (atomic_count_get(&pmd->pmd_overloaded)) {
5658 atomic_count_set(&pmd->pmd_overloaded, 0);
5659 }
5660
5661 pmd_corelist[num_pmds] = pmd->core_id;
5662 num_pmds++;
5663 }
5664
5665 curr_variance = variance(curr_pmd_usage, num_pmds);
5666 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5667
5668 if (ret) {
5669 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5670 " Predicted PMD variance: %"PRIu64"",
5671 curr_variance, new_variance);
5672
5673 if (new_variance < curr_variance) {
5674 improvement =
5675 ((curr_variance - new_variance) * 100) / curr_variance;
5676 }
5677 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5678 ret = false;
5679 }
5680 }
5681
5682 free(curr_pmd_usage);
5683 free(pmd_corelist);
5684 return ret;
5685 }
5686
5687
5688 /* Return true if needs to revalidate datapath flows. */
5689 static bool
5690 dpif_netdev_run(struct dpif *dpif)
5691 {
5692 struct dp_netdev_port *port;
5693 struct dp_netdev *dp = get_dp_netdev(dpif);
5694 struct dp_netdev_pmd_thread *non_pmd;
5695 uint64_t new_tnl_seq;
5696 bool need_to_flush = true;
5697 bool pmd_rebalance = false;
5698 long long int now = time_msec();
5699 struct dp_netdev_pmd_thread *pmd;
5700
5701 ovs_mutex_lock(&dp->port_mutex);
5702 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5703 if (non_pmd) {
5704 ovs_mutex_lock(&dp->non_pmd_mutex);
5705 HMAP_FOR_EACH (port, node, &dp->ports) {
5706 if (!netdev_is_pmd(port->netdev)) {
5707 int i;
5708
5709 if (port->emc_enabled) {
5710 atomic_read_relaxed(&dp->emc_insert_min,
5711 &non_pmd->ctx.emc_insert_min);
5712 } else {
5713 non_pmd->ctx.emc_insert_min = 0;
5714 }
5715
5716 for (i = 0; i < port->n_rxq; i++) {
5717
5718 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5719 continue;
5720 }
5721
5722 if (dp_netdev_process_rxq_port(non_pmd,
5723 &port->rxqs[i],
5724 port->port_no)) {
5725 need_to_flush = false;
5726 }
5727 }
5728 }
5729 }
5730 if (need_to_flush) {
5731 /* We didn't receive anything in the process loop.
5732 * Check if we need to send something.
5733 * There was no time updates on current iteration. */
5734 pmd_thread_ctx_time_update(non_pmd);
5735 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5736 }
5737
5738 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5739 ovs_mutex_unlock(&dp->non_pmd_mutex);
5740
5741 dp_netdev_pmd_unref(non_pmd);
5742 }
5743
5744 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5745 if (pmd_alb->is_enabled) {
5746 if (!pmd_alb->rebalance_poll_timer) {
5747 pmd_alb->rebalance_poll_timer = now;
5748 } else if ((pmd_alb->rebalance_poll_timer +
5749 pmd_alb->rebalance_intvl) < now) {
5750 pmd_alb->rebalance_poll_timer = now;
5751 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5752 if (atomic_count_get(&pmd->pmd_overloaded) >=
5753 PMD_RXQ_INTERVAL_MAX) {
5754 pmd_rebalance = true;
5755 break;
5756 }
5757 }
5758
5759 if (pmd_rebalance &&
5760 !dp_netdev_is_reconf_required(dp) &&
5761 !ports_require_restart(dp) &&
5762 pmd_rebalance_dry_run(dp)) {
5763 VLOG_INFO("PMD auto lb dry run."
5764 " requesting datapath reconfigure.");
5765 dp_netdev_request_reconfigure(dp);
5766 }
5767 }
5768 }
5769
5770 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5771 reconfigure_datapath(dp);
5772 }
5773 ovs_mutex_unlock(&dp->port_mutex);
5774
5775 tnl_neigh_cache_run();
5776 tnl_port_map_run();
5777 new_tnl_seq = seq_read(tnl_conf_seq);
5778
5779 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5780 dp->last_tnl_conf_seq = new_tnl_seq;
5781 return true;
5782 }
5783 return false;
5784 }
5785
5786 static void
5787 dpif_netdev_wait(struct dpif *dpif)
5788 {
5789 struct dp_netdev_port *port;
5790 struct dp_netdev *dp = get_dp_netdev(dpif);
5791
5792 ovs_mutex_lock(&dp_netdev_mutex);
5793 ovs_mutex_lock(&dp->port_mutex);
5794 HMAP_FOR_EACH (port, node, &dp->ports) {
5795 netdev_wait_reconf_required(port->netdev);
5796 if (!netdev_is_pmd(port->netdev)) {
5797 int i;
5798
5799 for (i = 0; i < port->n_rxq; i++) {
5800 netdev_rxq_wait(port->rxqs[i].rx);
5801 }
5802 }
5803 }
5804 ovs_mutex_unlock(&dp->port_mutex);
5805 ovs_mutex_unlock(&dp_netdev_mutex);
5806 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5807 }
5808
5809 static void
5810 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5811 {
5812 struct tx_port *tx_port_cached;
5813
5814 /* Flush all the queued packets. */
5815 dp_netdev_pmd_flush_output_packets(pmd, true);
5816 /* Free all used tx queue ids. */
5817 dpif_netdev_xps_revalidate_pmd(pmd, true);
5818
5819 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5820 free(tx_port_cached);
5821 }
5822 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5823 free(tx_port_cached);
5824 }
5825 }
5826
5827 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5828 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5829 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5830 * one txq. */
5831 static void
5832 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5833 OVS_REQUIRES(pmd->port_mutex)
5834 {
5835 struct tx_port *tx_port, *tx_port_cached;
5836
5837 pmd_free_cached_ports(pmd);
5838 hmap_shrink(&pmd->send_port_cache);
5839 hmap_shrink(&pmd->tnl_port_cache);
5840
5841 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5842 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5843 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5844 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5845 hash_port_no(tx_port_cached->port->port_no));
5846 }
5847
5848 if (netdev_n_txq(tx_port->port->netdev)) {
5849 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5850 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5851 hash_port_no(tx_port_cached->port->port_no));
5852 }
5853 }
5854 }
5855
5856 static void
5857 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5858 {
5859 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5860 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5861 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5862 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5863 }
5864 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5865
5866 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5867 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5868 }
5869
5870 static void
5871 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5872 {
5873 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5874 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5875 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5876 }
5877
5878 static int
5879 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5880 struct polled_queue **ppoll_list)
5881 {
5882 struct polled_queue *poll_list = *ppoll_list;
5883 struct rxq_poll *poll;
5884 int i;
5885
5886 ovs_mutex_lock(&pmd->port_mutex);
5887 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5888 * sizeof *poll_list);
5889
5890 i = 0;
5891 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5892 poll_list[i].rxq = poll->rxq;
5893 poll_list[i].port_no = poll->rxq->port->port_no;
5894 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5895 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5896 poll_list[i].change_seq =
5897 netdev_get_change_seq(poll->rxq->port->netdev);
5898 i++;
5899 }
5900
5901 pmd_load_cached_ports(pmd);
5902
5903 ovs_mutex_unlock(&pmd->port_mutex);
5904
5905 *ppoll_list = poll_list;
5906 return i;
5907 }
5908
5909 static void *
5910 pmd_thread_main(void *f_)
5911 {
5912 struct dp_netdev_pmd_thread *pmd = f_;
5913 struct pmd_perf_stats *s = &pmd->perf_stats;
5914 unsigned int lc = 0;
5915 struct polled_queue *poll_list;
5916 bool wait_for_reload = false;
5917 bool reload_tx_qid;
5918 bool exiting;
5919 bool reload;
5920 int poll_cnt;
5921 int i;
5922 int process_packets = 0;
5923
5924 poll_list = NULL;
5925
5926 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5927 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5928 ovs_numa_thread_setaffinity_core(pmd->core_id);
5929 dpdk_set_lcore_id(pmd->core_id);
5930 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5931 dfc_cache_init(&pmd->flow_cache);
5932 pmd_alloc_static_tx_qid(pmd);
5933
5934 reload:
5935 atomic_count_init(&pmd->pmd_overloaded, 0);
5936
5937 /* List port/core affinity */
5938 for (i = 0; i < poll_cnt; i++) {
5939 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5940 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5941 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5942 /* Reset the rxq current cycles counter. */
5943 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5944 }
5945
5946 if (!poll_cnt) {
5947 if (wait_for_reload) {
5948 /* Don't sleep, control thread will ask for a reload shortly. */
5949 do {
5950 atomic_read_explicit(&pmd->reload, &reload,
5951 memory_order_acquire);
5952 } while (!reload);
5953 } else {
5954 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5955 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5956 poll_block();
5957 }
5958 }
5959 }
5960
5961 pmd->intrvl_tsc_prev = 0;
5962 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5963 cycles_counter_update(s);
5964
5965 pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5966
5967 /* Protect pmd stats from external clearing while polling. */
5968 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5969 for (;;) {
5970 uint64_t rx_packets = 0, tx_packets = 0;
5971
5972 pmd_perf_start_iteration(s);
5973
5974 for (i = 0; i < poll_cnt; i++) {
5975
5976 if (!poll_list[i].rxq_enabled) {
5977 continue;
5978 }
5979
5980 if (poll_list[i].emc_enabled) {
5981 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5982 &pmd->ctx.emc_insert_min);
5983 } else {
5984 pmd->ctx.emc_insert_min = 0;
5985 }
5986
5987 process_packets =
5988 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5989 poll_list[i].port_no);
5990 rx_packets += process_packets;
5991 }
5992
5993 if (!rx_packets) {
5994 /* We didn't receive anything in the process loop.
5995 * Check if we need to send something.
5996 * There was no time updates on current iteration. */
5997 pmd_thread_ctx_time_update(pmd);
5998 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5999 }
6000
6001 /* Do RCU synchronization at fixed interval. This ensures that
6002 * synchronization would not be delayed long even at high load of
6003 * packet processing. */
6004 if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6005 if (!ovsrcu_try_quiesce()) {
6006 pmd->next_rcu_quiesce =
6007 pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6008 }
6009 }
6010
6011 if (lc++ > 1024) {
6012 lc = 0;
6013
6014 coverage_try_clear();
6015 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6016 if (!ovsrcu_try_quiesce()) {
6017 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6018 pmd->next_rcu_quiesce =
6019 pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6020 }
6021
6022 for (i = 0; i < poll_cnt; i++) {
6023 uint64_t current_seq =
6024 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6025 if (poll_list[i].change_seq != current_seq) {
6026 poll_list[i].change_seq = current_seq;
6027 poll_list[i].rxq_enabled =
6028 netdev_rxq_enabled(poll_list[i].rxq->rx);
6029 }
6030 }
6031 }
6032
6033 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6034 if (OVS_UNLIKELY(reload)) {
6035 break;
6036 }
6037
6038 pmd_perf_end_iteration(s, rx_packets, tx_packets,
6039 pmd_perf_metrics_enabled(pmd));
6040 }
6041 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6042
6043 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6044 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6045 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6046 atomic_read_relaxed(&pmd->exit, &exiting);
6047 /* Signal here to make sure the pmd finishes
6048 * reloading the updated configuration. */
6049 dp_netdev_pmd_reload_done(pmd);
6050
6051 if (reload_tx_qid) {
6052 pmd_free_static_tx_qid(pmd);
6053 pmd_alloc_static_tx_qid(pmd);
6054 }
6055
6056 if (!exiting) {
6057 goto reload;
6058 }
6059
6060 pmd_free_static_tx_qid(pmd);
6061 dfc_cache_uninit(&pmd->flow_cache);
6062 free(poll_list);
6063 pmd_free_cached_ports(pmd);
6064 return NULL;
6065 }
6066
6067 static void
6068 dp_netdev_disable_upcall(struct dp_netdev *dp)
6069 OVS_ACQUIRES(dp->upcall_rwlock)
6070 {
6071 fat_rwlock_wrlock(&dp->upcall_rwlock);
6072 }
6073
6074 \f
6075 /* Meters */
6076 static void
6077 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6078 struct ofputil_meter_features *features)
6079 {
6080 features->max_meters = MAX_METERS;
6081 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6082 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6083 features->max_bands = MAX_BANDS;
6084 features->max_color = 0;
6085 }
6086
6087 /* Applies the meter identified by 'meter_id' to 'packets_'. Packets
6088 * that exceed a band are dropped in-place. */
6089 static void
6090 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6091 uint32_t meter_id, long long int now)
6092 {
6093 struct dp_meter *meter;
6094 struct dp_meter_band *band;
6095 struct dp_packet *packet;
6096 long long int long_delta_t; /* msec */
6097 uint32_t delta_t; /* msec */
6098 const size_t cnt = dp_packet_batch_size(packets_);
6099 uint32_t bytes, volume;
6100 int exceeded_band[NETDEV_MAX_BURST];
6101 uint32_t exceeded_rate[NETDEV_MAX_BURST];
6102 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
6103
6104 if (meter_id >= MAX_METERS) {
6105 return;
6106 }
6107
6108 meter_lock(dp, meter_id);
6109 meter = dp->meters[meter_id];
6110 if (!meter) {
6111 goto out;
6112 }
6113
6114 /* Initialize as negative values. */
6115 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6116 /* Initialize as zeroes. */
6117 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6118
6119 /* All packets will hit the meter at the same time. */
6120 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
6121
6122 if (long_delta_t < 0) {
6123 /* This condition means that we have several threads fighting for a
6124 meter lock, and the one who received the packets a bit later wins.
6125 Assuming that all racing threads received packets at the same time
6126 to avoid overflow. */
6127 long_delta_t = 0;
6128 }
6129
6130 /* Make sure delta_t will not be too large, so that bucket will not
6131 * wrap around below. */
6132 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
6133 ? meter->max_delta_t : (uint32_t)long_delta_t;
6134
6135 /* Update meter stats. */
6136 meter->used = now;
6137 meter->packet_count += cnt;
6138 bytes = 0;
6139 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6140 bytes += dp_packet_size(packet);
6141 }
6142 meter->byte_count += bytes;
6143
6144 /* Meters can operate in terms of packets per second or kilobits per
6145 * second. */
6146 if (meter->flags & OFPMF13_PKTPS) {
6147 /* Rate in packets/second, bucket 1/1000 packets. */
6148 /* msec * packets/sec = 1/1000 packets. */
6149 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6150 } else {
6151 /* Rate in kbps, bucket in bits. */
6152 /* msec * kbps = bits */
6153 volume = bytes * 8;
6154 }
6155
6156 /* Update all bands and find the one hit with the highest rate for each
6157 * packet (if any). */
6158 for (int m = 0; m < meter->n_bands; ++m) {
6159 band = &meter->bands[m];
6160
6161 /* Update band's bucket. */
6162 band->bucket += delta_t * band->up.rate;
6163 if (band->bucket > band->up.burst_size) {
6164 band->bucket = band->up.burst_size;
6165 }
6166
6167 /* Drain the bucket for all the packets, if possible. */
6168 if (band->bucket >= volume) {
6169 band->bucket -= volume;
6170 } else {
6171 int band_exceeded_pkt;
6172
6173 /* Band limit hit, must process packet-by-packet. */
6174 if (meter->flags & OFPMF13_PKTPS) {
6175 band_exceeded_pkt = band->bucket / 1000;
6176 band->bucket %= 1000; /* Remainder stays in bucket. */
6177
6178 /* Update the exceeding band for each exceeding packet.
6179 * (Only one band will be fired by a packet, and that
6180 * can be different for each packet.) */
6181 for (int i = band_exceeded_pkt; i < cnt; i++) {
6182 if (band->up.rate > exceeded_rate[i]) {
6183 exceeded_rate[i] = band->up.rate;
6184 exceeded_band[i] = m;
6185 }
6186 }
6187 } else {
6188 /* Packet sizes differ, must process one-by-one. */
6189 band_exceeded_pkt = cnt;
6190 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6191 uint32_t bits = dp_packet_size(packet) * 8;
6192
6193 if (band->bucket >= bits) {
6194 band->bucket -= bits;
6195 } else {
6196 if (i < band_exceeded_pkt) {
6197 band_exceeded_pkt = i;
6198 }
6199 /* Update the exceeding band for the exceeding packet.
6200 * (Only one band will be fired by a packet, and that
6201 * can be different for each packet.) */
6202 if (band->up.rate > exceeded_rate[i]) {
6203 exceeded_rate[i] = band->up.rate;
6204 exceeded_band[i] = m;
6205 }
6206 }
6207 }
6208 }
6209 /* Remember the first exceeding packet. */
6210 if (exceeded_pkt > band_exceeded_pkt) {
6211 exceeded_pkt = band_exceeded_pkt;
6212 }
6213 }
6214 }
6215
6216 /* Fire the highest rate band exceeded by each packet, and drop
6217 * packets if needed. */
6218 size_t j;
6219 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6220 if (exceeded_band[j] >= 0) {
6221 /* Meter drop packet. */
6222 band = &meter->bands[exceeded_band[j]];
6223 band->packet_count += 1;
6224 band->byte_count += dp_packet_size(packet);
6225 COVERAGE_INC(datapath_drop_meter);
6226 dp_packet_delete(packet);
6227 } else {
6228 /* Meter accepts packet. */
6229 dp_packet_batch_refill(packets_, packet, j);
6230 }
6231 }
6232 out:
6233 meter_unlock(dp, meter_id);
6234 }
6235
6236 /* Meter set/get/del processing is still single-threaded. */
6237 static int
6238 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6239 struct ofputil_meter_config *config)
6240 {
6241 struct dp_netdev *dp = get_dp_netdev(dpif);
6242 uint32_t mid = meter_id.uint32;
6243 struct dp_meter *meter;
6244 int i;
6245
6246 if (mid >= MAX_METERS) {
6247 return EFBIG; /* Meter_id out of range. */
6248 }
6249
6250 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6251 return EBADF; /* Unsupported flags set */
6252 }
6253
6254 if (config->n_bands > MAX_BANDS) {
6255 return EINVAL;
6256 }
6257
6258 for (i = 0; i < config->n_bands; ++i) {
6259 switch (config->bands[i].type) {
6260 case OFPMBT13_DROP:
6261 break;
6262 default:
6263 return ENODEV; /* Unsupported band type */
6264 }
6265 }
6266
6267 /* Allocate meter */
6268 meter = xzalloc(sizeof *meter
6269 + config->n_bands * sizeof(struct dp_meter_band));
6270
6271 meter->flags = config->flags;
6272 meter->n_bands = config->n_bands;
6273 meter->max_delta_t = 0;
6274 meter->used = time_usec();
6275
6276 /* set up bands */
6277 for (i = 0; i < config->n_bands; ++i) {
6278 uint32_t band_max_delta_t;
6279
6280 /* Set burst size to a workable value if none specified. */
6281 if (config->bands[i].burst_size == 0) {
6282 config->bands[i].burst_size = config->bands[i].rate;
6283 }
6284
6285 meter->bands[i].up = config->bands[i];
6286 /* Convert burst size to the bucket units: */
6287 /* pkts => 1/1000 packets, kilobits => bits. */
6288 meter->bands[i].up.burst_size *= 1000;
6289 /* Initialize bucket to empty. */
6290 meter->bands[i].bucket = 0;
6291
6292 /* Figure out max delta_t that is enough to fill any bucket. */
6293 band_max_delta_t
6294 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
6295 if (band_max_delta_t > meter->max_delta_t) {
6296 meter->max_delta_t = band_max_delta_t;
6297 }
6298 }
6299
6300 meter_lock(dp, mid);
6301 dp_delete_meter(dp, mid); /* Free existing meter, if any */
6302 dp->meters[mid] = meter;
6303 meter_unlock(dp, mid);
6304
6305 return 0;
6306 }
6307
6308 static int
6309 dpif_netdev_meter_get(const struct dpif *dpif,
6310 ofproto_meter_id meter_id_,
6311 struct ofputil_meter_stats *stats, uint16_t n_bands)
6312 {
6313 const struct dp_netdev *dp = get_dp_netdev(dpif);
6314 uint32_t meter_id = meter_id_.uint32;
6315 int retval = 0;
6316
6317 if (meter_id >= MAX_METERS) {
6318 return EFBIG;
6319 }
6320
6321 meter_lock(dp, meter_id);
6322 const struct dp_meter *meter = dp->meters[meter_id];
6323 if (!meter) {
6324 retval = ENOENT;
6325 goto done;
6326 }
6327 if (stats) {
6328 int i = 0;
6329
6330 stats->packet_in_count = meter->packet_count;
6331 stats->byte_in_count = meter->byte_count;
6332
6333 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6334 stats->bands[i].packet_count = meter->bands[i].packet_count;
6335 stats->bands[i].byte_count = meter->bands[i].byte_count;
6336 }
6337
6338 stats->n_bands = i;
6339 }
6340
6341 done:
6342 meter_unlock(dp, meter_id);
6343 return retval;
6344 }
6345
6346 static int
6347 dpif_netdev_meter_del(struct dpif *dpif,
6348 ofproto_meter_id meter_id_,
6349 struct ofputil_meter_stats *stats, uint16_t n_bands)
6350 {
6351 struct dp_netdev *dp = get_dp_netdev(dpif);
6352 int error;
6353
6354 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6355 if (!error) {
6356 uint32_t meter_id = meter_id_.uint32;
6357
6358 meter_lock(dp, meter_id);
6359 dp_delete_meter(dp, meter_id);
6360 meter_unlock(dp, meter_id);
6361 }
6362 return error;
6363 }
6364
6365 \f
6366 static void
6367 dpif_netdev_disable_upcall(struct dpif *dpif)
6368 OVS_NO_THREAD_SAFETY_ANALYSIS
6369 {
6370 struct dp_netdev *dp = get_dp_netdev(dpif);
6371 dp_netdev_disable_upcall(dp);
6372 }
6373
6374 static void
6375 dp_netdev_enable_upcall(struct dp_netdev *dp)
6376 OVS_RELEASES(dp->upcall_rwlock)
6377 {
6378 fat_rwlock_unlock(&dp->upcall_rwlock);
6379 }
6380
6381 static void
6382 dpif_netdev_enable_upcall(struct dpif *dpif)
6383 OVS_NO_THREAD_SAFETY_ANALYSIS
6384 {
6385 struct dp_netdev *dp = get_dp_netdev(dpif);
6386 dp_netdev_enable_upcall(dp);
6387 }
6388
6389 static void
6390 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6391 {
6392 atomic_store_relaxed(&pmd->wait_for_reload, false);
6393 atomic_store_relaxed(&pmd->reload_tx_qid, false);
6394 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6395 atomic_store_explicit(&pmd->reload, false, memory_order_release);
6396 }
6397
6398 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
6399 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6400 * 'core_id' is NON_PMD_CORE_ID).
6401 *
6402 * Caller must unrefs the returned reference. */
6403 static struct dp_netdev_pmd_thread *
6404 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6405 {
6406 struct dp_netdev_pmd_thread *pmd;
6407 const struct cmap_node *pnode;
6408
6409 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
6410 if (!pnode) {
6411 return NULL;
6412 }
6413 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6414
6415 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6416 }
6417
6418 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6419 static void
6420 dp_netdev_set_nonpmd(struct dp_netdev *dp)
6421 OVS_REQUIRES(dp->port_mutex)
6422 {
6423 struct dp_netdev_pmd_thread *non_pmd;
6424
6425 non_pmd = xzalloc(sizeof *non_pmd);
6426 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6427 }
6428
6429 /* Caller must have valid pointer to 'pmd'. */
6430 static bool
6431 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6432 {
6433 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6434 }
6435
6436 static void
6437 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6438 {
6439 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6440 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6441 }
6442 }
6443
6444 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
6445 * fails, keeps checking for next node until reaching the end of cmap.
6446 *
6447 * Caller must unrefs the returned reference. */
6448 static struct dp_netdev_pmd_thread *
6449 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6450 {
6451 struct dp_netdev_pmd_thread *next;
6452
6453 do {
6454 struct cmap_node *node;
6455
6456 node = cmap_next_position(&dp->poll_threads, pos);
6457 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6458 : NULL;
6459 } while (next && !dp_netdev_pmd_try_ref(next));
6460
6461 return next;
6462 }
6463
6464 /* Configures the 'pmd' based on the input argument. */
6465 static void
6466 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6467 unsigned core_id, int numa_id)
6468 {
6469 pmd->dp = dp;
6470 pmd->core_id = core_id;
6471 pmd->numa_id = numa_id;
6472 pmd->need_reload = false;
6473 pmd->n_output_batches = 0;
6474
6475 ovs_refcount_init(&pmd->ref_cnt);
6476 atomic_init(&pmd->exit, false);
6477 pmd->reload_seq = seq_create();
6478 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6479 atomic_init(&pmd->reload, false);
6480 ovs_mutex_init(&pmd->flow_mutex);
6481 ovs_mutex_init(&pmd->port_mutex);
6482 ovs_mutex_init(&pmd->bond_mutex);
6483 cmap_init(&pmd->flow_table);
6484 cmap_init(&pmd->classifiers);
6485 pmd->ctx.last_rxq = NULL;
6486 pmd_thread_ctx_time_update(pmd);
6487 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6488 pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6489 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6490 hmap_init(&pmd->poll_list);
6491 hmap_init(&pmd->tx_ports);
6492 hmap_init(&pmd->tnl_port_cache);
6493 hmap_init(&pmd->send_port_cache);
6494 cmap_init(&pmd->tx_bonds);
6495 /* init the 'flow_cache' since there is no
6496 * actual thread created for NON_PMD_CORE_ID. */
6497 if (core_id == NON_PMD_CORE_ID) {
6498 dfc_cache_init(&pmd->flow_cache);
6499 pmd_alloc_static_tx_qid(pmd);
6500 }
6501 pmd_perf_stats_init(&pmd->perf_stats);
6502 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6503 hash_int(core_id, 0));
6504 }
6505
6506 static void
6507 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6508 {
6509 struct dpcls *cls;
6510
6511 dp_netdev_pmd_flow_flush(pmd);
6512 hmap_destroy(&pmd->send_port_cache);
6513 hmap_destroy(&pmd->tnl_port_cache);
6514 hmap_destroy(&pmd->tx_ports);
6515 cmap_destroy(&pmd->tx_bonds);
6516 hmap_destroy(&pmd->poll_list);
6517 /* All flows (including their dpcls_rules) have been deleted already */
6518 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6519 dpcls_destroy(cls);
6520 ovsrcu_postpone(free, cls);
6521 }
6522 cmap_destroy(&pmd->classifiers);
6523 cmap_destroy(&pmd->flow_table);
6524 ovs_mutex_destroy(&pmd->flow_mutex);
6525 seq_destroy(pmd->reload_seq);
6526 ovs_mutex_destroy(&pmd->port_mutex);
6527 ovs_mutex_destroy(&pmd->bond_mutex);
6528 free(pmd);
6529 }
6530
6531 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6532 * and unrefs the struct. */
6533 static void
6534 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6535 {
6536 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6537 * but extra cleanup is necessary */
6538 if (pmd->core_id == NON_PMD_CORE_ID) {
6539 ovs_mutex_lock(&dp->non_pmd_mutex);
6540 dfc_cache_uninit(&pmd->flow_cache);
6541 pmd_free_cached_ports(pmd);
6542 pmd_free_static_tx_qid(pmd);
6543 ovs_mutex_unlock(&dp->non_pmd_mutex);
6544 } else {
6545 atomic_store_relaxed(&pmd->exit, true);
6546 dp_netdev_reload_pmd__(pmd);
6547 xpthread_join(pmd->thread, NULL);
6548 }
6549
6550 dp_netdev_pmd_clear_ports(pmd);
6551
6552 /* Purges the 'pmd''s flows after stopping the thread, but before
6553 * destroying the flows, so that the flow stats can be collected. */
6554 if (dp->dp_purge_cb) {
6555 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6556 }
6557 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6558 dp_netdev_pmd_unref(pmd);
6559 }
6560
6561 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6562 * thread. */
6563 static void
6564 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6565 {
6566 struct dp_netdev_pmd_thread *pmd;
6567 struct dp_netdev_pmd_thread **pmd_list;
6568 size_t k = 0, n_pmds;
6569
6570 n_pmds = cmap_count(&dp->poll_threads);
6571 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6572
6573 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6574 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6575 continue;
6576 }
6577 /* We cannot call dp_netdev_del_pmd(), since it alters
6578 * 'dp->poll_threads' (while we're iterating it) and it
6579 * might quiesce. */
6580 ovs_assert(k < n_pmds);
6581 pmd_list[k++] = pmd;
6582 }
6583
6584 for (size_t i = 0; i < k; i++) {
6585 dp_netdev_del_pmd(dp, pmd_list[i]);
6586 }
6587 free(pmd_list);
6588 }
6589
6590 /* Deletes all rx queues from pmd->poll_list and all the ports from
6591 * pmd->tx_ports. */
6592 static void
6593 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6594 {
6595 struct rxq_poll *poll;
6596 struct tx_port *port;
6597 struct tx_bond *tx;
6598
6599 ovs_mutex_lock(&pmd->port_mutex);
6600 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6601 free(poll);
6602 }
6603 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6604 free(port);
6605 }
6606 ovs_mutex_unlock(&pmd->port_mutex);
6607
6608 ovs_mutex_lock(&pmd->bond_mutex);
6609 CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6610 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6611 ovsrcu_postpone(free, tx);
6612 }
6613 ovs_mutex_unlock(&pmd->bond_mutex);
6614 }
6615
6616 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6617 static void
6618 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6619 struct dp_netdev_rxq *rxq)
6620 OVS_REQUIRES(pmd->port_mutex)
6621 {
6622 int qid = netdev_rxq_get_queue_id(rxq->rx);
6623 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6624 struct rxq_poll *poll;
6625
6626 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6627 if (poll->rxq == rxq) {
6628 /* 'rxq' is already polled by this thread. Do nothing. */
6629 return;
6630 }
6631 }
6632
6633 poll = xmalloc(sizeof *poll);
6634 poll->rxq = rxq;
6635 hmap_insert(&pmd->poll_list, &poll->node, hash);
6636
6637 pmd->need_reload = true;
6638 }
6639
6640 /* Delete 'poll' from poll_list of PMD thread. */
6641 static void
6642 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6643 struct rxq_poll *poll)
6644 OVS_REQUIRES(pmd->port_mutex)
6645 {
6646 hmap_remove(&pmd->poll_list, &poll->node);
6647 free(poll);
6648
6649 pmd->need_reload = true;
6650 }
6651
6652 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6653 * changes to take effect. */
6654 static void
6655 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6656 struct dp_netdev_port *port)
6657 OVS_REQUIRES(pmd->port_mutex)
6658 {
6659 struct tx_port *tx;
6660
6661 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6662 if (tx) {
6663 /* 'port' is already on this thread tx cache. Do nothing. */
6664 return;
6665 }
6666
6667 tx = xzalloc(sizeof *tx);
6668
6669 tx->port = port;
6670 tx->qid = -1;
6671 tx->flush_time = 0LL;
6672 dp_packet_batch_init(&tx->output_pkts);
6673
6674 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6675 pmd->need_reload = true;
6676 }
6677
6678 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6679 * changes to take effect. */
6680 static void
6681 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6682 struct tx_port *tx)
6683 OVS_REQUIRES(pmd->port_mutex)
6684 {
6685 hmap_remove(&pmd->tx_ports, &tx->node);
6686 free(tx);
6687 pmd->need_reload = true;
6688 }
6689
6690 /* Add bond to the tx bond cmap of 'pmd'. */
6691 static void
6692 dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6693 struct tx_bond *bond, bool update)
6694 OVS_EXCLUDED(pmd->bond_mutex)
6695 {
6696 struct tx_bond *tx;
6697
6698 ovs_mutex_lock(&pmd->bond_mutex);
6699 tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6700
6701 if (tx && !update) {
6702 /* It's not an update and the entry already exists. Do nothing. */
6703 goto unlock;
6704 }
6705
6706 if (tx) {
6707 struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6708
6709 /* Copy the stats for each bucket. */
6710 for (int i = 0; i < BOND_BUCKETS; i++) {
6711 uint64_t n_packets, n_bytes;
6712
6713 atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
6714 atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
6715 atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
6716 atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
6717 }
6718 cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6719 hash_bond_id(bond->bond_id));
6720 ovsrcu_postpone(free, tx);
6721 } else {
6722 tx = xmemdup(bond, sizeof *bond);
6723 cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6724 }
6725 unlock:
6726 ovs_mutex_unlock(&pmd->bond_mutex);
6727 }
6728
6729 /* Delete bond from the tx bond cmap of 'pmd'. */
6730 static void
6731 dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6732 uint32_t bond_id)
6733 OVS_EXCLUDED(pmd->bond_mutex)
6734 {
6735 struct tx_bond *tx;
6736
6737 ovs_mutex_lock(&pmd->bond_mutex);
6738 tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6739 if (tx) {
6740 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6741 ovsrcu_postpone(free, tx);
6742 }
6743 ovs_mutex_unlock(&pmd->bond_mutex);
6744 }
6745 \f
6746 static char *
6747 dpif_netdev_get_datapath_version(void)
6748 {
6749 return xstrdup("<built-in>");
6750 }
6751
6752 static void
6753 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6754 uint16_t tcp_flags, long long now)
6755 {
6756 uint16_t flags;
6757
6758 atomic_store_relaxed(&netdev_flow->stats.used, now);
6759 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6760 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6761 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6762 flags |= tcp_flags;
6763 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6764 }
6765
6766 static int
6767 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6768 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6769 enum dpif_upcall_type type, const struct nlattr *userdata,
6770 struct ofpbuf *actions, struct ofpbuf *put_actions)
6771 {
6772 struct dp_netdev *dp = pmd->dp;
6773
6774 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6775 return ENODEV;
6776 }
6777
6778 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6779 struct ds ds = DS_EMPTY_INITIALIZER;
6780 char *packet_str;
6781 struct ofpbuf key;
6782 struct odp_flow_key_parms odp_parms = {
6783 .flow = flow,
6784 .mask = wc ? &wc->masks : NULL,
6785 .support = dp_netdev_support,
6786 };
6787
6788 ofpbuf_init(&key, 0);
6789 odp_flow_key_from_flow(&odp_parms, &key);
6790 packet_str = ofp_dp_packet_to_string(packet_);
6791
6792 odp_flow_key_format(key.data, key.size, &ds);
6793
6794 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6795 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6796
6797 ofpbuf_uninit(&key);
6798 free(packet_str);
6799
6800 ds_destroy(&ds);
6801 }
6802
6803 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6804 actions, wc, put_actions, dp->upcall_aux);
6805 }
6806
6807 static inline uint32_t
6808 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6809 const struct miniflow *mf)
6810 {
6811 uint32_t hash;
6812
6813 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6814 hash = dp_packet_get_rss_hash(packet);
6815 } else {
6816 hash = miniflow_hash_5tuple(mf, 0);
6817 dp_packet_set_rss_hash(packet, hash);
6818 }
6819
6820 return hash;
6821 }
6822
6823 static inline uint32_t
6824 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6825 const struct miniflow *mf)
6826 {
6827 uint32_t hash, recirc_depth;
6828
6829 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6830 hash = dp_packet_get_rss_hash(packet);
6831 } else {
6832 hash = miniflow_hash_5tuple(mf, 0);
6833 dp_packet_set_rss_hash(packet, hash);
6834 }
6835
6836 /* The RSS hash must account for the recirculation depth to avoid
6837 * collisions in the exact match cache */
6838 recirc_depth = *recirc_depth_get_unsafe();
6839 if (OVS_UNLIKELY(recirc_depth)) {
6840 hash = hash_finish(hash, recirc_depth);
6841 }
6842 return hash;
6843 }
6844
6845 struct packet_batch_per_flow {
6846 unsigned int byte_count;
6847 uint16_t tcp_flags;
6848 struct dp_netdev_flow *flow;
6849
6850 struct dp_packet_batch array;
6851 };
6852
6853 static inline void
6854 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6855 struct dp_packet *packet,
6856 uint16_t tcp_flags)
6857 {
6858 batch->byte_count += dp_packet_size(packet);
6859 batch->tcp_flags |= tcp_flags;
6860 dp_packet_batch_add(&batch->array, packet);
6861 }
6862
6863 static inline void
6864 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6865 struct dp_netdev_flow *flow)
6866 {
6867 flow->batch = batch;
6868
6869 batch->flow = flow;
6870 dp_packet_batch_init(&batch->array);
6871 batch->byte_count = 0;
6872 batch->tcp_flags = 0;
6873 }
6874
6875 static inline void
6876 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6877 struct dp_netdev_pmd_thread *pmd)
6878 {
6879 struct dp_netdev_actions *actions;
6880 struct dp_netdev_flow *flow = batch->flow;
6881
6882 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6883 batch->byte_count,
6884 batch->tcp_flags, pmd->ctx.now / 1000);
6885
6886 actions = dp_netdev_flow_get_actions(flow);
6887
6888 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6889 actions->actions, actions->size);
6890 }
6891
6892 static inline void
6893 dp_netdev_queue_batches(struct dp_packet *pkt,
6894 struct dp_netdev_flow *flow, uint16_t tcp_flags,
6895 struct packet_batch_per_flow *batches,
6896 size_t *n_batches)
6897 {
6898 struct packet_batch_per_flow *batch = flow->batch;
6899
6900 if (OVS_UNLIKELY(!batch)) {
6901 batch = &batches[(*n_batches)++];
6902 packet_batch_per_flow_init(batch, flow);
6903 }
6904
6905 packet_batch_per_flow_update(batch, pkt, tcp_flags);
6906 }
6907
6908 static inline void
6909 packet_enqueue_to_flow_map(struct dp_packet *packet,
6910 struct dp_netdev_flow *flow,
6911 uint16_t tcp_flags,
6912 struct dp_packet_flow_map *flow_map,
6913 size_t index)
6914 {
6915 struct dp_packet_flow_map *map = &flow_map[index];
6916 map->flow = flow;
6917 map->packet = packet;
6918 map->tcp_flags = tcp_flags;
6919 }
6920
6921 /* SMC lookup function for a batch of packets.
6922 * By doing batching SMC lookup, we can use prefetch
6923 * to hide memory access latency.
6924 */
6925 static inline void
6926 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6927 struct netdev_flow_key *keys,
6928 struct netdev_flow_key **missed_keys,
6929 struct dp_packet_batch *packets_,
6930 const int cnt,
6931 struct dp_packet_flow_map *flow_map,
6932 uint8_t *index_map)
6933 {
6934 int i;
6935 struct dp_packet *packet;
6936 size_t n_smc_hit = 0, n_missed = 0;
6937 struct dfc_cache *cache = &pmd->flow_cache;
6938 struct smc_cache *smc_cache = &cache->smc_cache;
6939 const struct cmap_node *flow_node;
6940 int recv_idx;
6941 uint16_t tcp_flags;
6942
6943 /* Prefetch buckets for all packets */
6944 for (i = 0; i < cnt; i++) {
6945 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6946 }
6947
6948 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6949 struct dp_netdev_flow *flow = NULL;
6950 flow_node = smc_entry_get(pmd, keys[i].hash);
6951 bool hit = false;
6952 /* Get the original order of this packet in received batch. */
6953 recv_idx = index_map[i];
6954
6955 if (OVS_LIKELY(flow_node != NULL)) {
6956 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6957 /* Since we dont have per-port megaflow to check the port
6958 * number, we need to verify that the input ports match. */
6959 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6960 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6961 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6962
6963 /* SMC hit and emc miss, we insert into EMC */
6964 keys[i].len =
6965 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6966 emc_probabilistic_insert(pmd, &keys[i], flow);
6967 /* Add these packets into the flow map in the same order
6968 * as received.
6969 */
6970 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6971 flow_map, recv_idx);
6972 n_smc_hit++;
6973 hit = true;
6974 break;
6975 }
6976 }
6977 if (hit) {
6978 continue;
6979 }
6980 }
6981
6982 /* SMC missed. Group missed packets together at
6983 * the beginning of the 'packets' array. */
6984 dp_packet_batch_refill(packets_, packet, i);
6985
6986 /* Preserve the order of packet for flow batching. */
6987 index_map[n_missed] = recv_idx;
6988
6989 /* Put missed keys to the pointer arrays return to the caller */
6990 missed_keys[n_missed++] = &keys[i];
6991 }
6992
6993 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6994 }
6995
6996 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6997 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6998 * miniflow is copied into 'keys' and the packet pointer is moved at the
6999 * beginning of the 'packets' array. The pointers of missed keys are put in the
7000 * missed_keys pointer array for future processing.
7001 *
7002 * The function returns the number of packets that needs to be processed in the
7003 * 'packets' array (they have been moved to the beginning of the vector).
7004 *
7005 * For performance reasons a caller may choose not to initialize the metadata
7006 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
7007 * is not valid and must be initialized by this function using 'port_no'.
7008 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7009 * will be ignored.
7010 */
7011 static inline size_t
7012 dfc_processing(struct dp_netdev_pmd_thread *pmd,
7013 struct dp_packet_batch *packets_,
7014 struct netdev_flow_key *keys,
7015 struct netdev_flow_key **missed_keys,
7016 struct packet_batch_per_flow batches[], size_t *n_batches,
7017 struct dp_packet_flow_map *flow_map,
7018 size_t *n_flows, uint8_t *index_map,
7019 bool md_is_valid, odp_port_t port_no)
7020 {
7021 struct netdev_flow_key *key = &keys[0];
7022 size_t n_missed = 0, n_emc_hit = 0;
7023 struct dfc_cache *cache = &pmd->flow_cache;
7024 struct dp_packet *packet;
7025 const size_t cnt = dp_packet_batch_size(packets_);
7026 uint32_t cur_min = pmd->ctx.emc_insert_min;
7027 int i;
7028 uint16_t tcp_flags;
7029 bool smc_enable_db;
7030 size_t map_cnt = 0;
7031 bool batch_enable = true;
7032
7033 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
7034 pmd_perf_update_counter(&pmd->perf_stats,
7035 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7036 cnt);
7037
7038 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7039 struct dp_netdev_flow *flow;
7040 uint32_t mark;
7041
7042 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7043 dp_packet_delete(packet);
7044 COVERAGE_INC(datapath_drop_rx_invalid_packet);
7045 continue;
7046 }
7047
7048 if (i != cnt - 1) {
7049 struct dp_packet **packets = packets_->packets;
7050 /* Prefetch next packet data and metadata. */
7051 OVS_PREFETCH(dp_packet_data(packets[i+1]));
7052 pkt_metadata_prefetch_init(&packets[i+1]->md);
7053 }
7054
7055 if (!md_is_valid) {
7056 pkt_metadata_init(&packet->md, port_no);
7057 }
7058
7059 if ((*recirc_depth_get() == 0) &&
7060 dp_packet_has_flow_mark(packet, &mark)) {
7061 flow = mark_to_flow_find(pmd, mark);
7062 if (OVS_LIKELY(flow)) {
7063 tcp_flags = parse_tcp_flags(packet);
7064 if (OVS_LIKELY(batch_enable)) {
7065 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7066 n_batches);
7067 } else {
7068 /* Flow batching should be performed only after fast-path
7069 * processing is also completed for packets with emc miss
7070 * or else it will result in reordering of packets with
7071 * same datapath flows. */
7072 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7073 flow_map, map_cnt++);
7074 }
7075 continue;
7076 }
7077 }
7078
7079 miniflow_extract(packet, &key->mf);
7080 key->len = 0; /* Not computed yet. */
7081 key->hash =
7082 (md_is_valid == false)
7083 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7084 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7085
7086 /* If EMC is disabled skip emc_lookup */
7087 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7088 if (OVS_LIKELY(flow)) {
7089 tcp_flags = miniflow_get_tcp_flags(&key->mf);
7090 n_emc_hit++;
7091 if (OVS_LIKELY(batch_enable)) {
7092 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7093 n_batches);
7094 } else {
7095 /* Flow batching should be performed only after fast-path
7096 * processing is also completed for packets with emc miss
7097 * or else it will result in reordering of packets with
7098 * same datapath flows. */
7099 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7100 flow_map, map_cnt++);
7101 }
7102 } else {
7103 /* Exact match cache missed. Group missed packets together at
7104 * the beginning of the 'packets' array. */
7105 dp_packet_batch_refill(packets_, packet, i);
7106
7107 /* Preserve the order of packet for flow batching. */
7108 index_map[n_missed] = map_cnt;
7109 flow_map[map_cnt++].flow = NULL;
7110
7111 /* 'key[n_missed]' contains the key of the current packet and it
7112 * will be passed to SMC lookup. The next key should be extracted
7113 * to 'keys[n_missed + 1]'.
7114 * We also maintain a pointer array to keys missed both SMC and EMC
7115 * which will be returned to the caller for future processing. */
7116 missed_keys[n_missed] = key;
7117 key = &keys[++n_missed];
7118
7119 /* Skip batching for subsequent packets to avoid reordering. */
7120 batch_enable = false;
7121 }
7122 }
7123 /* Count of packets which are not flow batched. */
7124 *n_flows = map_cnt;
7125
7126 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7127
7128 if (!smc_enable_db) {
7129 return dp_packet_batch_size(packets_);
7130 }
7131
7132 /* Packets miss EMC will do a batch lookup in SMC if enabled */
7133 smc_lookup_batch(pmd, keys, missed_keys, packets_,
7134 n_missed, flow_map, index_map);
7135
7136 return dp_packet_batch_size(packets_);
7137 }
7138
7139 static inline int
7140 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7141 struct dp_packet *packet,
7142 const struct netdev_flow_key *key,
7143 struct ofpbuf *actions, struct ofpbuf *put_actions)
7144 {
7145 struct ofpbuf *add_actions;
7146 struct dp_packet_batch b;
7147 struct match match;
7148 ovs_u128 ufid;
7149 int error;
7150 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7151
7152 match.tun_md.valid = false;
7153 miniflow_expand(&key->mf, &match.flow);
7154 memset(&match.wc, 0, sizeof match.wc);
7155
7156 ofpbuf_clear(actions);
7157 ofpbuf_clear(put_actions);
7158
7159 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7160 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7161 &ufid, DPIF_UC_MISS, NULL, actions,
7162 put_actions);
7163 if (OVS_UNLIKELY(error && error != ENOSPC)) {
7164 dp_packet_delete(packet);
7165 COVERAGE_INC(datapath_drop_upcall_error);
7166 return error;
7167 }
7168
7169 /* The Netlink encoding of datapath flow keys cannot express
7170 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7171 * tag is interpreted as exact match on the fact that there is no
7172 * VLAN. Unless we refactor a lot of code that translates between
7173 * Netlink and struct flow representations, we have to do the same
7174 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
7175 if (!match.wc.masks.vlans[0].tci) {
7176 match.wc.masks.vlans[0].tci = htons(0xffff);
7177 }
7178
7179 /* We can't allow the packet batching in the next loop to execute
7180 * the actions. Otherwise, if there are any slow path actions,
7181 * we'll send the packet up twice. */
7182 dp_packet_batch_init_packet(&b, packet);
7183 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
7184 actions->data, actions->size);
7185
7186 add_actions = put_actions->size ? put_actions : actions;
7187 if (OVS_LIKELY(error != ENOSPC)) {
7188 struct dp_netdev_flow *netdev_flow;
7189
7190 /* XXX: There's a race window where a flow covering this packet
7191 * could have already been installed since we last did the flow
7192 * lookup before upcall. This could be solved by moving the
7193 * mutex lock outside the loop, but that's an awful long time
7194 * to be locking revalidators out of making flow modifications. */
7195 ovs_mutex_lock(&pmd->flow_mutex);
7196 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
7197 if (OVS_LIKELY(!netdev_flow)) {
7198 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
7199 add_actions->data,
7200 add_actions->size);
7201 }
7202 ovs_mutex_unlock(&pmd->flow_mutex);
7203 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7204 smc_insert(pmd, key, hash);
7205 emc_probabilistic_insert(pmd, key, netdev_flow);
7206 }
7207 if (pmd_perf_metrics_enabled(pmd)) {
7208 /* Update upcall stats. */
7209 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7210 struct pmd_perf_stats *s = &pmd->perf_stats;
7211 s->current.upcalls++;
7212 s->current.upcall_cycles += cycles;
7213 histogram_add_sample(&s->cycles_per_upcall, cycles);
7214 }
7215 return error;
7216 }
7217
7218 static inline void
7219 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7220 struct dp_packet_batch *packets_,
7221 struct netdev_flow_key **keys,
7222 struct dp_packet_flow_map *flow_map,
7223 uint8_t *index_map,
7224 odp_port_t in_port)
7225 {
7226 const size_t cnt = dp_packet_batch_size(packets_);
7227 #if !defined(__CHECKER__) && !defined(_WIN32)
7228 const size_t PKT_ARRAY_SIZE = cnt;
7229 #else
7230 /* Sparse or MSVC doesn't like variable length array. */
7231 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7232 #endif
7233 struct dp_packet *packet;
7234 struct dpcls *cls;
7235 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7236 struct dp_netdev *dp = pmd->dp;
7237 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7238 int lookup_cnt = 0, add_lookup_cnt;
7239 bool any_miss;
7240
7241 for (size_t i = 0; i < cnt; i++) {
7242 /* Key length is needed in all the cases, hash computed on demand. */
7243 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7244 }
7245 /* Get the classifier for the in_port */
7246 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7247 if (OVS_LIKELY(cls)) {
7248 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7249 rules, cnt, &lookup_cnt);
7250 } else {
7251 any_miss = true;
7252 memset(rules, 0, sizeof(rules));
7253 }
7254 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7255 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7256 struct ofpbuf actions, put_actions;
7257
7258 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7259 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7260
7261 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7262 struct dp_netdev_flow *netdev_flow;
7263
7264 if (OVS_LIKELY(rules[i])) {
7265 continue;
7266 }
7267
7268 /* It's possible that an earlier slow path execution installed
7269 * a rule covering this flow. In this case, it's a lot cheaper
7270 * to catch it here than execute a miss. */
7271 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7272 &add_lookup_cnt);
7273 if (netdev_flow) {
7274 lookup_cnt += add_lookup_cnt;
7275 rules[i] = &netdev_flow->cr;
7276 continue;
7277 }
7278
7279 int error = handle_packet_upcall(pmd, packet, keys[i],
7280 &actions, &put_actions);
7281
7282 if (OVS_UNLIKELY(error)) {
7283 upcall_fail_cnt++;
7284 } else {
7285 upcall_ok_cnt++;
7286 }
7287 }
7288
7289 ofpbuf_uninit(&actions);
7290 ofpbuf_uninit(&put_actions);
7291 fat_rwlock_unlock(&dp->upcall_rwlock);
7292 } else if (OVS_UNLIKELY(any_miss)) {
7293 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7294 if (OVS_UNLIKELY(!rules[i])) {
7295 dp_packet_delete(packet);
7296 COVERAGE_INC(datapath_drop_lock_error);
7297 upcall_fail_cnt++;
7298 }
7299 }
7300 }
7301
7302 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7303 struct dp_netdev_flow *flow;
7304 /* Get the original order of this packet in received batch. */
7305 int recv_idx = index_map[i];
7306 uint16_t tcp_flags;
7307
7308 if (OVS_UNLIKELY(!rules[i])) {
7309 continue;
7310 }
7311
7312 flow = dp_netdev_flow_cast(rules[i]);
7313 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
7314 smc_insert(pmd, keys[i], hash);
7315
7316 emc_probabilistic_insert(pmd, keys[i], flow);
7317 /* Add these packets into the flow map in the same order
7318 * as received.
7319 */
7320 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7321 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7322 flow_map, recv_idx);
7323 }
7324
7325 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7326 cnt - upcall_ok_cnt - upcall_fail_cnt);
7327 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7328 lookup_cnt);
7329 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7330 upcall_ok_cnt);
7331 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7332 upcall_fail_cnt);
7333 }
7334
7335 /* Packets enter the datapath from a port (or from recirculation) here.
7336 *
7337 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7338 * When false the metadata in 'packets' need to be initialized. */
7339 static void
7340 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7341 struct dp_packet_batch *packets,
7342 bool md_is_valid, odp_port_t port_no)
7343 {
7344 #if !defined(__CHECKER__) && !defined(_WIN32)
7345 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7346 #else
7347 /* Sparse or MSVC doesn't like variable length array. */
7348 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7349 #endif
7350 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7351 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7352 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7353 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7354 size_t n_batches;
7355 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7356 uint8_t index_map[PKT_ARRAY_SIZE];
7357 size_t n_flows, i;
7358
7359 odp_port_t in_port;
7360
7361 n_batches = 0;
7362 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7363 flow_map, &n_flows, index_map, md_is_valid, port_no);
7364
7365 if (!dp_packet_batch_is_empty(packets)) {
7366 /* Get ingress port from first packet's metadata. */
7367 in_port = packets->packets[0]->md.in_port.odp_port;
7368 fast_path_processing(pmd, packets, missed_keys,
7369 flow_map, index_map, in_port);
7370 }
7371
7372 /* Batch rest of packets which are in flow map. */
7373 for (i = 0; i < n_flows; i++) {
7374 struct dp_packet_flow_map *map = &flow_map[i];
7375
7376 if (OVS_UNLIKELY(!map->flow)) {
7377 continue;
7378 }
7379 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7380 batches, &n_batches);
7381 }
7382
7383 /* All the flow batches need to be reset before any call to
7384 * packet_batch_per_flow_execute() as it could potentially trigger
7385 * recirculation. When a packet matching flow ‘j’ happens to be
7386 * recirculated, the nested call to dp_netdev_input__() could potentially
7387 * classify the packet as matching another flow - say 'k'. It could happen
7388 * that in the previous call to dp_netdev_input__() that same flow 'k' had
7389 * already its own batches[k] still waiting to be served. So if its
7390 * ‘batch’ member is not reset, the recirculated packet would be wrongly
7391 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7392 for (i = 0; i < n_batches; i++) {
7393 batches[i].flow->batch = NULL;
7394 }
7395
7396 for (i = 0; i < n_batches; i++) {
7397 packet_batch_per_flow_execute(&batches[i], pmd);
7398 }
7399 }
7400
7401 static void
7402 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7403 struct dp_packet_batch *packets,
7404 odp_port_t port_no)
7405 {
7406 dp_netdev_input__(pmd, packets, false, port_no);
7407 }
7408
7409 static void
7410 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7411 struct dp_packet_batch *packets)
7412 {
7413 dp_netdev_input__(pmd, packets, true, 0);
7414 }
7415
7416 struct dp_netdev_execute_aux {
7417 struct dp_netdev_pmd_thread *pmd;
7418 const struct flow *flow;
7419 };
7420
7421 static void
7422 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7423 void *aux)
7424 {
7425 struct dp_netdev *dp = get_dp_netdev(dpif);
7426 dp->dp_purge_aux = aux;
7427 dp->dp_purge_cb = cb;
7428 }
7429
7430 static void
7431 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7432 void *aux)
7433 {
7434 struct dp_netdev *dp = get_dp_netdev(dpif);
7435 dp->upcall_aux = aux;
7436 dp->upcall_cb = cb;
7437 }
7438
7439 static void
7440 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7441 bool purge)
7442 {
7443 struct tx_port *tx;
7444 struct dp_netdev_port *port;
7445 long long interval;
7446
7447 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7448 if (!tx->port->dynamic_txqs) {
7449 continue;
7450 }
7451 interval = pmd->ctx.now - tx->last_used;
7452 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7453 port = tx->port;
7454 ovs_mutex_lock(&port->txq_used_mutex);
7455 port->txq_used[tx->qid]--;
7456 ovs_mutex_unlock(&port->txq_used_mutex);
7457 tx->qid = -1;
7458 }
7459 }
7460 }
7461
7462 static int
7463 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7464 struct tx_port *tx)
7465 {
7466 struct dp_netdev_port *port;
7467 long long interval;
7468 int i, min_cnt, min_qid;
7469
7470 interval = pmd->ctx.now - tx->last_used;
7471 tx->last_used = pmd->ctx.now;
7472
7473 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7474 return tx->qid;
7475 }
7476
7477 port = tx->port;
7478
7479 ovs_mutex_lock(&port->txq_used_mutex);
7480 if (tx->qid >= 0) {
7481 port->txq_used[tx->qid]--;
7482 tx->qid = -1;
7483 }
7484
7485 min_cnt = -1;
7486 min_qid = 0;
7487 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7488 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7489 min_cnt = port->txq_used[i];
7490 min_qid = i;
7491 }
7492 }
7493
7494 port->txq_used[min_qid]++;
7495 tx->qid = min_qid;
7496
7497 ovs_mutex_unlock(&port->txq_used_mutex);
7498
7499 dpif_netdev_xps_revalidate_pmd(pmd, false);
7500
7501 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7502 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7503 return min_qid;
7504 }
7505
7506 static struct tx_port *
7507 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7508 odp_port_t port_no)
7509 {
7510 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7511 }
7512
7513 static struct tx_port *
7514 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7515 odp_port_t port_no)
7516 {
7517 return tx_port_lookup(&pmd->send_port_cache, port_no);
7518 }
7519
7520 static int
7521 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7522 const struct nlattr *attr,
7523 struct dp_packet_batch *batch)
7524 {
7525 struct tx_port *tun_port;
7526 const struct ovs_action_push_tnl *data;
7527 int err;
7528
7529 data = nl_attr_get(attr);
7530
7531 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7532 if (!tun_port) {
7533 err = -EINVAL;
7534 goto error;
7535 }
7536 err = netdev_push_header(tun_port->port->netdev, batch, data);
7537 if (!err) {
7538 return 0;
7539 }
7540 error:
7541 dp_packet_delete_batch(batch, true);
7542 return err;
7543 }
7544
7545 static void
7546 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7547 struct dp_packet *packet, bool should_steal,
7548 struct flow *flow, ovs_u128 *ufid,
7549 struct ofpbuf *actions,
7550 const struct nlattr *userdata)
7551 {
7552 struct dp_packet_batch b;
7553 int error;
7554
7555 ofpbuf_clear(actions);
7556
7557 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7558 DPIF_UC_ACTION, userdata, actions,
7559 NULL);
7560 if (!error || error == ENOSPC) {
7561 dp_packet_batch_init_packet(&b, packet);
7562 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7563 actions->data, actions->size);
7564 } else if (should_steal) {
7565 dp_packet_delete(packet);
7566 COVERAGE_INC(datapath_drop_userspace_action_error);
7567 }
7568 }
7569
7570 static bool
7571 dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7572 struct dp_packet_batch *packets_,
7573 bool should_steal, odp_port_t port_no)
7574 {
7575 struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7576 struct dp_packet_batch out;
7577
7578 if (!OVS_LIKELY(p)) {
7579 COVERAGE_ADD(datapath_drop_invalid_port,
7580 dp_packet_batch_size(packets_));
7581 dp_packet_delete_batch(packets_, should_steal);
7582 return false;
7583 }
7584 if (!should_steal) {
7585 dp_packet_batch_clone(&out, packets_);
7586 dp_packet_batch_reset_cutlen(packets_);
7587 packets_ = &out;
7588 }
7589 dp_packet_batch_apply_cutlen(packets_);
7590 #ifdef DPDK_NETDEV
7591 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7592 && packets_->packets[0]->source
7593 != p->output_pkts.packets[0]->source)) {
7594 /* XXX: netdev-dpdk assumes that all packets in a single
7595 * output batch has the same source. Flush here to
7596 * avoid memory access issues. */
7597 dp_netdev_pmd_flush_output_on_port(pmd, p);
7598 }
7599 #endif
7600 if (dp_packet_batch_size(&p->output_pkts)
7601 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7602 /* Flush here to avoid overflow. */
7603 dp_netdev_pmd_flush_output_on_port(pmd, p);
7604 }
7605 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7606 pmd->n_output_batches++;
7607 }
7608
7609 struct dp_packet *packet;
7610 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7611 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7612 pmd->ctx.last_rxq;
7613 dp_packet_batch_add(&p->output_pkts, packet);
7614 }
7615 return true;
7616 }
7617
7618 static void
7619 dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7620 struct dp_packet_batch *packets_,
7621 bool should_steal, uint32_t bond)
7622 {
7623 struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7624 struct dp_packet_batch out;
7625 struct dp_packet *packet;
7626
7627 if (!p_bond) {
7628 COVERAGE_ADD(datapath_drop_invalid_bond,
7629 dp_packet_batch_size(packets_));
7630 dp_packet_delete_batch(packets_, should_steal);
7631 return;
7632 }
7633 if (!should_steal) {
7634 dp_packet_batch_clone(&out, packets_);
7635 dp_packet_batch_reset_cutlen(packets_);
7636 packets_ = &out;
7637 }
7638 dp_packet_batch_apply_cutlen(packets_);
7639
7640 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7641 /*
7642 * Lookup the bond-hash table using hash to get the member.
7643 */
7644 uint32_t hash = dp_packet_get_rss_hash(packet);
7645 struct member_entry *s_entry
7646 = &p_bond->member_buckets[hash & BOND_MASK];
7647 odp_port_t bond_member = s_entry->member_id;
7648 uint32_t size = dp_packet_size(packet);
7649 struct dp_packet_batch output_pkt;
7650
7651 dp_packet_batch_init_packet(&output_pkt, packet);
7652 if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7653 bond_member))) {
7654 /* Update member stats. */
7655 non_atomic_ullong_add(&s_entry->n_packets, 1);
7656 non_atomic_ullong_add(&s_entry->n_bytes, size);
7657 }
7658 }
7659 }
7660
7661 static void
7662 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7663 const struct nlattr *a, bool should_steal)
7664 OVS_NO_THREAD_SAFETY_ANALYSIS
7665 {
7666 struct dp_netdev_execute_aux *aux = aux_;
7667 uint32_t *depth = recirc_depth_get();
7668 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7669 struct dp_netdev *dp = pmd->dp;
7670 int type = nl_attr_type(a);
7671 struct tx_port *p;
7672 uint32_t packet_count, packets_dropped;
7673
7674 switch ((enum ovs_action_attr)type) {
7675 case OVS_ACTION_ATTR_OUTPUT:
7676 dp_execute_output_action(pmd, packets_, should_steal,
7677 nl_attr_get_odp_port(a));
7678 return;
7679
7680 case OVS_ACTION_ATTR_LB_OUTPUT:
7681 dp_execute_lb_output_action(pmd, packets_, should_steal,
7682 nl_attr_get_u32(a));
7683 return;
7684
7685 case OVS_ACTION_ATTR_TUNNEL_PUSH:
7686 if (should_steal) {
7687 /* We're requested to push tunnel header, but also we need to take
7688 * the ownership of these packets. Thus, we can avoid performing
7689 * the action, because the caller will not use the result anyway.
7690 * Just break to free the batch. */
7691 break;
7692 }
7693 dp_packet_batch_apply_cutlen(packets_);
7694 packet_count = dp_packet_batch_size(packets_);
7695 if (push_tnl_action(pmd, a, packets_)) {
7696 COVERAGE_ADD(datapath_drop_tunnel_push_error,
7697 packet_count);
7698 }
7699 return;
7700
7701 case OVS_ACTION_ATTR_TUNNEL_POP:
7702 if (*depth < MAX_RECIRC_DEPTH) {
7703 struct dp_packet_batch *orig_packets_ = packets_;
7704 odp_port_t portno = nl_attr_get_odp_port(a);
7705
7706 p = pmd_tnl_port_cache_lookup(pmd, portno);
7707 if (p) {
7708 struct dp_packet_batch tnl_pkt;
7709
7710 if (!should_steal) {
7711 dp_packet_batch_clone(&tnl_pkt, packets_);
7712 packets_ = &tnl_pkt;
7713 dp_packet_batch_reset_cutlen(orig_packets_);
7714 }
7715
7716 dp_packet_batch_apply_cutlen(packets_);
7717
7718 packet_count = dp_packet_batch_size(packets_);
7719 netdev_pop_header(p->port->netdev, packets_);
7720 packets_dropped =
7721 packet_count - dp_packet_batch_size(packets_);
7722 if (packets_dropped) {
7723 COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7724 packets_dropped);
7725 }
7726 if (dp_packet_batch_is_empty(packets_)) {
7727 return;
7728 }
7729
7730 struct dp_packet *packet;
7731 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7732 packet->md.in_port.odp_port = portno;
7733 }
7734
7735 (*depth)++;
7736 dp_netdev_recirculate(pmd, packets_);
7737 (*depth)--;
7738 return;
7739 }
7740 COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7741 dp_packet_batch_size(packets_));
7742 } else {
7743 COVERAGE_ADD(datapath_drop_recirc_error,
7744 dp_packet_batch_size(packets_));
7745 }
7746 break;
7747
7748 case OVS_ACTION_ATTR_USERSPACE:
7749 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7750 struct dp_packet_batch *orig_packets_ = packets_;
7751 const struct nlattr *userdata;
7752 struct dp_packet_batch usr_pkt;
7753 struct ofpbuf actions;
7754 struct flow flow;
7755 ovs_u128 ufid;
7756 bool clone = false;
7757
7758 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7759 ofpbuf_init(&actions, 0);
7760
7761 if (packets_->trunc) {
7762 if (!should_steal) {
7763 dp_packet_batch_clone(&usr_pkt, packets_);
7764 packets_ = &usr_pkt;
7765 clone = true;
7766 dp_packet_batch_reset_cutlen(orig_packets_);
7767 }
7768
7769 dp_packet_batch_apply_cutlen(packets_);
7770 }
7771
7772 struct dp_packet *packet;
7773 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7774 flow_extract(packet, &flow);
7775 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7776 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7777 &ufid, &actions, userdata);
7778 }
7779
7780 if (clone) {
7781 dp_packet_delete_batch(packets_, true);
7782 }
7783
7784 ofpbuf_uninit(&actions);
7785 fat_rwlock_unlock(&dp->upcall_rwlock);
7786
7787 return;
7788 }
7789 COVERAGE_ADD(datapath_drop_lock_error,
7790 dp_packet_batch_size(packets_));
7791 break;
7792
7793 case OVS_ACTION_ATTR_RECIRC:
7794 if (*depth < MAX_RECIRC_DEPTH) {
7795 struct dp_packet_batch recirc_pkts;
7796
7797 if (!should_steal) {
7798 dp_packet_batch_clone(&recirc_pkts, packets_);
7799 packets_ = &recirc_pkts;
7800 }
7801
7802 struct dp_packet *packet;
7803 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7804 packet->md.recirc_id = nl_attr_get_u32(a);
7805 }
7806
7807 (*depth)++;
7808 dp_netdev_recirculate(pmd, packets_);
7809 (*depth)--;
7810
7811 return;
7812 }
7813
7814 COVERAGE_ADD(datapath_drop_recirc_error,
7815 dp_packet_batch_size(packets_));
7816 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7817 break;
7818
7819 case OVS_ACTION_ATTR_CT: {
7820 const struct nlattr *b;
7821 bool force = false;
7822 bool commit = false;
7823 unsigned int left;
7824 uint16_t zone = 0;
7825 uint32_t tp_id = 0;
7826 const char *helper = NULL;
7827 const uint32_t *setmark = NULL;
7828 const struct ovs_key_ct_labels *setlabel = NULL;
7829 struct nat_action_info_t nat_action_info;
7830 struct nat_action_info_t *nat_action_info_ref = NULL;
7831 bool nat_config = false;
7832
7833 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7834 nl_attr_get_size(a)) {
7835 enum ovs_ct_attr sub_type = nl_attr_type(b);
7836
7837 switch(sub_type) {
7838 case OVS_CT_ATTR_FORCE_COMMIT:
7839 force = true;
7840 /* fall through. */
7841 case OVS_CT_ATTR_COMMIT:
7842 commit = true;
7843 break;
7844 case OVS_CT_ATTR_ZONE:
7845 zone = nl_attr_get_u16(b);
7846 break;
7847 case OVS_CT_ATTR_HELPER:
7848 helper = nl_attr_get_string(b);
7849 break;
7850 case OVS_CT_ATTR_MARK:
7851 setmark = nl_attr_get(b);
7852 break;
7853 case OVS_CT_ATTR_LABELS:
7854 setlabel = nl_attr_get(b);
7855 break;
7856 case OVS_CT_ATTR_EVENTMASK:
7857 /* Silently ignored, as userspace datapath does not generate
7858 * netlink events. */
7859 break;
7860 case OVS_CT_ATTR_TIMEOUT:
7861 if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
7862 VLOG_WARN("Invalid Timeout Policy ID: %s.",
7863 nl_attr_get_string(b));
7864 tp_id = DEFAULT_TP_ID;
7865 }
7866 break;
7867 case OVS_CT_ATTR_NAT: {
7868 const struct nlattr *b_nest;
7869 unsigned int left_nest;
7870 bool ip_min_specified = false;
7871 bool proto_num_min_specified = false;
7872 bool ip_max_specified = false;
7873 bool proto_num_max_specified = false;
7874 memset(&nat_action_info, 0, sizeof nat_action_info);
7875 nat_action_info_ref = &nat_action_info;
7876
7877 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7878 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7879
7880 switch (sub_type_nest) {
7881 case OVS_NAT_ATTR_SRC:
7882 case OVS_NAT_ATTR_DST:
7883 nat_config = true;
7884 nat_action_info.nat_action |=
7885 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7886 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7887 break;
7888 case OVS_NAT_ATTR_IP_MIN:
7889 memcpy(&nat_action_info.min_addr,
7890 nl_attr_get(b_nest),
7891 nl_attr_get_size(b_nest));
7892 ip_min_specified = true;
7893 break;
7894 case OVS_NAT_ATTR_IP_MAX:
7895 memcpy(&nat_action_info.max_addr,
7896 nl_attr_get(b_nest),
7897 nl_attr_get_size(b_nest));
7898 ip_max_specified = true;
7899 break;
7900 case OVS_NAT_ATTR_PROTO_MIN:
7901 nat_action_info.min_port =
7902 nl_attr_get_u16(b_nest);
7903 proto_num_min_specified = true;
7904 break;
7905 case OVS_NAT_ATTR_PROTO_MAX:
7906 nat_action_info.max_port =
7907 nl_attr_get_u16(b_nest);
7908 proto_num_max_specified = true;
7909 break;
7910 case OVS_NAT_ATTR_PERSISTENT:
7911 case OVS_NAT_ATTR_PROTO_HASH:
7912 case OVS_NAT_ATTR_PROTO_RANDOM:
7913 break;
7914 case OVS_NAT_ATTR_UNSPEC:
7915 case __OVS_NAT_ATTR_MAX:
7916 OVS_NOT_REACHED();
7917 }
7918 }
7919
7920 if (ip_min_specified && !ip_max_specified) {
7921 nat_action_info.max_addr = nat_action_info.min_addr;
7922 }
7923 if (proto_num_min_specified && !proto_num_max_specified) {
7924 nat_action_info.max_port = nat_action_info.min_port;
7925 }
7926 if (proto_num_min_specified || proto_num_max_specified) {
7927 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7928 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7929 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7930 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7931 }
7932 }
7933 break;
7934 }
7935 case OVS_CT_ATTR_UNSPEC:
7936 case __OVS_CT_ATTR_MAX:
7937 OVS_NOT_REACHED();
7938 }
7939 }
7940
7941 /* We won't be able to function properly in this case, hence
7942 * complain loudly. */
7943 if (nat_config && !commit) {
7944 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7945 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7946 }
7947
7948 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7949 commit, zone, setmark, setlabel, aux->flow->tp_src,
7950 aux->flow->tp_dst, helper, nat_action_info_ref,
7951 pmd->ctx.now / 1000, tp_id);
7952 break;
7953 }
7954
7955 case OVS_ACTION_ATTR_METER:
7956 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7957 pmd->ctx.now);
7958 break;
7959
7960 case OVS_ACTION_ATTR_PUSH_VLAN:
7961 case OVS_ACTION_ATTR_POP_VLAN:
7962 case OVS_ACTION_ATTR_PUSH_MPLS:
7963 case OVS_ACTION_ATTR_POP_MPLS:
7964 case OVS_ACTION_ATTR_SET:
7965 case OVS_ACTION_ATTR_SET_MASKED:
7966 case OVS_ACTION_ATTR_SAMPLE:
7967 case OVS_ACTION_ATTR_HASH:
7968 case OVS_ACTION_ATTR_UNSPEC:
7969 case OVS_ACTION_ATTR_TRUNC:
7970 case OVS_ACTION_ATTR_PUSH_ETH:
7971 case OVS_ACTION_ATTR_POP_ETH:
7972 case OVS_ACTION_ATTR_CLONE:
7973 case OVS_ACTION_ATTR_PUSH_NSH:
7974 case OVS_ACTION_ATTR_POP_NSH:
7975 case OVS_ACTION_ATTR_CT_CLEAR:
7976 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7977 case OVS_ACTION_ATTR_DROP:
7978 case __OVS_ACTION_ATTR_MAX:
7979 OVS_NOT_REACHED();
7980 }
7981
7982 dp_packet_delete_batch(packets_, should_steal);
7983 }
7984
7985 static void
7986 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7987 struct dp_packet_batch *packets,
7988 bool should_steal, const struct flow *flow,
7989 const struct nlattr *actions, size_t actions_len)
7990 {
7991 struct dp_netdev_execute_aux aux = { pmd, flow };
7992
7993 odp_execute_actions(&aux, packets, should_steal, actions,
7994 actions_len, dp_execute_cb);
7995 }
7996
7997 struct dp_netdev_ct_dump {
7998 struct ct_dpif_dump_state up;
7999 struct conntrack_dump dump;
8000 struct conntrack *ct;
8001 struct dp_netdev *dp;
8002 };
8003
8004 static int
8005 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8006 const uint16_t *pzone, int *ptot_bkts)
8007 {
8008 struct dp_netdev *dp = get_dp_netdev(dpif);
8009 struct dp_netdev_ct_dump *dump;
8010
8011 dump = xzalloc(sizeof *dump);
8012 dump->dp = dp;
8013 dump->ct = dp->conntrack;
8014
8015 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8016
8017 *dump_ = &dump->up;
8018
8019 return 0;
8020 }
8021
8022 static int
8023 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8024 struct ct_dpif_dump_state *dump_,
8025 struct ct_dpif_entry *entry)
8026 {
8027 struct dp_netdev_ct_dump *dump;
8028
8029 INIT_CONTAINER(dump, dump_, up);
8030
8031 return conntrack_dump_next(&dump->dump, entry);
8032 }
8033
8034 static int
8035 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8036 struct ct_dpif_dump_state *dump_)
8037 {
8038 struct dp_netdev_ct_dump *dump;
8039 int err;
8040
8041 INIT_CONTAINER(dump, dump_, up);
8042
8043 err = conntrack_dump_done(&dump->dump);
8044
8045 free(dump);
8046
8047 return err;
8048 }
8049
8050 static int
8051 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8052 const struct ct_dpif_tuple *tuple)
8053 {
8054 struct dp_netdev *dp = get_dp_netdev(dpif);
8055
8056 if (tuple) {
8057 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8058 }
8059 return conntrack_flush(dp->conntrack, zone);
8060 }
8061
8062 static int
8063 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8064 {
8065 struct dp_netdev *dp = get_dp_netdev(dpif);
8066
8067 return conntrack_set_maxconns(dp->conntrack, maxconns);
8068 }
8069
8070 static int
8071 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8072 {
8073 struct dp_netdev *dp = get_dp_netdev(dpif);
8074
8075 return conntrack_get_maxconns(dp->conntrack, maxconns);
8076 }
8077
8078 static int
8079 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8080 {
8081 struct dp_netdev *dp = get_dp_netdev(dpif);
8082
8083 return conntrack_get_nconns(dp->conntrack, nconns);
8084 }
8085
8086 static int
8087 dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8088 {
8089 struct dp_netdev *dp = get_dp_netdev(dpif);
8090
8091 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8092 }
8093
8094 static int
8095 dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8096 {
8097 struct dp_netdev *dp = get_dp_netdev(dpif);
8098 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8099 return 0;
8100 }
8101
8102 static int
8103 dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
8104 const uint32_t *default_limits,
8105 const struct ovs_list *zone_limits)
8106 {
8107 int err = 0;
8108 struct dp_netdev *dp = get_dp_netdev(dpif);
8109 if (default_limits) {
8110 err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
8111 if (err != 0) {
8112 return err;
8113 }
8114 }
8115
8116 struct ct_dpif_zone_limit *zone_limit;
8117 LIST_FOR_EACH (zone_limit, node, zone_limits) {
8118 err = zone_limit_update(dp->conntrack, zone_limit->zone,
8119 zone_limit->limit);
8120 if (err != 0) {
8121 break;
8122 }
8123 }
8124 return err;
8125 }
8126
8127 static int
8128 dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
8129 uint32_t *default_limit,
8130 const struct ovs_list *zone_limits_request,
8131 struct ovs_list *zone_limits_reply)
8132 {
8133 struct dp_netdev *dp = get_dp_netdev(dpif);
8134 struct conntrack_zone_limit czl;
8135
8136 czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
8137 if (czl.zone == DEFAULT_ZONE) {
8138 *default_limit = czl.limit;
8139 } else {
8140 return EINVAL;
8141 }
8142
8143 if (!ovs_list_is_empty(zone_limits_request)) {
8144 struct ct_dpif_zone_limit *zone_limit;
8145 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
8146 czl = zone_limit_get(dp->conntrack, zone_limit->zone);
8147 if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
8148 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
8149 czl.limit, czl.count);
8150 } else {
8151 return EINVAL;
8152 }
8153 }
8154 } else {
8155 for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
8156 czl = zone_limit_get(dp->conntrack, z);
8157 if (czl.zone == z) {
8158 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
8159 czl.count);
8160 }
8161 }
8162 }
8163
8164 return 0;
8165 }
8166
8167 static int
8168 dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
8169 const struct ovs_list *zone_limits)
8170 {
8171 int err = 0;
8172 struct dp_netdev *dp = get_dp_netdev(dpif);
8173 struct ct_dpif_zone_limit *zone_limit;
8174 LIST_FOR_EACH (zone_limit, node, zone_limits) {
8175 err = zone_limit_delete(dp->conntrack, zone_limit->zone);
8176 if (err != 0) {
8177 break;
8178 }
8179 }
8180
8181 return err;
8182 }
8183
8184 static int
8185 dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
8186 const struct ct_dpif_timeout_policy *dpif_tp)
8187 {
8188 struct timeout_policy tp;
8189 struct dp_netdev *dp;
8190
8191 dp = get_dp_netdev(dpif);
8192 memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
8193 return timeout_policy_update(dp->conntrack, &tp);
8194 }
8195
8196 static int
8197 dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
8198 struct ct_dpif_timeout_policy *dpif_tp)
8199 {
8200 struct timeout_policy *tp;
8201 struct dp_netdev *dp;
8202 int err = 0;
8203
8204 dp = get_dp_netdev(dpif);
8205 tp = timeout_policy_get(dp->conntrack, tp_id);
8206 if (!tp) {
8207 return ENOENT;
8208 }
8209 memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8210 return err;
8211 }
8212
8213 static int
8214 dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8215 uint32_t tp_id)
8216 {
8217 struct dp_netdev *dp;
8218 int err = 0;
8219
8220 dp = get_dp_netdev(dpif);
8221 err = timeout_policy_delete(dp->conntrack, tp_id);
8222 return err;
8223 }
8224
8225 static int
8226 dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8227 uint32_t tp_id,
8228 uint16_t dl_type OVS_UNUSED,
8229 uint8_t nw_proto OVS_UNUSED,
8230 char **tp_name, bool *is_generic)
8231 {
8232 struct ds ds = DS_EMPTY_INITIALIZER;
8233
8234 ds_put_format(&ds, "%"PRIu32, tp_id);
8235 *tp_name = ds_steal_cstr(&ds);
8236 *is_generic = true;
8237 return 0;
8238 }
8239
8240 static int
8241 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8242 {
8243 struct dp_netdev *dp = get_dp_netdev(dpif);
8244 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8245 }
8246
8247 static int
8248 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8249 {
8250 struct dp_netdev *dp = get_dp_netdev(dpif);
8251 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8252 }
8253
8254 static int
8255 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8256 {
8257 struct dp_netdev *dp = get_dp_netdev(dpif);
8258 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8259 }
8260
8261 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8262 * diverge. */
8263 static int
8264 dpif_netdev_ipf_get_status(struct dpif *dpif,
8265 struct dpif_ipf_status *dpif_ipf_status)
8266 {
8267 struct dp_netdev *dp = get_dp_netdev(dpif);
8268 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8269 (struct ipf_status *) dpif_ipf_status);
8270 return 0;
8271 }
8272
8273 static int
8274 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8275 struct ipf_dump_ctx **ipf_dump_ctx)
8276 {
8277 return ipf_dump_start(ipf_dump_ctx);
8278 }
8279
8280 static int
8281 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8282 {
8283 struct dp_netdev *dp = get_dp_netdev(dpif);
8284 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8285 dump);
8286 }
8287
8288 static int
8289 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8290 {
8291 return ipf_dump_done(ipf_dump_ctx);
8292
8293 }
8294
8295 static int
8296 dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8297 odp_port_t *member_map)
8298 {
8299 struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8300 struct dp_netdev *dp = get_dp_netdev(dpif);
8301 struct dp_netdev_pmd_thread *pmd;
8302
8303 /* Prepare new bond mapping. */
8304 new_tx->bond_id = bond_id;
8305 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8306 new_tx->member_buckets[bucket].member_id = member_map[bucket];
8307 }
8308
8309 ovs_mutex_lock(&dp->bond_mutex);
8310 /* Check if bond already existed. */
8311 struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8312 if (old_tx) {
8313 cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8314 hash_bond_id(bond_id));
8315 ovsrcu_postpone(free, old_tx);
8316 } else {
8317 cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8318 }
8319 ovs_mutex_unlock(&dp->bond_mutex);
8320
8321 /* Update all PMDs with new bond mapping. */
8322 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8323 dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8324 }
8325 return 0;
8326 }
8327
8328 static int
8329 dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8330 {
8331 struct dp_netdev *dp = get_dp_netdev(dpif);
8332 struct dp_netdev_pmd_thread *pmd;
8333 struct tx_bond *tx;
8334
8335 ovs_mutex_lock(&dp->bond_mutex);
8336 /* Check if bond existed. */
8337 tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8338 if (tx) {
8339 cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8340 ovsrcu_postpone(free, tx);
8341 } else {
8342 /* Bond is not present. */
8343 ovs_mutex_unlock(&dp->bond_mutex);
8344 return ENOENT;
8345 }
8346 ovs_mutex_unlock(&dp->bond_mutex);
8347
8348 /* Remove the bond map in all pmds. */
8349 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8350 dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8351 }
8352 return 0;
8353 }
8354
8355 static int
8356 dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8357 uint64_t *n_bytes)
8358 {
8359 struct dp_netdev *dp = get_dp_netdev(dpif);
8360 struct dp_netdev_pmd_thread *pmd;
8361
8362 if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8363 return ENOENT;
8364 }
8365
8366 /* Search the bond in all PMDs. */
8367 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8368 struct tx_bond *pmd_bond_entry
8369 = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8370
8371 if (!pmd_bond_entry) {
8372 continue;
8373 }
8374
8375 /* Read bond stats. */
8376 for (int i = 0; i < BOND_BUCKETS; i++) {
8377 uint64_t pmd_n_bytes;
8378
8379 atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
8380 &pmd_n_bytes);
8381 n_bytes[i] += pmd_n_bytes;
8382 }
8383 }
8384 return 0;
8385 }
8386
8387 const struct dpif_class dpif_netdev_class = {
8388 "netdev",
8389 true, /* cleanup_required */
8390 dpif_netdev_init,
8391 dpif_netdev_enumerate,
8392 dpif_netdev_port_open_type,
8393 dpif_netdev_open,
8394 dpif_netdev_close,
8395 dpif_netdev_destroy,
8396 dpif_netdev_run,
8397 dpif_netdev_wait,
8398 dpif_netdev_get_stats,
8399 NULL, /* set_features */
8400 dpif_netdev_port_add,
8401 dpif_netdev_port_del,
8402 dpif_netdev_port_set_config,
8403 dpif_netdev_port_query_by_number,
8404 dpif_netdev_port_query_by_name,
8405 NULL, /* port_get_pid */
8406 dpif_netdev_port_dump_start,
8407 dpif_netdev_port_dump_next,
8408 dpif_netdev_port_dump_done,
8409 dpif_netdev_port_poll,
8410 dpif_netdev_port_poll_wait,
8411 dpif_netdev_flow_flush,
8412 dpif_netdev_flow_dump_create,
8413 dpif_netdev_flow_dump_destroy,
8414 dpif_netdev_flow_dump_thread_create,
8415 dpif_netdev_flow_dump_thread_destroy,
8416 dpif_netdev_flow_dump_next,
8417 dpif_netdev_operate,
8418 NULL, /* recv_set */
8419 NULL, /* handlers_set */
8420 dpif_netdev_set_config,
8421 dpif_netdev_queue_to_priority,
8422 NULL, /* recv */
8423 NULL, /* recv_wait */
8424 NULL, /* recv_purge */
8425 dpif_netdev_register_dp_purge_cb,
8426 dpif_netdev_register_upcall_cb,
8427 dpif_netdev_enable_upcall,
8428 dpif_netdev_disable_upcall,
8429 dpif_netdev_get_datapath_version,
8430 dpif_netdev_ct_dump_start,
8431 dpif_netdev_ct_dump_next,
8432 dpif_netdev_ct_dump_done,
8433 dpif_netdev_ct_flush,
8434 dpif_netdev_ct_set_maxconns,
8435 dpif_netdev_ct_get_maxconns,
8436 dpif_netdev_ct_get_nconns,
8437 dpif_netdev_ct_set_tcp_seq_chk,
8438 dpif_netdev_ct_get_tcp_seq_chk,
8439 dpif_netdev_ct_set_limits,
8440 dpif_netdev_ct_get_limits,
8441 dpif_netdev_ct_del_limits,
8442 dpif_netdev_ct_set_timeout_policy,
8443 dpif_netdev_ct_get_timeout_policy,
8444 dpif_netdev_ct_del_timeout_policy,
8445 NULL, /* ct_timeout_policy_dump_start */
8446 NULL, /* ct_timeout_policy_dump_next */
8447 NULL, /* ct_timeout_policy_dump_done */
8448 dpif_netdev_ct_get_timeout_policy_name,
8449 dpif_netdev_ipf_set_enabled,
8450 dpif_netdev_ipf_set_min_frag,
8451 dpif_netdev_ipf_set_max_nfrags,
8452 dpif_netdev_ipf_get_status,
8453 dpif_netdev_ipf_dump_start,
8454 dpif_netdev_ipf_dump_next,
8455 dpif_netdev_ipf_dump_done,
8456 dpif_netdev_meter_get_features,
8457 dpif_netdev_meter_set,
8458 dpif_netdev_meter_get,
8459 dpif_netdev_meter_del,
8460 dpif_netdev_bond_add,
8461 dpif_netdev_bond_del,
8462 dpif_netdev_bond_stats_get,
8463 };
8464
8465 static void
8466 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8467 const char *argv[], void *aux OVS_UNUSED)
8468 {
8469 struct dp_netdev_port *port;
8470 struct dp_netdev *dp;
8471 odp_port_t port_no;
8472
8473 ovs_mutex_lock(&dp_netdev_mutex);
8474 dp = shash_find_data(&dp_netdevs, argv[1]);
8475 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8476 ovs_mutex_unlock(&dp_netdev_mutex);
8477 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8478 return;
8479 }
8480 ovs_refcount_ref(&dp->ref_cnt);
8481 ovs_mutex_unlock(&dp_netdev_mutex);
8482
8483 ovs_mutex_lock(&dp->port_mutex);
8484 if (get_port_by_name(dp, argv[2], &port)) {
8485 unixctl_command_reply_error(conn, "unknown port");
8486 goto exit;
8487 }
8488
8489 port_no = u32_to_odp(atoi(argv[3]));
8490 if (!port_no || port_no == ODPP_NONE) {
8491 unixctl_command_reply_error(conn, "bad port number");
8492 goto exit;
8493 }
8494 if (dp_netdev_lookup_port(dp, port_no)) {
8495 unixctl_command_reply_error(conn, "port number already in use");
8496 goto exit;
8497 }
8498
8499 /* Remove port. */
8500 hmap_remove(&dp->ports, &port->node);
8501 reconfigure_datapath(dp);
8502
8503 /* Reinsert with new port number. */
8504 port->port_no = port_no;
8505 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8506 reconfigure_datapath(dp);
8507
8508 seq_change(dp->port_seq);
8509 unixctl_command_reply(conn, NULL);
8510
8511 exit:
8512 ovs_mutex_unlock(&dp->port_mutex);
8513 dp_netdev_unref(dp);
8514 }
8515
8516 static void
8517 dpif_dummy_register__(const char *type)
8518 {
8519 struct dpif_class *class;
8520
8521 class = xmalloc(sizeof *class);
8522 *class = dpif_netdev_class;
8523 class->type = xstrdup(type);
8524 dp_register_provider(class);
8525 }
8526
8527 static void
8528 dpif_dummy_override(const char *type)
8529 {
8530 int error;
8531
8532 /*
8533 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8534 * a userland-only build. It's useful for testsuite.
8535 */
8536 error = dp_unregister_provider(type);
8537 if (error == 0 || error == EAFNOSUPPORT) {
8538 dpif_dummy_register__(type);
8539 }
8540 }
8541
8542 void
8543 dpif_dummy_register(enum dummy_level level)
8544 {
8545 if (level == DUMMY_OVERRIDE_ALL) {
8546 struct sset types;
8547 const char *type;
8548
8549 sset_init(&types);
8550 dp_enumerate_types(&types);
8551 SSET_FOR_EACH (type, &types) {
8552 dpif_dummy_override(type);
8553 }
8554 sset_destroy(&types);
8555 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8556 dpif_dummy_override("system");
8557 }
8558
8559 dpif_dummy_register__("dummy");
8560
8561 unixctl_command_register("dpif-dummy/change-port-number",
8562 "dp port new-number",
8563 3, 3, dpif_dummy_change_port_number, NULL);
8564 }
8565 \f
8566 /* Datapath Classifier. */
8567
8568 static void
8569 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8570 {
8571 cmap_destroy(&subtable->rules);
8572 ovsrcu_postpone(free, subtable->mf_masks);
8573 ovsrcu_postpone(free, subtable);
8574 }
8575
8576 /* Initializes 'cls' as a classifier that initially contains no classification
8577 * rules. */
8578 static void
8579 dpcls_init(struct dpcls *cls)
8580 {
8581 cmap_init(&cls->subtables_map);
8582 pvector_init(&cls->subtables);
8583 }
8584
8585 static void
8586 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8587 {
8588 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
8589 pvector_remove(&cls->subtables, subtable);
8590 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8591 subtable->mask.hash);
8592 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
8593 }
8594
8595 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
8596 * caller's responsibility.
8597 * May only be called after all the readers have been terminated. */
8598 static void
8599 dpcls_destroy(struct dpcls *cls)
8600 {
8601 if (cls) {
8602 struct dpcls_subtable *subtable;
8603
8604 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
8605 ovs_assert(cmap_count(&subtable->rules) == 0);
8606 dpcls_destroy_subtable(cls, subtable);
8607 }
8608 cmap_destroy(&cls->subtables_map);
8609 pvector_destroy(&cls->subtables);
8610 }
8611 }
8612
8613 static struct dpcls_subtable *
8614 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8615 {
8616 struct dpcls_subtable *subtable;
8617
8618 /* Need to add one. */
8619 subtable = xmalloc(sizeof *subtable
8620 - sizeof subtable->mask.mf + mask->len);
8621 cmap_init(&subtable->rules);
8622 subtable->hit_cnt = 0;
8623 netdev_flow_key_clone(&subtable->mask, mask);
8624
8625 /* The count of bits in the mask defines the space required for masks.
8626 * Then call gen_masks() to create the appropriate masks, avoiding the cost
8627 * of doing runtime calculations. */
8628 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
8629 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
8630 subtable->mf_bits_set_unit0 = unit0;
8631 subtable->mf_bits_set_unit1 = unit1;
8632 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
8633 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
8634
8635 /* Get the preferred subtable search function for this (u0,u1) subtable.
8636 * The function is guaranteed to always return a valid implementation, and
8637 * possibly an ISA optimized, and/or specialized implementation.
8638 */
8639 subtable->lookup_func = dpcls_subtable_get_best_impl(unit0, unit1);
8640
8641 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
8642 /* Add the new subtable at the end of the pvector (with no hits yet) */
8643 pvector_insert(&cls->subtables, subtable, 0);
8644 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
8645 cmap_count(&cls->subtables_map), subtable, cls->in_port);
8646 pvector_publish(&cls->subtables);
8647
8648 return subtable;
8649 }
8650
8651 static inline struct dpcls_subtable *
8652 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8653 {
8654 struct dpcls_subtable *subtable;
8655
8656 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8657 &cls->subtables_map) {
8658 if (netdev_flow_key_equal(&subtable->mask, mask)) {
8659 return subtable;
8660 }
8661 }
8662 return dpcls_create_subtable(cls, mask);
8663 }
8664
8665 /* Checks for the best available implementation for each subtable lookup
8666 * function, and assigns it as the lookup function pointer for each subtable.
8667 * Returns the number of subtables that have changed lookup implementation.
8668 */
8669 static uint32_t
8670 dpcls_subtable_lookup_reprobe(struct dpcls *cls)
8671 {
8672 struct pvector *pvec = &cls->subtables;
8673 uint32_t subtables_changed = 0;
8674 struct dpcls_subtable *subtable = NULL;
8675
8676 PVECTOR_FOR_EACH (subtable, pvec) {
8677 uint32_t u0_bits = subtable->mf_bits_set_unit0;
8678 uint32_t u1_bits = subtable->mf_bits_set_unit1;
8679 void *old_func = subtable->lookup_func;
8680 subtable->lookup_func = dpcls_subtable_get_best_impl(u0_bits, u1_bits);
8681 subtables_changed += (old_func != subtable->lookup_func);
8682 }
8683 pvector_publish(pvec);
8684
8685 return subtables_changed;
8686 }
8687
8688 /* Periodically sort the dpcls subtable vectors according to hit counts */
8689 static void
8690 dpcls_sort_subtable_vector(struct dpcls *cls)
8691 {
8692 struct pvector *pvec = &cls->subtables;
8693 struct dpcls_subtable *subtable;
8694
8695 PVECTOR_FOR_EACH (subtable, pvec) {
8696 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8697 subtable->hit_cnt = 0;
8698 }
8699 pvector_publish(pvec);
8700 }
8701
8702 static inline void
8703 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8704 struct polled_queue *poll_list, int poll_cnt)
8705 {
8706 struct dpcls *cls;
8707 uint64_t tot_idle = 0, tot_proc = 0;
8708 unsigned int pmd_load = 0;
8709
8710 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
8711 uint64_t curr_tsc;
8712 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8713 if (pmd_alb->is_enabled && !pmd->isolated
8714 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8715 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8716 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8717 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8718 {
8719 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8720 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8721 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8722 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8723
8724 if (tot_proc) {
8725 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8726 }
8727
8728 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8729 atomic_count_inc(&pmd->pmd_overloaded);
8730 } else {
8731 atomic_count_set(&pmd->pmd_overloaded, 0);
8732 }
8733 }
8734
8735 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8736 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8737 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8738 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8739
8740 /* Get the cycles that were used to process each queue and store. */
8741 for (unsigned i = 0; i < poll_cnt; i++) {
8742 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8743 RXQ_CYCLES_PROC_CURR);
8744 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8745 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8746 0);
8747 }
8748 curr_tsc = cycles_counter_update(&pmd->perf_stats);
8749 if (pmd->intrvl_tsc_prev) {
8750 /* There is a prev timestamp, store a new intrvl cycle count. */
8751 atomic_store_relaxed(&pmd->intrvl_cycles,
8752 curr_tsc - pmd->intrvl_tsc_prev);
8753 }
8754 pmd->intrvl_tsc_prev = curr_tsc;
8755 /* Start new measuring interval */
8756 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
8757 }
8758
8759 if (pmd->ctx.now > pmd->next_optimization) {
8760 /* Try to obtain the flow lock to block out revalidator threads.
8761 * If not possible, just try next time. */
8762 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8763 /* Optimize each classifier */
8764 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8765 dpcls_sort_subtable_vector(cls);
8766 }
8767 ovs_mutex_unlock(&pmd->flow_mutex);
8768 /* Start new measuring interval */
8769 pmd->next_optimization = pmd->ctx.now
8770 + DPCLS_OPTIMIZATION_INTERVAL;
8771 }
8772 }
8773 }
8774
8775 /* Insert 'rule' into 'cls'. */
8776 static void
8777 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8778 const struct netdev_flow_key *mask)
8779 {
8780 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8781
8782 /* Refer to subtable's mask, also for later removal. */
8783 rule->mask = &subtable->mask;
8784 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8785 }
8786
8787 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
8788 static void
8789 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8790 {
8791 struct dpcls_subtable *subtable;
8792
8793 ovs_assert(rule->mask);
8794
8795 /* Get subtable from reference in rule->mask. */
8796 INIT_CONTAINER(subtable, rule->mask, mask);
8797 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8798 == 0) {
8799 /* Delete empty subtable. */
8800 dpcls_destroy_subtable(cls, subtable);
8801 pvector_publish(&cls->subtables);
8802 }
8803 }
8804
8805 /* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8806 static inline void
8807 netdev_flow_key_gen_mask_unit(uint64_t iter,
8808 const uint64_t count,
8809 uint64_t *mf_masks)
8810 {
8811 int i;
8812 for (i = 0; i < count; i++) {
8813 uint64_t lowest_bit = (iter & -iter);
8814 iter &= ~lowest_bit;
8815 mf_masks[i] = (lowest_bit - 1);
8816 }
8817 /* Checks that count has covered all bits in the iter bitmap. */
8818 ovs_assert(iter == 0);
8819 }
8820
8821 /* Generate a mask for each block in the miniflow, based on the bits set. This
8822 * allows easily masking packets with the generated array here, without
8823 * calculations. This replaces runtime-calculating the masks.
8824 * @param key The table to generate the mf_masks for
8825 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8826 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8827 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8828 */
8829 void
8830 netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8831 uint64_t *mf_masks,
8832 const uint32_t mf_bits_u0,
8833 const uint32_t mf_bits_u1)
8834 {
8835 uint64_t iter_u0 = tbl->mf.map.bits[0];
8836 uint64_t iter_u1 = tbl->mf.map.bits[1];
8837
8838 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8839 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8840 }
8841
8842 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8843 * in 'mask' the values in 'key' and 'target' are the same. */
8844 bool
8845 dpcls_rule_matches_key(const struct dpcls_rule *rule,
8846 const struct netdev_flow_key *target)
8847 {
8848 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8849 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
8850 uint64_t value;
8851
8852 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8853 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
8854 return false;
8855 }
8856 }
8857 return true;
8858 }
8859
8860 /* For each miniflow in 'keys' performs a classifier lookup writing the result
8861 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
8862 * NULL it is skipped.
8863 *
8864 * This function is optimized for use in the userspace datapath and therefore
8865 * does not implement a lot of features available in the standard
8866 * classifier_lookup() function. Specifically, it does not implement
8867 * priorities, instead returning any rule which matches the flow.
8868 *
8869 * Returns true if all miniflows found a corresponding rule. */
8870 static bool
8871 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8872 struct dpcls_rule **rules, const size_t cnt,
8873 int *num_lookups_p)
8874 {
8875 /* The received 'cnt' miniflows are the search-keys that will be processed
8876 * to find a matching entry into the available subtables.
8877 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8878 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8879 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8880
8881 struct dpcls_subtable *subtable;
8882 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8883
8884 if (cnt != MAP_BITS) {
8885 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8886 }
8887 memset(rules, 0, cnt * sizeof *rules);
8888
8889 int lookups_match = 0, subtable_pos = 1;
8890 uint32_t found_map;
8891
8892 /* The Datapath classifier - aka dpcls - is composed of subtables.
8893 * Subtables are dynamically created as needed when new rules are inserted.
8894 * Each subtable collects rules with matches on a specific subset of packet
8895 * fields as defined by the subtable's mask. We proceed to process every
8896 * search-key against each subtable, but when a match is found for a
8897 * search-key, the search for that key can stop because the rules are
8898 * non-overlapping. */
8899 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8900 /* Call the subtable specific lookup function. */
8901 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8902
8903 /* Count the number of subtables searched for this packet match. This
8904 * estimates the "spread" of subtables looked at per matched packet. */
8905 uint32_t pkts_matched = count_1bits(found_map);
8906 lookups_match += pkts_matched * subtable_pos;
8907
8908 /* Clear the found rules, and return early if all packets are found. */
8909 keys_map &= ~found_map;
8910 if (!keys_map) {
8911 if (num_lookups_p) {
8912 *num_lookups_p = lookups_match;
8913 }
8914 return true;
8915 }
8916 subtable_pos++;
8917 }
8918
8919 if (num_lookups_p) {
8920 *num_lookups_p = lookups_match;
8921 }
8922 return false;
8923 }