lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19 #include "dpif-netdev-private.h"
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <inttypes.h>
  25 #include <net/if.h>
  26 #include <sys/types.h>
  27 #include <netinet/in.h>
  28 #include <stdint.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <sys/ioctl.h>
  32 #include <sys/socket.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35
  36 #include "bitmap.h"
  37 #include "cmap.h"
  38 #include "conntrack.h"
  39 #include "conntrack-tp.h"
  40 #include "coverage.h"
  41 #include "ct-dpif.h"
  42 #include "csum.h"
  43 #include "dp-packet.h"
  44 #include "dpif.h"
  45 #include "dpif-netdev-lookup.h"
  46 #include "dpif-netdev-perf.h"
  47 #include "dpif-provider.h"
  48 #include "dummy.h"
  49 #include "fat-rwlock.h"
  50 #include "flow.h"
  51 #include "hmapx.h"
  52 #include "id-pool.h"
  53 #include "ipf.h"
  54 #include "netdev.h"
  55 #include "netdev-offload.h"
  56 #include "netdev-provider.h"
  57 #include "netdev-vport.h"
  58 #include "netlink.h"
  59 #include "odp-execute.h"
  60 #include "odp-util.h"
  61 #include "openvswitch/dynamic-string.h"
  62 #include "openvswitch/list.h"
  63 #include "openvswitch/match.h"
  64 #include "openvswitch/ofp-parse.h"
  65 #include "openvswitch/ofp-print.h"
  66 #include "openvswitch/ofpbuf.h"
  67 #include "openvswitch/shash.h"
  68 #include "openvswitch/vlog.h"
  69 #include "ovs-numa.h"
  70 #include "ovs-rcu.h"
  71 #include "packets.h"
  72 #include "openvswitch/poll-loop.h"
  73 #include "pvector.h"
  74 #include "random.h"
  75 #include "seq.h"
  76 #include "smap.h"
  77 #include "sset.h"
  78 #include "timeval.h"
  79 #include "tnl-neigh-cache.h"
  80 #include "tnl-ports.h"
  81 #include "unixctl.h"
  82 #include "util.h"
  83 #include "uuid.h"
  84
  85 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  86
  87 /* Auto Load Balancing Defaults */
  88 #define ALB_ACCEPTABLE_IMPROVEMENT       25
  89 #define ALB_PMD_LOAD_THRESHOLD           95
  90 #define ALB_PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
  91 #define MIN_TO_MSEC                  60000
  92
  93 #define FLOW_DUMP_MAX_BATCH 50
  94 /* Use per thread recirc_depth to prevent recirculation loop. */
  95 #define MAX_RECIRC_DEPTH 6
  96 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  97
  98 /* Use instant packet send by default. */
  99 #define DEFAULT_TX_FLUSH_INTERVAL 0
 100
 101 /* Configuration parameters. */
 102 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
 103 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
 104 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
 105
 106 COVERAGE_DEFINE(datapath_drop_meter);
 107 COVERAGE_DEFINE(datapath_drop_upcall_error);
 108 COVERAGE_DEFINE(datapath_drop_lock_error);
 109 COVERAGE_DEFINE(datapath_drop_userspace_action_error);
 110 COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
 111 COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
 112 COVERAGE_DEFINE(datapath_drop_recirc_error);
 113 COVERAGE_DEFINE(datapath_drop_invalid_port);
 114 COVERAGE_DEFINE(datapath_drop_invalid_bond);
 115 COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
 116 COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
 117
 118 /* Protects against changes to 'dp_netdevs'. */
 119 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
 120
 121 /* Contains all 'struct dp_netdev's. */
 122 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 123     = SHASH_INITIALIZER(&dp_netdevs);
 124
 125 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 126
 127 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 128                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 129                                      | CS_SRC_NAT | CS_DST_NAT)
 130 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 131
 132 static struct odp_support dp_netdev_support = {
 133     .max_vlan_headers = SIZE_MAX,
 134     .max_mpls_depth = SIZE_MAX,
 135     .recirc = true,
 136     .ct_state = true,
 137     .ct_zone = true,
 138     .ct_mark = true,
 139     .ct_label = true,
 140     .ct_state_nat = true,
 141     .ct_orig_tuple = true,
 142     .ct_orig_tuple6 = true,
 143 };
 144
 145 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
 146  *
 147  * Exact match cache for frequently used flows
 148  *
 149  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 150  * search its entries for a miniflow that matches exactly the miniflow of the
 151  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 152  *
 153  * A cache entry holds a reference to its 'dp_netdev_flow'.
 154  *
 155  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 156  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 157  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 158  * value is the index of a cache entry where the miniflow could be.
 159  *
 160  *
 161  * Signature match cache (SMC)
 162  *
 163  * This cache stores a 16-bit signature for each flow without storing keys, and
 164  * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 165  * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 166  * SMC uses a set-associative design that each bucket contains
 167  * SMC_ENTRY_PER_BUCKET number of entries.
 168  * Since 16-bit flow_table index is used, if there are more than 2^16
 169  * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 170  *
 171  *
 172  * Thread-safety
 173  * =============
 174  *
 175  * Each pmd_thread has its own private exact match cache.
 176  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 177  */
 178
 179 #define EM_FLOW_HASH_SHIFT 13
 180 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 181 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 182 #define EM_FLOW_HASH_SEGS 2
 183
 184 /* SMC uses a set-associative design. A bucket contains a set of entries that
 185  * a flow item can occupy. For now, it uses one hash function rather than two
 186  * as for the EMC design. */
 187 #define SMC_ENTRY_PER_BUCKET 4
 188 #define SMC_ENTRIES (1u << 20)
 189 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
 190 #define SMC_MASK (SMC_BUCKET_CNT - 1)
 191
 192 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 193 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 194 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 195                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 196
 197 struct emc_entry {
 198     struct dp_netdev_flow *flow;
 199     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 200 };
 201
 202 struct emc_cache {
 203     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 204     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 205 };
 206
 207 struct smc_bucket {
 208     uint16_t sig[SMC_ENTRY_PER_BUCKET];
 209     uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
 210 };
 211
 212 /* Signature match cache, differentiate from EMC cache */
 213 struct smc_cache {
 214     struct smc_bucket buckets[SMC_BUCKET_CNT];
 215 };
 216
 217 struct dfc_cache {
 218     struct emc_cache emc_cache;
 219     struct smc_cache smc_cache;
 220 };
 221
 222 /* Iterate in the exact match cache through every entry that might contain a
 223  * miniflow with hash 'HASH'. */
 224 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 225     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 226          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 227          i__ < EM_FLOW_HASH_SEGS;                                            \
 228          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 229 \f
 230 /* Simple non-wildcarding single-priority classifier. */
 231
 232 /* Time in microseconds between successive optimizations of the dpcls
 233  * subtable vector */
 234 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
 235
 236 /* Time in microseconds of the interval in which rxq processing cycles used
 237  * in rxq to pmd assignments is measured and stored. */
 238 #define PMD_RXQ_INTERVAL_LEN 10000000LL
 239
 240 /* Number of intervals for which cycles are stored
 241  * and used during rxq to pmd assignment. */
 242 #define PMD_RXQ_INTERVAL_MAX 6
 243
 244 /* Time in microseconds to try RCU quiescing. */
 245 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
 246
 247 struct dpcls {
 248     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 249     odp_port_t in_port;
 250     struct cmap subtables_map;
 251     struct pvector subtables;
 252 };
 253
 254 /* Data structure to keep packet order till fastpath processing. */
 255 struct dp_packet_flow_map {
 256     struct dp_packet *packet;
 257     struct dp_netdev_flow *flow;
 258     uint16_t tcp_flags;
 259 };
 260
 261 static void dpcls_init(struct dpcls *);
 262 static void dpcls_destroy(struct dpcls *);
 263 static void dpcls_sort_subtable_vector(struct dpcls *);
 264 static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
 265 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 266                          const struct netdev_flow_key *mask);
 267 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 268 static bool dpcls_lookup(struct dpcls *cls,
 269                          const struct netdev_flow_key *keys[],
 270                          struct dpcls_rule **rules, size_t cnt,
 271                          int *num_lookups_p);
 272
 273 /* Set of supported meter flags */
 274 #define DP_SUPPORTED_METER_FLAGS_MASK \
 275     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 276
 277 /* Set of supported meter band types */
 278 #define DP_SUPPORTED_METER_BAND_TYPES           \
 279     ( 1 << OFPMBT13_DROP )
 280
 281 struct dp_meter_band {
 282     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 283     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 284     uint64_t packet_count;
 285     uint64_t byte_count;
 286 };
 287
 288 struct dp_meter {
 289     uint16_t flags;
 290     uint16_t n_bands;
 291     uint32_t max_delta_t;
 292     uint64_t used;
 293     uint64_t packet_count;
 294     uint64_t byte_count;
 295     struct dp_meter_band bands[];
 296 };
 297
 298 struct pmd_auto_lb {
 299     bool auto_lb_requested;     /* Auto load balancing requested by user. */
 300     bool is_enabled;            /* Current status of Auto load balancing. */
 301     uint64_t rebalance_intvl;
 302     uint64_t rebalance_poll_timer;
 303 };
 304
 305 /* Datapath based on the network device interface from netdev.h.
 306  *
 307  *
 308  * Thread-safety
 309  * =============
 310  *
 311  * Some members, marked 'const', are immutable.  Accessing other members
 312  * requires synchronization, as noted in more detail below.
 313  *
 314  * Acquisition order is, from outermost to innermost:
 315  *
 316  *    dp_netdev_mutex (global)
 317  *    port_mutex
 318  *    bond_mutex
 319  *    non_pmd_mutex
 320  */
 321 struct dp_netdev {
 322     const struct dpif_class *const class;
 323     const char *const name;
 324     struct ovs_refcount ref_cnt;
 325     atomic_flag destroyed;
 326
 327     /* Ports.
 328      *
 329      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 330      * through 'ports' requires taking 'port_mutex'. */
 331     struct ovs_mutex port_mutex;
 332     struct hmap ports;
 333     struct seq *port_seq;       /* Incremented whenever a port changes. */
 334
 335     /* The time that a packet can wait in output batch for sending. */
 336     atomic_uint32_t tx_flush_interval;
 337
 338     /* Meters. */
 339     struct ovs_mutex meter_locks[N_METER_LOCKS];
 340     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 341
 342     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 343     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 344     /* Enable collection of PMD performance metrics. */
 345     atomic_bool pmd_perf_metrics;
 346     /* Enable the SMC cache from ovsdb config */
 347     atomic_bool smc_enable_db;
 348
 349     /* Protects access to ofproto-dpif-upcall interface during revalidator
 350      * thread synchronization. */
 351     struct fat_rwlock upcall_rwlock;
 352     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 353     void *upcall_aux;
 354
 355     /* Callback function for notifying the purging of dp flows (during
 356      * reseting pmd deletion). */
 357     dp_purge_callback *dp_purge_cb;
 358     void *dp_purge_aux;
 359
 360     /* Stores all 'struct dp_netdev_pmd_thread's. */
 361     struct cmap poll_threads;
 362     /* id pool for per thread static_tx_qid. */
 363     struct id_pool *tx_qid_pool;
 364     struct ovs_mutex tx_qid_pool_mutex;
 365     /* Use measured cycles for rxq to pmd assignment. */
 366     bool pmd_rxq_assign_cyc;
 367
 368     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 369      * instance for non-pmd thread. */
 370     struct ovs_mutex non_pmd_mutex;
 371
 372     /* Each pmd thread will store its pointer to
 373      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 374     ovsthread_key_t per_pmd_key;
 375
 376     struct seq *reconfigure_seq;
 377     uint64_t last_reconfigure_seq;
 378
 379     /* Cpu mask for pin of pmd threads. */
 380     char *pmd_cmask;
 381
 382     uint64_t last_tnl_conf_seq;
 383
 384     struct conntrack *conntrack;
 385     struct pmd_auto_lb pmd_alb;
 386
 387     /* Bonds. */
 388     struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
 389     struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
 390 };
 391
 392 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 393     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 394 {
 395     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 396 }
 397
 398 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 399     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 400 {
 401     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 402 }
 403
 404
 405 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 406                                                     odp_port_t)
 407     OVS_REQUIRES(dp->port_mutex);
 408
 409 enum rxq_cycles_counter_type {
 410     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 411                                    processing packets during the current
 412                                    interval. */
 413     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 414                                    during rxq to pmd assignment. */
 415     RXQ_N_CYCLES
 416 };
 417
 418 enum {
 419     DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
 420     DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
 421     DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
 422 };
 423
 424 struct dp_flow_offload_item {
 425     struct dp_netdev_pmd_thread *pmd;
 426     struct dp_netdev_flow *flow;
 427     int op;
 428     struct match match;
 429     struct nlattr *actions;
 430     size_t actions_len;
 431
 432     struct ovs_list node;
 433 };
 434
 435 struct dp_flow_offload {
 436     struct ovs_mutex mutex;
 437     struct ovs_list list;
 438     pthread_cond_t cond;
 439 };
 440
 441 static struct dp_flow_offload dp_flow_offload = {
 442     .mutex = OVS_MUTEX_INITIALIZER,
 443     .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
 444 };
 445
 446 static struct ovsthread_once offload_thread_once
 447     = OVSTHREAD_ONCE_INITIALIZER;
 448
 449 #define XPS_TIMEOUT 500000LL    /* In microseconds. */
 450
 451 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 452 struct dp_netdev_rxq {
 453     struct dp_netdev_port *port;
 454     struct netdev_rxq *rx;
 455     unsigned core_id;                  /* Core to which this queue should be
 456                                           pinned. OVS_CORE_UNSPEC if the
 457                                           queue doesn't need to be pinned to a
 458                                           particular core. */
 459     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 460     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 461     bool is_vhost;                     /* Is rxq of a vhost port. */
 462
 463     /* Counters of cycles spent successfully polling and processing pkts. */
 464     atomic_ullong cycles[RXQ_N_CYCLES];
 465     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 466        sum them to yield the cycles used for an rxq. */
 467     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 468 };
 469
 470 /* A port in a netdev-based datapath. */
 471 struct dp_netdev_port {
 472     odp_port_t port_no;
 473     bool dynamic_txqs;          /* If true XPS will be used. */
 474     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 475     struct netdev *netdev;
 476     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 477     struct netdev_saved_flags *sf;
 478     struct dp_netdev_rxq *rxqs;
 479     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 480     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 481     struct ovs_mutex txq_used_mutex;
 482     bool emc_enabled;           /* If true EMC will be used. */
 483     char *type;                 /* Port type as requested by user. */
 484     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 485 };
 486
 487 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 488 struct dp_netdev_flow_stats {
 489     atomic_llong used;             /* Last used time, in monotonic msecs. */
 490     atomic_ullong packet_count;    /* Number of packets matched. */
 491     atomic_ullong byte_count;      /* Number of bytes matched. */
 492     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 493 };
 494
 495 /* Contained by struct dp_netdev_flow's 'last_attrs' member.  */
 496 struct dp_netdev_flow_attrs {
 497     atomic_bool offloaded;         /* True if flow is offloaded to HW. */
 498     ATOMIC(const char *) dp_layer; /* DP layer the flow is handled in. */
 499 };
 500
 501 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 502  *
 503  *
 504  * Thread-safety
 505  * =============
 506  *
 507  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 508  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 509  *
 510  * Motivation
 511  * ----------
 512  *
 513  * The thread safety rules described here for "struct dp_netdev_flow" are
 514  * motivated by two goals:
 515  *
 516  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 517  *      reading bad data due to changes by some thread concurrently modifying
 518  *      those members.
 519  *
 520  *    - Prevent two threads making changes to members of a given "struct
 521  *      dp_netdev_flow" from interfering with each other.
 522  *
 523  *
 524  * Rules
 525  * -----
 526  *
 527  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 528  * grace period.  Code that needs to hold onto a flow for a while
 529  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 530  *
 531  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 532  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 533  * from modification.
 534  *
 535  * Some members, marked 'const', are immutable.  Accessing other members
 536  * requires synchronization, as noted in more detail below.
 537  */
 538 struct dp_netdev_flow {
 539     const struct flow flow;      /* Unmasked flow that created this entry. */
 540     /* Hash table index by unmasked flow. */
 541     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 542                                  /* 'flow_table'. */
 543     const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
 544     const ovs_u128 ufid;         /* Unique flow identifier. */
 545     const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
 546     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 547                                  /* flow. */
 548
 549     /* Number of references.
 550      * The classifier owns one reference.
 551      * Any thread trying to keep a rule from being freed should hold its own
 552      * reference. */
 553     struct ovs_refcount ref_cnt;
 554
 555     bool dead;
 556     uint32_t mark;               /* Unique flow mark assigned to a flow */
 557
 558     /* Statistics. */
 559     struct dp_netdev_flow_stats stats;
 560
 561     /* Statistics and attributes received from the netdev offload provider. */
 562     atomic_int netdev_flow_get_result;
 563     struct dp_netdev_flow_stats last_stats;
 564     struct dp_netdev_flow_attrs last_attrs;
 565
 566     /* Actions. */
 567     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 568
 569     /* While processing a group of input packets, the datapath uses the next
 570      * member to store a pointer to the output batch for the flow.  It is
 571      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 572      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 573     struct packet_batch_per_flow *batch;
 574
 575     /* Packet classification. */
 576     char *dp_extra_info;         /* String to return in a flow dump/get. */
 577     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 578     /* 'cr' must be the last member. */
 579 };
 580
 581 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 582 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 583 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 584                                          struct flow *, bool);
 585
 586 /* A set of datapath actions within a "struct dp_netdev_flow".
 587  *
 588  *
 589  * Thread-safety
 590  * =============
 591  *
 592  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 593 struct dp_netdev_actions {
 594     /* These members are immutable: they do not change during the struct's
 595      * lifetime.  */
 596     unsigned int size;          /* Size of 'actions', in bytes. */
 597     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 598 };
 599
 600 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 601                                                    size_t);
 602 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 603     const struct dp_netdev_flow *);
 604 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 605
 606 struct polled_queue {
 607     struct dp_netdev_rxq *rxq;
 608     odp_port_t port_no;
 609     bool emc_enabled;
 610     bool rxq_enabled;
 611     uint64_t change_seq;
 612 };
 613
 614 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 615 struct rxq_poll {
 616     struct dp_netdev_rxq *rxq;
 617     struct hmap_node node;
 618 };
 619
 620 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 621  * 'tnl_port_cache' or 'tx_ports'. */
 622 struct tx_port {
 623     struct dp_netdev_port *port;
 624     int qid;
 625     long long last_used;
 626     struct hmap_node node;
 627     long long flush_time;
 628     struct dp_packet_batch output_pkts;
 629     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 630 };
 631
 632 /* Contained by struct tx_bond 'member_buckets'. */
 633 struct member_entry {
 634     odp_port_t member_id;
 635     atomic_ullong n_packets;
 636     atomic_ullong n_bytes;
 637 };
 638
 639 /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
 640 struct tx_bond {
 641     struct cmap_node node;
 642     uint32_t bond_id;
 643     struct member_entry member_buckets[BOND_BUCKETS];
 644 };
 645
 646 /* A set of properties for the current processing loop that is not directly
 647  * associated with the pmd thread itself, but with the packets being
 648  * processed or the short-term system configuration (for example, time).
 649  * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 650 struct dp_netdev_pmd_thread_ctx {
 651     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 652     long long now;
 653     /* RX queue from which last packet was received. */
 654     struct dp_netdev_rxq *last_rxq;
 655     /* EMC insertion probability context for the current processing cycle. */
 656     uint32_t emc_insert_min;
 657 };
 658
 659 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 660  * the performance overhead of interrupt processing.  Therefore netdev can
 661  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 662  * these device to check for recv buffer.  pmd-thread does polling for
 663  * devices assigned to itself.
 664  *
 665  * DPDK used PMD for accessing NIC.
 666  *
 667  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 668  * I/O of all non-pmd threads.  There will be no actual thread created
 669  * for the instance.
 670  *
 671  * Each struct has its own flow cache and classifier per managed ingress port.
 672  * For packets received on ingress port, a look up is done on corresponding PMD
 673  * thread's flow cache and in case of a miss, lookup is performed in the
 674  * corresponding classifier of port.  Packets are executed with the found
 675  * actions in either case.
 676  * */
 677 struct dp_netdev_pmd_thread {
 678     struct dp_netdev *dp;
 679     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 680     struct cmap_node node;          /* In 'dp->poll_threads'. */
 681
 682     /* Per thread exact-match cache.  Note, the instance for cpu core
 683      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 684      * need to be protected by 'non_pmd_mutex'.  Every other instance
 685      * will only be accessed by its own pmd thread. */
 686     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
 687
 688     /* Flow-Table and classifiers
 689      *
 690      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 691      * changes to 'classifiers' must be made while still holding the
 692      * 'flow_mutex'.
 693      */
 694     struct ovs_mutex flow_mutex;
 695     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 696
 697     /* One classifier per in_port polled by the pmd */
 698     struct cmap classifiers;
 699     /* Periodically sort subtable vectors according to hit frequencies */
 700     long long int next_optimization;
 701     /* End of the next time interval for which processing cycles
 702        are stored for each polled rxq. */
 703     long long int rxq_next_cycle_store;
 704
 705     /* Last interval timestamp. */
 706     uint64_t intrvl_tsc_prev;
 707     /* Last interval cycles. */
 708     atomic_ullong intrvl_cycles;
 709
 710     /* Current context of the PMD thread. */
 711     struct dp_netdev_pmd_thread_ctx ctx;
 712
 713     struct seq *reload_seq;
 714     uint64_t last_reload_seq;
 715
 716     /* These are atomic variables used as a synchronization and configuration
 717      * points for thread reload/exit.
 718      *
 719      * 'reload' atomic is the main one and it's used as a memory
 720      * synchronization point for all other knobs and data.
 721      *
 722      * For a thread that requests PMD reload:
 723      *
 724      *   * All changes that should be visible to the PMD thread must be made
 725      *     before setting the 'reload'.  These changes could use any memory
 726      *     ordering model including 'relaxed'.
 727      *   * Setting the 'reload' atomic should occur in the same thread where
 728      *     all other PMD configuration options updated.
 729      *   * Setting the 'reload' atomic should be done with 'release' memory
 730      *     ordering model or stricter.  This will guarantee that all previous
 731      *     changes (including non-atomic and 'relaxed') will be visible to
 732      *     the PMD thread.
 733      *   * To check that reload is done, thread should poll the 'reload' atomic
 734      *     to become 'false'.  Polling should be done with 'acquire' memory
 735      *     ordering model or stricter.  This ensures that PMD thread completed
 736      *     the reload process.
 737      *
 738      * For the PMD thread:
 739      *
 740      *   * PMD thread should read 'reload' atomic with 'acquire' memory
 741      *     ordering model or stricter.  This will guarantee that all changes
 742      *     made before setting the 'reload' in the requesting thread will be
 743      *     visible to the PMD thread.
 744      *   * All other configuration data could be read with any memory
 745      *     ordering model (including non-atomic and 'relaxed') but *only after*
 746      *     reading the 'reload' atomic set to 'true'.
 747      *   * When the PMD reload done, PMD should (optionally) set all the below
 748      *     knobs except the 'reload' to their default ('false') values and
 749      *     (mandatory), as the last step, set the 'reload' to 'false' using
 750      *     'release' memory ordering model or stricter.  This will inform the
 751      *     requesting thread that PMD has completed a reload cycle.
 752      */
 753     atomic_bool reload;             /* Do we need to reload ports? */
 754     atomic_bool wait_for_reload;    /* Can we busy wait for the next reload? */
 755     atomic_bool reload_tx_qid;      /* Do we need to reload static_tx_qid? */
 756     atomic_bool exit;               /* For terminating the pmd thread. */
 757
 758     pthread_t thread;
 759     unsigned core_id;               /* CPU core id of this pmd thread. */
 760     int numa_id;                    /* numa node id of this pmd thread. */
 761     bool isolated;
 762
 763     /* Queue id used by this pmd thread to send packets on all netdevs if
 764      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 765      * than 'cmap_count(dp->poll_threads)'. */
 766     uint32_t static_tx_qid;
 767
 768     /* Number of filled output batches. */
 769     int n_output_batches;
 770
 771     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 772     /* List of rx queues to poll. */
 773     struct hmap poll_list OVS_GUARDED;
 774     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 775      * read by the pmd thread. */
 776     struct hmap tx_ports OVS_GUARDED;
 777
 778     struct ovs_mutex bond_mutex;    /* Protects updates of 'tx_bonds'. */
 779     /* Map of 'tx_bond's used for transmission.  Written by the main thread
 780      * and read by the pmd thread. */
 781     struct cmap tx_bonds;
 782
 783     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 784      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 785      * with at least one txq (that support send).  A port can be in both.
 786      *
 787      * There are two separate maps to make sure that we don't try to execute
 788      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 789      *
 790      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 791      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 792      * other instance will only be accessed by its own pmd thread. */
 793     struct hmap tnl_port_cache;
 794     struct hmap send_port_cache;
 795
 796     /* Keep track of detailed PMD performance statistics. */
 797     struct pmd_perf_stats perf_stats;
 798
 799     /* Stats from previous iteration used by automatic pmd
 800      * load balance logic. */
 801     uint64_t prev_stats[PMD_N_STATS];
 802     atomic_count pmd_overloaded;
 803
 804     /* Set to true if the pmd thread needs to be reloaded. */
 805     bool need_reload;
 806
 807     /* Next time when PMD should try RCU quiescing. */
 808     long long next_rcu_quiesce;
 809 };
 810
 811 /* Interface to netdev-based datapath. */
 812 struct dpif_netdev {
 813     struct dpif dpif;
 814     struct dp_netdev *dp;
 815     uint64_t last_port_seq;
 816 };
 817
 818 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 819                               struct dp_netdev_port **portp)
 820     OVS_REQUIRES(dp->port_mutex);
 821 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 822                             struct dp_netdev_port **portp)
 823     OVS_REQUIRES(dp->port_mutex);
 824 static void dp_netdev_free(struct dp_netdev *)
 825     OVS_REQUIRES(dp_netdev_mutex);
 826 static int do_add_port(struct dp_netdev *dp, const char *devname,
 827                        const char *type, odp_port_t port_no)
 828     OVS_REQUIRES(dp->port_mutex);
 829 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 830     OVS_REQUIRES(dp->port_mutex);
 831 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 832                             bool create, struct dpif **);
 833 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 834                                       struct dp_packet_batch *,
 835                                       bool should_steal,
 836                                       const struct flow *flow,
 837                                       const struct nlattr *actions,
 838                                       size_t actions_len);
 839 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 840                             struct dp_packet_batch *, odp_port_t port_no);
 841 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 842                                   struct dp_packet_batch *);
 843
 844 static void dp_netdev_disable_upcall(struct dp_netdev *);
 845 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 846 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 847                                     struct dp_netdev *dp, unsigned core_id,
 848                                     int numa_id);
 849 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 850 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 851     OVS_REQUIRES(dp->port_mutex);
 852
 853 static void *pmd_thread_main(void *);
 854 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 855                                                       unsigned core_id);
 856 static struct dp_netdev_pmd_thread *
 857 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 858 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 859                               struct dp_netdev_pmd_thread *pmd);
 860 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 861 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 862 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 863                                          struct dp_netdev_port *port)
 864     OVS_REQUIRES(pmd->port_mutex);
 865 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 866                                            struct tx_port *tx)
 867     OVS_REQUIRES(pmd->port_mutex);
 868 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 869                                      struct dp_netdev_rxq *rxq)
 870     OVS_REQUIRES(pmd->port_mutex);
 871 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 872                                        struct rxq_poll *poll)
 873     OVS_REQUIRES(pmd->port_mutex);
 874 static int
 875 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 876                                    bool force);
 877 static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 878                                          struct tx_bond *bond, bool update)
 879     OVS_EXCLUDED(pmd->bond_mutex);
 880 static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 881                                            uint32_t bond_id)
 882     OVS_EXCLUDED(pmd->bond_mutex);
 883
 884 static void reconfigure_datapath(struct dp_netdev *dp)
 885     OVS_REQUIRES(dp->port_mutex);
 886 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 887 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 888 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 889 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 890     OVS_REQUIRES(pmd->port_mutex);
 891 static inline void
 892 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 893                            struct polled_queue *poll_list, int poll_cnt);
 894 static void
 895 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 896                          enum rxq_cycles_counter_type type,
 897                          unsigned long long cycles);
 898 static uint64_t
 899 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 900                          enum rxq_cycles_counter_type type);
 901 static void
 902 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 903                            unsigned long long cycles);
 904 static uint64_t
 905 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 906 static void
 907 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 908                                bool purge);
 909 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 910                                       struct tx_port *tx);
 911 static inline struct dpcls *
 912 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
 913                            odp_port_t in_port);
 914
 915 static inline bool emc_entry_alive(struct emc_entry *ce);
 916 static void emc_clear_entry(struct emc_entry *ce);
 917 static void smc_clear_entry(struct smc_bucket *b, int idx);
 918
 919 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
 920 static inline bool
 921 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
 922 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 923                                   struct dp_netdev_flow *flow);
 924
 925 static void
 926 emc_cache_init(struct emc_cache *flow_cache)
 927 {
 928     int i;
 929
 930     flow_cache->sweep_idx = 0;
 931     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 932         flow_cache->entries[i].flow = NULL;
 933         flow_cache->entries[i].key.hash = 0;
 934         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 935         flowmap_init(&flow_cache->entries[i].key.mf.map);
 936     }
 937 }
 938
 939 static void
 940 smc_cache_init(struct smc_cache *smc_cache)
 941 {
 942     int i, j;
 943     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 944         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 945             smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
 946         }
 947     }
 948 }
 949
 950 static void
 951 dfc_cache_init(struct dfc_cache *flow_cache)
 952 {
 953     emc_cache_init(&flow_cache->emc_cache);
 954     smc_cache_init(&flow_cache->smc_cache);
 955 }
 956
 957 static void
 958 emc_cache_uninit(struct emc_cache *flow_cache)
 959 {
 960     int i;
 961
 962     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 963         emc_clear_entry(&flow_cache->entries[i]);
 964     }
 965 }
 966
 967 static void
 968 smc_cache_uninit(struct smc_cache *smc)
 969 {
 970     int i, j;
 971
 972     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 973         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 974             smc_clear_entry(&(smc->buckets[i]), j);
 975         }
 976     }
 977 }
 978
 979 static void
 980 dfc_cache_uninit(struct dfc_cache *flow_cache)
 981 {
 982     smc_cache_uninit(&flow_cache->smc_cache);
 983     emc_cache_uninit(&flow_cache->emc_cache);
 984 }
 985
 986 /* Check and clear dead flow references slowly (one entry at each
 987  * invocation).  */
 988 static void
 989 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 990 {
 991     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 992
 993     if (!emc_entry_alive(entry)) {
 994         emc_clear_entry(entry);
 995     }
 996     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 997 }
 998
 999 /* Updates the time in PMD threads context and should be called in three cases:
1000  *
1001  *     1. PMD structure initialization:
1002  *         - dp_netdev_configure_pmd()
1003  *
1004  *     2. Before processing of the new packet batch:
1005  *         - dpif_netdev_execute()
1006  *         - dp_netdev_process_rxq_port()
1007  *
1008  *     3. At least once per polling iteration in main polling threads if no
1009  *        packets received on current iteration:
1010  *         - dpif_netdev_run()
1011  *         - pmd_thread_main()
1012  *
1013  * 'pmd->ctx.now' should be used without update in all other cases if possible.
1014  */
1015 static inline void
1016 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
1017 {
1018     pmd->ctx.now = time_usec();
1019 }
1020
1021 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
1022 bool
1023 dpif_is_netdev(const struct dpif *dpif)
1024 {
1025     return dpif->dpif_class->open == dpif_netdev_open;
1026 }
1027
1028 static struct dpif_netdev *
1029 dpif_netdev_cast(const struct dpif *dpif)
1030 {
1031     ovs_assert(dpif_is_netdev(dpif));
1032     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
1033 }
1034
1035 static struct dp_netdev *
1036 get_dp_netdev(const struct dpif *dpif)
1037 {
1038     return dpif_netdev_cast(dpif)->dp;
1039 }
1040 \f
1041 enum pmd_info_type {
1042     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
1043     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
1044     PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
1045     PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
1046 };
1047
1048 static void
1049 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1050 {
1051     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1052                         ? "main thread" : "pmd thread");
1053     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1054         ds_put_format(reply, " numa_id %d", pmd->numa_id);
1055     }
1056     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1057         ds_put_format(reply, " core_id %u", pmd->core_id);
1058     }
1059     ds_put_cstr(reply, ":\n");
1060 }
1061
1062 static void
1063 pmd_info_show_stats(struct ds *reply,
1064                     struct dp_netdev_pmd_thread *pmd)
1065 {
1066     uint64_t stats[PMD_N_STATS];
1067     uint64_t total_cycles, total_packets;
1068     double passes_per_pkt = 0;
1069     double lookups_per_hit = 0;
1070     double packets_per_batch = 0;
1071
1072     pmd_perf_read_counters(&pmd->perf_stats, stats);
1073     total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1074                          + stats[PMD_CYCLES_ITER_BUSY];
1075     total_packets = stats[PMD_STAT_RECV];
1076
1077     format_pmd_thread(reply, pmd);
1078
1079     if (total_packets > 0) {
1080         passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1081                             / (double) total_packets;
1082     }
1083     if (stats[PMD_STAT_MASKED_HIT] > 0) {
1084         lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1085                             / (double) stats[PMD_STAT_MASKED_HIT];
1086     }
1087     if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1088         packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1089                             / (double) stats[PMD_STAT_SENT_BATCHES];
1090     }
1091
1092     ds_put_format(reply,
1093                   "  packets received: %"PRIu64"\n"
1094                   "  packet recirculations: %"PRIu64"\n"
1095                   "  avg. datapath passes per packet: %.02f\n"
1096                   "  emc hits: %"PRIu64"\n"
1097                   "  smc hits: %"PRIu64"\n"
1098                   "  megaflow hits: %"PRIu64"\n"
1099                   "  avg. subtable lookups per megaflow hit: %.02f\n"
1100                   "  miss with success upcall: %"PRIu64"\n"
1101                   "  miss with failed upcall: %"PRIu64"\n"
1102                   "  avg. packets per output batch: %.02f\n",
1103                   total_packets, stats[PMD_STAT_RECIRC],
1104                   passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1105                   stats[PMD_STAT_SMC_HIT],
1106                   stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1107                   stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1108                   packets_per_batch);
1109
1110     if (total_cycles == 0) {
1111         return;
1112     }
1113
1114     ds_put_format(reply,
1115                   "  idle cycles: %"PRIu64" (%.02f%%)\n"
1116                   "  processing cycles: %"PRIu64" (%.02f%%)\n",
1117                   stats[PMD_CYCLES_ITER_IDLE],
1118                   stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1119                   stats[PMD_CYCLES_ITER_BUSY],
1120                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1121
1122     if (total_packets == 0) {
1123         return;
1124     }
1125
1126     ds_put_format(reply,
1127                   "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1128                   total_cycles / (double) total_packets,
1129                   total_cycles, total_packets);
1130
1131     ds_put_format(reply,
1132                   "  avg processing cycles per packet: "
1133                   "%.02f (%"PRIu64"/%"PRIu64")\n",
1134                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1135                   stats[PMD_CYCLES_ITER_BUSY], total_packets);
1136 }
1137
1138 static void
1139 pmd_info_show_perf(struct ds *reply,
1140                    struct dp_netdev_pmd_thread *pmd,
1141                    struct pmd_perf_params *par)
1142 {
1143     if (pmd->core_id != NON_PMD_CORE_ID) {
1144         char *time_str =
1145                 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1146         long long now = time_msec();
1147         double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1148
1149         ds_put_cstr(reply, "\n");
1150         ds_put_format(reply, "Time: %s\n", time_str);
1151         ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1152         ds_put_cstr(reply, "\n");
1153         format_pmd_thread(reply, pmd);
1154         ds_put_cstr(reply, "\n");
1155         pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1156         if (pmd_perf_metrics_enabled(pmd)) {
1157             /* Prevent parallel clearing of perf metrics. */
1158             ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1159             if (par->histograms) {
1160                 ds_put_cstr(reply, "\n");
1161                 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1162             }
1163             if (par->iter_hist_len > 0) {
1164                 ds_put_cstr(reply, "\n");
1165                 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1166                         par->iter_hist_len);
1167             }
1168             if (par->ms_hist_len > 0) {
1169                 ds_put_cstr(reply, "\n");
1170                 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1171                         par->ms_hist_len);
1172             }
1173             ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1174         }
1175         free(time_str);
1176     }
1177 }
1178
1179 static int
1180 compare_poll_list(const void *a_, const void *b_)
1181 {
1182     const struct rxq_poll *a = a_;
1183     const struct rxq_poll *b = b_;
1184
1185     const char *namea = netdev_rxq_get_name(a->rxq->rx);
1186     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1187
1188     int cmp = strcmp(namea, nameb);
1189     if (!cmp) {
1190         return netdev_rxq_get_queue_id(a->rxq->rx)
1191                - netdev_rxq_get_queue_id(b->rxq->rx);
1192     } else {
1193         return cmp;
1194     }
1195 }
1196
1197 static void
1198 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1199                  size_t *n)
1200     OVS_REQUIRES(pmd->port_mutex)
1201 {
1202     struct rxq_poll *ret, *poll;
1203     size_t i;
1204
1205     *n = hmap_count(&pmd->poll_list);
1206     if (!*n) {
1207         ret = NULL;
1208     } else {
1209         ret = xcalloc(*n, sizeof *ret);
1210         i = 0;
1211         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1212             ret[i] = *poll;
1213             i++;
1214         }
1215         ovs_assert(i == *n);
1216         qsort(ret, *n, sizeof *ret, compare_poll_list);
1217     }
1218
1219     *list = ret;
1220 }
1221
1222 static void
1223 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1224 {
1225     if (pmd->core_id != NON_PMD_CORE_ID) {
1226         struct rxq_poll *list;
1227         size_t n_rxq;
1228         uint64_t total_cycles = 0;
1229
1230         ds_put_format(reply,
1231                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
1232                       pmd->numa_id, pmd->core_id, (pmd->isolated)
1233                                                   ? "true" : "false");
1234
1235         ovs_mutex_lock(&pmd->port_mutex);
1236         sorted_poll_list(pmd, &list, &n_rxq);
1237
1238         /* Get the total pmd cycles for an interval. */
1239         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1240         /* Estimate the cycles to cover all intervals. */
1241         total_cycles *= PMD_RXQ_INTERVAL_MAX;
1242
1243         for (int i = 0; i < n_rxq; i++) {
1244             struct dp_netdev_rxq *rxq = list[i].rxq;
1245             const char *name = netdev_rxq_get_name(rxq->rx);
1246             uint64_t proc_cycles = 0;
1247
1248             for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1249                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1250             }
1251             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
1252                           netdev_rxq_get_queue_id(list[i].rxq->rx));
1253             ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1254                                         ? "(enabled) " : "(disabled)");
1255             ds_put_format(reply, "  pmd usage: ");
1256             if (total_cycles) {
1257                 ds_put_format(reply, "%2"PRIu64"",
1258                               proc_cycles * 100 / total_cycles);
1259                 ds_put_cstr(reply, " %");
1260             } else {
1261                 ds_put_format(reply, "%s", "NOT AVAIL");
1262             }
1263             ds_put_cstr(reply, "\n");
1264         }
1265         ovs_mutex_unlock(&pmd->port_mutex);
1266         free(list);
1267     }
1268 }
1269
1270 static int
1271 compare_poll_thread_list(const void *a_, const void *b_)
1272 {
1273     const struct dp_netdev_pmd_thread *a, *b;
1274
1275     a = *(struct dp_netdev_pmd_thread **)a_;
1276     b = *(struct dp_netdev_pmd_thread **)b_;
1277
1278     if (a->core_id < b->core_id) {
1279         return -1;
1280     }
1281     if (a->core_id > b->core_id) {
1282         return 1;
1283     }
1284     return 0;
1285 }
1286
1287 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1288  * this list, as long as we do not go to quiescent state. */
1289 static void
1290 sorted_poll_thread_list(struct dp_netdev *dp,
1291                         struct dp_netdev_pmd_thread ***list,
1292                         size_t *n)
1293 {
1294     struct dp_netdev_pmd_thread *pmd;
1295     struct dp_netdev_pmd_thread **pmd_list;
1296     size_t k = 0, n_pmds;
1297
1298     n_pmds = cmap_count(&dp->poll_threads);
1299     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1300
1301     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1302         if (k >= n_pmds) {
1303             break;
1304         }
1305         pmd_list[k++] = pmd;
1306     }
1307
1308     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1309
1310     *list = pmd_list;
1311     *n = k;
1312 }
1313
1314 static void
1315 dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1316                                 const char *argv[] OVS_UNUSED,
1317                                 void *aux OVS_UNUSED)
1318 {
1319     /* Get a list of all lookup functions. */
1320     struct dpcls_subtable_lookup_info_t *lookup_funcs = NULL;
1321     int32_t count = dpcls_subtable_lookup_info_get(&lookup_funcs);
1322     if (count < 0) {
1323         unixctl_command_reply_error(conn, "error getting lookup names");
1324         return;
1325     }
1326
1327     /* Add all lookup functions to reply string. */
1328     struct ds reply = DS_EMPTY_INITIALIZER;
1329     ds_put_cstr(&reply, "Available lookup functions (priority : name)\n");
1330     for (int i = 0; i < count; i++) {
1331         ds_put_format(&reply, "  %d : %s\n", lookup_funcs[i].prio,
1332                       lookup_funcs[i].name);
1333     }
1334     unixctl_command_reply(conn, ds_cstr(&reply));
1335     ds_destroy(&reply);
1336 }
1337
1338 static void
1339 dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
1340                                 const char *argv[], void *aux OVS_UNUSED)
1341 {
1342     /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
1343      *   argv[1] is subtable name
1344      *   argv[2] is priority
1345      *   argv[3] is the datapath name (optional if only 1 datapath exists)
1346      */
1347     const char *func_name = argv[1];
1348
1349     errno = 0;
1350     char *err_char;
1351     uint32_t new_prio = strtoul(argv[2], &err_char, 10);
1352     if (errno != 0 || new_prio > UINT8_MAX) {
1353         unixctl_command_reply_error(conn,
1354             "error converting priority, use integer in range 0-255\n");
1355         return;
1356     }
1357
1358     int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
1359     if (err) {
1360         unixctl_command_reply_error(conn,
1361             "error, subtable lookup function not found\n");
1362         return;
1363     }
1364
1365     /* argv[3] is optional datapath instance. If no datapath name is provided
1366      * and only one datapath exists, the one existing datapath is reprobed.
1367      */
1368     ovs_mutex_lock(&dp_netdev_mutex);
1369     struct dp_netdev *dp = NULL;
1370
1371     if (argc == 4) {
1372         dp = shash_find_data(&dp_netdevs, argv[3]);
1373     } else if (shash_count(&dp_netdevs) == 1) {
1374         dp = shash_first(&dp_netdevs)->data;
1375     }
1376
1377     if (!dp) {
1378         ovs_mutex_unlock(&dp_netdev_mutex);
1379         unixctl_command_reply_error(conn,
1380                                     "please specify an existing datapath");
1381         return;
1382     }
1383
1384     /* Get PMD threads list, required to get DPCLS instances. */
1385     size_t n;
1386     uint32_t lookup_dpcls_changed = 0;
1387     uint32_t lookup_subtable_changed = 0;
1388     struct dp_netdev_pmd_thread **pmd_list;
1389     sorted_poll_thread_list(dp, &pmd_list, &n);
1390
1391     /* take port mutex as HMAP iters over them. */
1392     ovs_mutex_lock(&dp->port_mutex);
1393
1394     for (size_t i = 0; i < n; i++) {
1395         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1396         if (pmd->core_id == NON_PMD_CORE_ID) {
1397             continue;
1398         }
1399
1400         struct dp_netdev_port *port = NULL;
1401         HMAP_FOR_EACH (port, node, &dp->ports) {
1402             odp_port_t in_port = port->port_no;
1403             struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1404             if (!cls) {
1405                 continue;
1406             }
1407             uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1408             if (subtbl_changes) {
1409                 lookup_dpcls_changed++;
1410                 lookup_subtable_changed += subtbl_changes;
1411             }
1412         }
1413     }
1414
1415     /* release port mutex before netdev mutex. */
1416     ovs_mutex_unlock(&dp->port_mutex);
1417     ovs_mutex_unlock(&dp_netdev_mutex);
1418
1419     struct ds reply = DS_EMPTY_INITIALIZER;
1420     ds_put_format(&reply,
1421         "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1422         lookup_dpcls_changed, lookup_subtable_changed);
1423     const char *reply_str = ds_cstr(&reply);
1424     unixctl_command_reply(conn, reply_str);
1425     VLOG_INFO("%s", reply_str);
1426     ds_destroy(&reply);
1427 }
1428
1429 static void
1430 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1431                           const char *argv[], void *aux OVS_UNUSED)
1432 {
1433     struct ds reply = DS_EMPTY_INITIALIZER;
1434     struct dp_netdev *dp = NULL;
1435
1436     ovs_mutex_lock(&dp_netdev_mutex);
1437
1438     if (argc == 2) {
1439         dp = shash_find_data(&dp_netdevs, argv[1]);
1440     } else if (shash_count(&dp_netdevs) == 1) {
1441         /* There's only one datapath */
1442         dp = shash_first(&dp_netdevs)->data;
1443     }
1444
1445     if (!dp) {
1446         ovs_mutex_unlock(&dp_netdev_mutex);
1447         unixctl_command_reply_error(conn,
1448                                     "please specify an existing datapath");
1449         return;
1450     }
1451
1452     dp_netdev_request_reconfigure(dp);
1453     ovs_mutex_unlock(&dp_netdev_mutex);
1454     ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1455     unixctl_command_reply(conn, ds_cstr(&reply));
1456     ds_destroy(&reply);
1457 }
1458
1459 static void
1460 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1461                      void *aux)
1462 {
1463     struct ds reply = DS_EMPTY_INITIALIZER;
1464     struct dp_netdev_pmd_thread **pmd_list;
1465     struct dp_netdev *dp = NULL;
1466     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1467     unsigned int core_id;
1468     bool filter_on_pmd = false;
1469     size_t n;
1470
1471     ovs_mutex_lock(&dp_netdev_mutex);
1472
1473     while (argc > 1) {
1474         if (!strcmp(argv[1], "-pmd") && argc > 2) {
1475             if (str_to_uint(argv[2], 10, &core_id)) {
1476                 filter_on_pmd = true;
1477             }
1478             argc -= 2;
1479             argv += 2;
1480         } else {
1481             dp = shash_find_data(&dp_netdevs, argv[1]);
1482             argc -= 1;
1483             argv += 1;
1484         }
1485     }
1486
1487     if (!dp) {
1488         if (shash_count(&dp_netdevs) == 1) {
1489             /* There's only one datapath */
1490             dp = shash_first(&dp_netdevs)->data;
1491         } else {
1492             ovs_mutex_unlock(&dp_netdev_mutex);
1493             unixctl_command_reply_error(conn,
1494                                         "please specify an existing datapath");
1495             return;
1496         }
1497     }
1498
1499     sorted_poll_thread_list(dp, &pmd_list, &n);
1500     for (size_t i = 0; i < n; i++) {
1501         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1502         if (!pmd) {
1503             break;
1504         }
1505         if (filter_on_pmd && pmd->core_id != core_id) {
1506             continue;
1507         }
1508         if (type == PMD_INFO_SHOW_RXQ) {
1509             pmd_info_show_rxq(&reply, pmd);
1510         } else if (type == PMD_INFO_CLEAR_STATS) {
1511             pmd_perf_stats_clear(&pmd->perf_stats);
1512         } else if (type == PMD_INFO_SHOW_STATS) {
1513             pmd_info_show_stats(&reply, pmd);
1514         } else if (type == PMD_INFO_PERF_SHOW) {
1515             pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1516         }
1517     }
1518     free(pmd_list);
1519
1520     ovs_mutex_unlock(&dp_netdev_mutex);
1521
1522     unixctl_command_reply(conn, ds_cstr(&reply));
1523     ds_destroy(&reply);
1524 }
1525
1526 static void
1527 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1528                           const char *argv[],
1529                           void *aux OVS_UNUSED)
1530 {
1531     struct pmd_perf_params par;
1532     long int it_hist = 0, ms_hist = 0;
1533     par.histograms = true;
1534
1535     while (argc > 1) {
1536         if (!strcmp(argv[1], "-nh")) {
1537             par.histograms = false;
1538             argc -= 1;
1539             argv += 1;
1540         } else if (!strcmp(argv[1], "-it") && argc > 2) {
1541             it_hist = strtol(argv[2], NULL, 10);
1542             if (it_hist < 0) {
1543                 it_hist = 0;
1544             } else if (it_hist > HISTORY_LEN) {
1545                 it_hist = HISTORY_LEN;
1546             }
1547             argc -= 2;
1548             argv += 2;
1549         } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1550             ms_hist = strtol(argv[2], NULL, 10);
1551             if (ms_hist < 0) {
1552                 ms_hist = 0;
1553             } else if (ms_hist > HISTORY_LEN) {
1554                 ms_hist = HISTORY_LEN;
1555             }
1556             argc -= 2;
1557             argv += 2;
1558         } else {
1559             break;
1560         }
1561     }
1562     par.iter_hist_len = it_hist;
1563     par.ms_hist_len = ms_hist;
1564     par.command_type = PMD_INFO_PERF_SHOW;
1565     dpif_netdev_pmd_info(conn, argc, argv, &par);
1566 }
1567
1568 static void
1569 dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1570                       const char *argv[], void *aux OVS_UNUSED)
1571 {
1572     struct ds reply = DS_EMPTY_INITIALIZER;
1573     struct dp_netdev *dp = NULL;
1574
1575     ovs_mutex_lock(&dp_netdev_mutex);
1576     if (argc == 2) {
1577         dp = shash_find_data(&dp_netdevs, argv[1]);
1578     } else if (shash_count(&dp_netdevs) == 1) {
1579         /* There's only one datapath. */
1580         dp = shash_first(&dp_netdevs)->data;
1581     }
1582     if (!dp) {
1583         ovs_mutex_unlock(&dp_netdev_mutex);
1584         unixctl_command_reply_error(conn,
1585                                     "please specify an existing datapath");
1586         return;
1587     }
1588
1589     if (cmap_count(&dp->tx_bonds) > 0) {
1590         struct tx_bond *dp_bond_entry;
1591
1592         ds_put_cstr(&reply, "Bonds:\n");
1593         CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1594             ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1595                           dp_bond_entry->bond_id);
1596             for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1597                 uint32_t member_id = odp_to_u32(
1598                     dp_bond_entry->member_buckets[bucket].member_id);
1599                 ds_put_format(&reply,
1600                               "    bucket %d - member %"PRIu32"\n",
1601                               bucket, member_id);
1602             }
1603         }
1604     }
1605     ovs_mutex_unlock(&dp_netdev_mutex);
1606     unixctl_command_reply(conn, ds_cstr(&reply));
1607     ds_destroy(&reply);
1608 }
1609
1610 \f
1611 static int
1612 dpif_netdev_init(void)
1613 {
1614     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1615                               clear_aux = PMD_INFO_CLEAR_STATS,
1616                               poll_aux = PMD_INFO_SHOW_RXQ;
1617
1618     unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1619                              0, 3, dpif_netdev_pmd_info,
1620                              (void *)&show_aux);
1621     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1622                              0, 3, dpif_netdev_pmd_info,
1623                              (void *)&clear_aux);
1624     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1625                              0, 3, dpif_netdev_pmd_info,
1626                              (void *)&poll_aux);
1627     unixctl_command_register("dpif-netdev/pmd-perf-show",
1628                              "[-nh] [-it iter-history-len]"
1629                              " [-ms ms-history-len]"
1630                              " [-pmd core] [dp]",
1631                              0, 8, pmd_perf_show_cmd,
1632                              NULL);
1633     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1634                              0, 1, dpif_netdev_pmd_rebalance,
1635                              NULL);
1636     unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1637                              "on|off [-b before] [-a after] [-e|-ne] "
1638                              "[-us usec] [-q qlen]",
1639                              0, 10, pmd_perf_log_set_cmd,
1640                              NULL);
1641     unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1642                              0, 1, dpif_netdev_bond_show,
1643                              NULL);
1644     unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1645                              "[lookup_func] [prio] [dp]",
1646                              2, 3, dpif_netdev_subtable_lookup_set,
1647                              NULL);
1648     unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", "",
1649                              0, 0, dpif_netdev_subtable_lookup_get,
1650                              NULL);
1651     return 0;
1652 }
1653
1654 static int
1655 dpif_netdev_enumerate(struct sset *all_dps,
1656                       const struct dpif_class *dpif_class)
1657 {
1658     struct shash_node *node;
1659
1660     ovs_mutex_lock(&dp_netdev_mutex);
1661     SHASH_FOR_EACH(node, &dp_netdevs) {
1662         struct dp_netdev *dp = node->data;
1663         if (dpif_class != dp->class) {
1664             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1665              * If the class doesn't match, skip this dpif. */
1666              continue;
1667         }
1668         sset_add(all_dps, node->name);
1669     }
1670     ovs_mutex_unlock(&dp_netdev_mutex);
1671
1672     return 0;
1673 }
1674
1675 static bool
1676 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1677 {
1678     return class != &dpif_netdev_class;
1679 }
1680
1681 static const char *
1682 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1683 {
1684     return strcmp(type, "internal") ? type
1685                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1686                   : "tap";
1687 }
1688
1689 static struct dpif *
1690 create_dpif_netdev(struct dp_netdev *dp)
1691 {
1692     uint16_t netflow_id = hash_string(dp->name, 0);
1693     struct dpif_netdev *dpif;
1694
1695     ovs_refcount_ref(&dp->ref_cnt);
1696
1697     dpif = xmalloc(sizeof *dpif);
1698     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1699     dpif->dp = dp;
1700     dpif->last_port_seq = seq_read(dp->port_seq);
1701
1702     return &dpif->dpif;
1703 }
1704
1705 /* Choose an unused, non-zero port number and return it on success.
1706  * Return ODPP_NONE on failure. */
1707 static odp_port_t
1708 choose_port(struct dp_netdev *dp, const char *name)
1709     OVS_REQUIRES(dp->port_mutex)
1710 {
1711     uint32_t port_no;
1712
1713     if (dp->class != &dpif_netdev_class) {
1714         const char *p;
1715         int start_no = 0;
1716
1717         /* If the port name begins with "br", start the number search at
1718          * 100 to make writing tests easier. */
1719         if (!strncmp(name, "br", 2)) {
1720             start_no = 100;
1721         }
1722
1723         /* If the port name contains a number, try to assign that port number.
1724          * This can make writing unit tests easier because port numbers are
1725          * predictable. */
1726         for (p = name; *p != '\0'; p++) {
1727             if (isdigit((unsigned char) *p)) {
1728                 port_no = start_no + strtol(p, NULL, 10);
1729                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1730                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1731                     return u32_to_odp(port_no);
1732                 }
1733                 break;
1734             }
1735         }
1736     }
1737
1738     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1739         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1740             return u32_to_odp(port_no);
1741         }
1742     }
1743
1744     return ODPP_NONE;
1745 }
1746
1747 static int
1748 create_dp_netdev(const char *name, const struct dpif_class *class,
1749                  struct dp_netdev **dpp)
1750     OVS_REQUIRES(dp_netdev_mutex)
1751 {
1752     static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1753     struct dp_netdev *dp;
1754     int error;
1755
1756     /* Avoid estimating TSC frequency for dummy datapath to not slow down
1757      * unit tests. */
1758     if (!dpif_netdev_class_is_dummy(class)
1759         && ovsthread_once_start(&tsc_freq_check)) {
1760         pmd_perf_estimate_tsc_frequency();
1761         ovsthread_once_done(&tsc_freq_check);
1762     }
1763
1764     dp = xzalloc(sizeof *dp);
1765     shash_add(&dp_netdevs, name, dp);
1766
1767     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1768     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1769     ovs_refcount_init(&dp->ref_cnt);
1770     atomic_flag_clear(&dp->destroyed);
1771
1772     ovs_mutex_init_recursive(&dp->port_mutex);
1773     hmap_init(&dp->ports);
1774     dp->port_seq = seq_create();
1775     ovs_mutex_init(&dp->bond_mutex);
1776     cmap_init(&dp->tx_bonds);
1777
1778     fat_rwlock_init(&dp->upcall_rwlock);
1779
1780     dp->reconfigure_seq = seq_create();
1781     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1782
1783     for (int i = 0; i < N_METER_LOCKS; ++i) {
1784         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1785     }
1786
1787     /* Disable upcalls by default. */
1788     dp_netdev_disable_upcall(dp);
1789     dp->upcall_aux = NULL;
1790     dp->upcall_cb = NULL;
1791
1792     dp->conntrack = conntrack_init();
1793
1794     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1795     atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1796
1797     cmap_init(&dp->poll_threads);
1798     dp->pmd_rxq_assign_cyc = true;
1799
1800     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1801     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1802     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1803
1804     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1805     ovsthread_key_create(&dp->per_pmd_key, NULL);
1806
1807     ovs_mutex_lock(&dp->port_mutex);
1808     /* non-PMD will be created before all other threads and will
1809      * allocate static_tx_qid = 0. */
1810     dp_netdev_set_nonpmd(dp);
1811
1812     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1813                                                              "internal"),
1814                         ODPP_LOCAL);
1815     ovs_mutex_unlock(&dp->port_mutex);
1816     if (error) {
1817         dp_netdev_free(dp);
1818         return error;
1819     }
1820
1821     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1822     *dpp = dp;
1823     return 0;
1824 }
1825
1826 static void
1827 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1828 {
1829     seq_change(dp->reconfigure_seq);
1830 }
1831
1832 static bool
1833 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1834 {
1835     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1836 }
1837
1838 static int
1839 dpif_netdev_open(const struct dpif_class *class, const char *name,
1840                  bool create, struct dpif **dpifp)
1841 {
1842     struct dp_netdev *dp;
1843     int error;
1844
1845     ovs_mutex_lock(&dp_netdev_mutex);
1846     dp = shash_find_data(&dp_netdevs, name);
1847     if (!dp) {
1848         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1849     } else {
1850         error = (dp->class != class ? EINVAL
1851                  : create ? EEXIST
1852                  : 0);
1853     }
1854     if (!error) {
1855         *dpifp = create_dpif_netdev(dp);
1856     }
1857     ovs_mutex_unlock(&dp_netdev_mutex);
1858
1859     return error;
1860 }
1861
1862 static void
1863 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1864     OVS_NO_THREAD_SAFETY_ANALYSIS
1865 {
1866     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1867     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1868
1869     /* Before freeing a lock we should release it */
1870     fat_rwlock_unlock(&dp->upcall_rwlock);
1871     fat_rwlock_destroy(&dp->upcall_rwlock);
1872 }
1873
1874 static void
1875 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1876     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1877 {
1878     if (dp->meters[meter_id]) {
1879         free(dp->meters[meter_id]);
1880         dp->meters[meter_id] = NULL;
1881     }
1882 }
1883
1884 static uint32_t
1885 hash_bond_id(uint32_t bond_id)
1886 {
1887     return hash_int(bond_id, 0);
1888 }
1889
1890 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1891  * through the 'dp_netdevs' shash while freeing 'dp'. */
1892 static void
1893 dp_netdev_free(struct dp_netdev *dp)
1894     OVS_REQUIRES(dp_netdev_mutex)
1895 {
1896     struct dp_netdev_port *port, *next;
1897     struct tx_bond *bond;
1898
1899     shash_find_and_delete(&dp_netdevs, dp->name);
1900
1901     ovs_mutex_lock(&dp->port_mutex);
1902     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1903         do_del_port(dp, port);
1904     }
1905     ovs_mutex_unlock(&dp->port_mutex);
1906
1907     ovs_mutex_lock(&dp->bond_mutex);
1908     CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1909         cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1910         ovsrcu_postpone(free, bond);
1911     }
1912     ovs_mutex_unlock(&dp->bond_mutex);
1913
1914     dp_netdev_destroy_all_pmds(dp, true);
1915     cmap_destroy(&dp->poll_threads);
1916
1917     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1918     id_pool_destroy(dp->tx_qid_pool);
1919
1920     ovs_mutex_destroy(&dp->non_pmd_mutex);
1921     ovsthread_key_delete(dp->per_pmd_key);
1922
1923     conntrack_destroy(dp->conntrack);
1924
1925
1926     seq_destroy(dp->reconfigure_seq);
1927
1928     seq_destroy(dp->port_seq);
1929     hmap_destroy(&dp->ports);
1930     ovs_mutex_destroy(&dp->port_mutex);
1931
1932     cmap_destroy(&dp->tx_bonds);
1933     ovs_mutex_destroy(&dp->bond_mutex);
1934
1935     /* Upcalls must be disabled at this point */
1936     dp_netdev_destroy_upcall_lock(dp);
1937
1938     int i;
1939
1940     for (i = 0; i < MAX_METERS; ++i) {
1941         meter_lock(dp, i);
1942         dp_delete_meter(dp, i);
1943         meter_unlock(dp, i);
1944     }
1945     for (i = 0; i < N_METER_LOCKS; ++i) {
1946         ovs_mutex_destroy(&dp->meter_locks[i]);
1947     }
1948
1949     free(dp->pmd_cmask);
1950     free(CONST_CAST(char *, dp->name));
1951     free(dp);
1952 }
1953
1954 static void
1955 dp_netdev_unref(struct dp_netdev *dp)
1956 {
1957     if (dp) {
1958         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1959          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1960         ovs_mutex_lock(&dp_netdev_mutex);
1961         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1962             dp_netdev_free(dp);
1963         }
1964         ovs_mutex_unlock(&dp_netdev_mutex);
1965     }
1966 }
1967
1968 static void
1969 dpif_netdev_close(struct dpif *dpif)
1970 {
1971     struct dp_netdev *dp = get_dp_netdev(dpif);
1972
1973     dp_netdev_unref(dp);
1974     free(dpif);
1975 }
1976
1977 static int
1978 dpif_netdev_destroy(struct dpif *dpif)
1979 {
1980     struct dp_netdev *dp = get_dp_netdev(dpif);
1981
1982     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1983         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1984             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1985             OVS_NOT_REACHED();
1986         }
1987     }
1988
1989     return 0;
1990 }
1991
1992 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1993  * load/store semantics.  While the increment is not atomic, the load and
1994  * store operations are, making it impossible to read inconsistent values.
1995  *
1996  * This is used to update thread local stats counters. */
1997 static void
1998 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1999 {
2000     unsigned long long tmp;
2001
2002     atomic_read_relaxed(var, &tmp);
2003     tmp += n;
2004     atomic_store_relaxed(var, tmp);
2005 }
2006
2007 static int
2008 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2009 {
2010     struct dp_netdev *dp = get_dp_netdev(dpif);
2011     struct dp_netdev_pmd_thread *pmd;
2012     uint64_t pmd_stats[PMD_N_STATS];
2013
2014     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2015     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2016         stats->n_flows += cmap_count(&pmd->flow_table);
2017         pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2018         stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2019         stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2020         stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2021         stats->n_missed += pmd_stats[PMD_STAT_MISS];
2022         stats->n_lost += pmd_stats[PMD_STAT_LOST];
2023     }
2024     stats->n_masks = UINT32_MAX;
2025     stats->n_mask_hit = UINT64_MAX;
2026
2027     return 0;
2028 }
2029
2030 static void
2031 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2032 {
2033     if (pmd->core_id == NON_PMD_CORE_ID) {
2034         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2035         ovs_mutex_lock(&pmd->port_mutex);
2036         pmd_load_cached_ports(pmd);
2037         ovs_mutex_unlock(&pmd->port_mutex);
2038         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2039         return;
2040     }
2041
2042     seq_change(pmd->reload_seq);
2043     atomic_store_explicit(&pmd->reload, true, memory_order_release);
2044 }
2045
2046 static uint32_t
2047 hash_port_no(odp_port_t port_no)
2048 {
2049     return hash_int(odp_to_u32(port_no), 0);
2050 }
2051
2052 static int
2053 port_create(const char *devname, const char *type,
2054             odp_port_t port_no, struct dp_netdev_port **portp)
2055 {
2056     struct dp_netdev_port *port;
2057     enum netdev_flags flags;
2058     struct netdev *netdev;
2059     int error;
2060
2061     *portp = NULL;
2062
2063     /* Open and validate network device. */
2064     error = netdev_open(devname, type, &netdev);
2065     if (error) {
2066         return error;
2067     }
2068     /* XXX reject non-Ethernet devices */
2069
2070     netdev_get_flags(netdev, &flags);
2071     if (flags & NETDEV_LOOPBACK) {
2072         VLOG_ERR("%s: cannot add a loopback device", devname);
2073         error = EINVAL;
2074         goto out;
2075     }
2076
2077     port = xzalloc(sizeof *port);
2078     port->port_no = port_no;
2079     port->netdev = netdev;
2080     port->type = xstrdup(type);
2081     port->sf = NULL;
2082     port->emc_enabled = true;
2083     port->need_reconfigure = true;
2084     ovs_mutex_init(&port->txq_used_mutex);
2085
2086     *portp = port;
2087
2088     return 0;
2089
2090 out:
2091     netdev_close(netdev);
2092     return error;
2093 }
2094
2095 static int
2096 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2097             odp_port_t port_no)
2098     OVS_REQUIRES(dp->port_mutex)
2099 {
2100     struct netdev_saved_flags *sf;
2101     struct dp_netdev_port *port;
2102     int error;
2103
2104     /* Reject devices already in 'dp'. */
2105     if (!get_port_by_name(dp, devname, &port)) {
2106         return EEXIST;
2107     }
2108
2109     error = port_create(devname, type, port_no, &port);
2110     if (error) {
2111         return error;
2112     }
2113
2114     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2115     seq_change(dp->port_seq);
2116
2117     reconfigure_datapath(dp);
2118
2119     /* Check that port was successfully configured. */
2120     if (!dp_netdev_lookup_port(dp, port_no)) {
2121         return EINVAL;
2122     }
2123
2124     /* Updating device flags triggers an if_notifier, which triggers a bridge
2125      * reconfiguration and another attempt to add this port, leading to an
2126      * infinite loop if the device is configured incorrectly and cannot be
2127      * added.  Setting the promisc mode after a successful reconfiguration,
2128      * since we already know that the device is somehow properly configured. */
2129     error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2130     if (error) {
2131         VLOG_ERR("%s: cannot set promisc flag", devname);
2132         do_del_port(dp, port);
2133         return error;
2134     }
2135     port->sf = sf;
2136
2137     return 0;
2138 }
2139
2140 static int
2141 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2142                      odp_port_t *port_nop)
2143 {
2144     struct dp_netdev *dp = get_dp_netdev(dpif);
2145     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2146     const char *dpif_port;
2147     odp_port_t port_no;
2148     int error;
2149
2150     ovs_mutex_lock(&dp->port_mutex);
2151     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2152     if (*port_nop != ODPP_NONE) {
2153         port_no = *port_nop;
2154         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2155     } else {
2156         port_no = choose_port(dp, dpif_port);
2157         error = port_no == ODPP_NONE ? EFBIG : 0;
2158     }
2159     if (!error) {
2160         *port_nop = port_no;
2161         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2162     }
2163     ovs_mutex_unlock(&dp->port_mutex);
2164
2165     return error;
2166 }
2167
2168 static int
2169 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2170 {
2171     struct dp_netdev *dp = get_dp_netdev(dpif);
2172     int error;
2173
2174     ovs_mutex_lock(&dp->port_mutex);
2175     if (port_no == ODPP_LOCAL) {
2176         error = EINVAL;
2177     } else {
2178         struct dp_netdev_port *port;
2179
2180         error = get_port_by_number(dp, port_no, &port);
2181         if (!error) {
2182             do_del_port(dp, port);
2183         }
2184     }
2185     ovs_mutex_unlock(&dp->port_mutex);
2186
2187     return error;
2188 }
2189
2190 static bool
2191 is_valid_port_number(odp_port_t port_no)
2192 {
2193     return port_no != ODPP_NONE;
2194 }
2195
2196 static struct dp_netdev_port *
2197 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2198     OVS_REQUIRES(dp->port_mutex)
2199 {
2200     struct dp_netdev_port *port;
2201
2202     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2203         if (port->port_no == port_no) {
2204             return port;
2205         }
2206     }
2207     return NULL;
2208 }
2209
2210 static int
2211 get_port_by_number(struct dp_netdev *dp,
2212                    odp_port_t port_no, struct dp_netdev_port **portp)
2213     OVS_REQUIRES(dp->port_mutex)
2214 {
2215     if (!is_valid_port_number(port_no)) {
2216         *portp = NULL;
2217         return EINVAL;
2218     } else {
2219         *portp = dp_netdev_lookup_port(dp, port_no);
2220         return *portp ? 0 : ENODEV;
2221     }
2222 }
2223
2224 static void
2225 port_destroy(struct dp_netdev_port *port)
2226 {
2227     if (!port) {
2228         return;
2229     }
2230
2231     netdev_close(port->netdev);
2232     netdev_restore_flags(port->sf);
2233
2234     for (unsigned i = 0; i < port->n_rxq; i++) {
2235         netdev_rxq_close(port->rxqs[i].rx);
2236     }
2237     ovs_mutex_destroy(&port->txq_used_mutex);
2238     free(port->rxq_affinity_list);
2239     free(port->txq_used);
2240     free(port->rxqs);
2241     free(port->type);
2242     free(port);
2243 }
2244
2245 static int
2246 get_port_by_name(struct dp_netdev *dp,
2247                  const char *devname, struct dp_netdev_port **portp)
2248     OVS_REQUIRES(dp->port_mutex)
2249 {
2250     struct dp_netdev_port *port;
2251
2252     HMAP_FOR_EACH (port, node, &dp->ports) {
2253         if (!strcmp(netdev_get_name(port->netdev), devname)) {
2254             *portp = port;
2255             return 0;
2256         }
2257     }
2258
2259     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2260      * existing port. */
2261     return ENODEV;
2262 }
2263
2264 /* Returns 'true' if there is a port with pmd netdev. */
2265 static bool
2266 has_pmd_port(struct dp_netdev *dp)
2267     OVS_REQUIRES(dp->port_mutex)
2268 {
2269     struct dp_netdev_port *port;
2270
2271     HMAP_FOR_EACH (port, node, &dp->ports) {
2272         if (netdev_is_pmd(port->netdev)) {
2273             return true;
2274         }
2275     }
2276
2277     return false;
2278 }
2279
2280 static void
2281 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2282     OVS_REQUIRES(dp->port_mutex)
2283 {
2284     hmap_remove(&dp->ports, &port->node);
2285     seq_change(dp->port_seq);
2286
2287     reconfigure_datapath(dp);
2288
2289     port_destroy(port);
2290 }
2291
2292 static void
2293 answer_port_query(const struct dp_netdev_port *port,
2294                   struct dpif_port *dpif_port)
2295 {
2296     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2297     dpif_port->type = xstrdup(port->type);
2298     dpif_port->port_no = port->port_no;
2299 }
2300
2301 static int
2302 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2303                                  struct dpif_port *dpif_port)
2304 {
2305     struct dp_netdev *dp = get_dp_netdev(dpif);
2306     struct dp_netdev_port *port;
2307     int error;
2308
2309     ovs_mutex_lock(&dp->port_mutex);
2310     error = get_port_by_number(dp, port_no, &port);
2311     if (!error && dpif_port) {
2312         answer_port_query(port, dpif_port);
2313     }
2314     ovs_mutex_unlock(&dp->port_mutex);
2315
2316     return error;
2317 }
2318
2319 static int
2320 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2321                                struct dpif_port *dpif_port)
2322 {
2323     struct dp_netdev *dp = get_dp_netdev(dpif);
2324     struct dp_netdev_port *port;
2325     int error;
2326
2327     ovs_mutex_lock(&dp->port_mutex);
2328     error = get_port_by_name(dp, devname, &port);
2329     if (!error && dpif_port) {
2330         answer_port_query(port, dpif_port);
2331     }
2332     ovs_mutex_unlock(&dp->port_mutex);
2333
2334     return error;
2335 }
2336
2337 static void
2338 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2339 {
2340     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2341     free(flow->dp_extra_info);
2342     free(flow);
2343 }
2344
2345 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2346 {
2347     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2348         ovsrcu_postpone(dp_netdev_flow_free, flow);
2349     }
2350 }
2351
2352 static uint32_t
2353 dp_netdev_flow_hash(const ovs_u128 *ufid)
2354 {
2355     return ufid->u32[0];
2356 }
2357
2358 static inline struct dpcls *
2359 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2360                            odp_port_t in_port)
2361 {
2362     struct dpcls *cls;
2363     uint32_t hash = hash_port_no(in_port);
2364     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2365         if (cls->in_port == in_port) {
2366             /* Port classifier exists already */
2367             return cls;
2368         }
2369     }
2370     return NULL;
2371 }
2372
2373 static inline struct dpcls *
2374 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2375                          odp_port_t in_port)
2376     OVS_REQUIRES(pmd->flow_mutex)
2377 {
2378     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2379     uint32_t hash = hash_port_no(in_port);
2380
2381     if (!cls) {
2382         /* Create new classifier for in_port */
2383         cls = xmalloc(sizeof(*cls));
2384         dpcls_init(cls);
2385         cls->in_port = in_port;
2386         cmap_insert(&pmd->classifiers, &cls->node, hash);
2387         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2388     }
2389     return cls;
2390 }
2391
2392 #define MAX_FLOW_MARK       (UINT32_MAX - 1)
2393 #define INVALID_FLOW_MARK   0
2394 /* Zero flow mark is used to indicate the HW to remove the mark. A packet
2395  * marked with zero mark is received in SW without a mark at all, so it
2396  * cannot be used as a valid mark.
2397  */
2398
2399 struct megaflow_to_mark_data {
2400     const struct cmap_node node;
2401     ovs_u128 mega_ufid;
2402     uint32_t mark;
2403 };
2404
2405 struct flow_mark {
2406     struct cmap megaflow_to_mark;
2407     struct cmap mark_to_flow;
2408     struct id_pool *pool;
2409 };
2410
2411 static struct flow_mark flow_mark = {
2412     .megaflow_to_mark = CMAP_INITIALIZER,
2413     .mark_to_flow = CMAP_INITIALIZER,
2414 };
2415
2416 static uint32_t
2417 flow_mark_alloc(void)
2418 {
2419     uint32_t mark;
2420
2421     if (!flow_mark.pool) {
2422         /* Haven't initiated yet, do it here */
2423         flow_mark.pool = id_pool_create(1, MAX_FLOW_MARK);
2424     }
2425
2426     if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2427         return mark;
2428     }
2429
2430     return INVALID_FLOW_MARK;
2431 }
2432
2433 static void
2434 flow_mark_free(uint32_t mark)
2435 {
2436     id_pool_free_id(flow_mark.pool, mark);
2437 }
2438
2439 /* associate megaflow with a mark, which is a 1:1 mapping */
2440 static void
2441 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2442 {
2443     size_t hash = dp_netdev_flow_hash(mega_ufid);
2444     struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2445
2446     data->mega_ufid = *mega_ufid;
2447     data->mark = mark;
2448
2449     cmap_insert(&flow_mark.megaflow_to_mark,
2450                 CONST_CAST(struct cmap_node *, &data->node), hash);
2451 }
2452
2453 /* disassociate meagaflow with a mark */
2454 static void
2455 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2456 {
2457     size_t hash = dp_netdev_flow_hash(mega_ufid);
2458     struct megaflow_to_mark_data *data;
2459
2460     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2461         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2462             cmap_remove(&flow_mark.megaflow_to_mark,
2463                         CONST_CAST(struct cmap_node *, &data->node), hash);
2464             ovsrcu_postpone(free, data);
2465             return;
2466         }
2467     }
2468
2469     VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2470               UUID_ARGS((struct uuid *)mega_ufid));
2471 }
2472
2473 static inline uint32_t
2474 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2475 {
2476     size_t hash = dp_netdev_flow_hash(mega_ufid);
2477     struct megaflow_to_mark_data *data;
2478
2479     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2480         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2481             return data->mark;
2482         }
2483     }
2484
2485     VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2486              UUID_ARGS((struct uuid *)mega_ufid));
2487     return INVALID_FLOW_MARK;
2488 }
2489
2490 /* associate mark with a flow, which is 1:N mapping */
2491 static void
2492 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2493 {
2494     dp_netdev_flow_ref(flow);
2495
2496     cmap_insert(&flow_mark.mark_to_flow,
2497                 CONST_CAST(struct cmap_node *, &flow->mark_node),
2498                 hash_int(mark, 0));
2499     flow->mark = mark;
2500
2501     VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2502              flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2503 }
2504
2505 static bool
2506 flow_mark_has_no_ref(uint32_t mark)
2507 {
2508     struct dp_netdev_flow *flow;
2509
2510     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2511                              &flow_mark.mark_to_flow) {
2512         if (flow->mark == mark) {
2513             return false;
2514         }
2515     }
2516
2517     return true;
2518 }
2519
2520 static int
2521 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2522                           struct dp_netdev_flow *flow)
2523 {
2524     const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2525     struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2526                                              &flow->mark_node);
2527     uint32_t mark = flow->mark;
2528     int ret = 0;
2529
2530     /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2531      * never associated. */
2532     if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2533         return EINVAL;
2534     }
2535
2536     cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2537     flow->mark = INVALID_FLOW_MARK;
2538
2539     /*
2540      * no flow is referencing the mark any more? If so, let's
2541      * remove the flow from hardware and free the mark.
2542      */
2543     if (flow_mark_has_no_ref(mark)) {
2544         struct netdev *port;
2545         odp_port_t in_port = flow->flow.in_port.odp_port;
2546
2547         port = netdev_ports_get(in_port, dpif_type_str);
2548         if (port) {
2549             /* Taking a global 'port_mutex' to fulfill thread safety
2550              * restrictions for the netdev-offload-dpdk module. */
2551             ovs_mutex_lock(&pmd->dp->port_mutex);
2552             ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2553             ovs_mutex_unlock(&pmd->dp->port_mutex);
2554             netdev_close(port);
2555         }
2556
2557         flow_mark_free(mark);
2558         VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2559                  UUID_ARGS((struct uuid *) &flow->mega_ufid));
2560
2561         megaflow_to_mark_disassociate(&flow->mega_ufid);
2562     }
2563     dp_netdev_flow_unref(flow);
2564
2565     return ret;
2566 }
2567
2568 static void
2569 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2570 {
2571     struct dp_netdev_flow *flow;
2572
2573     CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2574         if (flow->pmd_id == pmd->core_id) {
2575             queue_netdev_flow_del(pmd, flow);
2576         }
2577     }
2578 }
2579
2580 static struct dp_netdev_flow *
2581 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2582                   const uint32_t mark)
2583 {
2584     struct dp_netdev_flow *flow;
2585
2586     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2587                              &flow_mark.mark_to_flow) {
2588         if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2589             flow->dead == false) {
2590             return flow;
2591         }
2592     }
2593
2594     return NULL;
2595 }
2596
2597 static struct dp_flow_offload_item *
2598 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2599                              struct dp_netdev_flow *flow,
2600                              int op)
2601 {
2602     struct dp_flow_offload_item *offload;
2603
2604     offload = xzalloc(sizeof(*offload));
2605     offload->pmd = pmd;
2606     offload->flow = flow;
2607     offload->op = op;
2608
2609     dp_netdev_flow_ref(flow);
2610     dp_netdev_pmd_try_ref(pmd);
2611
2612     return offload;
2613 }
2614
2615 static void
2616 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2617 {
2618     dp_netdev_pmd_unref(offload->pmd);
2619     dp_netdev_flow_unref(offload->flow);
2620
2621     free(offload->actions);
2622     free(offload);
2623 }
2624
2625 static void
2626 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2627 {
2628     ovs_mutex_lock(&dp_flow_offload.mutex);
2629     ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2630     xpthread_cond_signal(&dp_flow_offload.cond);
2631     ovs_mutex_unlock(&dp_flow_offload.mutex);
2632 }
2633
2634 static int
2635 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2636 {
2637     return mark_to_flow_disassociate(offload->pmd, offload->flow);
2638 }
2639
2640 /*
2641  * There are two flow offload operations here: addition and modification.
2642  *
2643  * For flow addition, this function does:
2644  * - allocate a new flow mark id
2645  * - perform hardware flow offload
2646  * - associate the flow mark with flow and mega flow
2647  *
2648  * For flow modification, both flow mark and the associations are still
2649  * valid, thus only item 2 needed.
2650  */
2651 static int
2652 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2653 {
2654     struct dp_netdev_pmd_thread *pmd = offload->pmd;
2655     struct dp_netdev_flow *flow = offload->flow;
2656     odp_port_t in_port = flow->flow.in_port.odp_port;
2657     const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2658     bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2659     struct offload_info info;
2660     struct netdev *port;
2661     uint32_t mark;
2662     int ret;
2663
2664     if (flow->dead) {
2665         return -1;
2666     }
2667
2668     if (modification) {
2669         mark = flow->mark;
2670         ovs_assert(mark != INVALID_FLOW_MARK);
2671     } else {
2672         /*
2673          * If a mega flow has already been offloaded (from other PMD
2674          * instances), do not offload it again.
2675          */
2676         mark = megaflow_to_mark_find(&flow->mega_ufid);
2677         if (mark != INVALID_FLOW_MARK) {
2678             VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2679             if (flow->mark != INVALID_FLOW_MARK) {
2680                 ovs_assert(flow->mark == mark);
2681             } else {
2682                 mark_to_flow_associate(mark, flow);
2683             }
2684             return 0;
2685         }
2686
2687         mark = flow_mark_alloc();
2688         if (mark == INVALID_FLOW_MARK) {
2689             VLOG_ERR("Failed to allocate flow mark!\n");
2690             return -1;
2691         }
2692     }
2693     info.flow_mark = mark;
2694
2695     port = netdev_ports_get(in_port, dpif_type_str);
2696     if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2697         netdev_close(port);
2698         goto err_free;
2699     }
2700     /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2701      * the netdev-offload-dpdk module. */
2702     ovs_mutex_lock(&pmd->dp->port_mutex);
2703     ret = netdev_flow_put(port, &offload->match,
2704                           CONST_CAST(struct nlattr *, offload->actions),
2705                           offload->actions_len, &flow->mega_ufid, &info,
2706                           NULL);
2707     ovs_mutex_unlock(&pmd->dp->port_mutex);
2708     netdev_close(port);
2709
2710     if (ret) {
2711         goto err_free;
2712     }
2713
2714     if (!modification) {
2715         megaflow_to_mark_associate(&flow->mega_ufid, mark);
2716         mark_to_flow_associate(mark, flow);
2717     }
2718     return 0;
2719
2720 err_free:
2721     if (!modification) {
2722         flow_mark_free(mark);
2723     } else {
2724         mark_to_flow_disassociate(pmd, flow);
2725     }
2726     return -1;
2727 }
2728
2729 static void *
2730 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2731 {
2732     struct dp_flow_offload_item *offload;
2733     struct ovs_list *list;
2734     const char *op;
2735     int ret;
2736
2737     for (;;) {
2738         ovs_mutex_lock(&dp_flow_offload.mutex);
2739         if (ovs_list_is_empty(&dp_flow_offload.list)) {
2740             ovsrcu_quiesce_start();
2741             ovs_mutex_cond_wait(&dp_flow_offload.cond,
2742                                 &dp_flow_offload.mutex);
2743             ovsrcu_quiesce_end();
2744         }
2745         list = ovs_list_pop_front(&dp_flow_offload.list);
2746         offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2747         ovs_mutex_unlock(&dp_flow_offload.mutex);
2748
2749         switch (offload->op) {
2750         case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2751             op = "add";
2752             ret = dp_netdev_flow_offload_put(offload);
2753             break;
2754         case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2755             op = "modify";
2756             ret = dp_netdev_flow_offload_put(offload);
2757             break;
2758         case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2759             op = "delete";
2760             ret = dp_netdev_flow_offload_del(offload);
2761             break;
2762         default:
2763             OVS_NOT_REACHED();
2764         }
2765
2766         VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2767                  ret == 0 ? "succeed" : "failed", op,
2768                  UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
2769         dp_netdev_free_flow_offload(offload);
2770         ovsrcu_quiesce();
2771     }
2772
2773     return NULL;
2774 }
2775
2776 static void
2777 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2778                       struct dp_netdev_flow *flow)
2779 {
2780     struct dp_flow_offload_item *offload;
2781
2782     if (ovsthread_once_start(&offload_thread_once)) {
2783         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2784         ovs_thread_create("dp_netdev_flow_offload",
2785                           dp_netdev_flow_offload_main, NULL);
2786         ovsthread_once_done(&offload_thread_once);
2787     }
2788
2789     offload = dp_netdev_alloc_flow_offload(pmd, flow,
2790                                            DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2791     dp_netdev_append_flow_offload(offload);
2792 }
2793
2794 static void
2795 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2796                       struct dp_netdev_flow *flow, struct match *match,
2797                       const struct nlattr *actions, size_t actions_len)
2798 {
2799     struct dp_flow_offload_item *offload;
2800     int op;
2801
2802     if (!netdev_is_flow_api_enabled()) {
2803         return;
2804     }
2805
2806     if (ovsthread_once_start(&offload_thread_once)) {
2807         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2808         ovs_thread_create("dp_netdev_flow_offload",
2809                           dp_netdev_flow_offload_main, NULL);
2810         ovsthread_once_done(&offload_thread_once);
2811     }
2812
2813     if (flow->mark != INVALID_FLOW_MARK) {
2814         op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2815     } else {
2816         op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2817     }
2818     offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2819     offload->match = *match;
2820     offload->actions = xmalloc(actions_len);
2821     memcpy(offload->actions, actions, actions_len);
2822     offload->actions_len = actions_len;
2823
2824     dp_netdev_append_flow_offload(offload);
2825 }
2826
2827 static void
2828 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2829                           struct dp_netdev_flow *flow)
2830     OVS_REQUIRES(pmd->flow_mutex)
2831 {
2832     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2833     struct dpcls *cls;
2834     odp_port_t in_port = flow->flow.in_port.odp_port;
2835
2836     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2837     ovs_assert(cls != NULL);
2838     dpcls_remove(cls, &flow->cr);
2839     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2840     if (flow->mark != INVALID_FLOW_MARK) {
2841         queue_netdev_flow_del(pmd, flow);
2842     }
2843     flow->dead = true;
2844
2845     dp_netdev_flow_unref(flow);
2846 }
2847
2848 static void
2849 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2850 {
2851     struct dp_netdev_flow *netdev_flow;
2852
2853     ovs_mutex_lock(&pmd->flow_mutex);
2854     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2855         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2856     }
2857     ovs_mutex_unlock(&pmd->flow_mutex);
2858 }
2859
2860 static int
2861 dpif_netdev_flow_flush(struct dpif *dpif)
2862 {
2863     struct dp_netdev *dp = get_dp_netdev(dpif);
2864     struct dp_netdev_pmd_thread *pmd;
2865
2866     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2867         dp_netdev_pmd_flow_flush(pmd);
2868     }
2869
2870     return 0;
2871 }
2872
2873 struct dp_netdev_port_state {
2874     struct hmap_position position;
2875     char *name;
2876 };
2877
2878 static int
2879 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2880 {
2881     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2882     return 0;
2883 }
2884
2885 static int
2886 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2887                            struct dpif_port *dpif_port)
2888 {
2889     struct dp_netdev_port_state *state = state_;
2890     struct dp_netdev *dp = get_dp_netdev(dpif);
2891     struct hmap_node *node;
2892     int retval;
2893
2894     ovs_mutex_lock(&dp->port_mutex);
2895     node = hmap_at_position(&dp->ports, &state->position);
2896     if (node) {
2897         struct dp_netdev_port *port;
2898
2899         port = CONTAINER_OF(node, struct dp_netdev_port, node);
2900
2901         free(state->name);
2902         state->name = xstrdup(netdev_get_name(port->netdev));
2903         dpif_port->name = state->name;
2904         dpif_port->type = port->type;
2905         dpif_port->port_no = port->port_no;
2906
2907         retval = 0;
2908     } else {
2909         retval = EOF;
2910     }
2911     ovs_mutex_unlock(&dp->port_mutex);
2912
2913     return retval;
2914 }
2915
2916 static int
2917 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2918 {
2919     struct dp_netdev_port_state *state = state_;
2920     free(state->name);
2921     free(state);
2922     return 0;
2923 }
2924
2925 static int
2926 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2927 {
2928     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2929     uint64_t new_port_seq;
2930     int error;
2931
2932     new_port_seq = seq_read(dpif->dp->port_seq);
2933     if (dpif->last_port_seq != new_port_seq) {
2934         dpif->last_port_seq = new_port_seq;
2935         error = ENOBUFS;
2936     } else {
2937         error = EAGAIN;
2938     }
2939
2940     return error;
2941 }
2942
2943 static void
2944 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2945 {
2946     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2947
2948     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2949 }
2950
2951 static struct dp_netdev_flow *
2952 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2953 {
2954     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2955 }
2956
2957 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2958 {
2959     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2960 }
2961
2962 /* netdev_flow_key utilities.
2963  *
2964  * netdev_flow_key is basically a miniflow.  We use these functions
2965  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2966  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2967  *
2968  * - Since we are dealing exclusively with miniflows created by
2969  *   miniflow_extract(), if the map is different the miniflow is different.
2970  *   Therefore we can be faster by comparing the map and the miniflow in a
2971  *   single memcmp().
2972  * - These functions can be inlined by the compiler. */
2973
2974 /* Given the number of bits set in miniflow's maps, returns the size of the
2975  * 'netdev_flow_key.mf' */
2976 static inline size_t
2977 netdev_flow_key_size(size_t flow_u64s)
2978 {
2979     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2980 }
2981
2982 static inline bool
2983 netdev_flow_key_equal(const struct netdev_flow_key *a,
2984                       const struct netdev_flow_key *b)
2985 {
2986     /* 'b->len' may be not set yet. */
2987     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2988 }
2989
2990 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2991  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2992  * generated by miniflow_extract. */
2993 static inline bool
2994 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2995                          const struct miniflow *mf)
2996 {
2997     return !memcmp(&key->mf, mf, key->len);
2998 }
2999
3000 static inline void
3001 netdev_flow_key_clone(struct netdev_flow_key *dst,
3002                       const struct netdev_flow_key *src)
3003 {
3004     memcpy(dst, src,
3005            offsetof(struct netdev_flow_key, mf) + src->len);
3006 }
3007
3008 /* Initialize a netdev_flow_key 'mask' from 'match'. */
3009 static inline void
3010 netdev_flow_mask_init(struct netdev_flow_key *mask,
3011                       const struct match *match)
3012 {
3013     uint64_t *dst = miniflow_values(&mask->mf);
3014     struct flowmap fmap;
3015     uint32_t hash = 0;
3016     size_t idx;
3017
3018     /* Only check masks that make sense for the flow. */
3019     flow_wc_map(&match->flow, &fmap);
3020     flowmap_init(&mask->mf.map);
3021
3022     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
3023         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
3024
3025         if (mask_u64) {
3026             flowmap_set(&mask->mf.map, idx, 1);
3027             *dst++ = mask_u64;
3028             hash = hash_add64(hash, mask_u64);
3029         }
3030     }
3031
3032     map_t map;
3033
3034     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
3035         hash = hash_add64(hash, map);
3036     }
3037
3038     size_t n = dst - miniflow_get_values(&mask->mf);
3039
3040     mask->hash = hash_finish(hash, n * 8);
3041     mask->len = netdev_flow_key_size(n);
3042 }
3043
3044 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
3045 static inline void
3046 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
3047                             const struct flow *flow,
3048                             const struct netdev_flow_key *mask)
3049 {
3050     uint64_t *dst_u64 = miniflow_values(&dst->mf);
3051     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
3052     uint32_t hash = 0;
3053     uint64_t value;
3054
3055     dst->len = mask->len;
3056     dst->mf = mask->mf;   /* Copy maps. */
3057
3058     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
3059         *dst_u64 = value & *mask_u64++;
3060         hash = hash_add64(hash, *dst_u64++);
3061     }
3062     dst->hash = hash_finish(hash,
3063                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
3064 }
3065
3066 static inline bool
3067 emc_entry_alive(struct emc_entry *ce)
3068 {
3069     return ce->flow && !ce->flow->dead;
3070 }
3071
3072 static void
3073 emc_clear_entry(struct emc_entry *ce)
3074 {
3075     if (ce->flow) {
3076         dp_netdev_flow_unref(ce->flow);
3077         ce->flow = NULL;
3078     }
3079 }
3080
3081 static inline void
3082 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
3083                  const struct netdev_flow_key *key)
3084 {
3085     if (ce->flow != flow) {
3086         if (ce->flow) {
3087             dp_netdev_flow_unref(ce->flow);
3088         }
3089
3090         if (dp_netdev_flow_ref(flow)) {
3091             ce->flow = flow;
3092         } else {
3093             ce->flow = NULL;
3094         }
3095     }
3096     if (key) {
3097         netdev_flow_key_clone(&ce->key, key);
3098     }
3099 }
3100
3101 static inline void
3102 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
3103            struct dp_netdev_flow *flow)
3104 {
3105     struct emc_entry *to_be_replaced = NULL;
3106     struct emc_entry *current_entry;
3107
3108     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3109         if (netdev_flow_key_equal(&current_entry->key, key)) {
3110             /* We found the entry with the 'mf' miniflow */
3111             emc_change_entry(current_entry, flow, NULL);
3112             return;
3113         }
3114
3115         /* Replacement policy: put the flow in an empty (not alive) entry, or
3116          * in the first entry where it can be */
3117         if (!to_be_replaced
3118             || (emc_entry_alive(to_be_replaced)
3119                 && !emc_entry_alive(current_entry))
3120             || current_entry->key.hash < to_be_replaced->key.hash) {
3121             to_be_replaced = current_entry;
3122         }
3123     }
3124     /* We didn't find the miniflow in the cache.
3125      * The 'to_be_replaced' entry is where the new flow will be stored */
3126
3127     emc_change_entry(to_be_replaced, flow, key);
3128 }
3129
3130 static inline void
3131 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
3132                          const struct netdev_flow_key *key,
3133                          struct dp_netdev_flow *flow)
3134 {
3135     /* Insert an entry into the EMC based on probability value 'min'. By
3136      * default the value is UINT32_MAX / 100 which yields an insertion
3137      * probability of 1/100 ie. 1% */
3138
3139     uint32_t min = pmd->ctx.emc_insert_min;
3140
3141     if (min && random_uint32() <= min) {
3142         emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3143     }
3144 }
3145
3146 static inline struct dp_netdev_flow *
3147 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
3148 {
3149     struct emc_entry *current_entry;
3150
3151     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3152         if (current_entry->key.hash == key->hash
3153             && emc_entry_alive(current_entry)
3154             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
3155
3156             /* We found the entry with the 'key->mf' miniflow */
3157             return current_entry->flow;
3158         }
3159     }
3160
3161     return NULL;
3162 }
3163
3164 static inline const struct cmap_node *
3165 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3166 {
3167     struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3168     struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3169     uint16_t sig = hash >> 16;
3170     uint16_t index = UINT16_MAX;
3171
3172     for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3173         if (bucket->sig[i] == sig) {
3174             index = bucket->flow_idx[i];
3175             break;
3176         }
3177     }
3178     if (index != UINT16_MAX) {
3179         return cmap_find_by_index(&pmd->flow_table, index);
3180     }
3181     return NULL;
3182 }
3183
3184 static void
3185 smc_clear_entry(struct smc_bucket *b, int idx)
3186 {
3187     b->flow_idx[idx] = UINT16_MAX;
3188 }
3189
3190 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3191  * turned off, 2) the flow_table index is larger than uint16_t can handle.
3192  * If there is already an SMC entry having same signature, the index will be
3193  * updated. If there is no existing entry, but an empty entry is available,
3194  * the empty entry will be taken. If no empty entry or existing same signature,
3195  * a random entry from the hashed bucket will be picked. */
3196 static inline void
3197 smc_insert(struct dp_netdev_pmd_thread *pmd,
3198            const struct netdev_flow_key *key,
3199            uint32_t hash)
3200 {
3201     struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3202     struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3203     uint16_t index;
3204     uint32_t cmap_index;
3205     bool smc_enable_db;
3206     int i;
3207
3208     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
3209     if (!smc_enable_db) {
3210         return;
3211     }
3212
3213     cmap_index = cmap_find_index(&pmd->flow_table, hash);
3214     index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3215
3216     /* If the index is larger than SMC can handle (uint16_t), we don't
3217      * insert */
3218     if (index == UINT16_MAX) {
3219         return;
3220     }
3221
3222     /* If an entry with same signature already exists, update the index */
3223     uint16_t sig = key->hash >> 16;
3224     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3225         if (bucket->sig[i] == sig) {
3226             bucket->flow_idx[i] = index;
3227             return;
3228         }
3229     }
3230     /* If there is an empty entry, occupy it. */
3231     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3232         if (bucket->flow_idx[i] == UINT16_MAX) {
3233             bucket->sig[i] = sig;
3234             bucket->flow_idx[i] = index;
3235             return;
3236         }
3237     }
3238     /* Otherwise, pick a random entry. */
3239     i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3240     bucket->sig[i] = sig;
3241     bucket->flow_idx[i] = index;
3242 }
3243
3244 static struct dp_netdev_flow *
3245 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3246                           const struct netdev_flow_key *key,
3247                           int *lookup_num_p)
3248 {
3249     struct dpcls *cls;
3250     struct dpcls_rule *rule;
3251     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3252                                                      in_port.odp_port));
3253     struct dp_netdev_flow *netdev_flow = NULL;
3254
3255     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3256     if (OVS_LIKELY(cls)) {
3257         dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3258         netdev_flow = dp_netdev_flow_cast(rule);
3259     }
3260     return netdev_flow;
3261 }
3262
3263 static struct dp_netdev_flow *
3264 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3265                         const ovs_u128 *ufidp, const struct nlattr *key,
3266                         size_t key_len)
3267 {
3268     struct dp_netdev_flow *netdev_flow;
3269     struct flow flow;
3270     ovs_u128 ufid;
3271
3272     /* If a UFID is not provided, determine one based on the key. */
3273     if (!ufidp && key && key_len
3274         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3275         odp_flow_key_hash(&flow, sizeof flow, &ufid);
3276         ufidp = &ufid;
3277     }
3278
3279     if (ufidp) {
3280         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3281                                  &pmd->flow_table) {
3282             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3283                 return netdev_flow;
3284             }
3285         }
3286     }
3287
3288     return NULL;
3289 }
3290
3291 static void
3292 dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3293                                     const struct dpif_flow_stats *stats,
3294                                     const struct dpif_flow_attrs *attrs,
3295                                     int result)
3296 {
3297     struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3298     struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3299
3300     atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result);
3301     if (result) {
3302         return;
3303     }
3304
3305     atomic_store_relaxed(&last_stats->used,         stats->used);
3306     atomic_store_relaxed(&last_stats->packet_count, stats->n_packets);
3307     atomic_store_relaxed(&last_stats->byte_count,   stats->n_bytes);
3308     atomic_store_relaxed(&last_stats->tcp_flags,    stats->tcp_flags);
3309
3310     atomic_store_relaxed(&last_attrs->offloaded,    attrs->offloaded);
3311     atomic_store_relaxed(&last_attrs->dp_layer,     attrs->dp_layer);
3312
3313 }
3314
3315 static void
3316 dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow,
3317                                     struct dpif_flow_stats *stats,
3318                                     struct dpif_flow_attrs *attrs,
3319                                     int *result)
3320 {
3321     struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats;
3322     struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs;
3323
3324     atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result);
3325     if (*result) {
3326         return;
3327     }
3328
3329     atomic_read_relaxed(&last_stats->used,         &stats->used);
3330     atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets);
3331     atomic_read_relaxed(&last_stats->byte_count,   &stats->n_bytes);
3332     atomic_read_relaxed(&last_stats->tcp_flags,    &stats->tcp_flags);
3333
3334     atomic_read_relaxed(&last_attrs->offloaded,    &attrs->offloaded);
3335     atomic_read_relaxed(&last_attrs->dp_layer,     &attrs->dp_layer);
3336 }
3337
3338 static bool
3339 dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3340                                     struct dp_netdev_flow *netdev_flow,
3341                                     struct dpif_flow_stats *stats,
3342                                     struct dpif_flow_attrs *attrs)
3343 {
3344     uint64_t act_buf[1024 / 8];
3345     struct nlattr *actions;
3346     struct netdev *netdev;
3347     struct match match;
3348     struct ofpbuf buf;
3349
3350     int ret = 0;
3351
3352     if (!netdev_is_flow_api_enabled()) {
3353         return false;
3354     }
3355
3356     netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3357                               dpif_normalize_type(dp->class->type));
3358     if (!netdev) {
3359         return false;
3360     }
3361     ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3362     /* Taking a global 'port_mutex' to fulfill thread safety
3363      * restrictions for the netdev-offload-dpdk module.
3364      *
3365      * XXX: Main thread will try to pause/stop all revalidators during datapath
3366      *      reconfiguration via datapath purge callback (dp_purge_cb) while
3367      *      holding 'dp->port_mutex'.  So we're not waiting for mutex here.
3368      *      Otherwise, deadlock is possible, bcause revalidators might sleep
3369      *      waiting for the main thread to release the lock and main thread
3370      *      will wait for them to stop processing.
3371      *      This workaround might make statistics less accurate. Especially
3372      *      for flow deletion case, since there will be no other attempt.  */
3373     if (!ovs_mutex_trylock(&dp->port_mutex)) {
3374         ret = netdev_flow_get(netdev, &match, &actions,
3375                               &netdev_flow->mega_ufid, stats, attrs, &buf);
3376         /* Storing statistics and attributes from the last request for
3377          * later use on mutex contention. */
3378         dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret);
3379         ovs_mutex_unlock(&dp->port_mutex);
3380     } else {
3381         dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret);
3382         if (!ret && !attrs->dp_layer) {
3383             /* Flow was never reported as 'offloaded' so it's harmless
3384              * to continue to think so. */
3385             ret = EAGAIN;
3386         }
3387     }
3388     netdev_close(netdev);
3389     if (ret) {
3390         return false;
3391     }
3392
3393     return true;
3394 }
3395
3396 static void
3397 get_dpif_flow_status(const struct dp_netdev *dp,
3398                      const struct dp_netdev_flow *netdev_flow_,
3399                      struct dpif_flow_stats *stats,
3400                      struct dpif_flow_attrs *attrs)
3401 {
3402     struct dpif_flow_stats offload_stats;
3403     struct dpif_flow_attrs offload_attrs;
3404     struct dp_netdev_flow *netdev_flow;
3405     unsigned long long n;
3406     long long used;
3407     uint16_t flags;
3408
3409     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3410
3411     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3412     stats->n_packets = n;
3413     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3414     stats->n_bytes = n;
3415     atomic_read_relaxed(&netdev_flow->stats.used, &used);
3416     stats->used = used;
3417     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3418     stats->tcp_flags = flags;
3419
3420     if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3421                                             &offload_stats, &offload_attrs)) {
3422         stats->n_packets += offload_stats.n_packets;
3423         stats->n_bytes += offload_stats.n_bytes;
3424         stats->used = MAX(stats->used, offload_stats.used);
3425         stats->tcp_flags |= offload_stats.tcp_flags;
3426         if (attrs) {
3427             attrs->offloaded = offload_attrs.offloaded;
3428             attrs->dp_layer = offload_attrs.dp_layer;
3429         }
3430     } else if (attrs) {
3431         attrs->offloaded = false;
3432         attrs->dp_layer = "ovs";
3433     }
3434 }
3435
3436 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3437  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3438  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3439  * protect them. */
3440 static void
3441 dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3442                             const struct dp_netdev_flow *netdev_flow,
3443                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3444                             struct dpif_flow *flow, bool terse)
3445 {
3446     if (terse) {
3447         memset(flow, 0, sizeof *flow);
3448     } else {
3449         struct flow_wildcards wc;
3450         struct dp_netdev_actions *actions;
3451         size_t offset;
3452         struct odp_flow_key_parms odp_parms = {
3453             .flow = &netdev_flow->flow,
3454             .mask = &wc.masks,
3455             .support = dp_netdev_support,
3456         };
3457
3458         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3459         /* in_port is exact matched, but we have left it out from the mask for
3460          * optimnization reasons. Add in_port back to the mask. */
3461         wc.masks.in_port.odp_port = ODPP_NONE;
3462
3463         /* Key */
3464         offset = key_buf->size;
3465         flow->key = ofpbuf_tail(key_buf);
3466         odp_flow_key_from_flow(&odp_parms, key_buf);
3467         flow->key_len = key_buf->size - offset;
3468
3469         /* Mask */
3470         offset = mask_buf->size;
3471         flow->mask = ofpbuf_tail(mask_buf);
3472         odp_parms.key_buf = key_buf;
3473         odp_flow_key_from_mask(&odp_parms, mask_buf);
3474         flow->mask_len = mask_buf->size - offset;
3475
3476         /* Actions */
3477         actions = dp_netdev_flow_get_actions(netdev_flow);
3478         flow->actions = actions->actions;
3479         flow->actions_len = actions->size;
3480     }
3481
3482     flow->ufid = netdev_flow->ufid;
3483     flow->ufid_present = true;
3484     flow->pmd_id = netdev_flow->pmd_id;
3485
3486     get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3487     flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3488 }
3489
3490 static int
3491 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3492                               const struct nlattr *mask_key,
3493                               uint32_t mask_key_len, const struct flow *flow,
3494                               struct flow_wildcards *wc, bool probe)
3495 {
3496     enum odp_key_fitness fitness;
3497
3498     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3499     if (fitness) {
3500         if (!probe) {
3501             /* This should not happen: it indicates that
3502              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3503              * disagree on the acceptable form of a mask.  Log the problem
3504              * as an error, with enough details to enable debugging. */
3505             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3506
3507             if (!VLOG_DROP_ERR(&rl)) {
3508                 struct ds s;
3509
3510                 ds_init(&s);
3511                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3512                                 true);
3513                 VLOG_ERR("internal error parsing flow mask %s (%s)",
3514                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3515                 ds_destroy(&s);
3516             }
3517         }
3518
3519         return EINVAL;
3520     }
3521
3522     return 0;
3523 }
3524
3525 static int
3526 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3527                               struct flow *flow, bool probe)
3528 {
3529     if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3530         if (!probe) {
3531             /* This should not happen: it indicates that
3532              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3533              * the acceptable form of a flow.  Log the problem as an error,
3534              * with enough details to enable debugging. */
3535             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3536
3537             if (!VLOG_DROP_ERR(&rl)) {
3538                 struct ds s;
3539
3540                 ds_init(&s);
3541                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3542                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3543                 ds_destroy(&s);
3544             }
3545         }
3546
3547         return EINVAL;
3548     }
3549
3550     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3551         return EINVAL;
3552     }
3553
3554     return 0;
3555 }
3556
3557 static int
3558 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3559 {
3560     struct dp_netdev *dp = get_dp_netdev(dpif);
3561     struct dp_netdev_flow *netdev_flow;
3562     struct dp_netdev_pmd_thread *pmd;
3563     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3564     struct hmapx_node *node;
3565     int error = EINVAL;
3566
3567     if (get->pmd_id == PMD_ID_NULL) {
3568         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3569             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3570                 dp_netdev_pmd_unref(pmd);
3571             }
3572         }
3573     } else {
3574         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3575         if (!pmd) {
3576             goto out;
3577         }
3578         hmapx_add(&to_find, pmd);
3579     }
3580
3581     if (!hmapx_count(&to_find)) {
3582         goto out;
3583     }
3584
3585     HMAPX_FOR_EACH (node, &to_find) {
3586         pmd = (struct dp_netdev_pmd_thread *) node->data;
3587         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3588                                               get->key_len);
3589         if (netdev_flow) {
3590             dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3591                                         get->buffer, get->flow, false);
3592             error = 0;
3593             break;
3594         } else {
3595             error = ENOENT;
3596         }
3597     }
3598
3599     HMAPX_FOR_EACH (node, &to_find) {
3600         pmd = (struct dp_netdev_pmd_thread *) node->data;
3601         dp_netdev_pmd_unref(pmd);
3602     }
3603 out:
3604     hmapx_destroy(&to_find);
3605     return error;
3606 }
3607
3608 static void
3609 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3610 {
3611     struct flow masked_flow;
3612     size_t i;
3613
3614     for (i = 0; i < sizeof(struct flow); i++) {
3615         ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3616                                        ((uint8_t *)&match->wc)[i];
3617     }
3618     odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3619 }
3620
3621 static struct dp_netdev_flow *
3622 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3623                    struct match *match, const ovs_u128 *ufid,
3624                    const struct nlattr *actions, size_t actions_len)
3625     OVS_REQUIRES(pmd->flow_mutex)
3626 {
3627     struct ds extra_info = DS_EMPTY_INITIALIZER;
3628     struct dp_netdev_flow *flow;
3629     struct netdev_flow_key mask;
3630     struct dpcls *cls;
3631     size_t unit;
3632
3633     /* Make sure in_port is exact matched before we read it. */
3634     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3635     odp_port_t in_port = match->flow.in_port.odp_port;
3636
3637     /* As we select the dpcls based on the port number, each netdev flow
3638      * belonging to the same dpcls will have the same odp_port value.
3639      * For performance reasons we wildcard odp_port here in the mask.  In the
3640      * typical case dp_hash is also wildcarded, and the resulting 8-byte
3641      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3642      * will not be part of the subtable mask.
3643      * This will speed up the hash computation during dpcls_lookup() because
3644      * there is one less call to hash_add64() in this case. */
3645     match->wc.masks.in_port.odp_port = 0;
3646     netdev_flow_mask_init(&mask, match);
3647     match->wc.masks.in_port.odp_port = ODPP_NONE;
3648
3649     /* Make sure wc does not have metadata. */
3650     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3651                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3652
3653     /* Do not allocate extra space. */
3654     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3655     memset(&flow->stats, 0, sizeof flow->stats);
3656     atomic_init(&flow->netdev_flow_get_result, 0);
3657     memset(&flow->last_stats, 0, sizeof flow->last_stats);
3658     memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
3659     flow->dead = false;
3660     flow->batch = NULL;
3661     flow->mark = INVALID_FLOW_MARK;
3662     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3663     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3664     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3665     ovs_refcount_init(&flow->ref_cnt);
3666     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3667
3668     dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3669     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3670
3671     /* Select dpcls for in_port. Relies on in_port to be exact match. */
3672     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3673     dpcls_insert(cls, &flow->cr, &mask);
3674
3675     ds_put_cstr(&extra_info, "miniflow_bits(");
3676     FLOWMAP_FOR_EACH_UNIT (unit) {
3677         if (unit) {
3678             ds_put_char(&extra_info, ',');
3679         }
3680         ds_put_format(&extra_info, "%d",
3681                       count_1bits(flow->cr.mask->mf.map.bits[unit]));
3682     }
3683     ds_put_char(&extra_info, ')');
3684     flow->dp_extra_info = ds_steal_cstr(&extra_info);
3685     ds_destroy(&extra_info);
3686
3687     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3688                 dp_netdev_flow_hash(&flow->ufid));
3689
3690     queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3691
3692     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3693         struct ds ds = DS_EMPTY_INITIALIZER;
3694         struct ofpbuf key_buf, mask_buf;
3695         struct odp_flow_key_parms odp_parms = {
3696             .flow = &match->flow,
3697             .mask = &match->wc.masks,
3698             .support = dp_netdev_support,
3699         };
3700
3701         ofpbuf_init(&key_buf, 0);
3702         ofpbuf_init(&mask_buf, 0);
3703
3704         odp_flow_key_from_flow(&odp_parms, &key_buf);
3705         odp_parms.key_buf = &key_buf;
3706         odp_flow_key_from_mask(&odp_parms, &mask_buf);
3707
3708         ds_put_cstr(&ds, "flow_add: ");
3709         odp_format_ufid(ufid, &ds);
3710         ds_put_cstr(&ds, " mega_");
3711         odp_format_ufid(&flow->mega_ufid, &ds);
3712         ds_put_cstr(&ds, " ");
3713         odp_flow_format(key_buf.data, key_buf.size,
3714                         mask_buf.data, mask_buf.size,
3715                         NULL, &ds, false);
3716         ds_put_cstr(&ds, ", actions:");
3717         format_odp_actions(&ds, actions, actions_len, NULL);
3718
3719         VLOG_DBG("%s", ds_cstr(&ds));
3720
3721         ofpbuf_uninit(&key_buf);
3722         ofpbuf_uninit(&mask_buf);
3723
3724         /* Add a printout of the actual match installed. */
3725         struct match m;
3726         ds_clear(&ds);
3727         ds_put_cstr(&ds, "flow match: ");
3728         miniflow_expand(&flow->cr.flow.mf, &m.flow);
3729         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3730         memset(&m.tun_md, 0, sizeof m.tun_md);
3731         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3732
3733         VLOG_DBG("%s", ds_cstr(&ds));
3734
3735         ds_destroy(&ds);
3736     }
3737
3738     return flow;
3739 }
3740
3741 static int
3742 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3743                 struct netdev_flow_key *key,
3744                 struct match *match,
3745                 ovs_u128 *ufid,
3746                 const struct dpif_flow_put *put,
3747                 struct dpif_flow_stats *stats)
3748 {
3749     struct dp_netdev_flow *netdev_flow;
3750     int error = 0;
3751
3752     if (stats) {
3753         memset(stats, 0, sizeof *stats);
3754     }
3755
3756     ovs_mutex_lock(&pmd->flow_mutex);
3757     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3758     if (!netdev_flow) {
3759         if (put->flags & DPIF_FP_CREATE) {
3760             dp_netdev_flow_add(pmd, match, ufid, put->actions,
3761                                put->actions_len);
3762         } else {
3763             error = ENOENT;
3764         }
3765     } else {
3766         if (put->flags & DPIF_FP_MODIFY) {
3767             struct dp_netdev_actions *new_actions;
3768             struct dp_netdev_actions *old_actions;
3769
3770             new_actions = dp_netdev_actions_create(put->actions,
3771                                                    put->actions_len);
3772
3773             old_actions = dp_netdev_flow_get_actions(netdev_flow);
3774             ovsrcu_set(&netdev_flow->actions, new_actions);
3775
3776             queue_netdev_flow_put(pmd, netdev_flow, match,
3777                                   put->actions, put->actions_len);
3778
3779             if (stats) {
3780                 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3781             }
3782             if (put->flags & DPIF_FP_ZERO_STATS) {
3783                 /* XXX: The userspace datapath uses thread local statistics
3784                  * (for flows), which should be updated only by the owning
3785                  * thread.  Since we cannot write on stats memory here,
3786                  * we choose not to support this flag.  Please note:
3787                  * - This feature is currently used only by dpctl commands with
3788                  *   option --clear.
3789                  * - Should the need arise, this operation can be implemented
3790                  *   by keeping a base value (to be update here) for each
3791                  *   counter, and subtracting it before outputting the stats */
3792                 error = EOPNOTSUPP;
3793             }
3794
3795             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3796         } else if (put->flags & DPIF_FP_CREATE) {
3797             error = EEXIST;
3798         } else {
3799             /* Overlapping flow. */
3800             error = EINVAL;
3801         }
3802     }
3803     ovs_mutex_unlock(&pmd->flow_mutex);
3804     return error;
3805 }
3806
3807 static int
3808 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3809 {
3810     struct dp_netdev *dp = get_dp_netdev(dpif);
3811     struct netdev_flow_key key, mask;
3812     struct dp_netdev_pmd_thread *pmd;
3813     struct match match;
3814     ovs_u128 ufid;
3815     int error;
3816     bool probe = put->flags & DPIF_FP_PROBE;
3817
3818     if (put->stats) {
3819         memset(put->stats, 0, sizeof *put->stats);
3820     }
3821     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3822                                           probe);
3823     if (error) {
3824         return error;
3825     }
3826     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3827                                           put->mask, put->mask_len,
3828                                           &match.flow, &match.wc, probe);
3829     if (error) {
3830         return error;
3831     }
3832
3833     if (put->ufid) {
3834         ufid = *put->ufid;
3835     } else {
3836         odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3837     }
3838
3839     /* The Netlink encoding of datapath flow keys cannot express
3840      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3841      * tag is interpreted as exact match on the fact that there is no
3842      * VLAN.  Unless we refactor a lot of code that translates between
3843      * Netlink and struct flow representations, we have to do the same
3844      * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3845     if (!match.wc.masks.vlans[0].tci) {
3846         match.wc.masks.vlans[0].tci = htons(0xffff);
3847     }
3848
3849     /* Must produce a netdev_flow_key for lookup.
3850      * Use the same method as employed to create the key when adding
3851      * the flow to the dplcs to make sure they match. */
3852     netdev_flow_mask_init(&mask, &match);
3853     netdev_flow_key_init_masked(&key, &match.flow, &mask);
3854
3855     if (put->pmd_id == PMD_ID_NULL) {
3856         if (cmap_count(&dp->poll_threads) == 0) {
3857             return EINVAL;
3858         }
3859         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3860             struct dpif_flow_stats pmd_stats;
3861             int pmd_error;
3862
3863             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3864                                         &pmd_stats);
3865             if (pmd_error) {
3866                 error = pmd_error;
3867             } else if (put->stats) {
3868                 put->stats->n_packets += pmd_stats.n_packets;
3869                 put->stats->n_bytes += pmd_stats.n_bytes;
3870                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3871                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3872             }
3873         }
3874     } else {
3875         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3876         if (!pmd) {
3877             return EINVAL;
3878         }
3879         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3880         dp_netdev_pmd_unref(pmd);
3881     }
3882
3883     return error;
3884 }
3885
3886 static int
3887 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3888                 struct dpif_flow_stats *stats,
3889                 const struct dpif_flow_del *del)
3890 {
3891     struct dp_netdev_flow *netdev_flow;
3892     int error = 0;
3893
3894     ovs_mutex_lock(&pmd->flow_mutex);
3895     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3896                                           del->key_len);
3897     if (netdev_flow) {
3898         if (stats) {
3899             get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3900         }
3901         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3902     } else {
3903         error = ENOENT;
3904     }
3905     ovs_mutex_unlock(&pmd->flow_mutex);
3906
3907     return error;
3908 }
3909
3910 static int
3911 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3912 {
3913     struct dp_netdev *dp = get_dp_netdev(dpif);
3914     struct dp_netdev_pmd_thread *pmd;
3915     int error = 0;
3916
3917     if (del->stats) {
3918         memset(del->stats, 0, sizeof *del->stats);
3919     }
3920
3921     if (del->pmd_id == PMD_ID_NULL) {
3922         if (cmap_count(&dp->poll_threads) == 0) {
3923             return EINVAL;
3924         }
3925         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3926             struct dpif_flow_stats pmd_stats;
3927             int pmd_error;
3928
3929             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3930             if (pmd_error) {
3931                 error = pmd_error;
3932             } else if (del->stats) {
3933                 del->stats->n_packets += pmd_stats.n_packets;
3934                 del->stats->n_bytes += pmd_stats.n_bytes;
3935                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3936                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3937             }
3938         }
3939     } else {
3940         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3941         if (!pmd) {
3942             return EINVAL;
3943         }
3944         error = flow_del_on_pmd(pmd, del->stats, del);
3945         dp_netdev_pmd_unref(pmd);
3946     }
3947
3948
3949     return error;
3950 }
3951
3952 struct dpif_netdev_flow_dump {
3953     struct dpif_flow_dump up;
3954     struct cmap_position poll_thread_pos;
3955     struct cmap_position flow_pos;
3956     struct dp_netdev_pmd_thread *cur_pmd;
3957     int status;
3958     struct ovs_mutex mutex;
3959 };
3960
3961 static struct dpif_netdev_flow_dump *
3962 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3963 {
3964     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3965 }
3966
3967 static struct dpif_flow_dump *
3968 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3969                              struct dpif_flow_dump_types *types OVS_UNUSED)
3970 {
3971     struct dpif_netdev_flow_dump *dump;
3972
3973     dump = xzalloc(sizeof *dump);
3974     dpif_flow_dump_init(&dump->up, dpif_);
3975     dump->up.terse = terse;
3976     ovs_mutex_init(&dump->mutex);
3977
3978     return &dump->up;
3979 }
3980
3981 static int
3982 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3983 {
3984     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3985
3986     ovs_mutex_destroy(&dump->mutex);
3987     free(dump);
3988     return 0;
3989 }
3990
3991 struct dpif_netdev_flow_dump_thread {
3992     struct dpif_flow_dump_thread up;
3993     struct dpif_netdev_flow_dump *dump;
3994     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3995     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3996 };
3997
3998 static struct dpif_netdev_flow_dump_thread *
3999 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
4000 {
4001     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
4002 }
4003
4004 static struct dpif_flow_dump_thread *
4005 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
4006 {
4007     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
4008     struct dpif_netdev_flow_dump_thread *thread;
4009
4010     thread = xmalloc(sizeof *thread);
4011     dpif_flow_dump_thread_init(&thread->up, &dump->up);
4012     thread->dump = dump;
4013     return &thread->up;
4014 }
4015
4016 static void
4017 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
4018 {
4019     struct dpif_netdev_flow_dump_thread *thread
4020         = dpif_netdev_flow_dump_thread_cast(thread_);
4021
4022     free(thread);
4023 }
4024
4025 static int
4026 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
4027                            struct dpif_flow *flows, int max_flows)
4028 {
4029     struct dpif_netdev_flow_dump_thread *thread
4030         = dpif_netdev_flow_dump_thread_cast(thread_);
4031     struct dpif_netdev_flow_dump *dump = thread->dump;
4032     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
4033     struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
4034     struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
4035     int n_flows = 0;
4036     int i;
4037
4038     ovs_mutex_lock(&dump->mutex);
4039     if (!dump->status) {
4040         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
4041         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
4042
4043         /* First call to dump_next(), extracts the first pmd thread.
4044          * If there is no pmd thread, returns immediately. */
4045         if (!pmd) {
4046             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4047             if (!pmd) {
4048                 ovs_mutex_unlock(&dump->mutex);
4049                 return n_flows;
4050
4051             }
4052         }
4053
4054         do {
4055             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
4056                 struct cmap_node *node;
4057
4058                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
4059                 if (!node) {
4060                     break;
4061                 }
4062                 netdev_flows[n_flows] = CONTAINER_OF(node,
4063                                                      struct dp_netdev_flow,
4064                                                      node);
4065             }
4066             /* When finishing dumping the current pmd thread, moves to
4067              * the next. */
4068             if (n_flows < flow_limit) {
4069                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
4070                 dp_netdev_pmd_unref(pmd);
4071                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4072                 if (!pmd) {
4073                     dump->status = EOF;
4074                     break;
4075                 }
4076             }
4077             /* Keeps the reference to next caller. */
4078             dump->cur_pmd = pmd;
4079
4080             /* If the current dump is empty, do not exit the loop, since the
4081              * remaining pmds could have flows to be dumped.  Just dumps again
4082              * on the new 'pmd'. */
4083         } while (!n_flows);
4084     }
4085     ovs_mutex_unlock(&dump->mutex);
4086
4087     for (i = 0; i < n_flows; i++) {
4088         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4089         struct odputil_keybuf *keybuf = &thread->keybuf[i];
4090         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4091         struct dpif_flow *f = &flows[i];
4092         struct ofpbuf key, mask;
4093
4094         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4095         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4096         dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4097                                     dump->up.terse);
4098     }
4099
4100     return n_flows;
4101 }
4102
4103 static int
4104 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4105     OVS_NO_THREAD_SAFETY_ANALYSIS
4106 {
4107     struct dp_netdev *dp = get_dp_netdev(dpif);
4108     struct dp_netdev_pmd_thread *pmd;
4109     struct dp_packet_batch pp;
4110
4111     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4112         dp_packet_size(execute->packet) > UINT16_MAX) {
4113         return EINVAL;
4114     }
4115
4116     /* Tries finding the 'pmd'.  If NULL is returned, that means
4117      * the current thread is a non-pmd thread and should use
4118      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4119     pmd = ovsthread_getspecific(dp->per_pmd_key);
4120     if (!pmd) {
4121         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4122         if (!pmd) {
4123             return EBUSY;
4124         }
4125     }
4126
4127     if (execute->probe) {
4128         /* If this is part of a probe, Drop the packet, since executing
4129          * the action may actually cause spurious packets be sent into
4130          * the network. */
4131         if (pmd->core_id == NON_PMD_CORE_ID) {
4132             dp_netdev_pmd_unref(pmd);
4133         }
4134         return 0;
4135     }
4136
4137     /* If the current thread is non-pmd thread, acquires
4138      * the 'non_pmd_mutex'. */
4139     if (pmd->core_id == NON_PMD_CORE_ID) {
4140         ovs_mutex_lock(&dp->non_pmd_mutex);
4141     }
4142
4143     /* Update current time in PMD context. We don't care about EMC insertion
4144      * probability, because we are on a slow path. */
4145     pmd_thread_ctx_time_update(pmd);
4146
4147     /* The action processing expects the RSS hash to be valid, because
4148      * it's always initialized at the beginning of datapath processing.
4149      * In this case, though, 'execute->packet' may not have gone through
4150      * the datapath at all, it may have been generated by the upper layer
4151      * (OpenFlow packet-out, BFD frame, ...). */
4152     if (!dp_packet_rss_valid(execute->packet)) {
4153         dp_packet_set_rss_hash(execute->packet,
4154                                flow_hash_5tuple(execute->flow, 0));
4155     }
4156
4157     dp_packet_batch_init_packet(&pp, execute->packet);
4158     pp.do_not_steal = true;
4159     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4160                               execute->actions, execute->actions_len);
4161     dp_netdev_pmd_flush_output_packets(pmd, true);
4162
4163     if (pmd->core_id == NON_PMD_CORE_ID) {
4164         ovs_mutex_unlock(&dp->non_pmd_mutex);
4165         dp_netdev_pmd_unref(pmd);
4166     }
4167
4168     return 0;
4169 }
4170
4171 static void
4172 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
4173                     enum dpif_offload_type offload_type OVS_UNUSED)
4174 {
4175     size_t i;
4176
4177     for (i = 0; i < n_ops; i++) {
4178         struct dpif_op *op = ops[i];
4179
4180         switch (op->type) {
4181         case DPIF_OP_FLOW_PUT:
4182             op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4183             break;
4184
4185         case DPIF_OP_FLOW_DEL:
4186             op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4187             break;
4188
4189         case DPIF_OP_EXECUTE:
4190             op->error = dpif_netdev_execute(dpif, &op->execute);
4191             break;
4192
4193         case DPIF_OP_FLOW_GET:
4194             op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4195             break;
4196         }
4197     }
4198 }
4199
4200 /* Enable or Disable PMD auto load balancing. */
4201 static void
4202 set_pmd_auto_lb(struct dp_netdev *dp)
4203 {
4204     unsigned int cnt = 0;
4205     struct dp_netdev_pmd_thread *pmd;
4206     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4207
4208     bool enable_alb = false;
4209     bool multi_rxq = false;
4210     bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
4211
4212     /* Ensure that there is at least 2 non-isolated PMDs and
4213      * one of them is polling more than one rxq. */
4214     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4215         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4216             continue;
4217         }
4218
4219         if (hmap_count(&pmd->poll_list) > 1) {
4220             multi_rxq = true;
4221         }
4222         if (cnt && multi_rxq) {
4223                 enable_alb = true;
4224                 break;
4225         }
4226         cnt++;
4227     }
4228
4229     /* Enable auto LB if it is requested and cycle based assignment is true. */
4230     enable_alb = enable_alb && pmd_rxq_assign_cyc &&
4231                     pmd_alb->auto_lb_requested;
4232
4233     if (pmd_alb->is_enabled != enable_alb) {
4234         pmd_alb->is_enabled = enable_alb;
4235         if (pmd_alb->is_enabled) {
4236             VLOG_INFO("PMD auto load balance is enabled "
4237                       "(with rebalance interval:%"PRIu64" msec)",
4238                        pmd_alb->rebalance_intvl);
4239         } else {
4240             pmd_alb->rebalance_poll_timer = 0;
4241             VLOG_INFO("PMD auto load balance is disabled");
4242         }
4243     }
4244
4245 }
4246
4247 /* Applies datapath configuration from the database. Some of the changes are
4248  * actually applied in dpif_netdev_run(). */
4249 static int
4250 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4251 {
4252     struct dp_netdev *dp = get_dp_netdev(dpif);
4253     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4254     const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4255                                              "cycles");
4256     unsigned long long insert_prob =
4257         smap_get_ullong(other_config, "emc-insert-inv-prob",
4258                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
4259     uint32_t insert_min, cur_min;
4260     uint32_t tx_flush_interval, cur_tx_flush_interval;
4261     uint64_t rebalance_intvl;
4262
4263     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4264                                      DEFAULT_TX_FLUSH_INTERVAL);
4265     atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4266     if (tx_flush_interval != cur_tx_flush_interval) {
4267         atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4268         VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4269                   tx_flush_interval);
4270     }
4271
4272     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4273         free(dp->pmd_cmask);
4274         dp->pmd_cmask = nullable_xstrdup(cmask);
4275         dp_netdev_request_reconfigure(dp);
4276     }
4277
4278     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4279     if (insert_prob <= UINT32_MAX) {
4280         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4281     } else {
4282         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4283         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4284     }
4285
4286     if (insert_min != cur_min) {
4287         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4288         if (insert_min == 0) {
4289             VLOG_INFO("EMC insertion probability changed to zero");
4290         } else {
4291             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4292                       insert_prob, (100 / (float)insert_prob));
4293         }
4294     }
4295
4296     bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4297     bool cur_perf_enabled;
4298     atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4299     if (perf_enabled != cur_perf_enabled) {
4300         atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4301         if (perf_enabled) {
4302             VLOG_INFO("PMD performance metrics collection enabled");
4303         } else {
4304             VLOG_INFO("PMD performance metrics collection disabled");
4305         }
4306     }
4307
4308     bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4309     bool cur_smc;
4310     atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4311     if (smc_enable != cur_smc) {
4312         atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4313         if (smc_enable) {
4314             VLOG_INFO("SMC cache is enabled");
4315         } else {
4316             VLOG_INFO("SMC cache is disabled");
4317         }
4318     }
4319
4320     bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
4321     if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
4322         VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
4323                       "Defaulting to 'cycles'.");
4324         pmd_rxq_assign_cyc = true;
4325         pmd_rxq_assign = "cycles";
4326     }
4327     if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4328         dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4329         VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4330                   pmd_rxq_assign);
4331         dp_netdev_request_reconfigure(dp);
4332     }
4333
4334     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4335     pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4336                               false);
4337
4338     rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4339                               ALB_PMD_REBALANCE_POLL_INTERVAL);
4340
4341     /* Input is in min, convert it to msec. */
4342     rebalance_intvl =
4343         rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4344
4345     if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4346         pmd_alb->rebalance_intvl = rebalance_intvl;
4347     }
4348
4349     set_pmd_auto_lb(dp);
4350     return 0;
4351 }
4352
4353 /* Parses affinity list and returns result in 'core_ids'. */
4354 static int
4355 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4356 {
4357     unsigned i;
4358     char *list, *copy, *key, *value;
4359     int error = 0;
4360
4361     for (i = 0; i < n_rxq; i++) {
4362         core_ids[i] = OVS_CORE_UNSPEC;
4363     }
4364
4365     if (!affinity_list) {
4366         return 0;
4367     }
4368
4369     list = copy = xstrdup(affinity_list);
4370
4371     while (ofputil_parse_key_value(&list, &key, &value)) {
4372         int rxq_id, core_id;
4373
4374         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4375             || !str_to_int(value, 0, &core_id) || core_id < 0) {
4376             error = EINVAL;
4377             break;
4378         }
4379
4380         if (rxq_id < n_rxq) {
4381             core_ids[rxq_id] = core_id;
4382         }
4383     }
4384
4385     free(copy);
4386     return error;
4387 }
4388
4389 /* Parses 'affinity_list' and applies configuration if it is valid. */
4390 static int
4391 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4392                                   const char *affinity_list)
4393 {
4394     unsigned *core_ids, i;
4395     int error = 0;
4396
4397     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4398     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4399         error = EINVAL;
4400         goto exit;
4401     }
4402
4403     for (i = 0; i < port->n_rxq; i++) {
4404         port->rxqs[i].core_id = core_ids[i];
4405     }
4406
4407 exit:
4408     free(core_ids);
4409     return error;
4410 }
4411
4412 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4413  * of given PMD thread. */
4414 static bool
4415 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4416                            struct dp_netdev_port *port)
4417     OVS_EXCLUDED(pmd->port_mutex)
4418 {
4419     struct rxq_poll *poll;
4420     bool found = false;
4421
4422     ovs_mutex_lock(&pmd->port_mutex);
4423     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4424         if (port == poll->rxq->port) {
4425             found = true;
4426             break;
4427         }
4428     }
4429     ovs_mutex_unlock(&pmd->port_mutex);
4430     return found;
4431 }
4432
4433 /* Updates port configuration from the database.  The changes are actually
4434  * applied in dpif_netdev_run(). */
4435 static int
4436 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4437                             const struct smap *cfg)
4438 {
4439     struct dp_netdev *dp = get_dp_netdev(dpif);
4440     struct dp_netdev_port *port;
4441     int error = 0;
4442     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4443     bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4444
4445     ovs_mutex_lock(&dp->port_mutex);
4446     error = get_port_by_number(dp, port_no, &port);
4447     if (error) {
4448         goto unlock;
4449     }
4450
4451     if (emc_enabled != port->emc_enabled) {
4452         struct dp_netdev_pmd_thread *pmd;
4453         struct ds ds = DS_EMPTY_INITIALIZER;
4454         uint32_t cur_min, insert_prob;
4455
4456         port->emc_enabled = emc_enabled;
4457         /* Mark for reload all the threads that polls this port and request
4458          * for reconfiguration for the actual reloading of threads. */
4459         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4460             if (dpif_netdev_pmd_polls_port(pmd, port)) {
4461                 pmd->need_reload = true;
4462             }
4463         }
4464         dp_netdev_request_reconfigure(dp);
4465
4466         ds_put_format(&ds, "%s: EMC has been %s.",
4467                       netdev_get_name(port->netdev),
4468                       (emc_enabled) ? "enabled" : "disabled");
4469         if (emc_enabled) {
4470             ds_put_cstr(&ds, " Current insertion probability is ");
4471             atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4472             if (!cur_min) {
4473                 ds_put_cstr(&ds, "zero.");
4474             } else {
4475                 insert_prob = UINT32_MAX / cur_min;
4476                 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4477                               insert_prob, 100 / (float) insert_prob);
4478             }
4479         }
4480         VLOG_INFO("%s", ds_cstr(&ds));
4481         ds_destroy(&ds);
4482     }
4483
4484     /* Checking for RXq affinity changes. */
4485     if (!netdev_is_pmd(port->netdev)
4486         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4487         goto unlock;
4488     }
4489
4490     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4491     if (error) {
4492         goto unlock;
4493     }
4494     free(port->rxq_affinity_list);
4495     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4496
4497     dp_netdev_request_reconfigure(dp);
4498 unlock:
4499     ovs_mutex_unlock(&dp->port_mutex);
4500     return error;
4501 }
4502
4503 static int
4504 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4505                               uint32_t queue_id, uint32_t *priority)
4506 {
4507     *priority = queue_id;
4508     return 0;
4509 }
4510
4511 \f
4512 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4513  * a copy of the 'size' bytes of 'actions' input parameters. */
4514 struct dp_netdev_actions *
4515 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4516 {
4517     struct dp_netdev_actions *netdev_actions;
4518
4519     netdev_actions = xmalloc(sizeof *netdev_actions + size);
4520     memcpy(netdev_actions->actions, actions, size);
4521     netdev_actions->size = size;
4522
4523     return netdev_actions;
4524 }
4525
4526 struct dp_netdev_actions *
4527 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4528 {
4529     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4530 }
4531
4532 static void
4533 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4534 {
4535     free(actions);
4536 }
4537 \f
4538 static void
4539 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4540                          enum rxq_cycles_counter_type type,
4541                          unsigned long long cycles)
4542 {
4543    atomic_store_relaxed(&rx->cycles[type], cycles);
4544 }
4545
4546 static void
4547 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4548                          enum rxq_cycles_counter_type type,
4549                          unsigned long long cycles)
4550 {
4551     non_atomic_ullong_add(&rx->cycles[type], cycles);
4552 }
4553
4554 static uint64_t
4555 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4556                          enum rxq_cycles_counter_type type)
4557 {
4558     unsigned long long processing_cycles;
4559     atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4560     return processing_cycles;
4561 }
4562
4563 static void
4564 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4565                                 unsigned long long cycles)
4566 {
4567     unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4568     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4569 }
4570
4571 static uint64_t
4572 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4573 {
4574     unsigned long long processing_cycles;
4575     atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4576     return processing_cycles;
4577 }
4578
4579 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4580 static inline bool
4581 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4582 {
4583     bool pmd_perf_enabled;
4584     atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4585     return pmd_perf_enabled;
4586 }
4587 #else
4588 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4589  * performance metrics are not available as locked access to 64 bit
4590  * integers would be prohibitively expensive. */
4591 static inline bool
4592 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4593 {
4594     return false;
4595 }
4596 #endif
4597
4598 static int
4599 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4600                                    struct tx_port *p)
4601 {
4602     int i;
4603     int tx_qid;
4604     int output_cnt;
4605     bool dynamic_txqs;
4606     struct cycle_timer timer;
4607     uint64_t cycles;
4608     uint32_t tx_flush_interval;
4609
4610     cycle_timer_start(&pmd->perf_stats, &timer);
4611
4612     dynamic_txqs = p->port->dynamic_txqs;
4613     if (dynamic_txqs) {
4614         tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4615     } else {
4616         tx_qid = pmd->static_tx_qid;
4617     }
4618
4619     output_cnt = dp_packet_batch_size(&p->output_pkts);
4620     ovs_assert(output_cnt > 0);
4621
4622     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4623     dp_packet_batch_init(&p->output_pkts);
4624
4625     /* Update time of the next flush. */
4626     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4627     p->flush_time = pmd->ctx.now + tx_flush_interval;
4628
4629     ovs_assert(pmd->n_output_batches > 0);
4630     pmd->n_output_batches--;
4631
4632     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4633     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4634
4635     /* Distribute send cycles evenly among transmitted packets and assign to
4636      * their respective rx queues. */
4637     cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4638     for (i = 0; i < output_cnt; i++) {
4639         if (p->output_pkts_rxqs[i]) {
4640             dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4641                                      RXQ_CYCLES_PROC_CURR, cycles);
4642         }
4643     }
4644
4645     return output_cnt;
4646 }
4647
4648 static int
4649 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4650                                    bool force)
4651 {
4652     struct tx_port *p;
4653     int output_cnt = 0;
4654
4655     if (!pmd->n_output_batches) {
4656         return 0;
4657     }
4658
4659     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4660         if (!dp_packet_batch_is_empty(&p->output_pkts)
4661             && (force || pmd->ctx.now >= p->flush_time)) {
4662             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4663         }
4664     }
4665     return output_cnt;
4666 }
4667
4668 static int
4669 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4670                            struct dp_netdev_rxq *rxq,
4671                            odp_port_t port_no)
4672 {
4673     struct pmd_perf_stats *s = &pmd->perf_stats;
4674     struct dp_packet_batch batch;
4675     struct cycle_timer timer;
4676     int error;
4677     int batch_cnt = 0;
4678     int rem_qlen = 0, *qlen_p = NULL;
4679     uint64_t cycles;
4680
4681     /* Measure duration for polling and processing rx burst. */
4682     cycle_timer_start(&pmd->perf_stats, &timer);
4683
4684     pmd->ctx.last_rxq = rxq;
4685     dp_packet_batch_init(&batch);
4686
4687     /* Fetch the rx queue length only for vhostuser ports. */
4688     if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4689         qlen_p = &rem_qlen;
4690     }
4691
4692     error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4693     if (!error) {
4694         /* At least one packet received. */
4695         *recirc_depth_get() = 0;
4696         pmd_thread_ctx_time_update(pmd);
4697         batch_cnt = dp_packet_batch_size(&batch);
4698         if (pmd_perf_metrics_enabled(pmd)) {
4699             /* Update batch histogram. */
4700             s->current.batches++;
4701             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4702             /* Update the maximum vhost rx queue fill level. */
4703             if (rxq->is_vhost && rem_qlen >= 0) {
4704                 uint32_t qfill = batch_cnt + rem_qlen;
4705                 if (qfill > s->current.max_vhost_qfill) {
4706                     s->current.max_vhost_qfill = qfill;
4707                 }
4708             }
4709         }
4710         /* Process packet batch. */
4711         dp_netdev_input(pmd, &batch, port_no);
4712
4713         /* Assign processing cycles to rx queue. */
4714         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4715         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4716
4717         dp_netdev_pmd_flush_output_packets(pmd, false);
4718     } else {
4719         /* Discard cycles. */
4720         cycle_timer_stop(&pmd->perf_stats, &timer);
4721         if (error != EAGAIN && error != EOPNOTSUPP) {
4722             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4723
4724             VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4725                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4726         }
4727     }
4728
4729     pmd->ctx.last_rxq = NULL;
4730
4731     return batch_cnt;
4732 }
4733
4734 static struct tx_port *
4735 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4736 {
4737     struct tx_port *tx;
4738
4739     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4740         if (tx->port->port_no == port_no) {
4741             return tx;
4742         }
4743     }
4744
4745     return NULL;
4746 }
4747
4748 static struct tx_bond *
4749 tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4750 {
4751     uint32_t hash = hash_bond_id(bond_id);
4752     struct tx_bond *tx;
4753
4754     CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4755         if (tx->bond_id == bond_id) {
4756             return tx;
4757         }
4758     }
4759     return NULL;
4760 }
4761
4762 static int
4763 port_reconfigure(struct dp_netdev_port *port)
4764 {
4765     struct netdev *netdev = port->netdev;
4766     int i, err;
4767
4768     /* Closes the existing 'rxq's. */
4769     for (i = 0; i < port->n_rxq; i++) {
4770         netdev_rxq_close(port->rxqs[i].rx);
4771         port->rxqs[i].rx = NULL;
4772     }
4773     unsigned last_nrxq = port->n_rxq;
4774     port->n_rxq = 0;
4775
4776     /* Allows 'netdev' to apply the pending configuration changes. */
4777     if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4778         err = netdev_reconfigure(netdev);
4779         if (err && (err != EOPNOTSUPP)) {
4780             VLOG_ERR("Failed to set interface %s new configuration",
4781                      netdev_get_name(netdev));
4782             return err;
4783         }
4784     }
4785     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4786     port->rxqs = xrealloc(port->rxqs,
4787                           sizeof *port->rxqs * netdev_n_rxq(netdev));
4788     /* Realloc 'used' counters for tx queues. */
4789     free(port->txq_used);
4790     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4791
4792     for (i = 0; i < netdev_n_rxq(netdev); i++) {
4793         bool new_queue = i >= last_nrxq;
4794         if (new_queue) {
4795             memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4796         }
4797
4798         port->rxqs[i].port = port;
4799         port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4800
4801         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4802         if (err) {
4803             return err;
4804         }
4805         port->n_rxq++;
4806     }
4807
4808     /* Parse affinity list to apply configuration for new queues. */
4809     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4810
4811     /* If reconfiguration was successful mark it as such, so we can use it */
4812     port->need_reconfigure = false;
4813
4814     return 0;
4815 }
4816
4817 struct rr_numa_list {
4818     struct hmap numas;  /* Contains 'struct rr_numa' */
4819 };
4820
4821 struct rr_numa {
4822     struct hmap_node node;
4823
4824     int numa_id;
4825
4826     /* Non isolated pmds on numa node 'numa_id' */
4827     struct dp_netdev_pmd_thread **pmds;
4828     int n_pmds;
4829
4830     int cur_index;
4831     bool idx_inc;
4832 };
4833
4834 static struct rr_numa *
4835 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4836 {
4837     struct rr_numa *numa;
4838
4839     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4840         if (numa->numa_id == numa_id) {
4841             return numa;
4842         }
4843     }
4844
4845     return NULL;
4846 }
4847
4848 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4849  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4850  * Returns NULL if 'rr' numa list is empty. */
4851 static struct rr_numa *
4852 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4853 {
4854     struct hmap_node *node = NULL;
4855
4856     if (numa) {
4857         node = hmap_next(&rr->numas, &numa->node);
4858     }
4859     if (!node) {
4860         node = hmap_first(&rr->numas);
4861     }
4862
4863     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4864 }
4865
4866 static void
4867 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4868 {
4869     struct dp_netdev_pmd_thread *pmd;
4870     struct rr_numa *numa;
4871
4872     hmap_init(&rr->numas);
4873
4874     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4875         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4876             continue;
4877         }
4878
4879         numa = rr_numa_list_lookup(rr, pmd->numa_id);
4880         if (!numa) {
4881             numa = xzalloc(sizeof *numa);
4882             numa->numa_id = pmd->numa_id;
4883             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4884         }
4885         numa->n_pmds++;
4886         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4887         numa->pmds[numa->n_pmds - 1] = pmd;
4888         /* At least one pmd so initialise curr_idx and idx_inc. */
4889         numa->cur_index = 0;
4890         numa->idx_inc = true;
4891     }
4892 }
4893
4894 /*
4895  * Returns the next pmd from the numa node.
4896  *
4897  * If 'updown' is 'true' it will alternate between selecting the next pmd in
4898  * either an up or down walk, switching between up/down when the first or last
4899  * core is reached. e.g. 1,2,3,3,2,1,1,2...
4900  *
4901  * If 'updown' is 'false' it will select the next pmd wrapping around when last
4902  * core reached. e.g. 1,2,3,1,2,3,1,2...
4903  */
4904 static struct dp_netdev_pmd_thread *
4905 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4906 {
4907     int numa_idx = numa->cur_index;
4908
4909     if (numa->idx_inc == true) {
4910         /* Incrementing through list of pmds. */
4911         if (numa->cur_index == numa->n_pmds-1) {
4912             /* Reached the last pmd. */
4913             if (updown) {
4914                 numa->idx_inc = false;
4915             } else {
4916                 numa->cur_index = 0;
4917             }
4918         } else {
4919             numa->cur_index++;
4920         }
4921     } else {
4922         /* Decrementing through list of pmds. */
4923         if (numa->cur_index == 0) {
4924             /* Reached the first pmd. */
4925             numa->idx_inc = true;
4926         } else {
4927             numa->cur_index--;
4928         }
4929     }
4930     return numa->pmds[numa_idx];
4931 }
4932
4933 static void
4934 rr_numa_list_destroy(struct rr_numa_list *rr)
4935 {
4936     struct rr_numa *numa;
4937
4938     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4939         free(numa->pmds);
4940         free(numa);
4941     }
4942     hmap_destroy(&rr->numas);
4943 }
4944
4945 /* Sort Rx Queues by the processing cycles they are consuming. */
4946 static int
4947 compare_rxq_cycles(const void *a, const void *b)
4948 {
4949     struct dp_netdev_rxq *qa;
4950     struct dp_netdev_rxq *qb;
4951     uint64_t cycles_qa, cycles_qb;
4952
4953     qa = *(struct dp_netdev_rxq **) a;
4954     qb = *(struct dp_netdev_rxq **) b;
4955
4956     cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4957     cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4958
4959     if (cycles_qa != cycles_qb) {
4960         return (cycles_qa < cycles_qb) ? 1 : -1;
4961     } else {
4962         /* Cycles are the same so tiebreak on port/queue id.
4963          * Tiebreaking (as opposed to return 0) ensures consistent
4964          * sort results across multiple OS's. */
4965         uint32_t port_qa = odp_to_u32(qa->port->port_no);
4966         uint32_t port_qb = odp_to_u32(qb->port->port_no);
4967         if (port_qa != port_qb) {
4968             return port_qa > port_qb ? 1 : -1;
4969         } else {
4970             return netdev_rxq_get_queue_id(qa->rx)
4971                     - netdev_rxq_get_queue_id(qb->rx);
4972         }
4973     }
4974 }
4975
4976 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
4977  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
4978  * pmds to unpinned queues.
4979  *
4980  * The function doesn't touch the pmd threads, it just stores the assignment
4981  * in the 'pmd' member of each rxq. */
4982 static void
4983 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4984 {
4985     struct dp_netdev_port *port;
4986     struct rr_numa_list rr;
4987     struct rr_numa *non_local_numa = NULL;
4988     struct dp_netdev_rxq ** rxqs = NULL;
4989     int n_rxqs = 0;
4990     struct rr_numa *numa = NULL;
4991     int numa_id;
4992     bool assign_cyc = dp->pmd_rxq_assign_cyc;
4993
4994     HMAP_FOR_EACH (port, node, &dp->ports) {
4995         if (!netdev_is_pmd(port->netdev)) {
4996             continue;
4997         }
4998
4999         for (int qid = 0; qid < port->n_rxq; qid++) {
5000             struct dp_netdev_rxq *q = &port->rxqs[qid];
5001
5002             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
5003                 struct dp_netdev_pmd_thread *pmd;
5004
5005                 pmd = dp_netdev_get_pmd(dp, q->core_id);
5006                 if (!pmd) {
5007                     VLOG_WARN("There is no PMD thread on core %d. Queue "
5008                               "%d on port \'%s\' will not be polled.",
5009                               q->core_id, qid, netdev_get_name(port->netdev));
5010                 } else {
5011                     q->pmd = pmd;
5012                     pmd->isolated = true;
5013                     VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5014                               "rx queue %d.", pmd->core_id, pmd->numa_id,
5015                               netdev_rxq_get_name(q->rx),
5016                               netdev_rxq_get_queue_id(q->rx));
5017                     dp_netdev_pmd_unref(pmd);
5018                 }
5019             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
5020                 uint64_t cycle_hist = 0;
5021
5022                 if (n_rxqs == 0) {
5023                     rxqs = xmalloc(sizeof *rxqs);
5024                 } else {
5025                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5026                 }
5027
5028                 if (assign_cyc) {
5029                     /* Sum the queue intervals and store the cycle history. */
5030                     for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5031                         cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5032                     }
5033                     dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5034                                              cycle_hist);
5035                 }
5036                 /* Store the queue. */
5037                 rxqs[n_rxqs++] = q;
5038             }
5039         }
5040     }
5041
5042     if (n_rxqs > 1 && assign_cyc) {
5043         /* Sort the queues in order of the processing cycles
5044          * they consumed during their last pmd interval. */
5045         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5046     }
5047
5048     rr_numa_list_populate(dp, &rr);
5049     /* Assign the sorted queues to pmds in round robin. */
5050     for (int i = 0; i < n_rxqs; i++) {
5051         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5052         numa = rr_numa_list_lookup(&rr, numa_id);
5053         if (!numa) {
5054             /* There are no pmds on the queue's local NUMA node.
5055                Round robin on the NUMA nodes that do have pmds. */
5056             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
5057             if (!non_local_numa) {
5058                 VLOG_ERR("There is no available (non-isolated) pmd "
5059                          "thread for port \'%s\' queue %d. This queue "
5060                          "will not be polled. Is pmd-cpu-mask set to "
5061                          "zero? Or are all PMDs isolated to other "
5062                          "queues?", netdev_rxq_get_name(rxqs[i]->rx),
5063                          netdev_rxq_get_queue_id(rxqs[i]->rx));
5064                 continue;
5065             }
5066             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
5067             VLOG_WARN("There's no available (non-isolated) pmd thread "
5068                       "on numa node %d. Queue %d on port \'%s\' will "
5069                       "be assigned to the pmd on core %d "
5070                       "(numa node %d). Expect reduced performance.",
5071                       numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
5072                       netdev_rxq_get_name(rxqs[i]->rx),
5073                       rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
5074         } else {
5075             rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
5076             if (assign_cyc) {
5077                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5078                           "rx queue %d "
5079                           "(measured processing cycles %"PRIu64").",
5080                           rxqs[i]->pmd->core_id, numa_id,
5081                           netdev_rxq_get_name(rxqs[i]->rx),
5082                           netdev_rxq_get_queue_id(rxqs[i]->rx),
5083                           dp_netdev_rxq_get_cycles(rxqs[i],
5084                                                    RXQ_CYCLES_PROC_HIST));
5085             } else {
5086                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
5087                           "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
5088                           netdev_rxq_get_name(rxqs[i]->rx),
5089                           netdev_rxq_get_queue_id(rxqs[i]->rx));
5090             }
5091         }
5092     }
5093
5094     rr_numa_list_destroy(&rr);
5095     free(rxqs);
5096 }
5097
5098 static void
5099 reload_affected_pmds(struct dp_netdev *dp)
5100 {
5101     struct dp_netdev_pmd_thread *pmd;
5102
5103     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5104         if (pmd->need_reload) {
5105             flow_mark_flush(pmd);
5106             dp_netdev_reload_pmd__(pmd);
5107         }
5108     }
5109
5110     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5111         if (pmd->need_reload) {
5112             if (pmd->core_id != NON_PMD_CORE_ID) {
5113                 bool reload;
5114
5115                 do {
5116                     atomic_read_explicit(&pmd->reload, &reload,
5117                                          memory_order_acquire);
5118                 } while (reload);
5119             }
5120             pmd->need_reload = false;
5121         }
5122     }
5123 }
5124
5125 static void
5126 reconfigure_pmd_threads(struct dp_netdev *dp)
5127     OVS_REQUIRES(dp->port_mutex)
5128 {
5129     struct dp_netdev_pmd_thread *pmd;
5130     struct ovs_numa_dump *pmd_cores;
5131     struct ovs_numa_info_core *core;
5132     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5133     struct hmapx_node *node;
5134     bool changed = false;
5135     bool need_to_adjust_static_tx_qids = false;
5136
5137     /* The pmd threads should be started only if there's a pmd port in the
5138      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
5139      * NR_PMD_THREADS per numa node. */
5140     if (!has_pmd_port(dp)) {
5141         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5142     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5143         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5144     } else {
5145         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5146     }
5147
5148     /* We need to adjust 'static_tx_qid's only if we're reducing number of
5149      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5150     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5151         /* Adjustment is required to keep 'static_tx_qid's sequential and
5152          * avoid possible issues, for example, imbalanced tx queue usage
5153          * and unnecessary locking caused by remapping on netdev level. */
5154         need_to_adjust_static_tx_qids = true;
5155     }
5156
5157     /* Check for unwanted pmd threads */
5158     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5159         if (pmd->core_id == NON_PMD_CORE_ID) {
5160             continue;
5161         }
5162         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5163                                                     pmd->core_id)) {
5164             hmapx_add(&to_delete, pmd);
5165         } else if (need_to_adjust_static_tx_qids) {
5166             atomic_store_relaxed(&pmd->reload_tx_qid, true);
5167             pmd->need_reload = true;
5168         }
5169     }
5170
5171     HMAPX_FOR_EACH (node, &to_delete) {
5172         pmd = (struct dp_netdev_pmd_thread *) node->data;
5173         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5174                   pmd->numa_id, pmd->core_id);
5175         dp_netdev_del_pmd(dp, pmd);
5176     }
5177     changed = !hmapx_is_empty(&to_delete);
5178     hmapx_destroy(&to_delete);
5179
5180     if (need_to_adjust_static_tx_qids) {
5181         /* 'static_tx_qid's are not sequential now.
5182          * Reload remaining threads to fix this. */
5183         reload_affected_pmds(dp);
5184     }
5185
5186     /* Check for required new pmd threads */
5187     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5188         pmd = dp_netdev_get_pmd(dp, core->core_id);
5189         if (!pmd) {
5190             struct ds name = DS_EMPTY_INITIALIZER;
5191
5192             pmd = xzalloc(sizeof *pmd);
5193             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5194
5195             ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5196             pmd->thread = ovs_thread_create(ds_cstr(&name),
5197                                             pmd_thread_main, pmd);
5198             ds_destroy(&name);
5199
5200             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5201                       pmd->numa_id, pmd->core_id);
5202             changed = true;
5203         } else {
5204             dp_netdev_pmd_unref(pmd);
5205         }
5206     }
5207
5208     if (changed) {
5209         struct ovs_numa_info_numa *numa;
5210
5211         /* Log the number of pmd threads per numa node. */
5212         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5213             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5214                       numa->n_cores, numa->numa_id);
5215         }
5216     }
5217
5218     ovs_numa_dump_destroy(pmd_cores);
5219 }
5220
5221 static void
5222 pmd_remove_stale_ports(struct dp_netdev *dp,
5223                        struct dp_netdev_pmd_thread *pmd)
5224     OVS_EXCLUDED(pmd->port_mutex)
5225     OVS_REQUIRES(dp->port_mutex)
5226 {
5227     struct rxq_poll *poll, *poll_next;
5228     struct tx_port *tx, *tx_next;
5229
5230     ovs_mutex_lock(&pmd->port_mutex);
5231     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5232         struct dp_netdev_port *port = poll->rxq->port;
5233
5234         if (port->need_reconfigure
5235             || !hmap_contains(&dp->ports, &port->node)) {
5236             dp_netdev_del_rxq_from_pmd(pmd, poll);
5237         }
5238     }
5239     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
5240         struct dp_netdev_port *port = tx->port;
5241
5242         if (port->need_reconfigure
5243             || !hmap_contains(&dp->ports, &port->node)) {
5244             dp_netdev_del_port_tx_from_pmd(pmd, tx);
5245         }
5246     }
5247     ovs_mutex_unlock(&pmd->port_mutex);
5248 }
5249
5250 /* Must be called each time a port is added/removed or the cmask changes.
5251  * This creates and destroys pmd threads, reconfigures ports, opens their
5252  * rxqs and assigns all rxqs/txqs to pmd threads. */
5253 static void
5254 reconfigure_datapath(struct dp_netdev *dp)
5255     OVS_REQUIRES(dp->port_mutex)
5256 {
5257     struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5258     struct dp_netdev_pmd_thread *pmd;
5259     struct dp_netdev_port *port;
5260     int wanted_txqs;
5261
5262     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5263
5264     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5265      * on the system and the user configuration. */
5266     reconfigure_pmd_threads(dp);
5267
5268     wanted_txqs = cmap_count(&dp->poll_threads);
5269
5270     /* The number of pmd threads might have changed, or a port can be new:
5271      * adjust the txqs. */
5272     HMAP_FOR_EACH (port, node, &dp->ports) {
5273         netdev_set_tx_multiq(port->netdev, wanted_txqs);
5274     }
5275
5276     /* Step 2: Remove from the pmd threads ports that have been removed or
5277      * need reconfiguration. */
5278
5279     /* Check for all the ports that need reconfiguration.  We cache this in
5280      * 'port->need_reconfigure', because netdev_is_reconf_required() can
5281      * change at any time.
5282      * Also mark for reconfiguration all ports which will likely change their
5283      * 'dynamic_txqs' parameter.  It's required to stop using them before
5284      * changing this setting and it's simpler to mark ports here and allow
5285      * 'pmd_remove_stale_ports' to remove them from threads.  There will be
5286      * no actual reconfiguration in 'port_reconfigure' because it's
5287      * unnecessary.  */
5288     HMAP_FOR_EACH (port, node, &dp->ports) {
5289         if (netdev_is_reconf_required(port->netdev)
5290             || (port->dynamic_txqs
5291                 != (netdev_n_txq(port->netdev) < wanted_txqs))) {
5292             port->need_reconfigure = true;
5293         }
5294     }
5295
5296     /* Remove from the pmd threads all the ports that have been deleted or
5297      * need reconfiguration. */
5298     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5299         pmd_remove_stale_ports(dp, pmd);
5300     }
5301
5302     /* Reload affected pmd threads.  We must wait for the pmd threads before
5303      * reconfiguring the ports, because a port cannot be reconfigured while
5304      * it's being used. */
5305     reload_affected_pmds(dp);
5306
5307     /* Step 3: Reconfigure ports. */
5308
5309     /* We only reconfigure the ports that we determined above, because they're
5310      * not being used by any pmd thread at the moment.  If a port fails to
5311      * reconfigure we remove it from the datapath. */
5312     struct dp_netdev_port *next_port;
5313     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
5314         int err;
5315
5316         if (!port->need_reconfigure) {
5317             continue;
5318         }
5319
5320         err = port_reconfigure(port);
5321         if (err) {
5322             hmap_remove(&dp->ports, &port->node);
5323             seq_change(dp->port_seq);
5324             port_destroy(port);
5325         } else {
5326             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
5327         }
5328     }
5329
5330     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
5331      * for now, we just update the 'pmd' pointer in each rxq to point to the
5332      * wanted thread according to the scheduling policy. */
5333
5334     /* Reset all the pmd threads to non isolated. */
5335     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5336         pmd->isolated = false;
5337     }
5338
5339     /* Reset all the queues to unassigned */
5340     HMAP_FOR_EACH (port, node, &dp->ports) {
5341         for (int i = 0; i < port->n_rxq; i++) {
5342             port->rxqs[i].pmd = NULL;
5343         }
5344     }
5345
5346     /* Add pinned queues and mark pmd threads isolated. */
5347     rxq_scheduling(dp, true);
5348
5349     /* Add non-pinned queues. */
5350     rxq_scheduling(dp, false);
5351
5352     /* Step 5: Remove queues not compliant with new scheduling. */
5353
5354     /* Count all the threads that will have at least one queue to poll. */
5355     HMAP_FOR_EACH (port, node, &dp->ports) {
5356         for (int qid = 0; qid < port->n_rxq; qid++) {
5357             struct dp_netdev_rxq *q = &port->rxqs[qid];
5358
5359             if (q->pmd) {
5360                 hmapx_add(&busy_threads, q->pmd);
5361             }
5362         }
5363     }
5364
5365     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5366         struct rxq_poll *poll, *poll_next;
5367
5368         ovs_mutex_lock(&pmd->port_mutex);
5369         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5370             if (poll->rxq->pmd != pmd) {
5371                 dp_netdev_del_rxq_from_pmd(pmd, poll);
5372
5373                 /* This pmd might sleep after this step if it has no rxq
5374                  * remaining. Tell it to busy wait for new assignment if it
5375                  * has at least one scheduled queue. */
5376                 if (hmap_count(&pmd->poll_list) == 0 &&
5377                     hmapx_contains(&busy_threads, pmd)) {
5378                     atomic_store_relaxed(&pmd->wait_for_reload, true);
5379                 }
5380             }
5381         }
5382         ovs_mutex_unlock(&pmd->port_mutex);
5383     }
5384
5385     hmapx_destroy(&busy_threads);
5386
5387     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
5388      * the old queues before readding them, otherwise a queue can be polled by
5389      * two threads at the same time. */
5390     reload_affected_pmds(dp);
5391
5392     /* Step 6: Add queues from scheduling, if they're not there already. */
5393     HMAP_FOR_EACH (port, node, &dp->ports) {
5394         if (!netdev_is_pmd(port->netdev)) {
5395             continue;
5396         }
5397
5398         for (int qid = 0; qid < port->n_rxq; qid++) {
5399             struct dp_netdev_rxq *q = &port->rxqs[qid];
5400
5401             if (q->pmd) {
5402                 ovs_mutex_lock(&q->pmd->port_mutex);
5403                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5404                 ovs_mutex_unlock(&q->pmd->port_mutex);
5405             }
5406         }
5407     }
5408
5409     /* Add every port and bond to the tx port and bond caches of
5410      * every pmd thread, if it's not there already and if this pmd
5411      * has at least one rxq to poll.
5412      */
5413     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5414         ovs_mutex_lock(&pmd->port_mutex);
5415         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5416             struct tx_bond *bond;
5417
5418             HMAP_FOR_EACH (port, node, &dp->ports) {
5419                 dp_netdev_add_port_tx_to_pmd(pmd, port);
5420             }
5421
5422             CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5423                 dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5424             }
5425         }
5426         ovs_mutex_unlock(&pmd->port_mutex);
5427     }
5428
5429     /* Reload affected pmd threads. */
5430     reload_affected_pmds(dp);
5431
5432     /* Check if PMD Auto LB is to be enabled */
5433     set_pmd_auto_lb(dp);
5434 }
5435
5436 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5437 static bool
5438 ports_require_restart(const struct dp_netdev *dp)
5439     OVS_REQUIRES(dp->port_mutex)
5440 {
5441     struct dp_netdev_port *port;
5442
5443     HMAP_FOR_EACH (port, node, &dp->ports) {
5444         if (netdev_is_reconf_required(port->netdev)) {
5445             return true;
5446         }
5447     }
5448
5449     return false;
5450 }
5451
5452 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5453  * of elements in array to be considered for calculating vairance.
5454  * Usage example: data array 'a' contains the processing load of each pmd and
5455  * 'n' is the number of PMDs. It returns the variance in processing load of
5456  * PMDs*/
5457 static uint64_t
5458 variance(uint64_t a[], int n)
5459 {
5460     /* Compute mean (average of elements). */
5461     uint64_t sum = 0;
5462     uint64_t mean = 0;
5463     uint64_t sqDiff = 0;
5464
5465     if (!n) {
5466         return 0;
5467     }
5468
5469     for (int i = 0; i < n; i++) {
5470         sum += a[i];
5471     }
5472
5473     if (sum) {
5474         mean = sum / n;
5475
5476         /* Compute sum squared differences with mean. */
5477         for (int i = 0; i < n; i++) {
5478             sqDiff += (a[i] - mean)*(a[i] - mean);
5479         }
5480     }
5481     return (sqDiff ? (sqDiff / n) : 0);
5482 }
5483
5484
5485 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5486  * assignment to PMDs. */
5487 static bool
5488 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5489                      uint32_t num_pmds, uint64_t *predicted_variance)
5490     OVS_REQUIRES(dp->port_mutex)
5491 {
5492     struct dp_netdev_port *port;
5493     struct dp_netdev_pmd_thread *pmd;
5494     struct dp_netdev_rxq **rxqs = NULL;
5495     struct rr_numa *numa = NULL;
5496     struct rr_numa_list rr;
5497     int n_rxqs = 0;
5498     bool ret = false;
5499     uint64_t *pmd_usage;
5500
5501     if (!predicted_variance) {
5502         return ret;
5503     }
5504
5505     pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5506
5507     HMAP_FOR_EACH (port, node, &dp->ports) {
5508         if (!netdev_is_pmd(port->netdev)) {
5509             continue;
5510         }
5511
5512         for (int qid = 0; qid < port->n_rxq; qid++) {
5513             struct dp_netdev_rxq *q = &port->rxqs[qid];
5514             uint64_t cycle_hist = 0;
5515
5516             if (q->pmd->isolated) {
5517                 continue;
5518             }
5519
5520             if (n_rxqs == 0) {
5521                 rxqs = xmalloc(sizeof *rxqs);
5522             } else {
5523                 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5524             }
5525
5526             /* Sum the queue intervals and store the cycle history. */
5527             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5528                 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5529             }
5530             dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5531                                          cycle_hist);
5532             /* Store the queue. */
5533             rxqs[n_rxqs++] = q;
5534         }
5535     }
5536     if (n_rxqs > 1) {
5537         /* Sort the queues in order of the processing cycles
5538          * they consumed during their last pmd interval. */
5539         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5540     }
5541     rr_numa_list_populate(dp, &rr);
5542
5543     for (int i = 0; i < n_rxqs; i++) {
5544         int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5545         numa = rr_numa_list_lookup(&rr, numa_id);
5546         if (!numa) {
5547             /* Abort if cross NUMA polling. */
5548             VLOG_DBG("PMD auto lb dry run."
5549                      " Aborting due to cross-numa polling.");
5550             goto cleanup;
5551         }
5552
5553         pmd = rr_numa_get_pmd(numa, true);
5554         VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5555                   "to be assigned port \'%s\' rx queue %d "
5556                   "(measured processing cycles %"PRIu64").",
5557                   pmd->core_id, numa_id,
5558                   netdev_rxq_get_name(rxqs[i]->rx),
5559                   netdev_rxq_get_queue_id(rxqs[i]->rx),
5560                   dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5561
5562         for (int id = 0; id < num_pmds; id++) {
5563             if (pmd->core_id == core_list[id]) {
5564                 /* Add the processing cycles of rxq to pmd polling it. */
5565                 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5566                                         RXQ_CYCLES_PROC_HIST);
5567             }
5568         }
5569     }
5570
5571     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5572         uint64_t total_cycles = 0;
5573
5574         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5575             continue;
5576         }
5577
5578         /* Get the total pmd cycles for an interval. */
5579         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5580         /* Estimate the cycles to cover all intervals. */
5581         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5582         for (int id = 0; id < num_pmds; id++) {
5583             if (pmd->core_id == core_list[id]) {
5584                 if (pmd_usage[id]) {
5585                     pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5586                 }
5587                 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5588                          "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5589             }
5590         }
5591     }
5592     *predicted_variance = variance(pmd_usage, num_pmds);
5593     ret = true;
5594
5595 cleanup:
5596     rr_numa_list_destroy(&rr);
5597     free(rxqs);
5598     free(pmd_usage);
5599     return ret;
5600 }
5601
5602 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5603  * better distribution of load on PMDs. */
5604 static bool
5605 pmd_rebalance_dry_run(struct dp_netdev *dp)
5606     OVS_REQUIRES(dp->port_mutex)
5607 {
5608     struct dp_netdev_pmd_thread *pmd;
5609     uint64_t *curr_pmd_usage;
5610
5611     uint64_t curr_variance;
5612     uint64_t new_variance;
5613     uint64_t improvement = 0;
5614     uint32_t num_pmds;
5615     uint32_t *pmd_corelist;
5616     struct rxq_poll *poll;
5617     bool ret;
5618
5619     num_pmds = cmap_count(&dp->poll_threads);
5620
5621     if (num_pmds > 1) {
5622         curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5623         pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5624     } else {
5625         return false;
5626     }
5627
5628     num_pmds = 0;
5629     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5630         uint64_t total_cycles = 0;
5631         uint64_t total_proc = 0;
5632
5633         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5634             continue;
5635         }
5636
5637         /* Get the total pmd cycles for an interval. */
5638         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5639         /* Estimate the cycles to cover all intervals. */
5640         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5641
5642         ovs_mutex_lock(&pmd->port_mutex);
5643         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5644             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5645                 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5646             }
5647         }
5648         ovs_mutex_unlock(&pmd->port_mutex);
5649
5650         if (total_proc) {
5651             curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5652         }
5653
5654         VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5655                   pmd->core_id, curr_pmd_usage[num_pmds]);
5656
5657         if (atomic_count_get(&pmd->pmd_overloaded)) {
5658             atomic_count_set(&pmd->pmd_overloaded, 0);
5659         }
5660
5661         pmd_corelist[num_pmds] = pmd->core_id;
5662         num_pmds++;
5663     }
5664
5665     curr_variance = variance(curr_pmd_usage, num_pmds);
5666     ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5667
5668     if (ret) {
5669         VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5670                   " Predicted PMD variance: %"PRIu64"",
5671                   curr_variance, new_variance);
5672
5673         if (new_variance < curr_variance) {
5674             improvement =
5675                 ((curr_variance - new_variance) * 100) / curr_variance;
5676         }
5677         if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5678             ret = false;
5679         }
5680     }
5681
5682     free(curr_pmd_usage);
5683     free(pmd_corelist);
5684     return ret;
5685 }
5686
5687
5688 /* Return true if needs to revalidate datapath flows. */
5689 static bool
5690 dpif_netdev_run(struct dpif *dpif)
5691 {
5692     struct dp_netdev_port *port;
5693     struct dp_netdev *dp = get_dp_netdev(dpif);
5694     struct dp_netdev_pmd_thread *non_pmd;
5695     uint64_t new_tnl_seq;
5696     bool need_to_flush = true;
5697     bool pmd_rebalance = false;
5698     long long int now = time_msec();
5699     struct dp_netdev_pmd_thread *pmd;
5700
5701     ovs_mutex_lock(&dp->port_mutex);
5702     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5703     if (non_pmd) {
5704         ovs_mutex_lock(&dp->non_pmd_mutex);
5705         HMAP_FOR_EACH (port, node, &dp->ports) {
5706             if (!netdev_is_pmd(port->netdev)) {
5707                 int i;
5708
5709                 if (port->emc_enabled) {
5710                     atomic_read_relaxed(&dp->emc_insert_min,
5711                                         &non_pmd->ctx.emc_insert_min);
5712                 } else {
5713                     non_pmd->ctx.emc_insert_min = 0;
5714                 }
5715
5716                 for (i = 0; i < port->n_rxq; i++) {
5717
5718                     if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5719                         continue;
5720                     }
5721
5722                     if (dp_netdev_process_rxq_port(non_pmd,
5723                                                    &port->rxqs[i],
5724                                                    port->port_no)) {
5725                         need_to_flush = false;
5726                     }
5727                 }
5728             }
5729         }
5730         if (need_to_flush) {
5731             /* We didn't receive anything in the process loop.
5732              * Check if we need to send something.
5733              * There was no time updates on current iteration. */
5734             pmd_thread_ctx_time_update(non_pmd);
5735             dp_netdev_pmd_flush_output_packets(non_pmd, false);
5736         }
5737
5738         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5739         ovs_mutex_unlock(&dp->non_pmd_mutex);
5740
5741         dp_netdev_pmd_unref(non_pmd);
5742     }
5743
5744     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5745     if (pmd_alb->is_enabled) {
5746         if (!pmd_alb->rebalance_poll_timer) {
5747             pmd_alb->rebalance_poll_timer = now;
5748         } else if ((pmd_alb->rebalance_poll_timer +
5749                    pmd_alb->rebalance_intvl) < now) {
5750             pmd_alb->rebalance_poll_timer = now;
5751             CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5752                 if (atomic_count_get(&pmd->pmd_overloaded) >=
5753                                     PMD_RXQ_INTERVAL_MAX) {
5754                     pmd_rebalance = true;
5755                     break;
5756                 }
5757             }
5758
5759             if (pmd_rebalance &&
5760                 !dp_netdev_is_reconf_required(dp) &&
5761                 !ports_require_restart(dp) &&
5762                 pmd_rebalance_dry_run(dp)) {
5763                 VLOG_INFO("PMD auto lb dry run."
5764                           " requesting datapath reconfigure.");
5765                 dp_netdev_request_reconfigure(dp);
5766             }
5767         }
5768     }
5769
5770     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5771         reconfigure_datapath(dp);
5772     }
5773     ovs_mutex_unlock(&dp->port_mutex);
5774
5775     tnl_neigh_cache_run();
5776     tnl_port_map_run();
5777     new_tnl_seq = seq_read(tnl_conf_seq);
5778
5779     if (dp->last_tnl_conf_seq != new_tnl_seq) {
5780         dp->last_tnl_conf_seq = new_tnl_seq;
5781         return true;
5782     }
5783     return false;
5784 }
5785
5786 static void
5787 dpif_netdev_wait(struct dpif *dpif)
5788 {
5789     struct dp_netdev_port *port;
5790     struct dp_netdev *dp = get_dp_netdev(dpif);
5791
5792     ovs_mutex_lock(&dp_netdev_mutex);
5793     ovs_mutex_lock(&dp->port_mutex);
5794     HMAP_FOR_EACH (port, node, &dp->ports) {
5795         netdev_wait_reconf_required(port->netdev);
5796         if (!netdev_is_pmd(port->netdev)) {
5797             int i;
5798
5799             for (i = 0; i < port->n_rxq; i++) {
5800                 netdev_rxq_wait(port->rxqs[i].rx);
5801             }
5802         }
5803     }
5804     ovs_mutex_unlock(&dp->port_mutex);
5805     ovs_mutex_unlock(&dp_netdev_mutex);
5806     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5807 }
5808
5809 static void
5810 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5811 {
5812     struct tx_port *tx_port_cached;
5813
5814     /* Flush all the queued packets. */
5815     dp_netdev_pmd_flush_output_packets(pmd, true);
5816     /* Free all used tx queue ids. */
5817     dpif_netdev_xps_revalidate_pmd(pmd, true);
5818
5819     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5820         free(tx_port_cached);
5821     }
5822     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5823         free(tx_port_cached);
5824     }
5825 }
5826
5827 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5828  * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5829  * device, otherwise to 'pmd->send_port_cache' if the port has at least
5830  * one txq. */
5831 static void
5832 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5833     OVS_REQUIRES(pmd->port_mutex)
5834 {
5835     struct tx_port *tx_port, *tx_port_cached;
5836
5837     pmd_free_cached_ports(pmd);
5838     hmap_shrink(&pmd->send_port_cache);
5839     hmap_shrink(&pmd->tnl_port_cache);
5840
5841     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5842         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5843             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5844             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5845                         hash_port_no(tx_port_cached->port->port_no));
5846         }
5847
5848         if (netdev_n_txq(tx_port->port->netdev)) {
5849             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5850             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5851                         hash_port_no(tx_port_cached->port->port_no));
5852         }
5853     }
5854 }
5855
5856 static void
5857 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5858 {
5859     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5860     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5861         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5862                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
5863     }
5864     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5865
5866     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5867              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5868 }
5869
5870 static void
5871 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5872 {
5873     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5874     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5875     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5876 }
5877
5878 static int
5879 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5880                           struct polled_queue **ppoll_list)
5881 {
5882     struct polled_queue *poll_list = *ppoll_list;
5883     struct rxq_poll *poll;
5884     int i;
5885
5886     ovs_mutex_lock(&pmd->port_mutex);
5887     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5888                                     * sizeof *poll_list);
5889
5890     i = 0;
5891     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5892         poll_list[i].rxq = poll->rxq;
5893         poll_list[i].port_no = poll->rxq->port->port_no;
5894         poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5895         poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5896         poll_list[i].change_seq =
5897                      netdev_get_change_seq(poll->rxq->port->netdev);
5898         i++;
5899     }
5900
5901     pmd_load_cached_ports(pmd);
5902
5903     ovs_mutex_unlock(&pmd->port_mutex);
5904
5905     *ppoll_list = poll_list;
5906     return i;
5907 }
5908
5909 static void *
5910 pmd_thread_main(void *f_)
5911 {
5912     struct dp_netdev_pmd_thread *pmd = f_;
5913     struct pmd_perf_stats *s = &pmd->perf_stats;
5914     unsigned int lc = 0;
5915     struct polled_queue *poll_list;
5916     bool wait_for_reload = false;
5917     bool reload_tx_qid;
5918     bool exiting;
5919     bool reload;
5920     int poll_cnt;
5921     int i;
5922     int process_packets = 0;
5923
5924     poll_list = NULL;
5925
5926     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5927     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5928     ovs_numa_thread_setaffinity_core(pmd->core_id);
5929     dpdk_set_lcore_id(pmd->core_id);
5930     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5931     dfc_cache_init(&pmd->flow_cache);
5932     pmd_alloc_static_tx_qid(pmd);
5933
5934 reload:
5935     atomic_count_init(&pmd->pmd_overloaded, 0);
5936
5937     /* List port/core affinity */
5938     for (i = 0; i < poll_cnt; i++) {
5939        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5940                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5941                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5942        /* Reset the rxq current cycles counter. */
5943        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5944     }
5945
5946     if (!poll_cnt) {
5947         if (wait_for_reload) {
5948             /* Don't sleep, control thread will ask for a reload shortly. */
5949             do {
5950                 atomic_read_explicit(&pmd->reload, &reload,
5951                                      memory_order_acquire);
5952             } while (!reload);
5953         } else {
5954             while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5955                 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5956                 poll_block();
5957             }
5958         }
5959     }
5960
5961     pmd->intrvl_tsc_prev = 0;
5962     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5963     cycles_counter_update(s);
5964
5965     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5966
5967     /* Protect pmd stats from external clearing while polling. */
5968     ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5969     for (;;) {
5970         uint64_t rx_packets = 0, tx_packets = 0;
5971
5972         pmd_perf_start_iteration(s);
5973
5974         for (i = 0; i < poll_cnt; i++) {
5975
5976             if (!poll_list[i].rxq_enabled) {
5977                 continue;
5978             }
5979
5980             if (poll_list[i].emc_enabled) {
5981                 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5982                                     &pmd->ctx.emc_insert_min);
5983             } else {
5984                 pmd->ctx.emc_insert_min = 0;
5985             }
5986
5987             process_packets =
5988                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5989                                            poll_list[i].port_no);
5990             rx_packets += process_packets;
5991         }
5992
5993         if (!rx_packets) {
5994             /* We didn't receive anything in the process loop.
5995              * Check if we need to send something.
5996              * There was no time updates on current iteration. */
5997             pmd_thread_ctx_time_update(pmd);
5998             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5999         }
6000
6001         /* Do RCU synchronization at fixed interval.  This ensures that
6002          * synchronization would not be delayed long even at high load of
6003          * packet processing. */
6004         if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6005             if (!ovsrcu_try_quiesce()) {
6006                 pmd->next_rcu_quiesce =
6007                     pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6008             }
6009         }
6010
6011         if (lc++ > 1024) {
6012             lc = 0;
6013
6014             coverage_try_clear();
6015             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6016             if (!ovsrcu_try_quiesce()) {
6017                 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6018                 pmd->next_rcu_quiesce =
6019                     pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6020             }
6021
6022             for (i = 0; i < poll_cnt; i++) {
6023                 uint64_t current_seq =
6024                          netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6025                 if (poll_list[i].change_seq != current_seq) {
6026                     poll_list[i].change_seq = current_seq;
6027                     poll_list[i].rxq_enabled =
6028                                  netdev_rxq_enabled(poll_list[i].rxq->rx);
6029                 }
6030             }
6031         }
6032
6033         atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6034         if (OVS_UNLIKELY(reload)) {
6035             break;
6036         }
6037
6038         pmd_perf_end_iteration(s, rx_packets, tx_packets,
6039                                pmd_perf_metrics_enabled(pmd));
6040     }
6041     ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6042
6043     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6044     atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6045     atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6046     atomic_read_relaxed(&pmd->exit, &exiting);
6047     /* Signal here to make sure the pmd finishes
6048      * reloading the updated configuration. */
6049     dp_netdev_pmd_reload_done(pmd);
6050
6051     if (reload_tx_qid) {
6052         pmd_free_static_tx_qid(pmd);
6053         pmd_alloc_static_tx_qid(pmd);
6054     }
6055
6056     if (!exiting) {
6057         goto reload;
6058     }
6059
6060     pmd_free_static_tx_qid(pmd);
6061     dfc_cache_uninit(&pmd->flow_cache);
6062     free(poll_list);
6063     pmd_free_cached_ports(pmd);
6064     return NULL;
6065 }
6066
6067 static void
6068 dp_netdev_disable_upcall(struct dp_netdev *dp)
6069     OVS_ACQUIRES(dp->upcall_rwlock)
6070 {
6071     fat_rwlock_wrlock(&dp->upcall_rwlock);
6072 }
6073
6074 \f
6075 /* Meters */
6076 static void
6077 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6078                                struct ofputil_meter_features *features)
6079 {
6080     features->max_meters = MAX_METERS;
6081     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6082     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6083     features->max_bands = MAX_BANDS;
6084     features->max_color = 0;
6085 }
6086
6087 /* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
6088  * that exceed a band are dropped in-place. */
6089 static void
6090 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6091                     uint32_t meter_id, long long int now)
6092 {
6093     struct dp_meter *meter;
6094     struct dp_meter_band *band;
6095     struct dp_packet *packet;
6096     long long int long_delta_t; /* msec */
6097     uint32_t delta_t; /* msec */
6098     const size_t cnt = dp_packet_batch_size(packets_);
6099     uint32_t bytes, volume;
6100     int exceeded_band[NETDEV_MAX_BURST];
6101     uint32_t exceeded_rate[NETDEV_MAX_BURST];
6102     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
6103
6104     if (meter_id >= MAX_METERS) {
6105         return;
6106     }
6107
6108     meter_lock(dp, meter_id);
6109     meter = dp->meters[meter_id];
6110     if (!meter) {
6111         goto out;
6112     }
6113
6114     /* Initialize as negative values. */
6115     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6116     /* Initialize as zeroes. */
6117     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6118
6119     /* All packets will hit the meter at the same time. */
6120     long_delta_t = now / 1000 - meter->used / 1000; /* msec */
6121
6122     if (long_delta_t < 0) {
6123         /* This condition means that we have several threads fighting for a
6124            meter lock, and the one who received the packets a bit later wins.
6125            Assuming that all racing threads received packets at the same time
6126            to avoid overflow. */
6127         long_delta_t = 0;
6128     }
6129
6130     /* Make sure delta_t will not be too large, so that bucket will not
6131      * wrap around below. */
6132     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
6133         ? meter->max_delta_t : (uint32_t)long_delta_t;
6134
6135     /* Update meter stats. */
6136     meter->used = now;
6137     meter->packet_count += cnt;
6138     bytes = 0;
6139     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6140         bytes += dp_packet_size(packet);
6141     }
6142     meter->byte_count += bytes;
6143
6144     /* Meters can operate in terms of packets per second or kilobits per
6145      * second. */
6146     if (meter->flags & OFPMF13_PKTPS) {
6147         /* Rate in packets/second, bucket 1/1000 packets. */
6148         /* msec * packets/sec = 1/1000 packets. */
6149         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6150     } else {
6151         /* Rate in kbps, bucket in bits. */
6152         /* msec * kbps = bits */
6153         volume = bytes * 8;
6154     }
6155
6156     /* Update all bands and find the one hit with the highest rate for each
6157      * packet (if any). */
6158     for (int m = 0; m < meter->n_bands; ++m) {
6159         band = &meter->bands[m];
6160
6161         /* Update band's bucket. */
6162         band->bucket += delta_t * band->up.rate;
6163         if (band->bucket > band->up.burst_size) {
6164             band->bucket = band->up.burst_size;
6165         }
6166
6167         /* Drain the bucket for all the packets, if possible. */
6168         if (band->bucket >= volume) {
6169             band->bucket -= volume;
6170         } else {
6171             int band_exceeded_pkt;
6172
6173             /* Band limit hit, must process packet-by-packet. */
6174             if (meter->flags & OFPMF13_PKTPS) {
6175                 band_exceeded_pkt = band->bucket / 1000;
6176                 band->bucket %= 1000; /* Remainder stays in bucket. */
6177
6178                 /* Update the exceeding band for each exceeding packet.
6179                  * (Only one band will be fired by a packet, and that
6180                  * can be different for each packet.) */
6181                 for (int i = band_exceeded_pkt; i < cnt; i++) {
6182                     if (band->up.rate > exceeded_rate[i]) {
6183                         exceeded_rate[i] = band->up.rate;
6184                         exceeded_band[i] = m;
6185                     }
6186                 }
6187             } else {
6188                 /* Packet sizes differ, must process one-by-one. */
6189                 band_exceeded_pkt = cnt;
6190                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6191                     uint32_t bits = dp_packet_size(packet) * 8;
6192
6193                     if (band->bucket >= bits) {
6194                         band->bucket -= bits;
6195                     } else {
6196                         if (i < band_exceeded_pkt) {
6197                             band_exceeded_pkt = i;
6198                         }
6199                         /* Update the exceeding band for the exceeding packet.
6200                          * (Only one band will be fired by a packet, and that
6201                          * can be different for each packet.) */
6202                         if (band->up.rate > exceeded_rate[i]) {
6203                             exceeded_rate[i] = band->up.rate;
6204                             exceeded_band[i] = m;
6205                         }
6206                     }
6207                 }
6208             }
6209             /* Remember the first exceeding packet. */
6210             if (exceeded_pkt > band_exceeded_pkt) {
6211                 exceeded_pkt = band_exceeded_pkt;
6212             }
6213         }
6214     }
6215
6216     /* Fire the highest rate band exceeded by each packet, and drop
6217      * packets if needed. */
6218     size_t j;
6219     DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6220         if (exceeded_band[j] >= 0) {
6221             /* Meter drop packet. */
6222             band = &meter->bands[exceeded_band[j]];
6223             band->packet_count += 1;
6224             band->byte_count += dp_packet_size(packet);
6225             COVERAGE_INC(datapath_drop_meter);
6226             dp_packet_delete(packet);
6227         } else {
6228             /* Meter accepts packet. */
6229             dp_packet_batch_refill(packets_, packet, j);
6230         }
6231     }
6232  out:
6233     meter_unlock(dp, meter_id);
6234 }
6235
6236 /* Meter set/get/del processing is still single-threaded. */
6237 static int
6238 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6239                       struct ofputil_meter_config *config)
6240 {
6241     struct dp_netdev *dp = get_dp_netdev(dpif);
6242     uint32_t mid = meter_id.uint32;
6243     struct dp_meter *meter;
6244     int i;
6245
6246     if (mid >= MAX_METERS) {
6247         return EFBIG; /* Meter_id out of range. */
6248     }
6249
6250     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6251         return EBADF; /* Unsupported flags set */
6252     }
6253
6254     if (config->n_bands > MAX_BANDS) {
6255         return EINVAL;
6256     }
6257
6258     for (i = 0; i < config->n_bands; ++i) {
6259         switch (config->bands[i].type) {
6260         case OFPMBT13_DROP:
6261             break;
6262         default:
6263             return ENODEV; /* Unsupported band type */
6264         }
6265     }
6266
6267     /* Allocate meter */
6268     meter = xzalloc(sizeof *meter
6269                     + config->n_bands * sizeof(struct dp_meter_band));
6270
6271     meter->flags = config->flags;
6272     meter->n_bands = config->n_bands;
6273     meter->max_delta_t = 0;
6274     meter->used = time_usec();
6275
6276     /* set up bands */
6277     for (i = 0; i < config->n_bands; ++i) {
6278         uint32_t band_max_delta_t;
6279
6280         /* Set burst size to a workable value if none specified. */
6281         if (config->bands[i].burst_size == 0) {
6282             config->bands[i].burst_size = config->bands[i].rate;
6283         }
6284
6285         meter->bands[i].up = config->bands[i];
6286         /* Convert burst size to the bucket units: */
6287         /* pkts => 1/1000 packets, kilobits => bits. */
6288         meter->bands[i].up.burst_size *= 1000;
6289         /* Initialize bucket to empty. */
6290         meter->bands[i].bucket = 0;
6291
6292         /* Figure out max delta_t that is enough to fill any bucket. */
6293         band_max_delta_t
6294             = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
6295         if (band_max_delta_t > meter->max_delta_t) {
6296             meter->max_delta_t = band_max_delta_t;
6297         }
6298     }
6299
6300     meter_lock(dp, mid);
6301     dp_delete_meter(dp, mid); /* Free existing meter, if any */
6302     dp->meters[mid] = meter;
6303     meter_unlock(dp, mid);
6304
6305     return 0;
6306 }
6307
6308 static int
6309 dpif_netdev_meter_get(const struct dpif *dpif,
6310                       ofproto_meter_id meter_id_,
6311                       struct ofputil_meter_stats *stats, uint16_t n_bands)
6312 {
6313     const struct dp_netdev *dp = get_dp_netdev(dpif);
6314     uint32_t meter_id = meter_id_.uint32;
6315     int retval = 0;
6316
6317     if (meter_id >= MAX_METERS) {
6318         return EFBIG;
6319     }
6320
6321     meter_lock(dp, meter_id);
6322     const struct dp_meter *meter = dp->meters[meter_id];
6323     if (!meter) {
6324         retval = ENOENT;
6325         goto done;
6326     }
6327     if (stats) {
6328         int i = 0;
6329
6330         stats->packet_in_count = meter->packet_count;
6331         stats->byte_in_count = meter->byte_count;
6332
6333         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6334             stats->bands[i].packet_count = meter->bands[i].packet_count;
6335             stats->bands[i].byte_count = meter->bands[i].byte_count;
6336         }
6337
6338         stats->n_bands = i;
6339     }
6340
6341 done:
6342     meter_unlock(dp, meter_id);
6343     return retval;
6344 }
6345
6346 static int
6347 dpif_netdev_meter_del(struct dpif *dpif,
6348                       ofproto_meter_id meter_id_,
6349                       struct ofputil_meter_stats *stats, uint16_t n_bands)
6350 {
6351     struct dp_netdev *dp = get_dp_netdev(dpif);
6352     int error;
6353
6354     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6355     if (!error) {
6356         uint32_t meter_id = meter_id_.uint32;
6357
6358         meter_lock(dp, meter_id);
6359         dp_delete_meter(dp, meter_id);
6360         meter_unlock(dp, meter_id);
6361     }
6362     return error;
6363 }
6364
6365 \f
6366 static void
6367 dpif_netdev_disable_upcall(struct dpif *dpif)
6368     OVS_NO_THREAD_SAFETY_ANALYSIS
6369 {
6370     struct dp_netdev *dp = get_dp_netdev(dpif);
6371     dp_netdev_disable_upcall(dp);
6372 }
6373
6374 static void
6375 dp_netdev_enable_upcall(struct dp_netdev *dp)
6376     OVS_RELEASES(dp->upcall_rwlock)
6377 {
6378     fat_rwlock_unlock(&dp->upcall_rwlock);
6379 }
6380
6381 static void
6382 dpif_netdev_enable_upcall(struct dpif *dpif)
6383     OVS_NO_THREAD_SAFETY_ANALYSIS
6384 {
6385     struct dp_netdev *dp = get_dp_netdev(dpif);
6386     dp_netdev_enable_upcall(dp);
6387 }
6388
6389 static void
6390 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6391 {
6392     atomic_store_relaxed(&pmd->wait_for_reload, false);
6393     atomic_store_relaxed(&pmd->reload_tx_qid, false);
6394     pmd->last_reload_seq = seq_read(pmd->reload_seq);
6395     atomic_store_explicit(&pmd->reload, false, memory_order_release);
6396 }
6397
6398 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
6399  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6400  * 'core_id' is NON_PMD_CORE_ID).
6401  *
6402  * Caller must unrefs the returned reference.  */
6403 static struct dp_netdev_pmd_thread *
6404 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6405 {
6406     struct dp_netdev_pmd_thread *pmd;
6407     const struct cmap_node *pnode;
6408
6409     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
6410     if (!pnode) {
6411         return NULL;
6412     }
6413     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6414
6415     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6416 }
6417
6418 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6419 static void
6420 dp_netdev_set_nonpmd(struct dp_netdev *dp)
6421     OVS_REQUIRES(dp->port_mutex)
6422 {
6423     struct dp_netdev_pmd_thread *non_pmd;
6424
6425     non_pmd = xzalloc(sizeof *non_pmd);
6426     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6427 }
6428
6429 /* Caller must have valid pointer to 'pmd'. */
6430 static bool
6431 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6432 {
6433     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6434 }
6435
6436 static void
6437 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6438 {
6439     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6440         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6441     }
6442 }
6443
6444 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
6445  * fails, keeps checking for next node until reaching the end of cmap.
6446  *
6447  * Caller must unrefs the returned reference. */
6448 static struct dp_netdev_pmd_thread *
6449 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6450 {
6451     struct dp_netdev_pmd_thread *next;
6452
6453     do {
6454         struct cmap_node *node;
6455
6456         node = cmap_next_position(&dp->poll_threads, pos);
6457         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6458             : NULL;
6459     } while (next && !dp_netdev_pmd_try_ref(next));
6460
6461     return next;
6462 }
6463
6464 /* Configures the 'pmd' based on the input argument. */
6465 static void
6466 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6467                         unsigned core_id, int numa_id)
6468 {
6469     pmd->dp = dp;
6470     pmd->core_id = core_id;
6471     pmd->numa_id = numa_id;
6472     pmd->need_reload = false;
6473     pmd->n_output_batches = 0;
6474
6475     ovs_refcount_init(&pmd->ref_cnt);
6476     atomic_init(&pmd->exit, false);
6477     pmd->reload_seq = seq_create();
6478     pmd->last_reload_seq = seq_read(pmd->reload_seq);
6479     atomic_init(&pmd->reload, false);
6480     ovs_mutex_init(&pmd->flow_mutex);
6481     ovs_mutex_init(&pmd->port_mutex);
6482     ovs_mutex_init(&pmd->bond_mutex);
6483     cmap_init(&pmd->flow_table);
6484     cmap_init(&pmd->classifiers);
6485     pmd->ctx.last_rxq = NULL;
6486     pmd_thread_ctx_time_update(pmd);
6487     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6488     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6489     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6490     hmap_init(&pmd->poll_list);
6491     hmap_init(&pmd->tx_ports);
6492     hmap_init(&pmd->tnl_port_cache);
6493     hmap_init(&pmd->send_port_cache);
6494     cmap_init(&pmd->tx_bonds);
6495     /* init the 'flow_cache' since there is no
6496      * actual thread created for NON_PMD_CORE_ID. */
6497     if (core_id == NON_PMD_CORE_ID) {
6498         dfc_cache_init(&pmd->flow_cache);
6499         pmd_alloc_static_tx_qid(pmd);
6500     }
6501     pmd_perf_stats_init(&pmd->perf_stats);
6502     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6503                 hash_int(core_id, 0));
6504 }
6505
6506 static void
6507 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6508 {
6509     struct dpcls *cls;
6510
6511     dp_netdev_pmd_flow_flush(pmd);
6512     hmap_destroy(&pmd->send_port_cache);
6513     hmap_destroy(&pmd->tnl_port_cache);
6514     hmap_destroy(&pmd->tx_ports);
6515     cmap_destroy(&pmd->tx_bonds);
6516     hmap_destroy(&pmd->poll_list);
6517     /* All flows (including their dpcls_rules) have been deleted already */
6518     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6519         dpcls_destroy(cls);
6520         ovsrcu_postpone(free, cls);
6521     }
6522     cmap_destroy(&pmd->classifiers);
6523     cmap_destroy(&pmd->flow_table);
6524     ovs_mutex_destroy(&pmd->flow_mutex);
6525     seq_destroy(pmd->reload_seq);
6526     ovs_mutex_destroy(&pmd->port_mutex);
6527     ovs_mutex_destroy(&pmd->bond_mutex);
6528     free(pmd);
6529 }
6530
6531 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6532  * and unrefs the struct. */
6533 static void
6534 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6535 {
6536     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6537      * but extra cleanup is necessary */
6538     if (pmd->core_id == NON_PMD_CORE_ID) {
6539         ovs_mutex_lock(&dp->non_pmd_mutex);
6540         dfc_cache_uninit(&pmd->flow_cache);
6541         pmd_free_cached_ports(pmd);
6542         pmd_free_static_tx_qid(pmd);
6543         ovs_mutex_unlock(&dp->non_pmd_mutex);
6544     } else {
6545         atomic_store_relaxed(&pmd->exit, true);
6546         dp_netdev_reload_pmd__(pmd);
6547         xpthread_join(pmd->thread, NULL);
6548     }
6549
6550     dp_netdev_pmd_clear_ports(pmd);
6551
6552     /* Purges the 'pmd''s flows after stopping the thread, but before
6553      * destroying the flows, so that the flow stats can be collected. */
6554     if (dp->dp_purge_cb) {
6555         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6556     }
6557     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6558     dp_netdev_pmd_unref(pmd);
6559 }
6560
6561 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6562  * thread. */
6563 static void
6564 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6565 {
6566     struct dp_netdev_pmd_thread *pmd;
6567     struct dp_netdev_pmd_thread **pmd_list;
6568     size_t k = 0, n_pmds;
6569
6570     n_pmds = cmap_count(&dp->poll_threads);
6571     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6572
6573     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6574         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6575             continue;
6576         }
6577         /* We cannot call dp_netdev_del_pmd(), since it alters
6578          * 'dp->poll_threads' (while we're iterating it) and it
6579          * might quiesce. */
6580         ovs_assert(k < n_pmds);
6581         pmd_list[k++] = pmd;
6582     }
6583
6584     for (size_t i = 0; i < k; i++) {
6585         dp_netdev_del_pmd(dp, pmd_list[i]);
6586     }
6587     free(pmd_list);
6588 }
6589
6590 /* Deletes all rx queues from pmd->poll_list and all the ports from
6591  * pmd->tx_ports. */
6592 static void
6593 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6594 {
6595     struct rxq_poll *poll;
6596     struct tx_port *port;
6597     struct tx_bond *tx;
6598
6599     ovs_mutex_lock(&pmd->port_mutex);
6600     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6601         free(poll);
6602     }
6603     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6604         free(port);
6605     }
6606     ovs_mutex_unlock(&pmd->port_mutex);
6607
6608     ovs_mutex_lock(&pmd->bond_mutex);
6609     CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6610         cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6611         ovsrcu_postpone(free, tx);
6612     }
6613     ovs_mutex_unlock(&pmd->bond_mutex);
6614 }
6615
6616 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6617 static void
6618 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6619                          struct dp_netdev_rxq *rxq)
6620     OVS_REQUIRES(pmd->port_mutex)
6621 {
6622     int qid = netdev_rxq_get_queue_id(rxq->rx);
6623     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6624     struct rxq_poll *poll;
6625
6626     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6627         if (poll->rxq == rxq) {
6628             /* 'rxq' is already polled by this thread. Do nothing. */
6629             return;
6630         }
6631     }
6632
6633     poll = xmalloc(sizeof *poll);
6634     poll->rxq = rxq;
6635     hmap_insert(&pmd->poll_list, &poll->node, hash);
6636
6637     pmd->need_reload = true;
6638 }
6639
6640 /* Delete 'poll' from poll_list of PMD thread. */
6641 static void
6642 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6643                            struct rxq_poll *poll)
6644     OVS_REQUIRES(pmd->port_mutex)
6645 {
6646     hmap_remove(&pmd->poll_list, &poll->node);
6647     free(poll);
6648
6649     pmd->need_reload = true;
6650 }
6651
6652 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6653  * changes to take effect. */
6654 static void
6655 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6656                              struct dp_netdev_port *port)
6657     OVS_REQUIRES(pmd->port_mutex)
6658 {
6659     struct tx_port *tx;
6660
6661     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6662     if (tx) {
6663         /* 'port' is already on this thread tx cache. Do nothing. */
6664         return;
6665     }
6666
6667     tx = xzalloc(sizeof *tx);
6668
6669     tx->port = port;
6670     tx->qid = -1;
6671     tx->flush_time = 0LL;
6672     dp_packet_batch_init(&tx->output_pkts);
6673
6674     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6675     pmd->need_reload = true;
6676 }
6677
6678 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6679  * changes to take effect. */
6680 static void
6681 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6682                                struct tx_port *tx)
6683     OVS_REQUIRES(pmd->port_mutex)
6684 {
6685     hmap_remove(&pmd->tx_ports, &tx->node);
6686     free(tx);
6687     pmd->need_reload = true;
6688 }
6689
6690 /* Add bond to the tx bond cmap of 'pmd'. */
6691 static void
6692 dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6693                              struct tx_bond *bond, bool update)
6694     OVS_EXCLUDED(pmd->bond_mutex)
6695 {
6696     struct tx_bond *tx;
6697
6698     ovs_mutex_lock(&pmd->bond_mutex);
6699     tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6700
6701     if (tx && !update) {
6702         /* It's not an update and the entry already exists.  Do nothing. */
6703         goto unlock;
6704     }
6705
6706     if (tx) {
6707         struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6708
6709         /* Copy the stats for each bucket. */
6710         for (int i = 0; i < BOND_BUCKETS; i++) {
6711             uint64_t n_packets, n_bytes;
6712
6713             atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
6714             atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
6715             atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
6716             atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
6717         }
6718         cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6719                      hash_bond_id(bond->bond_id));
6720         ovsrcu_postpone(free, tx);
6721     } else {
6722         tx = xmemdup(bond, sizeof *bond);
6723         cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6724     }
6725 unlock:
6726     ovs_mutex_unlock(&pmd->bond_mutex);
6727 }
6728
6729 /* Delete bond from the tx bond cmap of 'pmd'. */
6730 static void
6731 dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6732                                uint32_t bond_id)
6733     OVS_EXCLUDED(pmd->bond_mutex)
6734 {
6735     struct tx_bond *tx;
6736
6737     ovs_mutex_lock(&pmd->bond_mutex);
6738     tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6739     if (tx) {
6740         cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6741         ovsrcu_postpone(free, tx);
6742     }
6743     ovs_mutex_unlock(&pmd->bond_mutex);
6744 }
6745 \f
6746 static char *
6747 dpif_netdev_get_datapath_version(void)
6748 {
6749      return xstrdup("<built-in>");
6750 }
6751
6752 static void
6753 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6754                     uint16_t tcp_flags, long long now)
6755 {
6756     uint16_t flags;
6757
6758     atomic_store_relaxed(&netdev_flow->stats.used, now);
6759     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6760     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6761     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6762     flags |= tcp_flags;
6763     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6764 }
6765
6766 static int
6767 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6768                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6769                  enum dpif_upcall_type type, const struct nlattr *userdata,
6770                  struct ofpbuf *actions, struct ofpbuf *put_actions)
6771 {
6772     struct dp_netdev *dp = pmd->dp;
6773
6774     if (OVS_UNLIKELY(!dp->upcall_cb)) {
6775         return ENODEV;
6776     }
6777
6778     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6779         struct ds ds = DS_EMPTY_INITIALIZER;
6780         char *packet_str;
6781         struct ofpbuf key;
6782         struct odp_flow_key_parms odp_parms = {
6783             .flow = flow,
6784             .mask = wc ? &wc->masks : NULL,
6785             .support = dp_netdev_support,
6786         };
6787
6788         ofpbuf_init(&key, 0);
6789         odp_flow_key_from_flow(&odp_parms, &key);
6790         packet_str = ofp_dp_packet_to_string(packet_);
6791
6792         odp_flow_key_format(key.data, key.size, &ds);
6793
6794         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6795                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6796
6797         ofpbuf_uninit(&key);
6798         free(packet_str);
6799
6800         ds_destroy(&ds);
6801     }
6802
6803     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6804                          actions, wc, put_actions, dp->upcall_aux);
6805 }
6806
6807 static inline uint32_t
6808 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6809                                 const struct miniflow *mf)
6810 {
6811     uint32_t hash;
6812
6813     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6814         hash = dp_packet_get_rss_hash(packet);
6815     } else {
6816         hash = miniflow_hash_5tuple(mf, 0);
6817         dp_packet_set_rss_hash(packet, hash);
6818     }
6819
6820     return hash;
6821 }
6822
6823 static inline uint32_t
6824 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6825                                 const struct miniflow *mf)
6826 {
6827     uint32_t hash, recirc_depth;
6828
6829     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6830         hash = dp_packet_get_rss_hash(packet);
6831     } else {
6832         hash = miniflow_hash_5tuple(mf, 0);
6833         dp_packet_set_rss_hash(packet, hash);
6834     }
6835
6836     /* The RSS hash must account for the recirculation depth to avoid
6837      * collisions in the exact match cache */
6838     recirc_depth = *recirc_depth_get_unsafe();
6839     if (OVS_UNLIKELY(recirc_depth)) {
6840         hash = hash_finish(hash, recirc_depth);
6841     }
6842     return hash;
6843 }
6844
6845 struct packet_batch_per_flow {
6846     unsigned int byte_count;
6847     uint16_t tcp_flags;
6848     struct dp_netdev_flow *flow;
6849
6850     struct dp_packet_batch array;
6851 };
6852
6853 static inline void
6854 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6855                              struct dp_packet *packet,
6856                              uint16_t tcp_flags)
6857 {
6858     batch->byte_count += dp_packet_size(packet);
6859     batch->tcp_flags |= tcp_flags;
6860     dp_packet_batch_add(&batch->array, packet);
6861 }
6862
6863 static inline void
6864 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6865                            struct dp_netdev_flow *flow)
6866 {
6867     flow->batch = batch;
6868
6869     batch->flow = flow;
6870     dp_packet_batch_init(&batch->array);
6871     batch->byte_count = 0;
6872     batch->tcp_flags = 0;
6873 }
6874
6875 static inline void
6876 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6877                               struct dp_netdev_pmd_thread *pmd)
6878 {
6879     struct dp_netdev_actions *actions;
6880     struct dp_netdev_flow *flow = batch->flow;
6881
6882     dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6883                         batch->byte_count,
6884                         batch->tcp_flags, pmd->ctx.now / 1000);
6885
6886     actions = dp_netdev_flow_get_actions(flow);
6887
6888     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6889                               actions->actions, actions->size);
6890 }
6891
6892 static inline void
6893 dp_netdev_queue_batches(struct dp_packet *pkt,
6894                         struct dp_netdev_flow *flow, uint16_t tcp_flags,
6895                         struct packet_batch_per_flow *batches,
6896                         size_t *n_batches)
6897 {
6898     struct packet_batch_per_flow *batch = flow->batch;
6899
6900     if (OVS_UNLIKELY(!batch)) {
6901         batch = &batches[(*n_batches)++];
6902         packet_batch_per_flow_init(batch, flow);
6903     }
6904
6905     packet_batch_per_flow_update(batch, pkt, tcp_flags);
6906 }
6907
6908 static inline void
6909 packet_enqueue_to_flow_map(struct dp_packet *packet,
6910                            struct dp_netdev_flow *flow,
6911                            uint16_t tcp_flags,
6912                            struct dp_packet_flow_map *flow_map,
6913                            size_t index)
6914 {
6915     struct dp_packet_flow_map *map = &flow_map[index];
6916     map->flow = flow;
6917     map->packet = packet;
6918     map->tcp_flags = tcp_flags;
6919 }
6920
6921 /* SMC lookup function for a batch of packets.
6922  * By doing batching SMC lookup, we can use prefetch
6923  * to hide memory access latency.
6924  */
6925 static inline void
6926 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6927             struct netdev_flow_key *keys,
6928             struct netdev_flow_key **missed_keys,
6929             struct dp_packet_batch *packets_,
6930             const int cnt,
6931             struct dp_packet_flow_map *flow_map,
6932             uint8_t *index_map)
6933 {
6934     int i;
6935     struct dp_packet *packet;
6936     size_t n_smc_hit = 0, n_missed = 0;
6937     struct dfc_cache *cache = &pmd->flow_cache;
6938     struct smc_cache *smc_cache = &cache->smc_cache;
6939     const struct cmap_node *flow_node;
6940     int recv_idx;
6941     uint16_t tcp_flags;
6942
6943     /* Prefetch buckets for all packets */
6944     for (i = 0; i < cnt; i++) {
6945         OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6946     }
6947
6948     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6949         struct dp_netdev_flow *flow = NULL;
6950         flow_node = smc_entry_get(pmd, keys[i].hash);
6951         bool hit = false;
6952         /* Get the original order of this packet in received batch. */
6953         recv_idx = index_map[i];
6954
6955         if (OVS_LIKELY(flow_node != NULL)) {
6956             CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6957                 /* Since we dont have per-port megaflow to check the port
6958                  * number, we need to  verify that the input ports match. */
6959                 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6960                 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6961                     tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6962
6963                     /* SMC hit and emc miss, we insert into EMC */
6964                     keys[i].len =
6965                         netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6966                     emc_probabilistic_insert(pmd, &keys[i], flow);
6967                     /* Add these packets into the flow map in the same order
6968                      * as received.
6969                      */
6970                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6971                                                flow_map, recv_idx);
6972                     n_smc_hit++;
6973                     hit = true;
6974                     break;
6975                 }
6976             }
6977             if (hit) {
6978                 continue;
6979             }
6980         }
6981
6982         /* SMC missed. Group missed packets together at
6983          * the beginning of the 'packets' array. */
6984         dp_packet_batch_refill(packets_, packet, i);
6985
6986         /* Preserve the order of packet for flow batching. */
6987         index_map[n_missed] = recv_idx;
6988
6989         /* Put missed keys to the pointer arrays return to the caller */
6990         missed_keys[n_missed++] = &keys[i];
6991     }
6992
6993     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6994 }
6995
6996 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6997  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6998  * miniflow is copied into 'keys' and the packet pointer is moved at the
6999  * beginning of the 'packets' array. The pointers of missed keys are put in the
7000  * missed_keys pointer array for future processing.
7001  *
7002  * The function returns the number of packets that needs to be processed in the
7003  * 'packets' array (they have been moved to the beginning of the vector).
7004  *
7005  * For performance reasons a caller may choose not to initialize the metadata
7006  * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
7007  * is not valid and must be initialized by this function using 'port_no'.
7008  * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7009  * will be ignored.
7010  */
7011 static inline size_t
7012 dfc_processing(struct dp_netdev_pmd_thread *pmd,
7013                struct dp_packet_batch *packets_,
7014                struct netdev_flow_key *keys,
7015                struct netdev_flow_key **missed_keys,
7016                struct packet_batch_per_flow batches[], size_t *n_batches,
7017                struct dp_packet_flow_map *flow_map,
7018                size_t *n_flows, uint8_t *index_map,
7019                bool md_is_valid, odp_port_t port_no)
7020 {
7021     struct netdev_flow_key *key = &keys[0];
7022     size_t n_missed = 0, n_emc_hit = 0;
7023     struct dfc_cache *cache = &pmd->flow_cache;
7024     struct dp_packet *packet;
7025     const size_t cnt = dp_packet_batch_size(packets_);
7026     uint32_t cur_min = pmd->ctx.emc_insert_min;
7027     int i;
7028     uint16_t tcp_flags;
7029     bool smc_enable_db;
7030     size_t map_cnt = 0;
7031     bool batch_enable = true;
7032
7033     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
7034     pmd_perf_update_counter(&pmd->perf_stats,
7035                             md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7036                             cnt);
7037
7038     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7039         struct dp_netdev_flow *flow;
7040         uint32_t mark;
7041
7042         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7043             dp_packet_delete(packet);
7044             COVERAGE_INC(datapath_drop_rx_invalid_packet);
7045             continue;
7046         }
7047
7048         if (i != cnt - 1) {
7049             struct dp_packet **packets = packets_->packets;
7050             /* Prefetch next packet data and metadata. */
7051             OVS_PREFETCH(dp_packet_data(packets[i+1]));
7052             pkt_metadata_prefetch_init(&packets[i+1]->md);
7053         }
7054
7055         if (!md_is_valid) {
7056             pkt_metadata_init(&packet->md, port_no);
7057         }
7058
7059         if ((*recirc_depth_get() == 0) &&
7060             dp_packet_has_flow_mark(packet, &mark)) {
7061             flow = mark_to_flow_find(pmd, mark);
7062             if (OVS_LIKELY(flow)) {
7063                 tcp_flags = parse_tcp_flags(packet);
7064                 if (OVS_LIKELY(batch_enable)) {
7065                     dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7066                                             n_batches);
7067                 } else {
7068                     /* Flow batching should be performed only after fast-path
7069                      * processing is also completed for packets with emc miss
7070                      * or else it will result in reordering of packets with
7071                      * same datapath flows. */
7072                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7073                                                flow_map, map_cnt++);
7074                 }
7075                 continue;
7076             }
7077         }
7078
7079         miniflow_extract(packet, &key->mf);
7080         key->len = 0; /* Not computed yet. */
7081         key->hash =
7082                 (md_is_valid == false)
7083                 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7084                 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7085
7086         /* If EMC is disabled skip emc_lookup */
7087         flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7088         if (OVS_LIKELY(flow)) {
7089             tcp_flags = miniflow_get_tcp_flags(&key->mf);
7090             n_emc_hit++;
7091             if (OVS_LIKELY(batch_enable)) {
7092                 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7093                                         n_batches);
7094             } else {
7095                 /* Flow batching should be performed only after fast-path
7096                  * processing is also completed for packets with emc miss
7097                  * or else it will result in reordering of packets with
7098                  * same datapath flows. */
7099                 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7100                                            flow_map, map_cnt++);
7101             }
7102         } else {
7103             /* Exact match cache missed. Group missed packets together at
7104              * the beginning of the 'packets' array. */
7105             dp_packet_batch_refill(packets_, packet, i);
7106
7107             /* Preserve the order of packet for flow batching. */
7108             index_map[n_missed] = map_cnt;
7109             flow_map[map_cnt++].flow = NULL;
7110
7111             /* 'key[n_missed]' contains the key of the current packet and it
7112              * will be passed to SMC lookup. The next key should be extracted
7113              * to 'keys[n_missed + 1]'.
7114              * We also maintain a pointer array to keys missed both SMC and EMC
7115              * which will be returned to the caller for future processing. */
7116             missed_keys[n_missed] = key;
7117             key = &keys[++n_missed];
7118
7119             /* Skip batching for subsequent packets to avoid reordering. */
7120             batch_enable = false;
7121         }
7122     }
7123     /* Count of packets which are not flow batched. */
7124     *n_flows = map_cnt;
7125
7126     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7127
7128     if (!smc_enable_db) {
7129         return dp_packet_batch_size(packets_);
7130     }
7131
7132     /* Packets miss EMC will do a batch lookup in SMC if enabled */
7133     smc_lookup_batch(pmd, keys, missed_keys, packets_,
7134                      n_missed, flow_map, index_map);
7135
7136     return dp_packet_batch_size(packets_);
7137 }
7138
7139 static inline int
7140 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7141                      struct dp_packet *packet,
7142                      const struct netdev_flow_key *key,
7143                      struct ofpbuf *actions, struct ofpbuf *put_actions)
7144 {
7145     struct ofpbuf *add_actions;
7146     struct dp_packet_batch b;
7147     struct match match;
7148     ovs_u128 ufid;
7149     int error;
7150     uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7151
7152     match.tun_md.valid = false;
7153     miniflow_expand(&key->mf, &match.flow);
7154     memset(&match.wc, 0, sizeof match.wc);
7155
7156     ofpbuf_clear(actions);
7157     ofpbuf_clear(put_actions);
7158
7159     odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7160     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7161                              &ufid, DPIF_UC_MISS, NULL, actions,
7162                              put_actions);
7163     if (OVS_UNLIKELY(error && error != ENOSPC)) {
7164         dp_packet_delete(packet);
7165         COVERAGE_INC(datapath_drop_upcall_error);
7166         return error;
7167     }
7168
7169     /* The Netlink encoding of datapath flow keys cannot express
7170      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7171      * tag is interpreted as exact match on the fact that there is no
7172      * VLAN.  Unless we refactor a lot of code that translates between
7173      * Netlink and struct flow representations, we have to do the same
7174      * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
7175     if (!match.wc.masks.vlans[0].tci) {
7176         match.wc.masks.vlans[0].tci = htons(0xffff);
7177     }
7178
7179     /* We can't allow the packet batching in the next loop to execute
7180      * the actions.  Otherwise, if there are any slow path actions,
7181      * we'll send the packet up twice. */
7182     dp_packet_batch_init_packet(&b, packet);
7183     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
7184                               actions->data, actions->size);
7185
7186     add_actions = put_actions->size ? put_actions : actions;
7187     if (OVS_LIKELY(error != ENOSPC)) {
7188         struct dp_netdev_flow *netdev_flow;
7189
7190         /* XXX: There's a race window where a flow covering this packet
7191          * could have already been installed since we last did the flow
7192          * lookup before upcall.  This could be solved by moving the
7193          * mutex lock outside the loop, but that's an awful long time
7194          * to be locking revalidators out of making flow modifications. */
7195         ovs_mutex_lock(&pmd->flow_mutex);
7196         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
7197         if (OVS_LIKELY(!netdev_flow)) {
7198             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
7199                                              add_actions->data,
7200                                              add_actions->size);
7201         }
7202         ovs_mutex_unlock(&pmd->flow_mutex);
7203         uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7204         smc_insert(pmd, key, hash);
7205         emc_probabilistic_insert(pmd, key, netdev_flow);
7206     }
7207     if (pmd_perf_metrics_enabled(pmd)) {
7208         /* Update upcall stats. */
7209         cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7210         struct pmd_perf_stats *s = &pmd->perf_stats;
7211         s->current.upcalls++;
7212         s->current.upcall_cycles += cycles;
7213         histogram_add_sample(&s->cycles_per_upcall, cycles);
7214     }
7215     return error;
7216 }
7217
7218 static inline void
7219 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7220                      struct dp_packet_batch *packets_,
7221                      struct netdev_flow_key **keys,
7222                      struct dp_packet_flow_map *flow_map,
7223                      uint8_t *index_map,
7224                      odp_port_t in_port)
7225 {
7226     const size_t cnt = dp_packet_batch_size(packets_);
7227 #if !defined(__CHECKER__) && !defined(_WIN32)
7228     const size_t PKT_ARRAY_SIZE = cnt;
7229 #else
7230     /* Sparse or MSVC doesn't like variable length array. */
7231     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7232 #endif
7233     struct dp_packet *packet;
7234     struct dpcls *cls;
7235     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7236     struct dp_netdev *dp = pmd->dp;
7237     int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7238     int lookup_cnt = 0, add_lookup_cnt;
7239     bool any_miss;
7240
7241     for (size_t i = 0; i < cnt; i++) {
7242         /* Key length is needed in all the cases, hash computed on demand. */
7243         keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7244     }
7245     /* Get the classifier for the in_port */
7246     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7247     if (OVS_LIKELY(cls)) {
7248         any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7249                                 rules, cnt, &lookup_cnt);
7250     } else {
7251         any_miss = true;
7252         memset(rules, 0, sizeof(rules));
7253     }
7254     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7255         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7256         struct ofpbuf actions, put_actions;
7257
7258         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7259         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7260
7261         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7262             struct dp_netdev_flow *netdev_flow;
7263
7264             if (OVS_LIKELY(rules[i])) {
7265                 continue;
7266             }
7267
7268             /* It's possible that an earlier slow path execution installed
7269              * a rule covering this flow.  In this case, it's a lot cheaper
7270              * to catch it here than execute a miss. */
7271             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7272                                                     &add_lookup_cnt);
7273             if (netdev_flow) {
7274                 lookup_cnt += add_lookup_cnt;
7275                 rules[i] = &netdev_flow->cr;
7276                 continue;
7277             }
7278
7279             int error = handle_packet_upcall(pmd, packet, keys[i],
7280                                              &actions, &put_actions);
7281
7282             if (OVS_UNLIKELY(error)) {
7283                 upcall_fail_cnt++;
7284             } else {
7285                 upcall_ok_cnt++;
7286             }
7287         }
7288
7289         ofpbuf_uninit(&actions);
7290         ofpbuf_uninit(&put_actions);
7291         fat_rwlock_unlock(&dp->upcall_rwlock);
7292     } else if (OVS_UNLIKELY(any_miss)) {
7293         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7294             if (OVS_UNLIKELY(!rules[i])) {
7295                 dp_packet_delete(packet);
7296                 COVERAGE_INC(datapath_drop_lock_error);
7297                 upcall_fail_cnt++;
7298             }
7299         }
7300     }
7301
7302     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7303         struct dp_netdev_flow *flow;
7304         /* Get the original order of this packet in received batch. */
7305         int recv_idx = index_map[i];
7306         uint16_t tcp_flags;
7307
7308         if (OVS_UNLIKELY(!rules[i])) {
7309             continue;
7310         }
7311
7312         flow = dp_netdev_flow_cast(rules[i]);
7313         uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
7314         smc_insert(pmd, keys[i], hash);
7315
7316         emc_probabilistic_insert(pmd, keys[i], flow);
7317         /* Add these packets into the flow map in the same order
7318          * as received.
7319          */
7320         tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7321         packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7322                                    flow_map, recv_idx);
7323     }
7324
7325     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7326                             cnt - upcall_ok_cnt - upcall_fail_cnt);
7327     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7328                             lookup_cnt);
7329     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7330                             upcall_ok_cnt);
7331     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7332                             upcall_fail_cnt);
7333 }
7334
7335 /* Packets enter the datapath from a port (or from recirculation) here.
7336  *
7337  * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7338  * When false the metadata in 'packets' need to be initialized. */
7339 static void
7340 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7341                   struct dp_packet_batch *packets,
7342                   bool md_is_valid, odp_port_t port_no)
7343 {
7344 #if !defined(__CHECKER__) && !defined(_WIN32)
7345     const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7346 #else
7347     /* Sparse or MSVC doesn't like variable length array. */
7348     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7349 #endif
7350     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7351         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7352     struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7353     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7354     size_t n_batches;
7355     struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7356     uint8_t index_map[PKT_ARRAY_SIZE];
7357     size_t n_flows, i;
7358
7359     odp_port_t in_port;
7360
7361     n_batches = 0;
7362     dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7363                    flow_map, &n_flows, index_map, md_is_valid, port_no);
7364
7365     if (!dp_packet_batch_is_empty(packets)) {
7366         /* Get ingress port from first packet's metadata. */
7367         in_port = packets->packets[0]->md.in_port.odp_port;
7368         fast_path_processing(pmd, packets, missed_keys,
7369                              flow_map, index_map, in_port);
7370     }
7371
7372     /* Batch rest of packets which are in flow map. */
7373     for (i = 0; i < n_flows; i++) {
7374         struct dp_packet_flow_map *map = &flow_map[i];
7375
7376         if (OVS_UNLIKELY(!map->flow)) {
7377             continue;
7378         }
7379         dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7380                                 batches, &n_batches);
7381      }
7382
7383     /* All the flow batches need to be reset before any call to
7384      * packet_batch_per_flow_execute() as it could potentially trigger
7385      * recirculation. When a packet matching flow ‘j’ happens to be
7386      * recirculated, the nested call to dp_netdev_input__() could potentially
7387      * classify the packet as matching another flow - say 'k'. It could happen
7388      * that in the previous call to dp_netdev_input__() that same flow 'k' had
7389      * already its own batches[k] still waiting to be served.  So if its
7390      * ‘batch’ member is not reset, the recirculated packet would be wrongly
7391      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7392     for (i = 0; i < n_batches; i++) {
7393         batches[i].flow->batch = NULL;
7394     }
7395
7396     for (i = 0; i < n_batches; i++) {
7397         packet_batch_per_flow_execute(&batches[i], pmd);
7398     }
7399 }
7400
7401 static void
7402 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7403                 struct dp_packet_batch *packets,
7404                 odp_port_t port_no)
7405 {
7406     dp_netdev_input__(pmd, packets, false, port_no);
7407 }
7408
7409 static void
7410 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7411                       struct dp_packet_batch *packets)
7412 {
7413     dp_netdev_input__(pmd, packets, true, 0);
7414 }
7415
7416 struct dp_netdev_execute_aux {
7417     struct dp_netdev_pmd_thread *pmd;
7418     const struct flow *flow;
7419 };
7420
7421 static void
7422 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7423                                  void *aux)
7424 {
7425     struct dp_netdev *dp = get_dp_netdev(dpif);
7426     dp->dp_purge_aux = aux;
7427     dp->dp_purge_cb = cb;
7428 }
7429
7430 static void
7431 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7432                                void *aux)
7433 {
7434     struct dp_netdev *dp = get_dp_netdev(dpif);
7435     dp->upcall_aux = aux;
7436     dp->upcall_cb = cb;
7437 }
7438
7439 static void
7440 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7441                                bool purge)
7442 {
7443     struct tx_port *tx;
7444     struct dp_netdev_port *port;
7445     long long interval;
7446
7447     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7448         if (!tx->port->dynamic_txqs) {
7449             continue;
7450         }
7451         interval = pmd->ctx.now - tx->last_used;
7452         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7453             port = tx->port;
7454             ovs_mutex_lock(&port->txq_used_mutex);
7455             port->txq_used[tx->qid]--;
7456             ovs_mutex_unlock(&port->txq_used_mutex);
7457             tx->qid = -1;
7458         }
7459     }
7460 }
7461
7462 static int
7463 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7464                            struct tx_port *tx)
7465 {
7466     struct dp_netdev_port *port;
7467     long long interval;
7468     int i, min_cnt, min_qid;
7469
7470     interval = pmd->ctx.now - tx->last_used;
7471     tx->last_used = pmd->ctx.now;
7472
7473     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7474         return tx->qid;
7475     }
7476
7477     port = tx->port;
7478
7479     ovs_mutex_lock(&port->txq_used_mutex);
7480     if (tx->qid >= 0) {
7481         port->txq_used[tx->qid]--;
7482         tx->qid = -1;
7483     }
7484
7485     min_cnt = -1;
7486     min_qid = 0;
7487     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7488         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7489             min_cnt = port->txq_used[i];
7490             min_qid = i;
7491         }
7492     }
7493
7494     port->txq_used[min_qid]++;
7495     tx->qid = min_qid;
7496
7497     ovs_mutex_unlock(&port->txq_used_mutex);
7498
7499     dpif_netdev_xps_revalidate_pmd(pmd, false);
7500
7501     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7502              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7503     return min_qid;
7504 }
7505
7506 static struct tx_port *
7507 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7508                           odp_port_t port_no)
7509 {
7510     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7511 }
7512
7513 static struct tx_port *
7514 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7515                            odp_port_t port_no)
7516 {
7517     return tx_port_lookup(&pmd->send_port_cache, port_no);
7518 }
7519
7520 static int
7521 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7522                 const struct nlattr *attr,
7523                 struct dp_packet_batch *batch)
7524 {
7525     struct tx_port *tun_port;
7526     const struct ovs_action_push_tnl *data;
7527     int err;
7528
7529     data = nl_attr_get(attr);
7530
7531     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7532     if (!tun_port) {
7533         err = -EINVAL;
7534         goto error;
7535     }
7536     err = netdev_push_header(tun_port->port->netdev, batch, data);
7537     if (!err) {
7538         return 0;
7539     }
7540 error:
7541     dp_packet_delete_batch(batch, true);
7542     return err;
7543 }
7544
7545 static void
7546 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7547                             struct dp_packet *packet, bool should_steal,
7548                             struct flow *flow, ovs_u128 *ufid,
7549                             struct ofpbuf *actions,
7550                             const struct nlattr *userdata)
7551 {
7552     struct dp_packet_batch b;
7553     int error;
7554
7555     ofpbuf_clear(actions);
7556
7557     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7558                              DPIF_UC_ACTION, userdata, actions,
7559                              NULL);
7560     if (!error || error == ENOSPC) {
7561         dp_packet_batch_init_packet(&b, packet);
7562         dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7563                                   actions->data, actions->size);
7564     } else if (should_steal) {
7565         dp_packet_delete(packet);
7566         COVERAGE_INC(datapath_drop_userspace_action_error);
7567     }
7568 }
7569
7570 static bool
7571 dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7572                          struct dp_packet_batch *packets_,
7573                          bool should_steal, odp_port_t port_no)
7574 {
7575     struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7576     struct dp_packet_batch out;
7577
7578     if (!OVS_LIKELY(p)) {
7579         COVERAGE_ADD(datapath_drop_invalid_port,
7580                      dp_packet_batch_size(packets_));
7581         dp_packet_delete_batch(packets_, should_steal);
7582         return false;
7583     }
7584     if (!should_steal) {
7585         dp_packet_batch_clone(&out, packets_);
7586         dp_packet_batch_reset_cutlen(packets_);
7587         packets_ = &out;
7588     }
7589     dp_packet_batch_apply_cutlen(packets_);
7590 #ifdef DPDK_NETDEV
7591     if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7592                      && packets_->packets[0]->source
7593                         != p->output_pkts.packets[0]->source)) {
7594         /* XXX: netdev-dpdk assumes that all packets in a single
7595          *      output batch has the same source. Flush here to
7596          *      avoid memory access issues. */
7597         dp_netdev_pmd_flush_output_on_port(pmd, p);
7598     }
7599 #endif
7600     if (dp_packet_batch_size(&p->output_pkts)
7601         + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7602         /* Flush here to avoid overflow. */
7603         dp_netdev_pmd_flush_output_on_port(pmd, p);
7604     }
7605     if (dp_packet_batch_is_empty(&p->output_pkts)) {
7606         pmd->n_output_batches++;
7607     }
7608
7609     struct dp_packet *packet;
7610     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7611         p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7612             pmd->ctx.last_rxq;
7613         dp_packet_batch_add(&p->output_pkts, packet);
7614     }
7615     return true;
7616 }
7617
7618 static void
7619 dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7620                             struct dp_packet_batch *packets_,
7621                             bool should_steal, uint32_t bond)
7622 {
7623     struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7624     struct dp_packet_batch out;
7625     struct dp_packet *packet;
7626
7627     if (!p_bond) {
7628         COVERAGE_ADD(datapath_drop_invalid_bond,
7629                      dp_packet_batch_size(packets_));
7630         dp_packet_delete_batch(packets_, should_steal);
7631         return;
7632     }
7633     if (!should_steal) {
7634         dp_packet_batch_clone(&out, packets_);
7635         dp_packet_batch_reset_cutlen(packets_);
7636         packets_ = &out;
7637     }
7638     dp_packet_batch_apply_cutlen(packets_);
7639
7640     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7641         /*
7642          * Lookup the bond-hash table using hash to get the member.
7643          */
7644         uint32_t hash = dp_packet_get_rss_hash(packet);
7645         struct member_entry *s_entry
7646             = &p_bond->member_buckets[hash & BOND_MASK];
7647         odp_port_t bond_member = s_entry->member_id;
7648         uint32_t size = dp_packet_size(packet);
7649         struct dp_packet_batch output_pkt;
7650
7651         dp_packet_batch_init_packet(&output_pkt, packet);
7652         if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7653                                                 bond_member))) {
7654             /* Update member stats. */
7655             non_atomic_ullong_add(&s_entry->n_packets, 1);
7656             non_atomic_ullong_add(&s_entry->n_bytes, size);
7657         }
7658     }
7659 }
7660
7661 static void
7662 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7663               const struct nlattr *a, bool should_steal)
7664     OVS_NO_THREAD_SAFETY_ANALYSIS
7665 {
7666     struct dp_netdev_execute_aux *aux = aux_;
7667     uint32_t *depth = recirc_depth_get();
7668     struct dp_netdev_pmd_thread *pmd = aux->pmd;
7669     struct dp_netdev *dp = pmd->dp;
7670     int type = nl_attr_type(a);
7671     struct tx_port *p;
7672     uint32_t packet_count, packets_dropped;
7673
7674     switch ((enum ovs_action_attr)type) {
7675     case OVS_ACTION_ATTR_OUTPUT:
7676         dp_execute_output_action(pmd, packets_, should_steal,
7677                                  nl_attr_get_odp_port(a));
7678         return;
7679
7680     case OVS_ACTION_ATTR_LB_OUTPUT:
7681         dp_execute_lb_output_action(pmd, packets_, should_steal,
7682                                     nl_attr_get_u32(a));
7683         return;
7684
7685     case OVS_ACTION_ATTR_TUNNEL_PUSH:
7686         if (should_steal) {
7687             /* We're requested to push tunnel header, but also we need to take
7688              * the ownership of these packets. Thus, we can avoid performing
7689              * the action, because the caller will not use the result anyway.
7690              * Just break to free the batch. */
7691             break;
7692         }
7693         dp_packet_batch_apply_cutlen(packets_);
7694         packet_count = dp_packet_batch_size(packets_);
7695         if (push_tnl_action(pmd, a, packets_)) {
7696             COVERAGE_ADD(datapath_drop_tunnel_push_error,
7697                          packet_count);
7698         }
7699         return;
7700
7701     case OVS_ACTION_ATTR_TUNNEL_POP:
7702         if (*depth < MAX_RECIRC_DEPTH) {
7703             struct dp_packet_batch *orig_packets_ = packets_;
7704             odp_port_t portno = nl_attr_get_odp_port(a);
7705
7706             p = pmd_tnl_port_cache_lookup(pmd, portno);
7707             if (p) {
7708                 struct dp_packet_batch tnl_pkt;
7709
7710                 if (!should_steal) {
7711                     dp_packet_batch_clone(&tnl_pkt, packets_);
7712                     packets_ = &tnl_pkt;
7713                     dp_packet_batch_reset_cutlen(orig_packets_);
7714                 }
7715
7716                 dp_packet_batch_apply_cutlen(packets_);
7717
7718                 packet_count = dp_packet_batch_size(packets_);
7719                 netdev_pop_header(p->port->netdev, packets_);
7720                 packets_dropped =
7721                    packet_count - dp_packet_batch_size(packets_);
7722                 if (packets_dropped) {
7723                     COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7724                                  packets_dropped);
7725                 }
7726                 if (dp_packet_batch_is_empty(packets_)) {
7727                     return;
7728                 }
7729
7730                 struct dp_packet *packet;
7731                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7732                     packet->md.in_port.odp_port = portno;
7733                 }
7734
7735                 (*depth)++;
7736                 dp_netdev_recirculate(pmd, packets_);
7737                 (*depth)--;
7738                 return;
7739             }
7740             COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7741                          dp_packet_batch_size(packets_));
7742         } else {
7743             COVERAGE_ADD(datapath_drop_recirc_error,
7744                          dp_packet_batch_size(packets_));
7745         }
7746         break;
7747
7748     case OVS_ACTION_ATTR_USERSPACE:
7749         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7750             struct dp_packet_batch *orig_packets_ = packets_;
7751             const struct nlattr *userdata;
7752             struct dp_packet_batch usr_pkt;
7753             struct ofpbuf actions;
7754             struct flow flow;
7755             ovs_u128 ufid;
7756             bool clone = false;
7757
7758             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7759             ofpbuf_init(&actions, 0);
7760
7761             if (packets_->trunc) {
7762                 if (!should_steal) {
7763                     dp_packet_batch_clone(&usr_pkt, packets_);
7764                     packets_ = &usr_pkt;
7765                     clone = true;
7766                     dp_packet_batch_reset_cutlen(orig_packets_);
7767                 }
7768
7769                 dp_packet_batch_apply_cutlen(packets_);
7770             }
7771
7772             struct dp_packet *packet;
7773             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7774                 flow_extract(packet, &flow);
7775                 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7776                 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7777                                             &ufid, &actions, userdata);
7778             }
7779
7780             if (clone) {
7781                 dp_packet_delete_batch(packets_, true);
7782             }
7783
7784             ofpbuf_uninit(&actions);
7785             fat_rwlock_unlock(&dp->upcall_rwlock);
7786
7787             return;
7788         }
7789         COVERAGE_ADD(datapath_drop_lock_error,
7790                      dp_packet_batch_size(packets_));
7791         break;
7792
7793     case OVS_ACTION_ATTR_RECIRC:
7794         if (*depth < MAX_RECIRC_DEPTH) {
7795             struct dp_packet_batch recirc_pkts;
7796
7797             if (!should_steal) {
7798                dp_packet_batch_clone(&recirc_pkts, packets_);
7799                packets_ = &recirc_pkts;
7800             }
7801
7802             struct dp_packet *packet;
7803             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7804                 packet->md.recirc_id = nl_attr_get_u32(a);
7805             }
7806
7807             (*depth)++;
7808             dp_netdev_recirculate(pmd, packets_);
7809             (*depth)--;
7810
7811             return;
7812         }
7813
7814         COVERAGE_ADD(datapath_drop_recirc_error,
7815                      dp_packet_batch_size(packets_));
7816         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7817         break;
7818
7819     case OVS_ACTION_ATTR_CT: {
7820         const struct nlattr *b;
7821         bool force = false;
7822         bool commit = false;
7823         unsigned int left;
7824         uint16_t zone = 0;
7825         uint32_t tp_id = 0;
7826         const char *helper = NULL;
7827         const uint32_t *setmark = NULL;
7828         const struct ovs_key_ct_labels *setlabel = NULL;
7829         struct nat_action_info_t nat_action_info;
7830         struct nat_action_info_t *nat_action_info_ref = NULL;
7831         bool nat_config = false;
7832
7833         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7834                                  nl_attr_get_size(a)) {
7835             enum ovs_ct_attr sub_type = nl_attr_type(b);
7836
7837             switch(sub_type) {
7838             case OVS_CT_ATTR_FORCE_COMMIT:
7839                 force = true;
7840                 /* fall through. */
7841             case OVS_CT_ATTR_COMMIT:
7842                 commit = true;
7843                 break;
7844             case OVS_CT_ATTR_ZONE:
7845                 zone = nl_attr_get_u16(b);
7846                 break;
7847             case OVS_CT_ATTR_HELPER:
7848                 helper = nl_attr_get_string(b);
7849                 break;
7850             case OVS_CT_ATTR_MARK:
7851                 setmark = nl_attr_get(b);
7852                 break;
7853             case OVS_CT_ATTR_LABELS:
7854                 setlabel = nl_attr_get(b);
7855                 break;
7856             case OVS_CT_ATTR_EVENTMASK:
7857                 /* Silently ignored, as userspace datapath does not generate
7858                  * netlink events. */
7859                 break;
7860             case OVS_CT_ATTR_TIMEOUT:
7861                 if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
7862                     VLOG_WARN("Invalid Timeout Policy ID: %s.",
7863                               nl_attr_get_string(b));
7864                     tp_id = DEFAULT_TP_ID;
7865                 }
7866                 break;
7867             case OVS_CT_ATTR_NAT: {
7868                 const struct nlattr *b_nest;
7869                 unsigned int left_nest;
7870                 bool ip_min_specified = false;
7871                 bool proto_num_min_specified = false;
7872                 bool ip_max_specified = false;
7873                 bool proto_num_max_specified = false;
7874                 memset(&nat_action_info, 0, sizeof nat_action_info);
7875                 nat_action_info_ref = &nat_action_info;
7876
7877                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7878                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7879
7880                     switch (sub_type_nest) {
7881                     case OVS_NAT_ATTR_SRC:
7882                     case OVS_NAT_ATTR_DST:
7883                         nat_config = true;
7884                         nat_action_info.nat_action |=
7885                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
7886                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7887                         break;
7888                     case OVS_NAT_ATTR_IP_MIN:
7889                         memcpy(&nat_action_info.min_addr,
7890                                nl_attr_get(b_nest),
7891                                nl_attr_get_size(b_nest));
7892                         ip_min_specified = true;
7893                         break;
7894                     case OVS_NAT_ATTR_IP_MAX:
7895                         memcpy(&nat_action_info.max_addr,
7896                                nl_attr_get(b_nest),
7897                                nl_attr_get_size(b_nest));
7898                         ip_max_specified = true;
7899                         break;
7900                     case OVS_NAT_ATTR_PROTO_MIN:
7901                         nat_action_info.min_port =
7902                             nl_attr_get_u16(b_nest);
7903                         proto_num_min_specified = true;
7904                         break;
7905                     case OVS_NAT_ATTR_PROTO_MAX:
7906                         nat_action_info.max_port =
7907                             nl_attr_get_u16(b_nest);
7908                         proto_num_max_specified = true;
7909                         break;
7910                     case OVS_NAT_ATTR_PERSISTENT:
7911                     case OVS_NAT_ATTR_PROTO_HASH:
7912                     case OVS_NAT_ATTR_PROTO_RANDOM:
7913                         break;
7914                     case OVS_NAT_ATTR_UNSPEC:
7915                     case __OVS_NAT_ATTR_MAX:
7916                         OVS_NOT_REACHED();
7917                     }
7918                 }
7919
7920                 if (ip_min_specified && !ip_max_specified) {
7921                     nat_action_info.max_addr = nat_action_info.min_addr;
7922                 }
7923                 if (proto_num_min_specified && !proto_num_max_specified) {
7924                     nat_action_info.max_port = nat_action_info.min_port;
7925                 }
7926                 if (proto_num_min_specified || proto_num_max_specified) {
7927                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7928                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7929                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7930                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7931                     }
7932                 }
7933                 break;
7934             }
7935             case OVS_CT_ATTR_UNSPEC:
7936             case __OVS_CT_ATTR_MAX:
7937                 OVS_NOT_REACHED();
7938             }
7939         }
7940
7941         /* We won't be able to function properly in this case, hence
7942          * complain loudly. */
7943         if (nat_config && !commit) {
7944             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7945             VLOG_WARN_RL(&rl, "NAT specified without commit.");
7946         }
7947
7948         conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7949                           commit, zone, setmark, setlabel, aux->flow->tp_src,
7950                           aux->flow->tp_dst, helper, nat_action_info_ref,
7951                           pmd->ctx.now / 1000, tp_id);
7952         break;
7953     }
7954
7955     case OVS_ACTION_ATTR_METER:
7956         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7957                             pmd->ctx.now);
7958         break;
7959
7960     case OVS_ACTION_ATTR_PUSH_VLAN:
7961     case OVS_ACTION_ATTR_POP_VLAN:
7962     case OVS_ACTION_ATTR_PUSH_MPLS:
7963     case OVS_ACTION_ATTR_POP_MPLS:
7964     case OVS_ACTION_ATTR_SET:
7965     case OVS_ACTION_ATTR_SET_MASKED:
7966     case OVS_ACTION_ATTR_SAMPLE:
7967     case OVS_ACTION_ATTR_HASH:
7968     case OVS_ACTION_ATTR_UNSPEC:
7969     case OVS_ACTION_ATTR_TRUNC:
7970     case OVS_ACTION_ATTR_PUSH_ETH:
7971     case OVS_ACTION_ATTR_POP_ETH:
7972     case OVS_ACTION_ATTR_CLONE:
7973     case OVS_ACTION_ATTR_PUSH_NSH:
7974     case OVS_ACTION_ATTR_POP_NSH:
7975     case OVS_ACTION_ATTR_CT_CLEAR:
7976     case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7977     case OVS_ACTION_ATTR_DROP:
7978     case __OVS_ACTION_ATTR_MAX:
7979         OVS_NOT_REACHED();
7980     }
7981
7982     dp_packet_delete_batch(packets_, should_steal);
7983 }
7984
7985 static void
7986 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7987                           struct dp_packet_batch *packets,
7988                           bool should_steal, const struct flow *flow,
7989                           const struct nlattr *actions, size_t actions_len)
7990 {
7991     struct dp_netdev_execute_aux aux = { pmd, flow };
7992
7993     odp_execute_actions(&aux, packets, should_steal, actions,
7994                         actions_len, dp_execute_cb);
7995 }
7996
7997 struct dp_netdev_ct_dump {
7998     struct ct_dpif_dump_state up;
7999     struct conntrack_dump dump;
8000     struct conntrack *ct;
8001     struct dp_netdev *dp;
8002 };
8003
8004 static int
8005 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8006                           const uint16_t *pzone, int *ptot_bkts)
8007 {
8008     struct dp_netdev *dp = get_dp_netdev(dpif);
8009     struct dp_netdev_ct_dump *dump;
8010
8011     dump = xzalloc(sizeof *dump);
8012     dump->dp = dp;
8013     dump->ct = dp->conntrack;
8014
8015     conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8016
8017     *dump_ = &dump->up;
8018
8019     return 0;
8020 }
8021
8022 static int
8023 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8024                          struct ct_dpif_dump_state *dump_,
8025                          struct ct_dpif_entry *entry)
8026 {
8027     struct dp_netdev_ct_dump *dump;
8028
8029     INIT_CONTAINER(dump, dump_, up);
8030
8031     return conntrack_dump_next(&dump->dump, entry);
8032 }
8033
8034 static int
8035 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8036                          struct ct_dpif_dump_state *dump_)
8037 {
8038     struct dp_netdev_ct_dump *dump;
8039     int err;
8040
8041     INIT_CONTAINER(dump, dump_, up);
8042
8043     err = conntrack_dump_done(&dump->dump);
8044
8045     free(dump);
8046
8047     return err;
8048 }
8049
8050 static int
8051 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8052                      const struct ct_dpif_tuple *tuple)
8053 {
8054     struct dp_netdev *dp = get_dp_netdev(dpif);
8055
8056     if (tuple) {
8057         return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8058     }
8059     return conntrack_flush(dp->conntrack, zone);
8060 }
8061
8062 static int
8063 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8064 {
8065     struct dp_netdev *dp = get_dp_netdev(dpif);
8066
8067     return conntrack_set_maxconns(dp->conntrack, maxconns);
8068 }
8069
8070 static int
8071 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8072 {
8073     struct dp_netdev *dp = get_dp_netdev(dpif);
8074
8075     return conntrack_get_maxconns(dp->conntrack, maxconns);
8076 }
8077
8078 static int
8079 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8080 {
8081     struct dp_netdev *dp = get_dp_netdev(dpif);
8082
8083     return conntrack_get_nconns(dp->conntrack, nconns);
8084 }
8085
8086 static int
8087 dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8088 {
8089     struct dp_netdev *dp = get_dp_netdev(dpif);
8090
8091     return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8092 }
8093
8094 static int
8095 dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8096 {
8097     struct dp_netdev *dp = get_dp_netdev(dpif);
8098     *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8099     return 0;
8100 }
8101
8102 static int
8103 dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
8104                            const uint32_t *default_limits,
8105                            const struct ovs_list *zone_limits)
8106 {
8107     int err = 0;
8108     struct dp_netdev *dp = get_dp_netdev(dpif);
8109     if (default_limits) {
8110         err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
8111         if (err != 0) {
8112             return err;
8113         }
8114     }
8115
8116     struct ct_dpif_zone_limit *zone_limit;
8117     LIST_FOR_EACH (zone_limit, node, zone_limits) {
8118         err = zone_limit_update(dp->conntrack, zone_limit->zone,
8119                                 zone_limit->limit);
8120         if (err != 0) {
8121             break;
8122         }
8123     }
8124     return err;
8125 }
8126
8127 static int
8128 dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
8129                            uint32_t *default_limit,
8130                            const struct ovs_list *zone_limits_request,
8131                            struct ovs_list *zone_limits_reply)
8132 {
8133     struct dp_netdev *dp = get_dp_netdev(dpif);
8134     struct conntrack_zone_limit czl;
8135
8136     czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
8137     if (czl.zone == DEFAULT_ZONE) {
8138         *default_limit = czl.limit;
8139     } else {
8140         return EINVAL;
8141     }
8142
8143     if (!ovs_list_is_empty(zone_limits_request)) {
8144         struct ct_dpif_zone_limit *zone_limit;
8145         LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
8146             czl = zone_limit_get(dp->conntrack, zone_limit->zone);
8147             if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
8148                 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
8149                                         czl.limit, czl.count);
8150             } else {
8151                 return EINVAL;
8152             }
8153         }
8154     } else {
8155         for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
8156             czl = zone_limit_get(dp->conntrack, z);
8157             if (czl.zone == z) {
8158                 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
8159                                         czl.count);
8160             }
8161         }
8162     }
8163
8164     return 0;
8165 }
8166
8167 static int
8168 dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
8169                            const struct ovs_list *zone_limits)
8170 {
8171     int err = 0;
8172     struct dp_netdev *dp = get_dp_netdev(dpif);
8173     struct ct_dpif_zone_limit *zone_limit;
8174     LIST_FOR_EACH (zone_limit, node, zone_limits) {
8175         err = zone_limit_delete(dp->conntrack, zone_limit->zone);
8176         if (err != 0) {
8177             break;
8178         }
8179     }
8180
8181     return err;
8182 }
8183
8184 static int
8185 dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
8186                                   const struct ct_dpif_timeout_policy *dpif_tp)
8187 {
8188     struct timeout_policy tp;
8189     struct dp_netdev *dp;
8190
8191     dp = get_dp_netdev(dpif);
8192     memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
8193     return timeout_policy_update(dp->conntrack, &tp);
8194 }
8195
8196 static int
8197 dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
8198                                   struct ct_dpif_timeout_policy *dpif_tp)
8199 {
8200     struct timeout_policy *tp;
8201     struct dp_netdev *dp;
8202     int err = 0;
8203
8204     dp = get_dp_netdev(dpif);
8205     tp = timeout_policy_get(dp->conntrack, tp_id);
8206     if (!tp) {
8207         return ENOENT;
8208     }
8209     memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8210     return err;
8211 }
8212
8213 static int
8214 dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8215                                   uint32_t tp_id)
8216 {
8217     struct dp_netdev *dp;
8218     int err = 0;
8219
8220     dp = get_dp_netdev(dpif);
8221     err = timeout_policy_delete(dp->conntrack, tp_id);
8222     return err;
8223 }
8224
8225 static int
8226 dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8227                                        uint32_t tp_id,
8228                                        uint16_t dl_type OVS_UNUSED,
8229                                        uint8_t nw_proto OVS_UNUSED,
8230                                        char **tp_name, bool *is_generic)
8231 {
8232     struct ds ds = DS_EMPTY_INITIALIZER;
8233
8234     ds_put_format(&ds, "%"PRIu32, tp_id);
8235     *tp_name = ds_steal_cstr(&ds);
8236     *is_generic = true;
8237     return 0;
8238 }
8239
8240 static int
8241 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8242 {
8243     struct dp_netdev *dp = get_dp_netdev(dpif);
8244     return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8245 }
8246
8247 static int
8248 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8249 {
8250     struct dp_netdev *dp = get_dp_netdev(dpif);
8251     return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8252 }
8253
8254 static int
8255 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8256 {
8257     struct dp_netdev *dp = get_dp_netdev(dpif);
8258     return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8259 }
8260
8261 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8262  * diverge. */
8263 static int
8264 dpif_netdev_ipf_get_status(struct dpif *dpif,
8265                            struct dpif_ipf_status *dpif_ipf_status)
8266 {
8267     struct dp_netdev *dp = get_dp_netdev(dpif);
8268     ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8269                    (struct ipf_status *) dpif_ipf_status);
8270     return 0;
8271 }
8272
8273 static int
8274 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8275                            struct ipf_dump_ctx **ipf_dump_ctx)
8276 {
8277     return ipf_dump_start(ipf_dump_ctx);
8278 }
8279
8280 static int
8281 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8282 {
8283     struct dp_netdev *dp = get_dp_netdev(dpif);
8284     return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8285                          dump);
8286 }
8287
8288 static int
8289 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8290 {
8291     return ipf_dump_done(ipf_dump_ctx);
8292
8293 }
8294
8295 static int
8296 dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8297                      odp_port_t *member_map)
8298 {
8299     struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8300     struct dp_netdev *dp = get_dp_netdev(dpif);
8301     struct dp_netdev_pmd_thread *pmd;
8302
8303     /* Prepare new bond mapping. */
8304     new_tx->bond_id = bond_id;
8305     for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8306         new_tx->member_buckets[bucket].member_id = member_map[bucket];
8307     }
8308
8309     ovs_mutex_lock(&dp->bond_mutex);
8310     /* Check if bond already existed. */
8311     struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8312     if (old_tx) {
8313         cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8314                      hash_bond_id(bond_id));
8315         ovsrcu_postpone(free, old_tx);
8316     } else {
8317         cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8318     }
8319     ovs_mutex_unlock(&dp->bond_mutex);
8320
8321     /* Update all PMDs with new bond mapping. */
8322     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8323         dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8324     }
8325     return 0;
8326 }
8327
8328 static int
8329 dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8330 {
8331     struct dp_netdev *dp = get_dp_netdev(dpif);
8332     struct dp_netdev_pmd_thread *pmd;
8333     struct tx_bond *tx;
8334
8335     ovs_mutex_lock(&dp->bond_mutex);
8336     /* Check if bond existed. */
8337     tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8338     if (tx) {
8339         cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8340         ovsrcu_postpone(free, tx);
8341     } else {
8342         /* Bond is not present. */
8343         ovs_mutex_unlock(&dp->bond_mutex);
8344         return ENOENT;
8345     }
8346     ovs_mutex_unlock(&dp->bond_mutex);
8347
8348     /* Remove the bond map in all pmds. */
8349     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8350         dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8351     }
8352     return 0;
8353 }
8354
8355 static int
8356 dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8357                            uint64_t *n_bytes)
8358 {
8359     struct dp_netdev *dp = get_dp_netdev(dpif);
8360     struct dp_netdev_pmd_thread *pmd;
8361
8362     if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8363         return ENOENT;
8364     }
8365
8366     /* Search the bond in all PMDs. */
8367     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8368         struct tx_bond *pmd_bond_entry
8369             = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8370
8371         if (!pmd_bond_entry) {
8372             continue;
8373         }
8374
8375         /* Read bond stats. */
8376         for (int i = 0; i < BOND_BUCKETS; i++) {
8377             uint64_t pmd_n_bytes;
8378
8379             atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
8380                                 &pmd_n_bytes);
8381             n_bytes[i] += pmd_n_bytes;
8382         }
8383     }
8384     return 0;
8385 }
8386
8387 const struct dpif_class dpif_netdev_class = {
8388     "netdev",
8389     true,                       /* cleanup_required */
8390     dpif_netdev_init,
8391     dpif_netdev_enumerate,
8392     dpif_netdev_port_open_type,
8393     dpif_netdev_open,
8394     dpif_netdev_close,
8395     dpif_netdev_destroy,
8396     dpif_netdev_run,
8397     dpif_netdev_wait,
8398     dpif_netdev_get_stats,
8399     NULL,                      /* set_features */
8400     dpif_netdev_port_add,
8401     dpif_netdev_port_del,
8402     dpif_netdev_port_set_config,
8403     dpif_netdev_port_query_by_number,
8404     dpif_netdev_port_query_by_name,
8405     NULL,                       /* port_get_pid */
8406     dpif_netdev_port_dump_start,
8407     dpif_netdev_port_dump_next,
8408     dpif_netdev_port_dump_done,
8409     dpif_netdev_port_poll,
8410     dpif_netdev_port_poll_wait,
8411     dpif_netdev_flow_flush,
8412     dpif_netdev_flow_dump_create,
8413     dpif_netdev_flow_dump_destroy,
8414     dpif_netdev_flow_dump_thread_create,
8415     dpif_netdev_flow_dump_thread_destroy,
8416     dpif_netdev_flow_dump_next,
8417     dpif_netdev_operate,
8418     NULL,                       /* recv_set */
8419     NULL,                       /* handlers_set */
8420     dpif_netdev_set_config,
8421     dpif_netdev_queue_to_priority,
8422     NULL,                       /* recv */
8423     NULL,                       /* recv_wait */
8424     NULL,                       /* recv_purge */
8425     dpif_netdev_register_dp_purge_cb,
8426     dpif_netdev_register_upcall_cb,
8427     dpif_netdev_enable_upcall,
8428     dpif_netdev_disable_upcall,
8429     dpif_netdev_get_datapath_version,
8430     dpif_netdev_ct_dump_start,
8431     dpif_netdev_ct_dump_next,
8432     dpif_netdev_ct_dump_done,
8433     dpif_netdev_ct_flush,
8434     dpif_netdev_ct_set_maxconns,
8435     dpif_netdev_ct_get_maxconns,
8436     dpif_netdev_ct_get_nconns,
8437     dpif_netdev_ct_set_tcp_seq_chk,
8438     dpif_netdev_ct_get_tcp_seq_chk,
8439     dpif_netdev_ct_set_limits,
8440     dpif_netdev_ct_get_limits,
8441     dpif_netdev_ct_del_limits,
8442     dpif_netdev_ct_set_timeout_policy,
8443     dpif_netdev_ct_get_timeout_policy,
8444     dpif_netdev_ct_del_timeout_policy,
8445     NULL,                       /* ct_timeout_policy_dump_start */
8446     NULL,                       /* ct_timeout_policy_dump_next */
8447     NULL,                       /* ct_timeout_policy_dump_done */
8448     dpif_netdev_ct_get_timeout_policy_name,
8449     dpif_netdev_ipf_set_enabled,
8450     dpif_netdev_ipf_set_min_frag,
8451     dpif_netdev_ipf_set_max_nfrags,
8452     dpif_netdev_ipf_get_status,
8453     dpif_netdev_ipf_dump_start,
8454     dpif_netdev_ipf_dump_next,
8455     dpif_netdev_ipf_dump_done,
8456     dpif_netdev_meter_get_features,
8457     dpif_netdev_meter_set,
8458     dpif_netdev_meter_get,
8459     dpif_netdev_meter_del,
8460     dpif_netdev_bond_add,
8461     dpif_netdev_bond_del,
8462     dpif_netdev_bond_stats_get,
8463 };
8464
8465 static void
8466 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8467                               const char *argv[], void *aux OVS_UNUSED)
8468 {
8469     struct dp_netdev_port *port;
8470     struct dp_netdev *dp;
8471     odp_port_t port_no;
8472
8473     ovs_mutex_lock(&dp_netdev_mutex);
8474     dp = shash_find_data(&dp_netdevs, argv[1]);
8475     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8476         ovs_mutex_unlock(&dp_netdev_mutex);
8477         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8478         return;
8479     }
8480     ovs_refcount_ref(&dp->ref_cnt);
8481     ovs_mutex_unlock(&dp_netdev_mutex);
8482
8483     ovs_mutex_lock(&dp->port_mutex);
8484     if (get_port_by_name(dp, argv[2], &port)) {
8485         unixctl_command_reply_error(conn, "unknown port");
8486         goto exit;
8487     }
8488
8489     port_no = u32_to_odp(atoi(argv[3]));
8490     if (!port_no || port_no == ODPP_NONE) {
8491         unixctl_command_reply_error(conn, "bad port number");
8492         goto exit;
8493     }
8494     if (dp_netdev_lookup_port(dp, port_no)) {
8495         unixctl_command_reply_error(conn, "port number already in use");
8496         goto exit;
8497     }
8498
8499     /* Remove port. */
8500     hmap_remove(&dp->ports, &port->node);
8501     reconfigure_datapath(dp);
8502
8503     /* Reinsert with new port number. */
8504     port->port_no = port_no;
8505     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8506     reconfigure_datapath(dp);
8507
8508     seq_change(dp->port_seq);
8509     unixctl_command_reply(conn, NULL);
8510
8511 exit:
8512     ovs_mutex_unlock(&dp->port_mutex);
8513     dp_netdev_unref(dp);
8514 }
8515
8516 static void
8517 dpif_dummy_register__(const char *type)
8518 {
8519     struct dpif_class *class;
8520
8521     class = xmalloc(sizeof *class);
8522     *class = dpif_netdev_class;
8523     class->type = xstrdup(type);
8524     dp_register_provider(class);
8525 }
8526
8527 static void
8528 dpif_dummy_override(const char *type)
8529 {
8530     int error;
8531
8532     /*
8533      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8534      * a userland-only build.  It's useful for testsuite.
8535      */
8536     error = dp_unregister_provider(type);
8537     if (error == 0 || error == EAFNOSUPPORT) {
8538         dpif_dummy_register__(type);
8539     }
8540 }
8541
8542 void
8543 dpif_dummy_register(enum dummy_level level)
8544 {
8545     if (level == DUMMY_OVERRIDE_ALL) {
8546         struct sset types;
8547         const char *type;
8548
8549         sset_init(&types);
8550         dp_enumerate_types(&types);
8551         SSET_FOR_EACH (type, &types) {
8552             dpif_dummy_override(type);
8553         }
8554         sset_destroy(&types);
8555     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8556         dpif_dummy_override("system");
8557     }
8558
8559     dpif_dummy_register__("dummy");
8560
8561     unixctl_command_register("dpif-dummy/change-port-number",
8562                              "dp port new-number",
8563                              3, 3, dpif_dummy_change_port_number, NULL);
8564 }
8565 \f
8566 /* Datapath Classifier. */
8567
8568 static void
8569 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8570 {
8571     cmap_destroy(&subtable->rules);
8572     ovsrcu_postpone(free, subtable->mf_masks);
8573     ovsrcu_postpone(free, subtable);
8574 }
8575
8576 /* Initializes 'cls' as a classifier that initially contains no classification
8577  * rules. */
8578 static void
8579 dpcls_init(struct dpcls *cls)
8580 {
8581     cmap_init(&cls->subtables_map);
8582     pvector_init(&cls->subtables);
8583 }
8584
8585 static void
8586 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8587 {
8588     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
8589     pvector_remove(&cls->subtables, subtable);
8590     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8591                 subtable->mask.hash);
8592     ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
8593 }
8594
8595 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
8596  * caller's responsibility.
8597  * May only be called after all the readers have been terminated. */
8598 static void
8599 dpcls_destroy(struct dpcls *cls)
8600 {
8601     if (cls) {
8602         struct dpcls_subtable *subtable;
8603
8604         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
8605             ovs_assert(cmap_count(&subtable->rules) == 0);
8606             dpcls_destroy_subtable(cls, subtable);
8607         }
8608         cmap_destroy(&cls->subtables_map);
8609         pvector_destroy(&cls->subtables);
8610     }
8611 }
8612
8613 static struct dpcls_subtable *
8614 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8615 {
8616     struct dpcls_subtable *subtable;
8617
8618     /* Need to add one. */
8619     subtable = xmalloc(sizeof *subtable
8620                        - sizeof subtable->mask.mf + mask->len);
8621     cmap_init(&subtable->rules);
8622     subtable->hit_cnt = 0;
8623     netdev_flow_key_clone(&subtable->mask, mask);
8624
8625     /* The count of bits in the mask defines the space required for masks.
8626      * Then call gen_masks() to create the appropriate masks, avoiding the cost
8627      * of doing runtime calculations. */
8628     uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
8629     uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
8630     subtable->mf_bits_set_unit0 = unit0;
8631     subtable->mf_bits_set_unit1 = unit1;
8632     subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
8633     netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
8634
8635     /* Get the preferred subtable search function for this (u0,u1) subtable.
8636      * The function is guaranteed to always return a valid implementation, and
8637      * possibly an ISA optimized, and/or specialized implementation.
8638      */
8639     subtable->lookup_func = dpcls_subtable_get_best_impl(unit0, unit1);
8640
8641     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
8642     /* Add the new subtable at the end of the pvector (with no hits yet) */
8643     pvector_insert(&cls->subtables, subtable, 0);
8644     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
8645              cmap_count(&cls->subtables_map), subtable, cls->in_port);
8646     pvector_publish(&cls->subtables);
8647
8648     return subtable;
8649 }
8650
8651 static inline struct dpcls_subtable *
8652 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8653 {
8654     struct dpcls_subtable *subtable;
8655
8656     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8657                              &cls->subtables_map) {
8658         if (netdev_flow_key_equal(&subtable->mask, mask)) {
8659             return subtable;
8660         }
8661     }
8662     return dpcls_create_subtable(cls, mask);
8663 }
8664
8665 /* Checks for the best available implementation for each subtable lookup
8666  * function, and assigns it as the lookup function pointer for each subtable.
8667  * Returns the number of subtables that have changed lookup implementation.
8668  */
8669 static uint32_t
8670 dpcls_subtable_lookup_reprobe(struct dpcls *cls)
8671 {
8672     struct pvector *pvec = &cls->subtables;
8673     uint32_t subtables_changed = 0;
8674     struct dpcls_subtable *subtable = NULL;
8675
8676     PVECTOR_FOR_EACH (subtable, pvec) {
8677         uint32_t u0_bits = subtable->mf_bits_set_unit0;
8678         uint32_t u1_bits = subtable->mf_bits_set_unit1;
8679         void *old_func = subtable->lookup_func;
8680         subtable->lookup_func = dpcls_subtable_get_best_impl(u0_bits, u1_bits);
8681         subtables_changed += (old_func != subtable->lookup_func);
8682     }
8683     pvector_publish(pvec);
8684
8685     return subtables_changed;
8686 }
8687
8688 /* Periodically sort the dpcls subtable vectors according to hit counts */
8689 static void
8690 dpcls_sort_subtable_vector(struct dpcls *cls)
8691 {
8692     struct pvector *pvec = &cls->subtables;
8693     struct dpcls_subtable *subtable;
8694
8695     PVECTOR_FOR_EACH (subtable, pvec) {
8696         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8697         subtable->hit_cnt = 0;
8698     }
8699     pvector_publish(pvec);
8700 }
8701
8702 static inline void
8703 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8704                            struct polled_queue *poll_list, int poll_cnt)
8705 {
8706     struct dpcls *cls;
8707     uint64_t tot_idle = 0, tot_proc = 0;
8708     unsigned int pmd_load = 0;
8709
8710     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
8711         uint64_t curr_tsc;
8712         struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8713         if (pmd_alb->is_enabled && !pmd->isolated
8714             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8715                                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8716             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8717                                         pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8718             {
8719             tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8720                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8721             tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8722                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8723
8724             if (tot_proc) {
8725                 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8726             }
8727
8728             if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8729                 atomic_count_inc(&pmd->pmd_overloaded);
8730             } else {
8731                 atomic_count_set(&pmd->pmd_overloaded, 0);
8732             }
8733         }
8734
8735         pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8736                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8737         pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8738                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8739
8740         /* Get the cycles that were used to process each queue and store. */
8741         for (unsigned i = 0; i < poll_cnt; i++) {
8742             uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8743                                                         RXQ_CYCLES_PROC_CURR);
8744             dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8745             dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8746                                      0);
8747         }
8748         curr_tsc = cycles_counter_update(&pmd->perf_stats);
8749         if (pmd->intrvl_tsc_prev) {
8750             /* There is a prev timestamp, store a new intrvl cycle count. */
8751             atomic_store_relaxed(&pmd->intrvl_cycles,
8752                                  curr_tsc - pmd->intrvl_tsc_prev);
8753         }
8754         pmd->intrvl_tsc_prev = curr_tsc;
8755         /* Start new measuring interval */
8756         pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
8757     }
8758
8759     if (pmd->ctx.now > pmd->next_optimization) {
8760         /* Try to obtain the flow lock to block out revalidator threads.
8761          * If not possible, just try next time. */
8762         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8763             /* Optimize each classifier */
8764             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8765                 dpcls_sort_subtable_vector(cls);
8766             }
8767             ovs_mutex_unlock(&pmd->flow_mutex);
8768             /* Start new measuring interval */
8769             pmd->next_optimization = pmd->ctx.now
8770                                      + DPCLS_OPTIMIZATION_INTERVAL;
8771         }
8772     }
8773 }
8774
8775 /* Insert 'rule' into 'cls'. */
8776 static void
8777 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8778              const struct netdev_flow_key *mask)
8779 {
8780     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8781
8782     /* Refer to subtable's mask, also for later removal. */
8783     rule->mask = &subtable->mask;
8784     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8785 }
8786
8787 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
8788 static void
8789 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8790 {
8791     struct dpcls_subtable *subtable;
8792
8793     ovs_assert(rule->mask);
8794
8795     /* Get subtable from reference in rule->mask. */
8796     INIT_CONTAINER(subtable, rule->mask, mask);
8797     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8798         == 0) {
8799         /* Delete empty subtable. */
8800         dpcls_destroy_subtable(cls, subtable);
8801         pvector_publish(&cls->subtables);
8802     }
8803 }
8804
8805 /* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8806 static inline void
8807 netdev_flow_key_gen_mask_unit(uint64_t iter,
8808                               const uint64_t count,
8809                               uint64_t *mf_masks)
8810 {
8811     int i;
8812     for (i = 0; i < count; i++) {
8813         uint64_t lowest_bit = (iter & -iter);
8814         iter &= ~lowest_bit;
8815         mf_masks[i] = (lowest_bit - 1);
8816     }
8817     /* Checks that count has covered all bits in the iter bitmap. */
8818     ovs_assert(iter == 0);
8819 }
8820
8821 /* Generate a mask for each block in the miniflow, based on the bits set. This
8822  * allows easily masking packets with the generated array here, without
8823  * calculations. This replaces runtime-calculating the masks.
8824  * @param key The table to generate the mf_masks for
8825  * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8826  * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8827  * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8828  */
8829 void
8830 netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8831                           uint64_t *mf_masks,
8832                           const uint32_t mf_bits_u0,
8833                           const uint32_t mf_bits_u1)
8834 {
8835     uint64_t iter_u0 = tbl->mf.map.bits[0];
8836     uint64_t iter_u1 = tbl->mf.map.bits[1];
8837
8838     netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8839     netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8840 }
8841
8842 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8843  * in 'mask' the values in 'key' and 'target' are the same. */
8844 bool
8845 dpcls_rule_matches_key(const struct dpcls_rule *rule,
8846                        const struct netdev_flow_key *target)
8847 {
8848     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8849     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
8850     uint64_t value;
8851
8852     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8853         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
8854             return false;
8855         }
8856     }
8857     return true;
8858 }
8859
8860 /* For each miniflow in 'keys' performs a classifier lookup writing the result
8861  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
8862  * NULL it is skipped.
8863  *
8864  * This function is optimized for use in the userspace datapath and therefore
8865  * does not implement a lot of features available in the standard
8866  * classifier_lookup() function.  Specifically, it does not implement
8867  * priorities, instead returning any rule which matches the flow.
8868  *
8869  * Returns true if all miniflows found a corresponding rule. */
8870 static bool
8871 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8872              struct dpcls_rule **rules, const size_t cnt,
8873              int *num_lookups_p)
8874 {
8875     /* The received 'cnt' miniflows are the search-keys that will be processed
8876      * to find a matching entry into the available subtables.
8877      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8878 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8879     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8880
8881     struct dpcls_subtable *subtable;
8882     uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8883
8884     if (cnt != MAP_BITS) {
8885         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8886     }
8887     memset(rules, 0, cnt * sizeof *rules);
8888
8889     int lookups_match = 0, subtable_pos = 1;
8890     uint32_t found_map;
8891
8892     /* The Datapath classifier - aka dpcls - is composed of subtables.
8893      * Subtables are dynamically created as needed when new rules are inserted.
8894      * Each subtable collects rules with matches on a specific subset of packet
8895      * fields as defined by the subtable's mask.  We proceed to process every
8896      * search-key against each subtable, but when a match is found for a
8897      * search-key, the search for that key can stop because the rules are
8898      * non-overlapping. */
8899     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8900         /* Call the subtable specific lookup function. */
8901         found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8902
8903         /* Count the number of subtables searched for this packet match. This
8904          * estimates the "spread" of subtables looked at per matched packet. */
8905         uint32_t pkts_matched = count_1bits(found_map);
8906         lookups_match += pkts_matched * subtable_pos;
8907
8908         /* Clear the found rules, and return early if all packets are found. */
8909         keys_map &= ~found_map;
8910         if (!keys_map) {
8911             if (num_lookups_p) {
8912                 *num_lookups_p = lookups_match;
8913             }
8914             return true;
8915         }
8916         subtable_pos++;
8917     }
8918
8919     if (num_lookups_p) {
8920         *num_lookups_p = lookups_match;
8921     }
8922     return false;
8923 }