lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <inttypes.h>
  24 #include <net/if.h>
  25 #include <sys/types.h>
  26 #include <netinet/in.h>
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/ioctl.h>
  31 #include <sys/socket.h>
  32 #include <sys/stat.h>
  33 #include <unistd.h>
  34
  35 #include "bitmap.h"
  36 #include "cmap.h"
  37 #include "conntrack.h"
  38 #include "coverage.h"
  39 #include "ct-dpif.h"
  40 #include "csum.h"
  41 #include "dp-packet.h"
  42 #include "dpif.h"
  43 #include "dpif-netdev-perf.h"
  44 #include "dpif-provider.h"
  45 #include "dummy.h"
  46 #include "fat-rwlock.h"
  47 #include "flow.h"
  48 #include "hmapx.h"
  49 #include "id-pool.h"
  50 #include "latch.h"
  51 #include "netdev.h"
  52 #include "netdev-provider.h"
  53 #include "netdev-vport.h"
  54 #include "netlink.h"
  55 #include "odp-execute.h"
  56 #include "odp-util.h"
  57 #include "openvswitch/dynamic-string.h"
  58 #include "openvswitch/list.h"
  59 #include "openvswitch/match.h"
  60 #include "openvswitch/ofp-parse.h"
  61 #include "openvswitch/ofp-print.h"
  62 #include "openvswitch/ofpbuf.h"
  63 #include "openvswitch/shash.h"
  64 #include "openvswitch/vlog.h"
  65 #include "ovs-numa.h"
  66 #include "ovs-rcu.h"
  67 #include "packets.h"
  68 #include "openvswitch/poll-loop.h"
  69 #include "pvector.h"
  70 #include "random.h"
  71 #include "seq.h"
  72 #include "smap.h"
  73 #include "sset.h"
  74 #include "timeval.h"
  75 #include "tnl-neigh-cache.h"
  76 #include "tnl-ports.h"
  77 #include "unixctl.h"
  78 #include "util.h"
  79 #include "uuid.h"
  80
  81 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  82
  83 #define FLOW_DUMP_MAX_BATCH 50
  84 /* Use per thread recirc_depth to prevent recirculation loop. */
  85 #define MAX_RECIRC_DEPTH 6
  86 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  87
  88 /* Use instant packet send by default. */
  89 #define DEFAULT_TX_FLUSH_INTERVAL 0
  90
  91 /* Configuration parameters. */
  92 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
  93 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
  94 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
  95 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
  96
  97 /* Protects against changes to 'dp_netdevs'. */
  98 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
  99
 100 /* Contains all 'struct dp_netdev's. */
 101 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 102     = SHASH_INITIALIZER(&dp_netdevs);
 103
 104 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 105
 106 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 107                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 108                                      | CS_SRC_NAT | CS_DST_NAT)
 109 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 110
 111 static struct odp_support dp_netdev_support = {
 112     .max_vlan_headers = SIZE_MAX,
 113     .max_mpls_depth = SIZE_MAX,
 114     .recirc = true,
 115     .ct_state = true,
 116     .ct_zone = true,
 117     .ct_mark = true,
 118     .ct_label = true,
 119     .ct_state_nat = true,
 120     .ct_orig_tuple = true,
 121     .ct_orig_tuple6 = true,
 122 };
 123
 124 /* Stores a miniflow with inline values */
 125
 126 struct netdev_flow_key {
 127     uint32_t hash;       /* Hash function differs for different users. */
 128     uint32_t len;        /* Length of the following miniflow (incl. map). */
 129     struct miniflow mf;
 130     uint64_t buf[FLOW_MAX_PACKET_U64S];
 131 };
 132
 133 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
 134  *
 135  * Exact match cache for frequently used flows
 136  *
 137  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 138  * search its entries for a miniflow that matches exactly the miniflow of the
 139  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 140  *
 141  * A cache entry holds a reference to its 'dp_netdev_flow'.
 142  *
 143  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 144  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 145  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 146  * value is the index of a cache entry where the miniflow could be.
 147  *
 148  *
 149  * Signature match cache (SMC)
 150  *
 151  * This cache stores a 16-bit signature for each flow without storing keys, and
 152  * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 153  * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 154  * SMC uses a set-associative design that each bucket contains
 155  * SMC_ENTRY_PER_BUCKET number of entries.
 156  * Since 16-bit flow_table index is used, if there are more than 2^16
 157  * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 158  *
 159  *
 160  * Thread-safety
 161  * =============
 162  *
 163  * Each pmd_thread has its own private exact match cache.
 164  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 165  */
 166
 167 #define EM_FLOW_HASH_SHIFT 13
 168 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 169 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 170 #define EM_FLOW_HASH_SEGS 2
 171
 172 /* SMC uses a set-associative design. A bucket contains a set of entries that
 173  * a flow item can occupy. For now, it uses one hash function rather than two
 174  * as for the EMC design. */
 175 #define SMC_ENTRY_PER_BUCKET 4
 176 #define SMC_ENTRIES (1u << 20)
 177 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
 178 #define SMC_MASK (SMC_BUCKET_CNT - 1)
 179
 180 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 181 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 182 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 183                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 184
 185 struct emc_entry {
 186     struct dp_netdev_flow *flow;
 187     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 188 };
 189
 190 struct emc_cache {
 191     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 192     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 193 };
 194
 195 struct smc_bucket {
 196     uint16_t sig[SMC_ENTRY_PER_BUCKET];
 197     uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
 198 };
 199
 200 /* Signature match cache, differentiate from EMC cache */
 201 struct smc_cache {
 202     struct smc_bucket buckets[SMC_BUCKET_CNT];
 203 };
 204
 205 struct dfc_cache {
 206     struct emc_cache emc_cache;
 207     struct smc_cache smc_cache;
 208 };
 209
 210 /* Iterate in the exact match cache through every entry that might contain a
 211  * miniflow with hash 'HASH'. */
 212 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 213     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 214          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 215          i__ < EM_FLOW_HASH_SEGS;                                            \
 216          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 217 \f
 218 /* Simple non-wildcarding single-priority classifier. */
 219
 220 /* Time in microseconds between successive optimizations of the dpcls
 221  * subtable vector */
 222 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
 223
 224 /* Time in microseconds of the interval in which rxq processing cycles used
 225  * in rxq to pmd assignments is measured and stored. */
 226 #define PMD_RXQ_INTERVAL_LEN 10000000LL
 227
 228 /* Number of intervals for which cycles are stored
 229  * and used during rxq to pmd assignment. */
 230 #define PMD_RXQ_INTERVAL_MAX 6
 231
 232 struct dpcls {
 233     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 234     odp_port_t in_port;
 235     struct cmap subtables_map;
 236     struct pvector subtables;
 237 };
 238
 239 /* A rule to be inserted to the classifier. */
 240 struct dpcls_rule {
 241     struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 242     struct netdev_flow_key *mask; /* Subtable's mask. */
 243     struct netdev_flow_key flow;  /* Matching key. */
 244     /* 'flow' must be the last field, additional space is allocated here. */
 245 };
 246
 247 /* Data structure to keep packet order till fastpath processing. */
 248 struct dp_packet_flow_map {
 249     struct dp_packet *packet;
 250     struct dp_netdev_flow *flow;
 251     uint16_t tcp_flags;
 252 };
 253
 254 static void dpcls_init(struct dpcls *);
 255 static void dpcls_destroy(struct dpcls *);
 256 static void dpcls_sort_subtable_vector(struct dpcls *);
 257 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 258                          const struct netdev_flow_key *mask);
 259 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 260 static bool dpcls_lookup(struct dpcls *cls,
 261                          const struct netdev_flow_key *keys[],
 262                          struct dpcls_rule **rules, size_t cnt,
 263                          int *num_lookups_p);
 264 static bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
 265                             const struct netdev_flow_key *target);
 266 /* Set of supported meter flags */
 267 #define DP_SUPPORTED_METER_FLAGS_MASK \
 268     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 269
 270 /* Set of supported meter band types */
 271 #define DP_SUPPORTED_METER_BAND_TYPES           \
 272     ( 1 << OFPMBT13_DROP )
 273
 274 struct dp_meter_band {
 275     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 276     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 277     uint64_t packet_count;
 278     uint64_t byte_count;
 279 };
 280
 281 struct dp_meter {
 282     uint16_t flags;
 283     uint16_t n_bands;
 284     uint32_t max_delta_t;
 285     uint64_t used;
 286     uint64_t packet_count;
 287     uint64_t byte_count;
 288     struct dp_meter_band bands[];
 289 };
 290
 291 /* Datapath based on the network device interface from netdev.h.
 292  *
 293  *
 294  * Thread-safety
 295  * =============
 296  *
 297  * Some members, marked 'const', are immutable.  Accessing other members
 298  * requires synchronization, as noted in more detail below.
 299  *
 300  * Acquisition order is, from outermost to innermost:
 301  *
 302  *    dp_netdev_mutex (global)
 303  *    port_mutex
 304  *    non_pmd_mutex
 305  */
 306 struct dp_netdev {
 307     const struct dpif_class *const class;
 308     const char *const name;
 309     struct dpif *dpif;
 310     struct ovs_refcount ref_cnt;
 311     atomic_flag destroyed;
 312
 313     /* Ports.
 314      *
 315      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 316      * through 'ports' requires taking 'port_mutex'. */
 317     struct ovs_mutex port_mutex;
 318     struct hmap ports;
 319     struct seq *port_seq;       /* Incremented whenever a port changes. */
 320
 321     /* The time that a packet can wait in output batch for sending. */
 322     atomic_uint32_t tx_flush_interval;
 323
 324     /* Meters. */
 325     struct ovs_mutex meter_locks[N_METER_LOCKS];
 326     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 327
 328     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 329     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 330     /* Enable collection of PMD performance metrics. */
 331     atomic_bool pmd_perf_metrics;
 332     /* Enable the SMC cache from ovsdb config */
 333     atomic_bool smc_enable_db;
 334
 335     /* Protects access to ofproto-dpif-upcall interface during revalidator
 336      * thread synchronization. */
 337     struct fat_rwlock upcall_rwlock;
 338     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 339     void *upcall_aux;
 340
 341     /* Callback function for notifying the purging of dp flows (during
 342      * reseting pmd deletion). */
 343     dp_purge_callback *dp_purge_cb;
 344     void *dp_purge_aux;
 345
 346     /* Stores all 'struct dp_netdev_pmd_thread's. */
 347     struct cmap poll_threads;
 348     /* id pool for per thread static_tx_qid. */
 349     struct id_pool *tx_qid_pool;
 350     struct ovs_mutex tx_qid_pool_mutex;
 351
 352     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 353      * instance for non-pmd thread. */
 354     struct ovs_mutex non_pmd_mutex;
 355
 356     /* Each pmd thread will store its pointer to
 357      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 358     ovsthread_key_t per_pmd_key;
 359
 360     struct seq *reconfigure_seq;
 361     uint64_t last_reconfigure_seq;
 362
 363     /* Cpu mask for pin of pmd threads. */
 364     char *pmd_cmask;
 365
 366     uint64_t last_tnl_conf_seq;
 367
 368     struct conntrack conntrack;
 369 };
 370
 371 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 372     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 373 {
 374     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 375 }
 376
 377 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 378     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 379 {
 380     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 381 }
 382
 383
 384 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 385                                                     odp_port_t)
 386     OVS_REQUIRES(dp->port_mutex);
 387
 388 enum rxq_cycles_counter_type {
 389     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 390                                    processing packets during the current
 391                                    interval. */
 392     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 393                                    during rxq to pmd assignment. */
 394     RXQ_N_CYCLES
 395 };
 396
 397 enum {
 398     DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
 399     DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
 400     DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
 401 };
 402
 403 struct dp_flow_offload_item {
 404     struct dp_netdev_pmd_thread *pmd;
 405     struct dp_netdev_flow *flow;
 406     int op;
 407     struct match match;
 408     struct nlattr *actions;
 409     size_t actions_len;
 410
 411     struct ovs_list node;
 412 };
 413
 414 struct dp_flow_offload {
 415     struct ovs_mutex mutex;
 416     struct ovs_list list;
 417     pthread_cond_t cond;
 418 };
 419
 420 static struct dp_flow_offload dp_flow_offload = {
 421     .mutex = OVS_MUTEX_INITIALIZER,
 422     .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
 423 };
 424
 425 static struct ovsthread_once offload_thread_once
 426     = OVSTHREAD_ONCE_INITIALIZER;
 427
 428 #define XPS_TIMEOUT 500000LL    /* In microseconds. */
 429
 430 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 431 struct dp_netdev_rxq {
 432     struct dp_netdev_port *port;
 433     struct netdev_rxq *rx;
 434     unsigned core_id;                  /* Core to which this queue should be
 435                                           pinned. OVS_CORE_UNSPEC if the
 436                                           queue doesn't need to be pinned to a
 437                                           particular core. */
 438     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 439     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 440     bool is_vhost;                     /* Is rxq of a vhost port. */
 441
 442     /* Counters of cycles spent successfully polling and processing pkts. */
 443     atomic_ullong cycles[RXQ_N_CYCLES];
 444     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 445        sum them to yield the cycles used for an rxq. */
 446     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 447 };
 448
 449 /* A port in a netdev-based datapath. */
 450 struct dp_netdev_port {
 451     odp_port_t port_no;
 452     bool dynamic_txqs;          /* If true XPS will be used. */
 453     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 454     struct netdev *netdev;
 455     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 456     struct netdev_saved_flags *sf;
 457     struct dp_netdev_rxq *rxqs;
 458     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 459     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 460     struct ovs_mutex txq_used_mutex;
 461     char *type;                 /* Port type as requested by user. */
 462     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 463 };
 464
 465 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 466 struct dp_netdev_flow_stats {
 467     atomic_llong used;             /* Last used time, in monotonic msecs. */
 468     atomic_ullong packet_count;    /* Number of packets matched. */
 469     atomic_ullong byte_count;      /* Number of bytes matched. */
 470     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 471 };
 472
 473 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 474  *
 475  *
 476  * Thread-safety
 477  * =============
 478  *
 479  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 480  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 481  *
 482  * Motivation
 483  * ----------
 484  *
 485  * The thread safety rules described here for "struct dp_netdev_flow" are
 486  * motivated by two goals:
 487  *
 488  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 489  *      reading bad data due to changes by some thread concurrently modifying
 490  *      those members.
 491  *
 492  *    - Prevent two threads making changes to members of a given "struct
 493  *      dp_netdev_flow" from interfering with each other.
 494  *
 495  *
 496  * Rules
 497  * -----
 498  *
 499  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 500  * grace period.  Code that needs to hold onto a flow for a while
 501  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 502  *
 503  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 504  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 505  * from modification.
 506  *
 507  * Some members, marked 'const', are immutable.  Accessing other members
 508  * requires synchronization, as noted in more detail below.
 509  */
 510 struct dp_netdev_flow {
 511     const struct flow flow;      /* Unmasked flow that created this entry. */
 512     /* Hash table index by unmasked flow. */
 513     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 514                                  /* 'flow_table'. */
 515     const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
 516     const ovs_u128 ufid;         /* Unique flow identifier. */
 517     const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
 518     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 519                                  /* flow. */
 520
 521     /* Number of references.
 522      * The classifier owns one reference.
 523      * Any thread trying to keep a rule from being freed should hold its own
 524      * reference. */
 525     struct ovs_refcount ref_cnt;
 526
 527     bool dead;
 528     uint32_t mark;               /* Unique flow mark assigned to a flow */
 529
 530     /* Statistics. */
 531     struct dp_netdev_flow_stats stats;
 532
 533     /* Actions. */
 534     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 535
 536     /* While processing a group of input packets, the datapath uses the next
 537      * member to store a pointer to the output batch for the flow.  It is
 538      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 539      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 540     struct packet_batch_per_flow *batch;
 541
 542     /* Packet classification. */
 543     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 544     /* 'cr' must be the last member. */
 545 };
 546
 547 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 548 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 549 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 550                                          struct flow *, bool);
 551
 552 /* A set of datapath actions within a "struct dp_netdev_flow".
 553  *
 554  *
 555  * Thread-safety
 556  * =============
 557  *
 558  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 559 struct dp_netdev_actions {
 560     /* These members are immutable: they do not change during the struct's
 561      * lifetime.  */
 562     unsigned int size;          /* Size of 'actions', in bytes. */
 563     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 564 };
 565
 566 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 567                                                    size_t);
 568 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 569     const struct dp_netdev_flow *);
 570 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 571
 572 struct polled_queue {
 573     struct dp_netdev_rxq *rxq;
 574     odp_port_t port_no;
 575 };
 576
 577 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 578 struct rxq_poll {
 579     struct dp_netdev_rxq *rxq;
 580     struct hmap_node node;
 581 };
 582
 583 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 584  * 'tnl_port_cache' or 'tx_ports'. */
 585 struct tx_port {
 586     struct dp_netdev_port *port;
 587     int qid;
 588     long long last_used;
 589     struct hmap_node node;
 590     long long flush_time;
 591     struct dp_packet_batch output_pkts;
 592     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 593 };
 594
 595 /* A set of properties for the current processing loop that is not directly
 596  * associated with the pmd thread itself, but with the packets being
 597  * processed or the short-term system configuration (for example, time).
 598  * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 599 struct dp_netdev_pmd_thread_ctx {
 600     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 601     long long now;
 602     /* RX queue from which last packet was received. */
 603     struct dp_netdev_rxq *last_rxq;
 604 };
 605
 606 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 607  * the performance overhead of interrupt processing.  Therefore netdev can
 608  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 609  * these device to check for recv buffer.  pmd-thread does polling for
 610  * devices assigned to itself.
 611  *
 612  * DPDK used PMD for accessing NIC.
 613  *
 614  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 615  * I/O of all non-pmd threads.  There will be no actual thread created
 616  * for the instance.
 617  *
 618  * Each struct has its own flow cache and classifier per managed ingress port.
 619  * For packets received on ingress port, a look up is done on corresponding PMD
 620  * thread's flow cache and in case of a miss, lookup is performed in the
 621  * corresponding classifier of port.  Packets are executed with the found
 622  * actions in either case.
 623  * */
 624 struct dp_netdev_pmd_thread {
 625     struct dp_netdev *dp;
 626     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 627     struct cmap_node node;          /* In 'dp->poll_threads'. */
 628
 629     pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
 630     struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */
 631
 632     /* Per thread exact-match cache.  Note, the instance for cpu core
 633      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 634      * need to be protected by 'non_pmd_mutex'.  Every other instance
 635      * will only be accessed by its own pmd thread. */
 636     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
 637
 638     /* Flow-Table and classifiers
 639      *
 640      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 641      * changes to 'classifiers' must be made while still holding the
 642      * 'flow_mutex'.
 643      */
 644     struct ovs_mutex flow_mutex;
 645     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 646
 647     /* One classifier per in_port polled by the pmd */
 648     struct cmap classifiers;
 649     /* Periodically sort subtable vectors according to hit frequencies */
 650     long long int next_optimization;
 651     /* End of the next time interval for which processing cycles
 652        are stored for each polled rxq. */
 653     long long int rxq_next_cycle_store;
 654
 655     /* Last interval timestamp. */
 656     uint64_t intrvl_tsc_prev;
 657     /* Last interval cycles. */
 658     atomic_ullong intrvl_cycles;
 659
 660     /* Current context of the PMD thread. */
 661     struct dp_netdev_pmd_thread_ctx ctx;
 662
 663     struct latch exit_latch;        /* For terminating the pmd thread. */
 664     struct seq *reload_seq;
 665     uint64_t last_reload_seq;
 666     atomic_bool reload;             /* Do we need to reload ports? */
 667     pthread_t thread;
 668     unsigned core_id;               /* CPU core id of this pmd thread. */
 669     int numa_id;                    /* numa node id of this pmd thread. */
 670     bool isolated;
 671
 672     /* Queue id used by this pmd thread to send packets on all netdevs if
 673      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 674      * than 'cmap_count(dp->poll_threads)'. */
 675     uint32_t static_tx_qid;
 676
 677     /* Number of filled output batches. */
 678     int n_output_batches;
 679
 680     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 681     /* List of rx queues to poll. */
 682     struct hmap poll_list OVS_GUARDED;
 683     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 684      * read by the pmd thread. */
 685     struct hmap tx_ports OVS_GUARDED;
 686
 687     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 688      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 689      * with at least one txq (that support send).  A port can be in both.
 690      *
 691      * There are two separate maps to make sure that we don't try to execute
 692      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 693      *
 694      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 695      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 696      * other instance will only be accessed by its own pmd thread. */
 697     struct hmap tnl_port_cache;
 698     struct hmap send_port_cache;
 699
 700     /* Keep track of detailed PMD performance statistics. */
 701     struct pmd_perf_stats perf_stats;
 702
 703     /* Set to true if the pmd thread needs to be reloaded. */
 704     bool need_reload;
 705 };
 706
 707 /* Interface to netdev-based datapath. */
 708 struct dpif_netdev {
 709     struct dpif dpif;
 710     struct dp_netdev *dp;
 711     uint64_t last_port_seq;
 712 };
 713
 714 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 715                               struct dp_netdev_port **portp)
 716     OVS_REQUIRES(dp->port_mutex);
 717 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 718                             struct dp_netdev_port **portp)
 719     OVS_REQUIRES(dp->port_mutex);
 720 static void dp_netdev_free(struct dp_netdev *)
 721     OVS_REQUIRES(dp_netdev_mutex);
 722 static int do_add_port(struct dp_netdev *dp, const char *devname,
 723                        const char *type, odp_port_t port_no)
 724     OVS_REQUIRES(dp->port_mutex);
 725 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 726     OVS_REQUIRES(dp->port_mutex);
 727 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 728                             bool create, struct dpif **);
 729 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 730                                       struct dp_packet_batch *,
 731                                       bool should_steal,
 732                                       const struct flow *flow,
 733                                       const struct nlattr *actions,
 734                                       size_t actions_len);
 735 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 736                             struct dp_packet_batch *, odp_port_t port_no);
 737 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 738                                   struct dp_packet_batch *);
 739
 740 static void dp_netdev_disable_upcall(struct dp_netdev *);
 741 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 742 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 743                                     struct dp_netdev *dp, unsigned core_id,
 744                                     int numa_id);
 745 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 746 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 747     OVS_REQUIRES(dp->port_mutex);
 748
 749 static void *pmd_thread_main(void *);
 750 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 751                                                       unsigned core_id);
 752 static struct dp_netdev_pmd_thread *
 753 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 754 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 755                               struct dp_netdev_pmd_thread *pmd);
 756 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 757 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 758 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 759                                          struct dp_netdev_port *port)
 760     OVS_REQUIRES(pmd->port_mutex);
 761 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 762                                            struct tx_port *tx)
 763     OVS_REQUIRES(pmd->port_mutex);
 764 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 765                                      struct dp_netdev_rxq *rxq)
 766     OVS_REQUIRES(pmd->port_mutex);
 767 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 768                                        struct rxq_poll *poll)
 769     OVS_REQUIRES(pmd->port_mutex);
 770 static int
 771 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 772                                    bool force);
 773
 774 static void reconfigure_datapath(struct dp_netdev *dp)
 775     OVS_REQUIRES(dp->port_mutex);
 776 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 777 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 778 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 779 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 780     OVS_REQUIRES(pmd->port_mutex);
 781 static inline void
 782 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 783                            struct polled_queue *poll_list, int poll_cnt);
 784 static void
 785 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 786                          enum rxq_cycles_counter_type type,
 787                          unsigned long long cycles);
 788 static uint64_t
 789 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 790                          enum rxq_cycles_counter_type type);
 791 static void
 792 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 793                            unsigned long long cycles);
 794 static uint64_t
 795 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 796 static void
 797 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 798                                bool purge);
 799 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 800                                       struct tx_port *tx);
 801
 802 static inline bool emc_entry_alive(struct emc_entry *ce);
 803 static void emc_clear_entry(struct emc_entry *ce);
 804 static void smc_clear_entry(struct smc_bucket *b, int idx);
 805
 806 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
 807 static inline bool
 808 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
 809 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 810                                   struct dp_netdev_flow *flow);
 811
 812 static void
 813 emc_cache_init(struct emc_cache *flow_cache)
 814 {
 815     int i;
 816
 817     flow_cache->sweep_idx = 0;
 818     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 819         flow_cache->entries[i].flow = NULL;
 820         flow_cache->entries[i].key.hash = 0;
 821         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 822         flowmap_init(&flow_cache->entries[i].key.mf.map);
 823     }
 824 }
 825
 826 static void
 827 smc_cache_init(struct smc_cache *smc_cache)
 828 {
 829     int i, j;
 830     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 831         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 832             smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
 833         }
 834     }
 835 }
 836
 837 static void
 838 dfc_cache_init(struct dfc_cache *flow_cache)
 839 {
 840     emc_cache_init(&flow_cache->emc_cache);
 841     smc_cache_init(&flow_cache->smc_cache);
 842 }
 843
 844 static void
 845 emc_cache_uninit(struct emc_cache *flow_cache)
 846 {
 847     int i;
 848
 849     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 850         emc_clear_entry(&flow_cache->entries[i]);
 851     }
 852 }
 853
 854 static void
 855 smc_cache_uninit(struct smc_cache *smc)
 856 {
 857     int i, j;
 858
 859     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 860         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 861             smc_clear_entry(&(smc->buckets[i]), j);
 862         }
 863     }
 864 }
 865
 866 static void
 867 dfc_cache_uninit(struct dfc_cache *flow_cache)
 868 {
 869     smc_cache_uninit(&flow_cache->smc_cache);
 870     emc_cache_uninit(&flow_cache->emc_cache);
 871 }
 872
 873 /* Check and clear dead flow references slowly (one entry at each
 874  * invocation).  */
 875 static void
 876 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 877 {
 878     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 879
 880     if (!emc_entry_alive(entry)) {
 881         emc_clear_entry(entry);
 882     }
 883     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 884 }
 885
 886 /* Updates the time in PMD threads context and should be called in three cases:
 887  *
 888  *     1. PMD structure initialization:
 889  *         - dp_netdev_configure_pmd()
 890  *
 891  *     2. Before processing of the new packet batch:
 892  *         - dpif_netdev_execute()
 893  *         - dp_netdev_process_rxq_port()
 894  *
 895  *     3. At least once per polling iteration in main polling threads if no
 896  *        packets received on current iteration:
 897  *         - dpif_netdev_run()
 898  *         - pmd_thread_main()
 899  *
 900  * 'pmd->ctx.now' should be used without update in all other cases if possible.
 901  */
 902 static inline void
 903 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
 904 {
 905     pmd->ctx.now = time_usec();
 906 }
 907
 908 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 909 bool
 910 dpif_is_netdev(const struct dpif *dpif)
 911 {
 912     return dpif->dpif_class->open == dpif_netdev_open;
 913 }
 914
 915 static struct dpif_netdev *
 916 dpif_netdev_cast(const struct dpif *dpif)
 917 {
 918     ovs_assert(dpif_is_netdev(dpif));
 919     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 920 }
 921
 922 static struct dp_netdev *
 923 get_dp_netdev(const struct dpif *dpif)
 924 {
 925     return dpif_netdev_cast(dpif)->dp;
 926 }
 927 \f
 928 enum pmd_info_type {
 929     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 930     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
 931     PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
 932     PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
 933 };
 934
 935 static void
 936 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 937 {
 938     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
 939                         ? "main thread" : "pmd thread");
 940     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
 941         ds_put_format(reply, " numa_id %d", pmd->numa_id);
 942     }
 943     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
 944         ds_put_format(reply, " core_id %u", pmd->core_id);
 945     }
 946     ds_put_cstr(reply, ":\n");
 947 }
 948
 949 static void
 950 pmd_info_show_stats(struct ds *reply,
 951                     struct dp_netdev_pmd_thread *pmd)
 952 {
 953     uint64_t stats[PMD_N_STATS];
 954     uint64_t total_cycles, total_packets;
 955     double passes_per_pkt = 0;
 956     double lookups_per_hit = 0;
 957     double packets_per_batch = 0;
 958
 959     pmd_perf_read_counters(&pmd->perf_stats, stats);
 960     total_cycles = stats[PMD_CYCLES_ITER_IDLE]
 961                          + stats[PMD_CYCLES_ITER_BUSY];
 962     total_packets = stats[PMD_STAT_RECV];
 963
 964     format_pmd_thread(reply, pmd);
 965
 966     if (total_packets > 0) {
 967         passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
 968                             / (double) total_packets;
 969     }
 970     if (stats[PMD_STAT_MASKED_HIT] > 0) {
 971         lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
 972                             / (double) stats[PMD_STAT_MASKED_HIT];
 973     }
 974     if (stats[PMD_STAT_SENT_BATCHES] > 0) {
 975         packets_per_batch = stats[PMD_STAT_SENT_PKTS]
 976                             / (double) stats[PMD_STAT_SENT_BATCHES];
 977     }
 978
 979     ds_put_format(reply,
 980                   "  packets received: %"PRIu64"\n"
 981                   "  packet recirculations: %"PRIu64"\n"
 982                   "  avg. datapath passes per packet: %.02f\n"
 983                   "  emc hits: %"PRIu64"\n"
 984                   "  smc hits: %"PRIu64"\n"
 985                   "  megaflow hits: %"PRIu64"\n"
 986                   "  avg. subtable lookups per megaflow hit: %.02f\n"
 987                   "  miss with success upcall: %"PRIu64"\n"
 988                   "  miss with failed upcall: %"PRIu64"\n"
 989                   "  avg. packets per output batch: %.02f\n",
 990                   total_packets, stats[PMD_STAT_RECIRC],
 991                   passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
 992                   stats[PMD_STAT_SMC_HIT],
 993                   stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
 994                   stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
 995                   packets_per_batch);
 996
 997     if (total_cycles == 0) {
 998         return;
 999     }
1000
1001     ds_put_format(reply,
1002                   "  idle cycles: %"PRIu64" (%.02f%%)\n"
1003                   "  processing cycles: %"PRIu64" (%.02f%%)\n",
1004                   stats[PMD_CYCLES_ITER_IDLE],
1005                   stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1006                   stats[PMD_CYCLES_ITER_BUSY],
1007                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1008
1009     if (total_packets == 0) {
1010         return;
1011     }
1012
1013     ds_put_format(reply,
1014                   "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1015                   total_cycles / (double) total_packets,
1016                   total_cycles, total_packets);
1017
1018     ds_put_format(reply,
1019                   "  avg processing cycles per packet: "
1020                   "%.02f (%"PRIu64"/%"PRIu64")\n",
1021                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1022                   stats[PMD_CYCLES_ITER_BUSY], total_packets);
1023 }
1024
1025 static void
1026 pmd_info_show_perf(struct ds *reply,
1027                    struct dp_netdev_pmd_thread *pmd,
1028                    struct pmd_perf_params *par)
1029 {
1030     if (pmd->core_id != NON_PMD_CORE_ID) {
1031         char *time_str =
1032                 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1033         long long now = time_msec();
1034         double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1035
1036         ds_put_cstr(reply, "\n");
1037         ds_put_format(reply, "Time: %s\n", time_str);
1038         ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1039         ds_put_cstr(reply, "\n");
1040         format_pmd_thread(reply, pmd);
1041         ds_put_cstr(reply, "\n");
1042         pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1043         if (pmd_perf_metrics_enabled(pmd)) {
1044             /* Prevent parallel clearing of perf metrics. */
1045             ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1046             if (par->histograms) {
1047                 ds_put_cstr(reply, "\n");
1048                 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1049             }
1050             if (par->iter_hist_len > 0) {
1051                 ds_put_cstr(reply, "\n");
1052                 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1053                         par->iter_hist_len);
1054             }
1055             if (par->ms_hist_len > 0) {
1056                 ds_put_cstr(reply, "\n");
1057                 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1058                         par->ms_hist_len);
1059             }
1060             ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1061         }
1062         free(time_str);
1063     }
1064 }
1065
1066 static int
1067 compare_poll_list(const void *a_, const void *b_)
1068 {
1069     const struct rxq_poll *a = a_;
1070     const struct rxq_poll *b = b_;
1071
1072     const char *namea = netdev_rxq_get_name(a->rxq->rx);
1073     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1074
1075     int cmp = strcmp(namea, nameb);
1076     if (!cmp) {
1077         return netdev_rxq_get_queue_id(a->rxq->rx)
1078                - netdev_rxq_get_queue_id(b->rxq->rx);
1079     } else {
1080         return cmp;
1081     }
1082 }
1083
1084 static void
1085 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1086                  size_t *n)
1087 {
1088     struct rxq_poll *ret, *poll;
1089     size_t i;
1090
1091     *n = hmap_count(&pmd->poll_list);
1092     if (!*n) {
1093         ret = NULL;
1094     } else {
1095         ret = xcalloc(*n, sizeof *ret);
1096         i = 0;
1097         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1098             ret[i] = *poll;
1099             i++;
1100         }
1101         ovs_assert(i == *n);
1102         qsort(ret, *n, sizeof *ret, compare_poll_list);
1103     }
1104
1105     *list = ret;
1106 }
1107
1108 static void
1109 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1110 {
1111     if (pmd->core_id != NON_PMD_CORE_ID) {
1112         struct rxq_poll *list;
1113         size_t n_rxq;
1114         uint64_t total_cycles = 0;
1115
1116         ds_put_format(reply,
1117                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
1118                       pmd->numa_id, pmd->core_id, (pmd->isolated)
1119                                                   ? "true" : "false");
1120
1121         ovs_mutex_lock(&pmd->port_mutex);
1122         sorted_poll_list(pmd, &list, &n_rxq);
1123
1124         /* Get the total pmd cycles for an interval. */
1125         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1126         /* Estimate the cycles to cover all intervals. */
1127         total_cycles *= PMD_RXQ_INTERVAL_MAX;
1128
1129         for (int i = 0; i < n_rxq; i++) {
1130             struct dp_netdev_rxq *rxq = list[i].rxq;
1131             const char *name = netdev_rxq_get_name(rxq->rx);
1132             uint64_t proc_cycles = 0;
1133
1134             for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1135                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1136             }
1137             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
1138                           netdev_rxq_get_queue_id(list[i].rxq->rx));
1139             ds_put_format(reply, "  pmd usage: ");
1140             if (total_cycles) {
1141                 ds_put_format(reply, "%2"PRIu64"",
1142                               proc_cycles * 100 / total_cycles);
1143                 ds_put_cstr(reply, " %");
1144             } else {
1145                 ds_put_format(reply, "%s", "NOT AVAIL");
1146             }
1147             ds_put_cstr(reply, "\n");
1148         }
1149         ovs_mutex_unlock(&pmd->port_mutex);
1150         free(list);
1151     }
1152 }
1153
1154 static int
1155 compare_poll_thread_list(const void *a_, const void *b_)
1156 {
1157     const struct dp_netdev_pmd_thread *a, *b;
1158
1159     a = *(struct dp_netdev_pmd_thread **)a_;
1160     b = *(struct dp_netdev_pmd_thread **)b_;
1161
1162     if (a->core_id < b->core_id) {
1163         return -1;
1164     }
1165     if (a->core_id > b->core_id) {
1166         return 1;
1167     }
1168     return 0;
1169 }
1170
1171 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1172  * this list, as long as we do not go to quiescent state. */
1173 static void
1174 sorted_poll_thread_list(struct dp_netdev *dp,
1175                         struct dp_netdev_pmd_thread ***list,
1176                         size_t *n)
1177 {
1178     struct dp_netdev_pmd_thread *pmd;
1179     struct dp_netdev_pmd_thread **pmd_list;
1180     size_t k = 0, n_pmds;
1181
1182     n_pmds = cmap_count(&dp->poll_threads);
1183     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1184
1185     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1186         if (k >= n_pmds) {
1187             break;
1188         }
1189         pmd_list[k++] = pmd;
1190     }
1191
1192     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1193
1194     *list = pmd_list;
1195     *n = k;
1196 }
1197
1198 static void
1199 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1200                           const char *argv[], void *aux OVS_UNUSED)
1201 {
1202     struct ds reply = DS_EMPTY_INITIALIZER;
1203     struct dp_netdev *dp = NULL;
1204
1205     ovs_mutex_lock(&dp_netdev_mutex);
1206
1207     if (argc == 2) {
1208         dp = shash_find_data(&dp_netdevs, argv[1]);
1209     } else if (shash_count(&dp_netdevs) == 1) {
1210         /* There's only one datapath */
1211         dp = shash_first(&dp_netdevs)->data;
1212     }
1213
1214     if (!dp) {
1215         ovs_mutex_unlock(&dp_netdev_mutex);
1216         unixctl_command_reply_error(conn,
1217                                     "please specify an existing datapath");
1218         return;
1219     }
1220
1221     dp_netdev_request_reconfigure(dp);
1222     ovs_mutex_unlock(&dp_netdev_mutex);
1223     ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1224     unixctl_command_reply(conn, ds_cstr(&reply));
1225     ds_destroy(&reply);
1226 }
1227
1228 static void
1229 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1230                      void *aux)
1231 {
1232     struct ds reply = DS_EMPTY_INITIALIZER;
1233     struct dp_netdev_pmd_thread **pmd_list;
1234     struct dp_netdev *dp = NULL;
1235     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1236     unsigned int core_id;
1237     bool filter_on_pmd = false;
1238     size_t n;
1239
1240     ovs_mutex_lock(&dp_netdev_mutex);
1241
1242     while (argc > 1) {
1243         if (!strcmp(argv[1], "-pmd") && argc > 2) {
1244             if (str_to_uint(argv[2], 10, &core_id)) {
1245                 filter_on_pmd = true;
1246             }
1247             argc -= 2;
1248             argv += 2;
1249         } else {
1250             dp = shash_find_data(&dp_netdevs, argv[1]);
1251             argc -= 1;
1252             argv += 1;
1253         }
1254     }
1255
1256     if (!dp) {
1257         if (shash_count(&dp_netdevs) == 1) {
1258             /* There's only one datapath */
1259             dp = shash_first(&dp_netdevs)->data;
1260         } else {
1261             ovs_mutex_unlock(&dp_netdev_mutex);
1262             unixctl_command_reply_error(conn,
1263                                         "please specify an existing datapath");
1264             return;
1265         }
1266     }
1267
1268     sorted_poll_thread_list(dp, &pmd_list, &n);
1269     for (size_t i = 0; i < n; i++) {
1270         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1271         if (!pmd) {
1272             break;
1273         }
1274         if (filter_on_pmd && pmd->core_id != core_id) {
1275             continue;
1276         }
1277         if (type == PMD_INFO_SHOW_RXQ) {
1278             pmd_info_show_rxq(&reply, pmd);
1279         } else if (type == PMD_INFO_CLEAR_STATS) {
1280             pmd_perf_stats_clear(&pmd->perf_stats);
1281         } else if (type == PMD_INFO_SHOW_STATS) {
1282             pmd_info_show_stats(&reply, pmd);
1283         } else if (type == PMD_INFO_PERF_SHOW) {
1284             pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1285         }
1286     }
1287     free(pmd_list);
1288
1289     ovs_mutex_unlock(&dp_netdev_mutex);
1290
1291     unixctl_command_reply(conn, ds_cstr(&reply));
1292     ds_destroy(&reply);
1293 }
1294
1295 static void
1296 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1297                           const char *argv[],
1298                           void *aux OVS_UNUSED)
1299 {
1300     struct pmd_perf_params par;
1301     long int it_hist = 0, ms_hist = 0;
1302     par.histograms = true;
1303
1304     while (argc > 1) {
1305         if (!strcmp(argv[1], "-nh")) {
1306             par.histograms = false;
1307             argc -= 1;
1308             argv += 1;
1309         } else if (!strcmp(argv[1], "-it") && argc > 2) {
1310             it_hist = strtol(argv[2], NULL, 10);
1311             if (it_hist < 0) {
1312                 it_hist = 0;
1313             } else if (it_hist > HISTORY_LEN) {
1314                 it_hist = HISTORY_LEN;
1315             }
1316             argc -= 2;
1317             argv += 2;
1318         } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1319             ms_hist = strtol(argv[2], NULL, 10);
1320             if (ms_hist < 0) {
1321                 ms_hist = 0;
1322             } else if (ms_hist > HISTORY_LEN) {
1323                 ms_hist = HISTORY_LEN;
1324             }
1325             argc -= 2;
1326             argv += 2;
1327         } else {
1328             break;
1329         }
1330     }
1331     par.iter_hist_len = it_hist;
1332     par.ms_hist_len = ms_hist;
1333     par.command_type = PMD_INFO_PERF_SHOW;
1334     dpif_netdev_pmd_info(conn, argc, argv, &par);
1335 }
1336 \f
1337 static int
1338 dpif_netdev_init(void)
1339 {
1340     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1341                               clear_aux = PMD_INFO_CLEAR_STATS,
1342                               poll_aux = PMD_INFO_SHOW_RXQ;
1343
1344     unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1345                              0, 3, dpif_netdev_pmd_info,
1346                              (void *)&show_aux);
1347     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1348                              0, 3, dpif_netdev_pmd_info,
1349                              (void *)&clear_aux);
1350     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1351                              0, 3, dpif_netdev_pmd_info,
1352                              (void *)&poll_aux);
1353     unixctl_command_register("dpif-netdev/pmd-perf-show",
1354                              "[-nh] [-it iter-history-len]"
1355                              " [-ms ms-history-len]"
1356                              " [-pmd core] [dp]",
1357                              0, 8, pmd_perf_show_cmd,
1358                              NULL);
1359     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1360                              0, 1, dpif_netdev_pmd_rebalance,
1361                              NULL);
1362     unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1363                              "on|off [-b before] [-a after] [-e|-ne] "
1364                              "[-us usec] [-q qlen]",
1365                              0, 10, pmd_perf_log_set_cmd,
1366                              NULL);
1367     return 0;
1368 }
1369
1370 static int
1371 dpif_netdev_enumerate(struct sset *all_dps,
1372                       const struct dpif_class *dpif_class)
1373 {
1374     struct shash_node *node;
1375
1376     ovs_mutex_lock(&dp_netdev_mutex);
1377     SHASH_FOR_EACH(node, &dp_netdevs) {
1378         struct dp_netdev *dp = node->data;
1379         if (dpif_class != dp->class) {
1380             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1381              * If the class doesn't match, skip this dpif. */
1382              continue;
1383         }
1384         sset_add(all_dps, node->name);
1385     }
1386     ovs_mutex_unlock(&dp_netdev_mutex);
1387
1388     return 0;
1389 }
1390
1391 static bool
1392 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1393 {
1394     return class != &dpif_netdev_class;
1395 }
1396
1397 static const char *
1398 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1399 {
1400     return strcmp(type, "internal") ? type
1401                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1402                   : "tap";
1403 }
1404
1405 static struct dpif *
1406 create_dpif_netdev(struct dp_netdev *dp)
1407 {
1408     uint16_t netflow_id = hash_string(dp->name, 0);
1409     struct dpif_netdev *dpif;
1410
1411     ovs_refcount_ref(&dp->ref_cnt);
1412
1413     dpif = xmalloc(sizeof *dpif);
1414     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1415     dpif->dp = dp;
1416     dpif->last_port_seq = seq_read(dp->port_seq);
1417
1418     return &dpif->dpif;
1419 }
1420
1421 /* Choose an unused, non-zero port number and return it on success.
1422  * Return ODPP_NONE on failure. */
1423 static odp_port_t
1424 choose_port(struct dp_netdev *dp, const char *name)
1425     OVS_REQUIRES(dp->port_mutex)
1426 {
1427     uint32_t port_no;
1428
1429     if (dp->class != &dpif_netdev_class) {
1430         const char *p;
1431         int start_no = 0;
1432
1433         /* If the port name begins with "br", start the number search at
1434          * 100 to make writing tests easier. */
1435         if (!strncmp(name, "br", 2)) {
1436             start_no = 100;
1437         }
1438
1439         /* If the port name contains a number, try to assign that port number.
1440          * This can make writing unit tests easier because port numbers are
1441          * predictable. */
1442         for (p = name; *p != '\0'; p++) {
1443             if (isdigit((unsigned char) *p)) {
1444                 port_no = start_no + strtol(p, NULL, 10);
1445                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1446                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1447                     return u32_to_odp(port_no);
1448                 }
1449                 break;
1450             }
1451         }
1452     }
1453
1454     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1455         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1456             return u32_to_odp(port_no);
1457         }
1458     }
1459
1460     return ODPP_NONE;
1461 }
1462
1463 static int
1464 create_dp_netdev(const char *name, const struct dpif_class *class,
1465                  struct dp_netdev **dpp)
1466     OVS_REQUIRES(dp_netdev_mutex)
1467 {
1468     struct dp_netdev *dp;
1469     int error;
1470
1471     dp = xzalloc(sizeof *dp);
1472     shash_add(&dp_netdevs, name, dp);
1473
1474     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1475     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1476     ovs_refcount_init(&dp->ref_cnt);
1477     atomic_flag_clear(&dp->destroyed);
1478
1479     ovs_mutex_init(&dp->port_mutex);
1480     hmap_init(&dp->ports);
1481     dp->port_seq = seq_create();
1482     fat_rwlock_init(&dp->upcall_rwlock);
1483
1484     dp->reconfigure_seq = seq_create();
1485     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1486
1487     for (int i = 0; i < N_METER_LOCKS; ++i) {
1488         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1489     }
1490
1491     /* Disable upcalls by default. */
1492     dp_netdev_disable_upcall(dp);
1493     dp->upcall_aux = NULL;
1494     dp->upcall_cb = NULL;
1495
1496     conntrack_init(&dp->conntrack);
1497
1498     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1499     atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1500
1501     cmap_init(&dp->poll_threads);
1502
1503     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1504     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1505     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1506
1507     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1508     ovsthread_key_create(&dp->per_pmd_key, NULL);
1509
1510     ovs_mutex_lock(&dp->port_mutex);
1511     /* non-PMD will be created before all other threads and will
1512      * allocate static_tx_qid = 0. */
1513     dp_netdev_set_nonpmd(dp);
1514
1515     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1516                                                              "internal"),
1517                         ODPP_LOCAL);
1518     ovs_mutex_unlock(&dp->port_mutex);
1519     if (error) {
1520         dp_netdev_free(dp);
1521         return error;
1522     }
1523
1524     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1525     *dpp = dp;
1526     return 0;
1527 }
1528
1529 static void
1530 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1531 {
1532     seq_change(dp->reconfigure_seq);
1533 }
1534
1535 static bool
1536 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1537 {
1538     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1539 }
1540
1541 static int
1542 dpif_netdev_open(const struct dpif_class *class, const char *name,
1543                  bool create, struct dpif **dpifp)
1544 {
1545     struct dp_netdev *dp;
1546     int error;
1547
1548     ovs_mutex_lock(&dp_netdev_mutex);
1549     dp = shash_find_data(&dp_netdevs, name);
1550     if (!dp) {
1551         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1552     } else {
1553         error = (dp->class != class ? EINVAL
1554                  : create ? EEXIST
1555                  : 0);
1556     }
1557     if (!error) {
1558         *dpifp = create_dpif_netdev(dp);
1559         dp->dpif = *dpifp;
1560     }
1561     ovs_mutex_unlock(&dp_netdev_mutex);
1562
1563     return error;
1564 }
1565
1566 static void
1567 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1568     OVS_NO_THREAD_SAFETY_ANALYSIS
1569 {
1570     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1571     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1572
1573     /* Before freeing a lock we should release it */
1574     fat_rwlock_unlock(&dp->upcall_rwlock);
1575     fat_rwlock_destroy(&dp->upcall_rwlock);
1576 }
1577
1578 static void
1579 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1580     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1581 {
1582     if (dp->meters[meter_id]) {
1583         free(dp->meters[meter_id]);
1584         dp->meters[meter_id] = NULL;
1585     }
1586 }
1587
1588 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1589  * through the 'dp_netdevs' shash while freeing 'dp'. */
1590 static void
1591 dp_netdev_free(struct dp_netdev *dp)
1592     OVS_REQUIRES(dp_netdev_mutex)
1593 {
1594     struct dp_netdev_port *port, *next;
1595
1596     shash_find_and_delete(&dp_netdevs, dp->name);
1597
1598     ovs_mutex_lock(&dp->port_mutex);
1599     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1600         do_del_port(dp, port);
1601     }
1602     ovs_mutex_unlock(&dp->port_mutex);
1603
1604     dp_netdev_destroy_all_pmds(dp, true);
1605     cmap_destroy(&dp->poll_threads);
1606
1607     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1608     id_pool_destroy(dp->tx_qid_pool);
1609
1610     ovs_mutex_destroy(&dp->non_pmd_mutex);
1611     ovsthread_key_delete(dp->per_pmd_key);
1612
1613     conntrack_destroy(&dp->conntrack);
1614
1615
1616     seq_destroy(dp->reconfigure_seq);
1617
1618     seq_destroy(dp->port_seq);
1619     hmap_destroy(&dp->ports);
1620     ovs_mutex_destroy(&dp->port_mutex);
1621
1622     /* Upcalls must be disabled at this point */
1623     dp_netdev_destroy_upcall_lock(dp);
1624
1625     int i;
1626
1627     for (i = 0; i < MAX_METERS; ++i) {
1628         meter_lock(dp, i);
1629         dp_delete_meter(dp, i);
1630         meter_unlock(dp, i);
1631     }
1632     for (i = 0; i < N_METER_LOCKS; ++i) {
1633         ovs_mutex_destroy(&dp->meter_locks[i]);
1634     }
1635
1636     free(dp->pmd_cmask);
1637     free(CONST_CAST(char *, dp->name));
1638     free(dp);
1639 }
1640
1641 static void
1642 dp_netdev_unref(struct dp_netdev *dp)
1643 {
1644     if (dp) {
1645         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1646          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1647         ovs_mutex_lock(&dp_netdev_mutex);
1648         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1649             dp_netdev_free(dp);
1650         }
1651         ovs_mutex_unlock(&dp_netdev_mutex);
1652     }
1653 }
1654
1655 static void
1656 dpif_netdev_close(struct dpif *dpif)
1657 {
1658     struct dp_netdev *dp = get_dp_netdev(dpif);
1659
1660     dp_netdev_unref(dp);
1661     free(dpif);
1662 }
1663
1664 static int
1665 dpif_netdev_destroy(struct dpif *dpif)
1666 {
1667     struct dp_netdev *dp = get_dp_netdev(dpif);
1668
1669     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1670         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1671             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1672             OVS_NOT_REACHED();
1673         }
1674     }
1675
1676     return 0;
1677 }
1678
1679 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1680  * load/store semantics.  While the increment is not atomic, the load and
1681  * store operations are, making it impossible to read inconsistent values.
1682  *
1683  * This is used to update thread local stats counters. */
1684 static void
1685 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1686 {
1687     unsigned long long tmp;
1688
1689     atomic_read_relaxed(var, &tmp);
1690     tmp += n;
1691     atomic_store_relaxed(var, tmp);
1692 }
1693
1694 static int
1695 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1696 {
1697     struct dp_netdev *dp = get_dp_netdev(dpif);
1698     struct dp_netdev_pmd_thread *pmd;
1699     uint64_t pmd_stats[PMD_N_STATS];
1700
1701     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1702     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1703         stats->n_flows += cmap_count(&pmd->flow_table);
1704         pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1705         stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1706         stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1707         stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1708         stats->n_missed += pmd_stats[PMD_STAT_MISS];
1709         stats->n_lost += pmd_stats[PMD_STAT_LOST];
1710     }
1711     stats->n_masks = UINT32_MAX;
1712     stats->n_mask_hit = UINT64_MAX;
1713
1714     return 0;
1715 }
1716
1717 static void
1718 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1719 {
1720     if (pmd->core_id == NON_PMD_CORE_ID) {
1721         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1722         ovs_mutex_lock(&pmd->port_mutex);
1723         pmd_load_cached_ports(pmd);
1724         ovs_mutex_unlock(&pmd->port_mutex);
1725         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1726         return;
1727     }
1728
1729     ovs_mutex_lock(&pmd->cond_mutex);
1730     seq_change(pmd->reload_seq);
1731     atomic_store_relaxed(&pmd->reload, true);
1732     ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1733     ovs_mutex_unlock(&pmd->cond_mutex);
1734 }
1735
1736 static uint32_t
1737 hash_port_no(odp_port_t port_no)
1738 {
1739     return hash_int(odp_to_u32(port_no), 0);
1740 }
1741
1742 static int
1743 port_create(const char *devname, const char *type,
1744             odp_port_t port_no, struct dp_netdev_port **portp)
1745 {
1746     struct netdev_saved_flags *sf;
1747     struct dp_netdev_port *port;
1748     enum netdev_flags flags;
1749     struct netdev *netdev;
1750     int error;
1751
1752     *portp = NULL;
1753
1754     /* Open and validate network device. */
1755     error = netdev_open(devname, type, &netdev);
1756     if (error) {
1757         return error;
1758     }
1759     /* XXX reject non-Ethernet devices */
1760
1761     netdev_get_flags(netdev, &flags);
1762     if (flags & NETDEV_LOOPBACK) {
1763         VLOG_ERR("%s: cannot add a loopback device", devname);
1764         error = EINVAL;
1765         goto out;
1766     }
1767
1768     error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1769     if (error) {
1770         VLOG_ERR("%s: cannot set promisc flag", devname);
1771         goto out;
1772     }
1773
1774     port = xzalloc(sizeof *port);
1775     port->port_no = port_no;
1776     port->netdev = netdev;
1777     port->type = xstrdup(type);
1778     port->sf = sf;
1779     port->need_reconfigure = true;
1780     ovs_mutex_init(&port->txq_used_mutex);
1781
1782     *portp = port;
1783
1784     return 0;
1785
1786 out:
1787     netdev_close(netdev);
1788     return error;
1789 }
1790
1791 static int
1792 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1793             odp_port_t port_no)
1794     OVS_REQUIRES(dp->port_mutex)
1795 {
1796     struct dp_netdev_port *port;
1797     int error;
1798
1799     /* Reject devices already in 'dp'. */
1800     if (!get_port_by_name(dp, devname, &port)) {
1801         return EEXIST;
1802     }
1803
1804     error = port_create(devname, type, port_no, &port);
1805     if (error) {
1806         return error;
1807     }
1808
1809     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1810     seq_change(dp->port_seq);
1811
1812     reconfigure_datapath(dp);
1813
1814     return 0;
1815 }
1816
1817 static int
1818 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1819                      odp_port_t *port_nop)
1820 {
1821     struct dp_netdev *dp = get_dp_netdev(dpif);
1822     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1823     const char *dpif_port;
1824     odp_port_t port_no;
1825     int error;
1826
1827     ovs_mutex_lock(&dp->port_mutex);
1828     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1829     if (*port_nop != ODPP_NONE) {
1830         port_no = *port_nop;
1831         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1832     } else {
1833         port_no = choose_port(dp, dpif_port);
1834         error = port_no == ODPP_NONE ? EFBIG : 0;
1835     }
1836     if (!error) {
1837         *port_nop = port_no;
1838         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1839     }
1840     ovs_mutex_unlock(&dp->port_mutex);
1841
1842     return error;
1843 }
1844
1845 static int
1846 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1847 {
1848     struct dp_netdev *dp = get_dp_netdev(dpif);
1849     int error;
1850
1851     ovs_mutex_lock(&dp->port_mutex);
1852     if (port_no == ODPP_LOCAL) {
1853         error = EINVAL;
1854     } else {
1855         struct dp_netdev_port *port;
1856
1857         error = get_port_by_number(dp, port_no, &port);
1858         if (!error) {
1859             do_del_port(dp, port);
1860         }
1861     }
1862     ovs_mutex_unlock(&dp->port_mutex);
1863
1864     return error;
1865 }
1866
1867 static bool
1868 is_valid_port_number(odp_port_t port_no)
1869 {
1870     return port_no != ODPP_NONE;
1871 }
1872
1873 static struct dp_netdev_port *
1874 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1875     OVS_REQUIRES(dp->port_mutex)
1876 {
1877     struct dp_netdev_port *port;
1878
1879     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1880         if (port->port_no == port_no) {
1881             return port;
1882         }
1883     }
1884     return NULL;
1885 }
1886
1887 static int
1888 get_port_by_number(struct dp_netdev *dp,
1889                    odp_port_t port_no, struct dp_netdev_port **portp)
1890     OVS_REQUIRES(dp->port_mutex)
1891 {
1892     if (!is_valid_port_number(port_no)) {
1893         *portp = NULL;
1894         return EINVAL;
1895     } else {
1896         *portp = dp_netdev_lookup_port(dp, port_no);
1897         return *portp ? 0 : ENODEV;
1898     }
1899 }
1900
1901 static void
1902 port_destroy(struct dp_netdev_port *port)
1903 {
1904     if (!port) {
1905         return;
1906     }
1907
1908     netdev_close(port->netdev);
1909     netdev_restore_flags(port->sf);
1910
1911     for (unsigned i = 0; i < port->n_rxq; i++) {
1912         netdev_rxq_close(port->rxqs[i].rx);
1913     }
1914     ovs_mutex_destroy(&port->txq_used_mutex);
1915     free(port->rxq_affinity_list);
1916     free(port->txq_used);
1917     free(port->rxqs);
1918     free(port->type);
1919     free(port);
1920 }
1921
1922 static int
1923 get_port_by_name(struct dp_netdev *dp,
1924                  const char *devname, struct dp_netdev_port **portp)
1925     OVS_REQUIRES(dp->port_mutex)
1926 {
1927     struct dp_netdev_port *port;
1928
1929     HMAP_FOR_EACH (port, node, &dp->ports) {
1930         if (!strcmp(netdev_get_name(port->netdev), devname)) {
1931             *portp = port;
1932             return 0;
1933         }
1934     }
1935
1936     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1937      * existing port. */
1938     return ENODEV;
1939 }
1940
1941 /* Returns 'true' if there is a port with pmd netdev. */
1942 static bool
1943 has_pmd_port(struct dp_netdev *dp)
1944     OVS_REQUIRES(dp->port_mutex)
1945 {
1946     struct dp_netdev_port *port;
1947
1948     HMAP_FOR_EACH (port, node, &dp->ports) {
1949         if (netdev_is_pmd(port->netdev)) {
1950             return true;
1951         }
1952     }
1953
1954     return false;
1955 }
1956
1957 static void
1958 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1959     OVS_REQUIRES(dp->port_mutex)
1960 {
1961     hmap_remove(&dp->ports, &port->node);
1962     seq_change(dp->port_seq);
1963
1964     reconfigure_datapath(dp);
1965
1966     port_destroy(port);
1967 }
1968
1969 static void
1970 answer_port_query(const struct dp_netdev_port *port,
1971                   struct dpif_port *dpif_port)
1972 {
1973     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1974     dpif_port->type = xstrdup(port->type);
1975     dpif_port->port_no = port->port_no;
1976 }
1977
1978 static int
1979 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1980                                  struct dpif_port *dpif_port)
1981 {
1982     struct dp_netdev *dp = get_dp_netdev(dpif);
1983     struct dp_netdev_port *port;
1984     int error;
1985
1986     ovs_mutex_lock(&dp->port_mutex);
1987     error = get_port_by_number(dp, port_no, &port);
1988     if (!error && dpif_port) {
1989         answer_port_query(port, dpif_port);
1990     }
1991     ovs_mutex_unlock(&dp->port_mutex);
1992
1993     return error;
1994 }
1995
1996 static int
1997 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1998                                struct dpif_port *dpif_port)
1999 {
2000     struct dp_netdev *dp = get_dp_netdev(dpif);
2001     struct dp_netdev_port *port;
2002     int error;
2003
2004     ovs_mutex_lock(&dp->port_mutex);
2005     error = get_port_by_name(dp, devname, &port);
2006     if (!error && dpif_port) {
2007         answer_port_query(port, dpif_port);
2008     }
2009     ovs_mutex_unlock(&dp->port_mutex);
2010
2011     return error;
2012 }
2013
2014 static void
2015 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2016 {
2017     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2018     free(flow);
2019 }
2020
2021 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2022 {
2023     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2024         ovsrcu_postpone(dp_netdev_flow_free, flow);
2025     }
2026 }
2027
2028 static uint32_t
2029 dp_netdev_flow_hash(const ovs_u128 *ufid)
2030 {
2031     return ufid->u32[0];
2032 }
2033
2034 static inline struct dpcls *
2035 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2036                            odp_port_t in_port)
2037 {
2038     struct dpcls *cls;
2039     uint32_t hash = hash_port_no(in_port);
2040     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2041         if (cls->in_port == in_port) {
2042             /* Port classifier exists already */
2043             return cls;
2044         }
2045     }
2046     return NULL;
2047 }
2048
2049 static inline struct dpcls *
2050 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2051                          odp_port_t in_port)
2052     OVS_REQUIRES(pmd->flow_mutex)
2053 {
2054     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2055     uint32_t hash = hash_port_no(in_port);
2056
2057     if (!cls) {
2058         /* Create new classifier for in_port */
2059         cls = xmalloc(sizeof(*cls));
2060         dpcls_init(cls);
2061         cls->in_port = in_port;
2062         cmap_insert(&pmd->classifiers, &cls->node, hash);
2063         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2064     }
2065     return cls;
2066 }
2067
2068 #define MAX_FLOW_MARK       (UINT32_MAX - 1)
2069 #define INVALID_FLOW_MARK   (UINT32_MAX)
2070
2071 struct megaflow_to_mark_data {
2072     const struct cmap_node node;
2073     ovs_u128 mega_ufid;
2074     uint32_t mark;
2075 };
2076
2077 struct flow_mark {
2078     struct cmap megaflow_to_mark;
2079     struct cmap mark_to_flow;
2080     struct id_pool *pool;
2081 };
2082
2083 static struct flow_mark flow_mark = {
2084     .megaflow_to_mark = CMAP_INITIALIZER,
2085     .mark_to_flow = CMAP_INITIALIZER,
2086 };
2087
2088 static uint32_t
2089 flow_mark_alloc(void)
2090 {
2091     uint32_t mark;
2092
2093     if (!flow_mark.pool) {
2094         /* Haven't initiated yet, do it here */
2095         flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2096     }
2097
2098     if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2099         return mark;
2100     }
2101
2102     return INVALID_FLOW_MARK;
2103 }
2104
2105 static void
2106 flow_mark_free(uint32_t mark)
2107 {
2108     id_pool_free_id(flow_mark.pool, mark);
2109 }
2110
2111 /* associate megaflow with a mark, which is a 1:1 mapping */
2112 static void
2113 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2114 {
2115     size_t hash = dp_netdev_flow_hash(mega_ufid);
2116     struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2117
2118     data->mega_ufid = *mega_ufid;
2119     data->mark = mark;
2120
2121     cmap_insert(&flow_mark.megaflow_to_mark,
2122                 CONST_CAST(struct cmap_node *, &data->node), hash);
2123 }
2124
2125 /* disassociate meagaflow with a mark */
2126 static void
2127 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2128 {
2129     size_t hash = dp_netdev_flow_hash(mega_ufid);
2130     struct megaflow_to_mark_data *data;
2131
2132     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2133         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2134             cmap_remove(&flow_mark.megaflow_to_mark,
2135                         CONST_CAST(struct cmap_node *, &data->node), hash);
2136             free(data);
2137             return;
2138         }
2139     }
2140
2141     VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2142               UUID_ARGS((struct uuid *)mega_ufid));
2143 }
2144
2145 static inline uint32_t
2146 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2147 {
2148     size_t hash = dp_netdev_flow_hash(mega_ufid);
2149     struct megaflow_to_mark_data *data;
2150
2151     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2152         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2153             return data->mark;
2154         }
2155     }
2156
2157     VLOG_WARN("Mark id for ufid "UUID_FMT" was not found\n",
2158               UUID_ARGS((struct uuid *)mega_ufid));
2159     return INVALID_FLOW_MARK;
2160 }
2161
2162 /* associate mark with a flow, which is 1:N mapping */
2163 static void
2164 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2165 {
2166     dp_netdev_flow_ref(flow);
2167
2168     cmap_insert(&flow_mark.mark_to_flow,
2169                 CONST_CAST(struct cmap_node *, &flow->mark_node),
2170                 hash_int(mark, 0));
2171     flow->mark = mark;
2172
2173     VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2174 }
2175
2176 static bool
2177 flow_mark_has_no_ref(uint32_t mark)
2178 {
2179     struct dp_netdev_flow *flow;
2180
2181     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2182                              &flow_mark.mark_to_flow) {
2183         if (flow->mark == mark) {
2184             return false;
2185         }
2186     }
2187
2188     return true;
2189 }
2190
2191 static int
2192 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2193                           struct dp_netdev_flow *flow)
2194 {
2195     int ret = 0;
2196     uint32_t mark = flow->mark;
2197     struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2198                                              &flow->mark_node);
2199
2200     cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2201     flow->mark = INVALID_FLOW_MARK;
2202
2203     /*
2204      * no flow is referencing the mark any more? If so, let's
2205      * remove the flow from hardware and free the mark.
2206      */
2207     if (flow_mark_has_no_ref(mark)) {
2208         struct dp_netdev_port *port;
2209         odp_port_t in_port = flow->flow.in_port.odp_port;
2210
2211         ovs_mutex_lock(&pmd->dp->port_mutex);
2212         port = dp_netdev_lookup_port(pmd->dp, in_port);
2213         if (port) {
2214             ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2215         }
2216         ovs_mutex_unlock(&pmd->dp->port_mutex);
2217
2218         flow_mark_free(mark);
2219         VLOG_DBG("Freed flow mark %u\n", mark);
2220
2221         megaflow_to_mark_disassociate(&flow->mega_ufid);
2222     }
2223     dp_netdev_flow_unref(flow);
2224
2225     return ret;
2226 }
2227
2228 static void
2229 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2230 {
2231     struct dp_netdev_flow *flow;
2232
2233     CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2234         if (flow->pmd_id == pmd->core_id) {
2235             queue_netdev_flow_del(pmd, flow);
2236         }
2237     }
2238 }
2239
2240 static struct dp_netdev_flow *
2241 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2242                   const uint32_t mark)
2243 {
2244     struct dp_netdev_flow *flow;
2245
2246     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2247                              &flow_mark.mark_to_flow) {
2248         if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2249             flow->dead == false) {
2250             return flow;
2251         }
2252     }
2253
2254     return NULL;
2255 }
2256
2257 static struct dp_flow_offload_item *
2258 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2259                              struct dp_netdev_flow *flow,
2260                              int op)
2261 {
2262     struct dp_flow_offload_item *offload;
2263
2264     offload = xzalloc(sizeof(*offload));
2265     offload->pmd = pmd;
2266     offload->flow = flow;
2267     offload->op = op;
2268
2269     dp_netdev_flow_ref(flow);
2270     dp_netdev_pmd_try_ref(pmd);
2271
2272     return offload;
2273 }
2274
2275 static void
2276 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2277 {
2278     dp_netdev_pmd_unref(offload->pmd);
2279     dp_netdev_flow_unref(offload->flow);
2280
2281     free(offload->actions);
2282     free(offload);
2283 }
2284
2285 static void
2286 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2287 {
2288     ovs_mutex_lock(&dp_flow_offload.mutex);
2289     ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2290     xpthread_cond_signal(&dp_flow_offload.cond);
2291     ovs_mutex_unlock(&dp_flow_offload.mutex);
2292 }
2293
2294 static int
2295 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2296 {
2297     return mark_to_flow_disassociate(offload->pmd, offload->flow);
2298 }
2299
2300 /*
2301  * There are two flow offload operations here: addition and modification.
2302  *
2303  * For flow addition, this function does:
2304  * - allocate a new flow mark id
2305  * - perform hardware flow offload
2306  * - associate the flow mark with flow and mega flow
2307  *
2308  * For flow modification, both flow mark and the associations are still
2309  * valid, thus only item 2 needed.
2310  */
2311 static int
2312 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2313 {
2314     struct dp_netdev_port *port;
2315     struct dp_netdev_pmd_thread *pmd = offload->pmd;
2316     struct dp_netdev_flow *flow = offload->flow;
2317     odp_port_t in_port = flow->flow.in_port.odp_port;
2318     bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2319     struct offload_info info;
2320     uint32_t mark;
2321     int ret;
2322
2323     if (flow->dead) {
2324         return -1;
2325     }
2326
2327     if (modification) {
2328         mark = flow->mark;
2329         ovs_assert(mark != INVALID_FLOW_MARK);
2330     } else {
2331         /*
2332          * If a mega flow has already been offloaded (from other PMD
2333          * instances), do not offload it again.
2334          */
2335         mark = megaflow_to_mark_find(&flow->mega_ufid);
2336         if (mark != INVALID_FLOW_MARK) {
2337             VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2338             if (flow->mark != INVALID_FLOW_MARK) {
2339                 ovs_assert(flow->mark == mark);
2340             } else {
2341                 mark_to_flow_associate(mark, flow);
2342             }
2343             return 0;
2344         }
2345
2346         mark = flow_mark_alloc();
2347         if (mark == INVALID_FLOW_MARK) {
2348             VLOG_ERR("Failed to allocate flow mark!\n");
2349         }
2350     }
2351     info.flow_mark = mark;
2352
2353     ovs_mutex_lock(&pmd->dp->port_mutex);
2354     port = dp_netdev_lookup_port(pmd->dp, in_port);
2355     if (!port) {
2356         ovs_mutex_unlock(&pmd->dp->port_mutex);
2357         return -1;
2358     }
2359     ret = netdev_flow_put(port->netdev, &offload->match,
2360                           CONST_CAST(struct nlattr *, offload->actions),
2361                           offload->actions_len, &flow->mega_ufid, &info,
2362                           NULL);
2363     ovs_mutex_unlock(&pmd->dp->port_mutex);
2364
2365     if (ret) {
2366         if (!modification) {
2367             flow_mark_free(mark);
2368         } else {
2369             mark_to_flow_disassociate(pmd, flow);
2370         }
2371         return -1;
2372     }
2373
2374     if (!modification) {
2375         megaflow_to_mark_associate(&flow->mega_ufid, mark);
2376         mark_to_flow_associate(mark, flow);
2377     }
2378
2379     return 0;
2380 }
2381
2382 static void *
2383 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2384 {
2385     struct dp_flow_offload_item *offload;
2386     struct ovs_list *list;
2387     const char *op;
2388     int ret;
2389
2390     for (;;) {
2391         ovs_mutex_lock(&dp_flow_offload.mutex);
2392         if (ovs_list_is_empty(&dp_flow_offload.list)) {
2393             ovsrcu_quiesce_start();
2394             ovs_mutex_cond_wait(&dp_flow_offload.cond,
2395                                 &dp_flow_offload.mutex);
2396         }
2397         list = ovs_list_pop_front(&dp_flow_offload.list);
2398         offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2399         ovs_mutex_unlock(&dp_flow_offload.mutex);
2400
2401         switch (offload->op) {
2402         case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2403             op = "add";
2404             ret = dp_netdev_flow_offload_put(offload);
2405             break;
2406         case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2407             op = "modify";
2408             ret = dp_netdev_flow_offload_put(offload);
2409             break;
2410         case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2411             op = "delete";
2412             ret = dp_netdev_flow_offload_del(offload);
2413             break;
2414         default:
2415             OVS_NOT_REACHED();
2416         }
2417
2418         VLOG_DBG("%s to %s netdev flow\n",
2419                  ret == 0 ? "succeed" : "failed", op);
2420         dp_netdev_free_flow_offload(offload);
2421     }
2422
2423     return NULL;
2424 }
2425
2426 static void
2427 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2428                       struct dp_netdev_flow *flow)
2429 {
2430     struct dp_flow_offload_item *offload;
2431
2432     if (ovsthread_once_start(&offload_thread_once)) {
2433         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2434         ovs_thread_create("dp_netdev_flow_offload",
2435                           dp_netdev_flow_offload_main, NULL);
2436         ovsthread_once_done(&offload_thread_once);
2437     }
2438
2439     offload = dp_netdev_alloc_flow_offload(pmd, flow,
2440                                            DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2441     dp_netdev_append_flow_offload(offload);
2442 }
2443
2444 static void
2445 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2446                       struct dp_netdev_flow *flow, struct match *match,
2447                       const struct nlattr *actions, size_t actions_len)
2448 {
2449     struct dp_flow_offload_item *offload;
2450     int op;
2451
2452     if (!netdev_is_flow_api_enabled()) {
2453         return;
2454     }
2455
2456     if (ovsthread_once_start(&offload_thread_once)) {
2457         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2458         ovs_thread_create("dp_netdev_flow_offload",
2459                           dp_netdev_flow_offload_main, NULL);
2460         ovsthread_once_done(&offload_thread_once);
2461     }
2462
2463     if (flow->mark != INVALID_FLOW_MARK) {
2464         op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2465     } else {
2466         op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2467     }
2468     offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2469     offload->match = *match;
2470     offload->actions = xmalloc(actions_len);
2471     memcpy(offload->actions, actions, actions_len);
2472     offload->actions_len = actions_len;
2473
2474     dp_netdev_append_flow_offload(offload);
2475 }
2476
2477 static void
2478 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2479                           struct dp_netdev_flow *flow)
2480     OVS_REQUIRES(pmd->flow_mutex)
2481 {
2482     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2483     struct dpcls *cls;
2484     odp_port_t in_port = flow->flow.in_port.odp_port;
2485
2486     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2487     ovs_assert(cls != NULL);
2488     dpcls_remove(cls, &flow->cr);
2489     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2490     if (flow->mark != INVALID_FLOW_MARK) {
2491         queue_netdev_flow_del(pmd, flow);
2492     }
2493     flow->dead = true;
2494
2495     dp_netdev_flow_unref(flow);
2496 }
2497
2498 static void
2499 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2500 {
2501     struct dp_netdev_flow *netdev_flow;
2502
2503     ovs_mutex_lock(&pmd->flow_mutex);
2504     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2505         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2506     }
2507     ovs_mutex_unlock(&pmd->flow_mutex);
2508 }
2509
2510 static int
2511 dpif_netdev_flow_flush(struct dpif *dpif)
2512 {
2513     struct dp_netdev *dp = get_dp_netdev(dpif);
2514     struct dp_netdev_pmd_thread *pmd;
2515
2516     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2517         dp_netdev_pmd_flow_flush(pmd);
2518     }
2519
2520     return 0;
2521 }
2522
2523 struct dp_netdev_port_state {
2524     struct hmap_position position;
2525     char *name;
2526 };
2527
2528 static int
2529 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2530 {
2531     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2532     return 0;
2533 }
2534
2535 static int
2536 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2537                            struct dpif_port *dpif_port)
2538 {
2539     struct dp_netdev_port_state *state = state_;
2540     struct dp_netdev *dp = get_dp_netdev(dpif);
2541     struct hmap_node *node;
2542     int retval;
2543
2544     ovs_mutex_lock(&dp->port_mutex);
2545     node = hmap_at_position(&dp->ports, &state->position);
2546     if (node) {
2547         struct dp_netdev_port *port;
2548
2549         port = CONTAINER_OF(node, struct dp_netdev_port, node);
2550
2551         free(state->name);
2552         state->name = xstrdup(netdev_get_name(port->netdev));
2553         dpif_port->name = state->name;
2554         dpif_port->type = port->type;
2555         dpif_port->port_no = port->port_no;
2556
2557         retval = 0;
2558     } else {
2559         retval = EOF;
2560     }
2561     ovs_mutex_unlock(&dp->port_mutex);
2562
2563     return retval;
2564 }
2565
2566 static int
2567 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2568 {
2569     struct dp_netdev_port_state *state = state_;
2570     free(state->name);
2571     free(state);
2572     return 0;
2573 }
2574
2575 static int
2576 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2577 {
2578     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2579     uint64_t new_port_seq;
2580     int error;
2581
2582     new_port_seq = seq_read(dpif->dp->port_seq);
2583     if (dpif->last_port_seq != new_port_seq) {
2584         dpif->last_port_seq = new_port_seq;
2585         error = ENOBUFS;
2586     } else {
2587         error = EAGAIN;
2588     }
2589
2590     return error;
2591 }
2592
2593 static void
2594 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2595 {
2596     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2597
2598     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2599 }
2600
2601 static struct dp_netdev_flow *
2602 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2603 {
2604     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2605 }
2606
2607 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2608 {
2609     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2610 }
2611
2612 /* netdev_flow_key utilities.
2613  *
2614  * netdev_flow_key is basically a miniflow.  We use these functions
2615  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2616  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2617  *
2618  * - Since we are dealing exclusively with miniflows created by
2619  *   miniflow_extract(), if the map is different the miniflow is different.
2620  *   Therefore we can be faster by comparing the map and the miniflow in a
2621  *   single memcmp().
2622  * - These functions can be inlined by the compiler. */
2623
2624 /* Given the number of bits set in miniflow's maps, returns the size of the
2625  * 'netdev_flow_key.mf' */
2626 static inline size_t
2627 netdev_flow_key_size(size_t flow_u64s)
2628 {
2629     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2630 }
2631
2632 static inline bool
2633 netdev_flow_key_equal(const struct netdev_flow_key *a,
2634                       const struct netdev_flow_key *b)
2635 {
2636     /* 'b->len' may be not set yet. */
2637     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2638 }
2639
2640 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2641  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2642  * generated by miniflow_extract. */
2643 static inline bool
2644 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2645                          const struct miniflow *mf)
2646 {
2647     return !memcmp(&key->mf, mf, key->len);
2648 }
2649
2650 static inline void
2651 netdev_flow_key_clone(struct netdev_flow_key *dst,
2652                       const struct netdev_flow_key *src)
2653 {
2654     memcpy(dst, src,
2655            offsetof(struct netdev_flow_key, mf) + src->len);
2656 }
2657
2658 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2659 static inline void
2660 netdev_flow_mask_init(struct netdev_flow_key *mask,
2661                       const struct match *match)
2662 {
2663     uint64_t *dst = miniflow_values(&mask->mf);
2664     struct flowmap fmap;
2665     uint32_t hash = 0;
2666     size_t idx;
2667
2668     /* Only check masks that make sense for the flow. */
2669     flow_wc_map(&match->flow, &fmap);
2670     flowmap_init(&mask->mf.map);
2671
2672     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2673         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2674
2675         if (mask_u64) {
2676             flowmap_set(&mask->mf.map, idx, 1);
2677             *dst++ = mask_u64;
2678             hash = hash_add64(hash, mask_u64);
2679         }
2680     }
2681
2682     map_t map;
2683
2684     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2685         hash = hash_add64(hash, map);
2686     }
2687
2688     size_t n = dst - miniflow_get_values(&mask->mf);
2689
2690     mask->hash = hash_finish(hash, n * 8);
2691     mask->len = netdev_flow_key_size(n);
2692 }
2693
2694 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2695 static inline void
2696 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2697                             const struct flow *flow,
2698                             const struct netdev_flow_key *mask)
2699 {
2700     uint64_t *dst_u64 = miniflow_values(&dst->mf);
2701     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2702     uint32_t hash = 0;
2703     uint64_t value;
2704
2705     dst->len = mask->len;
2706     dst->mf = mask->mf;   /* Copy maps. */
2707
2708     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2709         *dst_u64 = value & *mask_u64++;
2710         hash = hash_add64(hash, *dst_u64++);
2711     }
2712     dst->hash = hash_finish(hash,
2713                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2714 }
2715
2716 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2717 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
2718     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2719
2720 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2721  * 'mask'. */
2722 static inline uint32_t
2723 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2724                              const struct netdev_flow_key *mask)
2725 {
2726     const uint64_t *p = miniflow_get_values(&mask->mf);
2727     uint32_t hash = 0;
2728     uint64_t value;
2729
2730     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2731         hash = hash_add64(hash, value & *p++);
2732     }
2733
2734     return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2735 }
2736
2737 static inline bool
2738 emc_entry_alive(struct emc_entry *ce)
2739 {
2740     return ce->flow && !ce->flow->dead;
2741 }
2742
2743 static void
2744 emc_clear_entry(struct emc_entry *ce)
2745 {
2746     if (ce->flow) {
2747         dp_netdev_flow_unref(ce->flow);
2748         ce->flow = NULL;
2749     }
2750 }
2751
2752 static inline void
2753 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2754                  const struct netdev_flow_key *key)
2755 {
2756     if (ce->flow != flow) {
2757         if (ce->flow) {
2758             dp_netdev_flow_unref(ce->flow);
2759         }
2760
2761         if (dp_netdev_flow_ref(flow)) {
2762             ce->flow = flow;
2763         } else {
2764             ce->flow = NULL;
2765         }
2766     }
2767     if (key) {
2768         netdev_flow_key_clone(&ce->key, key);
2769     }
2770 }
2771
2772 static inline void
2773 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2774            struct dp_netdev_flow *flow)
2775 {
2776     struct emc_entry *to_be_replaced = NULL;
2777     struct emc_entry *current_entry;
2778
2779     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2780         if (netdev_flow_key_equal(&current_entry->key, key)) {
2781             /* We found the entry with the 'mf' miniflow */
2782             emc_change_entry(current_entry, flow, NULL);
2783             return;
2784         }
2785
2786         /* Replacement policy: put the flow in an empty (not alive) entry, or
2787          * in the first entry where it can be */
2788         if (!to_be_replaced
2789             || (emc_entry_alive(to_be_replaced)
2790                 && !emc_entry_alive(current_entry))
2791             || current_entry->key.hash < to_be_replaced->key.hash) {
2792             to_be_replaced = current_entry;
2793         }
2794     }
2795     /* We didn't find the miniflow in the cache.
2796      * The 'to_be_replaced' entry is where the new flow will be stored */
2797
2798     emc_change_entry(to_be_replaced, flow, key);
2799 }
2800
2801 static inline void
2802 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2803                          const struct netdev_flow_key *key,
2804                          struct dp_netdev_flow *flow)
2805 {
2806     /* Insert an entry into the EMC based on probability value 'min'. By
2807      * default the value is UINT32_MAX / 100 which yields an insertion
2808      * probability of 1/100 ie. 1% */
2809
2810     uint32_t min;
2811
2812     atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
2813
2814     if (min && random_uint32() <= min) {
2815         emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2816     }
2817 }
2818
2819 static inline struct dp_netdev_flow *
2820 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2821 {
2822     struct emc_entry *current_entry;
2823
2824     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2825         if (current_entry->key.hash == key->hash
2826             && emc_entry_alive(current_entry)
2827             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2828
2829             /* We found the entry with the 'key->mf' miniflow */
2830             return current_entry->flow;
2831         }
2832     }
2833
2834     return NULL;
2835 }
2836
2837 static inline const struct cmap_node *
2838 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2839 {
2840     struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2841     struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2842     uint16_t sig = hash >> 16;
2843     uint16_t index = UINT16_MAX;
2844
2845     for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2846         if (bucket->sig[i] == sig) {
2847             index = bucket->flow_idx[i];
2848             break;
2849         }
2850     }
2851     if (index != UINT16_MAX) {
2852         return cmap_find_by_index(&pmd->flow_table, index);
2853     }
2854     return NULL;
2855 }
2856
2857 static void
2858 smc_clear_entry(struct smc_bucket *b, int idx)
2859 {
2860     b->flow_idx[idx] = UINT16_MAX;
2861 }
2862
2863 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2864  * turned off, 2) the flow_table index is larger than uint16_t can handle.
2865  * If there is already an SMC entry having same signature, the index will be
2866  * updated. If there is no existing entry, but an empty entry is available,
2867  * the empty entry will be taken. If no empty entry or existing same signature,
2868  * a random entry from the hashed bucket will be picked. */
2869 static inline void
2870 smc_insert(struct dp_netdev_pmd_thread *pmd,
2871            const struct netdev_flow_key *key,
2872            uint32_t hash)
2873 {
2874     struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2875     struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2876     uint16_t index;
2877     uint32_t cmap_index;
2878     bool smc_enable_db;
2879     int i;
2880
2881     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2882     if (!smc_enable_db) {
2883         return;
2884     }
2885
2886     cmap_index = cmap_find_index(&pmd->flow_table, hash);
2887     index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2888
2889     /* If the index is larger than SMC can handle (uint16_t), we don't
2890      * insert */
2891     if (index == UINT16_MAX) {
2892         return;
2893     }
2894
2895     /* If an entry with same signature already exists, update the index */
2896     uint16_t sig = key->hash >> 16;
2897     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2898         if (bucket->sig[i] == sig) {
2899             bucket->flow_idx[i] = index;
2900             return;
2901         }
2902     }
2903     /* If there is an empty entry, occupy it. */
2904     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2905         if (bucket->flow_idx[i] == UINT16_MAX) {
2906             bucket->sig[i] = sig;
2907             bucket->flow_idx[i] = index;
2908             return;
2909         }
2910     }
2911     /* Otherwise, pick a random entry. */
2912     i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2913     bucket->sig[i] = sig;
2914     bucket->flow_idx[i] = index;
2915 }
2916
2917 static struct dp_netdev_flow *
2918 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2919                           const struct netdev_flow_key *key,
2920                           int *lookup_num_p)
2921 {
2922     struct dpcls *cls;
2923     struct dpcls_rule *rule;
2924     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2925                                                      in_port.odp_port));
2926     struct dp_netdev_flow *netdev_flow = NULL;
2927
2928     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2929     if (OVS_LIKELY(cls)) {
2930         dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
2931         netdev_flow = dp_netdev_flow_cast(rule);
2932     }
2933     return netdev_flow;
2934 }
2935
2936 static struct dp_netdev_flow *
2937 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2938                         const ovs_u128 *ufidp, const struct nlattr *key,
2939                         size_t key_len)
2940 {
2941     struct dp_netdev_flow *netdev_flow;
2942     struct flow flow;
2943     ovs_u128 ufid;
2944
2945     /* If a UFID is not provided, determine one based on the key. */
2946     if (!ufidp && key && key_len
2947         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2948         dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2949         ufidp = &ufid;
2950     }
2951
2952     if (ufidp) {
2953         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2954                                  &pmd->flow_table) {
2955             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2956                 return netdev_flow;
2957             }
2958         }
2959     }
2960
2961     return NULL;
2962 }
2963
2964 static void
2965 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2966                     struct dpif_flow_stats *stats)
2967 {
2968     struct dp_netdev_flow *netdev_flow;
2969     unsigned long long n;
2970     long long used;
2971     uint16_t flags;
2972
2973     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2974
2975     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2976     stats->n_packets = n;
2977     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2978     stats->n_bytes = n;
2979     atomic_read_relaxed(&netdev_flow->stats.used, &used);
2980     stats->used = used;
2981     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2982     stats->tcp_flags = flags;
2983 }
2984
2985 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2986  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2987  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2988  * protect them. */
2989 static void
2990 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2991                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2992                             struct dpif_flow *flow, bool terse)
2993 {
2994     if (terse) {
2995         memset(flow, 0, sizeof *flow);
2996     } else {
2997         struct flow_wildcards wc;
2998         struct dp_netdev_actions *actions;
2999         size_t offset;
3000         struct odp_flow_key_parms odp_parms = {
3001             .flow = &netdev_flow->flow,
3002             .mask = &wc.masks,
3003             .support = dp_netdev_support,
3004         };
3005
3006         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3007         /* in_port is exact matched, but we have left it out from the mask for
3008          * optimnization reasons. Add in_port back to the mask. */
3009         wc.masks.in_port.odp_port = ODPP_NONE;
3010
3011         /* Key */
3012         offset = key_buf->size;
3013         flow->key = ofpbuf_tail(key_buf);
3014         odp_flow_key_from_flow(&odp_parms, key_buf);
3015         flow->key_len = key_buf->size - offset;
3016
3017         /* Mask */
3018         offset = mask_buf->size;
3019         flow->mask = ofpbuf_tail(mask_buf);
3020         odp_parms.key_buf = key_buf;
3021         odp_flow_key_from_mask(&odp_parms, mask_buf);
3022         flow->mask_len = mask_buf->size - offset;
3023
3024         /* Actions */
3025         actions = dp_netdev_flow_get_actions(netdev_flow);
3026         flow->actions = actions->actions;
3027         flow->actions_len = actions->size;
3028     }
3029
3030     flow->ufid = netdev_flow->ufid;
3031     flow->ufid_present = true;
3032     flow->pmd_id = netdev_flow->pmd_id;
3033     get_dpif_flow_stats(netdev_flow, &flow->stats);
3034
3035     flow->attrs.offloaded = false;
3036     flow->attrs.dp_layer = "ovs";
3037 }
3038
3039 static int
3040 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3041                               const struct nlattr *mask_key,
3042                               uint32_t mask_key_len, const struct flow *flow,
3043                               struct flow_wildcards *wc, bool probe)
3044 {
3045     enum odp_key_fitness fitness;
3046
3047     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
3048     if (fitness) {
3049         if (!probe) {
3050             /* This should not happen: it indicates that
3051              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3052              * disagree on the acceptable form of a mask.  Log the problem
3053              * as an error, with enough details to enable debugging. */
3054             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3055
3056             if (!VLOG_DROP_ERR(&rl)) {
3057                 struct ds s;
3058
3059                 ds_init(&s);
3060                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3061                                 true);
3062                 VLOG_ERR("internal error parsing flow mask %s (%s)",
3063                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3064                 ds_destroy(&s);
3065             }
3066         }
3067
3068         return EINVAL;
3069     }
3070
3071     return 0;
3072 }
3073
3074 static int
3075 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3076                               struct flow *flow, bool probe)
3077 {
3078     if (odp_flow_key_to_flow(key, key_len, flow)) {
3079         if (!probe) {
3080             /* This should not happen: it indicates that
3081              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3082              * the acceptable form of a flow.  Log the problem as an error,
3083              * with enough details to enable debugging. */
3084             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3085
3086             if (!VLOG_DROP_ERR(&rl)) {
3087                 struct ds s;
3088
3089                 ds_init(&s);
3090                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3091                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3092                 ds_destroy(&s);
3093             }
3094         }
3095
3096         return EINVAL;
3097     }
3098
3099     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3100         return EINVAL;
3101     }
3102
3103     return 0;
3104 }
3105
3106 static int
3107 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3108 {
3109     struct dp_netdev *dp = get_dp_netdev(dpif);
3110     struct dp_netdev_flow *netdev_flow;
3111     struct dp_netdev_pmd_thread *pmd;
3112     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3113     struct hmapx_node *node;
3114     int error = EINVAL;
3115
3116     if (get->pmd_id == PMD_ID_NULL) {
3117         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3118             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3119                 dp_netdev_pmd_unref(pmd);
3120             }
3121         }
3122     } else {
3123         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3124         if (!pmd) {
3125             goto out;
3126         }
3127         hmapx_add(&to_find, pmd);
3128     }
3129
3130     if (!hmapx_count(&to_find)) {
3131         goto out;
3132     }
3133
3134     HMAPX_FOR_EACH (node, &to_find) {
3135         pmd = (struct dp_netdev_pmd_thread *) node->data;
3136         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3137                                               get->key_len);
3138         if (netdev_flow) {
3139             dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
3140                                         get->flow, false);
3141             error = 0;
3142             break;
3143         } else {
3144             error = ENOENT;
3145         }
3146     }
3147
3148     HMAPX_FOR_EACH (node, &to_find) {
3149         pmd = (struct dp_netdev_pmd_thread *) node->data;
3150         dp_netdev_pmd_unref(pmd);
3151     }
3152 out:
3153     hmapx_destroy(&to_find);
3154     return error;
3155 }
3156
3157 static void
3158 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3159 {
3160     struct flow masked_flow;
3161     size_t i;
3162
3163     for (i = 0; i < sizeof(struct flow); i++) {
3164         ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3165                                        ((uint8_t *)&match->wc)[i];
3166     }
3167     dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
3168 }
3169
3170 static struct dp_netdev_flow *
3171 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3172                    struct match *match, const ovs_u128 *ufid,
3173                    const struct nlattr *actions, size_t actions_len)
3174     OVS_REQUIRES(pmd->flow_mutex)
3175 {
3176     struct dp_netdev_flow *flow;
3177     struct netdev_flow_key mask;
3178     struct dpcls *cls;
3179
3180     /* Make sure in_port is exact matched before we read it. */
3181     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3182     odp_port_t in_port = match->flow.in_port.odp_port;
3183
3184     /* As we select the dpcls based on the port number, each netdev flow
3185      * belonging to the same dpcls will have the same odp_port value.
3186      * For performance reasons we wildcard odp_port here in the mask.  In the
3187      * typical case dp_hash is also wildcarded, and the resulting 8-byte
3188      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3189      * will not be part of the subtable mask.
3190      * This will speed up the hash computation during dpcls_lookup() because
3191      * there is one less call to hash_add64() in this case. */
3192     match->wc.masks.in_port.odp_port = 0;
3193     netdev_flow_mask_init(&mask, match);
3194     match->wc.masks.in_port.odp_port = ODPP_NONE;
3195
3196     /* Make sure wc does not have metadata. */
3197     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3198                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3199
3200     /* Do not allocate extra space. */
3201     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3202     memset(&flow->stats, 0, sizeof flow->stats);
3203     flow->dead = false;
3204     flow->batch = NULL;
3205     flow->mark = INVALID_FLOW_MARK;
3206     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3207     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3208     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3209     ovs_refcount_init(&flow->ref_cnt);
3210     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3211
3212     dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3213     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3214
3215     /* Select dpcls for in_port. Relies on in_port to be exact match. */
3216     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3217     dpcls_insert(cls, &flow->cr, &mask);
3218
3219     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3220                 dp_netdev_flow_hash(&flow->ufid));
3221
3222     queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3223
3224     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3225         struct ds ds = DS_EMPTY_INITIALIZER;
3226         struct ofpbuf key_buf, mask_buf;
3227         struct odp_flow_key_parms odp_parms = {
3228             .flow = &match->flow,
3229             .mask = &match->wc.masks,
3230             .support = dp_netdev_support,
3231         };
3232
3233         ofpbuf_init(&key_buf, 0);
3234         ofpbuf_init(&mask_buf, 0);
3235
3236         odp_flow_key_from_flow(&odp_parms, &key_buf);
3237         odp_parms.key_buf = &key_buf;
3238         odp_flow_key_from_mask(&odp_parms, &mask_buf);
3239
3240         ds_put_cstr(&ds, "flow_add: ");
3241         odp_format_ufid(ufid, &ds);
3242         ds_put_cstr(&ds, " ");
3243         odp_flow_format(key_buf.data, key_buf.size,
3244                         mask_buf.data, mask_buf.size,
3245                         NULL, &ds, false);
3246         ds_put_cstr(&ds, ", actions:");
3247         format_odp_actions(&ds, actions, actions_len, NULL);
3248
3249         VLOG_DBG("%s", ds_cstr(&ds));
3250
3251         ofpbuf_uninit(&key_buf);
3252         ofpbuf_uninit(&mask_buf);
3253
3254         /* Add a printout of the actual match installed. */
3255         struct match m;
3256         ds_clear(&ds);
3257         ds_put_cstr(&ds, "flow match: ");
3258         miniflow_expand(&flow->cr.flow.mf, &m.flow);
3259         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3260         memset(&m.tun_md, 0, sizeof m.tun_md);
3261         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3262
3263         VLOG_DBG("%s", ds_cstr(&ds));
3264
3265         ds_destroy(&ds);
3266     }
3267
3268     return flow;
3269 }
3270
3271 static int
3272 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3273                 struct netdev_flow_key *key,
3274                 struct match *match,
3275                 ovs_u128 *ufid,
3276                 const struct dpif_flow_put *put,
3277                 struct dpif_flow_stats *stats)
3278 {
3279     struct dp_netdev_flow *netdev_flow;
3280     int error = 0;
3281
3282     if (stats) {
3283         memset(stats, 0, sizeof *stats);
3284     }
3285
3286     ovs_mutex_lock(&pmd->flow_mutex);
3287     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3288     if (!netdev_flow) {
3289         if (put->flags & DPIF_FP_CREATE) {
3290             if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
3291                 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3292                                    put->actions_len);
3293                 error = 0;
3294             } else {
3295                 error = EFBIG;
3296             }
3297         } else {
3298             error = ENOENT;
3299         }
3300     } else {
3301         if (put->flags & DPIF_FP_MODIFY) {
3302             struct dp_netdev_actions *new_actions;
3303             struct dp_netdev_actions *old_actions;
3304
3305             new_actions = dp_netdev_actions_create(put->actions,
3306                                                    put->actions_len);
3307
3308             old_actions = dp_netdev_flow_get_actions(netdev_flow);
3309             ovsrcu_set(&netdev_flow->actions, new_actions);
3310
3311             queue_netdev_flow_put(pmd, netdev_flow, match,
3312                                   put->actions, put->actions_len);
3313
3314             if (stats) {
3315                 get_dpif_flow_stats(netdev_flow, stats);
3316             }
3317             if (put->flags & DPIF_FP_ZERO_STATS) {
3318                 /* XXX: The userspace datapath uses thread local statistics
3319                  * (for flows), which should be updated only by the owning
3320                  * thread.  Since we cannot write on stats memory here,
3321                  * we choose not to support this flag.  Please note:
3322                  * - This feature is currently used only by dpctl commands with
3323                  *   option --clear.
3324                  * - Should the need arise, this operation can be implemented
3325                  *   by keeping a base value (to be update here) for each
3326                  *   counter, and subtracting it before outputting the stats */
3327                 error = EOPNOTSUPP;
3328             }
3329
3330             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3331         } else if (put->flags & DPIF_FP_CREATE) {
3332             error = EEXIST;
3333         } else {
3334             /* Overlapping flow. */
3335             error = EINVAL;
3336         }
3337     }
3338     ovs_mutex_unlock(&pmd->flow_mutex);
3339     return error;
3340 }
3341
3342 static int
3343 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3344 {
3345     struct dp_netdev *dp = get_dp_netdev(dpif);
3346     struct netdev_flow_key key, mask;
3347     struct dp_netdev_pmd_thread *pmd;
3348     struct match match;
3349     ovs_u128 ufid;
3350     int error;
3351     bool probe = put->flags & DPIF_FP_PROBE;
3352
3353     if (put->stats) {
3354         memset(put->stats, 0, sizeof *put->stats);
3355     }
3356     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3357                                           probe);
3358     if (error) {
3359         return error;
3360     }
3361     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3362                                           put->mask, put->mask_len,
3363                                           &match.flow, &match.wc, probe);
3364     if (error) {
3365         return error;
3366     }
3367
3368     if (put->ufid) {
3369         ufid = *put->ufid;
3370     } else {
3371         dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3372     }
3373
3374     /* Must produce a netdev_flow_key for lookup.
3375      * Use the same method as employed to create the key when adding
3376      * the flow to the dplcs to make sure they match. */
3377     netdev_flow_mask_init(&mask, &match);
3378     netdev_flow_key_init_masked(&key, &match.flow, &mask);
3379
3380     if (put->pmd_id == PMD_ID_NULL) {
3381         if (cmap_count(&dp->poll_threads) == 0) {
3382             return EINVAL;
3383         }
3384         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3385             struct dpif_flow_stats pmd_stats;
3386             int pmd_error;
3387
3388             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3389                                         &pmd_stats);
3390             if (pmd_error) {
3391                 error = pmd_error;
3392             } else if (put->stats) {
3393                 put->stats->n_packets += pmd_stats.n_packets;
3394                 put->stats->n_bytes += pmd_stats.n_bytes;
3395                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3396                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3397             }
3398         }
3399     } else {
3400         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3401         if (!pmd) {
3402             return EINVAL;
3403         }
3404         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3405         dp_netdev_pmd_unref(pmd);
3406     }
3407
3408     return error;
3409 }
3410
3411 static int
3412 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3413                 struct dpif_flow_stats *stats,
3414                 const struct dpif_flow_del *del)
3415 {
3416     struct dp_netdev_flow *netdev_flow;
3417     int error = 0;
3418
3419     ovs_mutex_lock(&pmd->flow_mutex);
3420     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3421                                           del->key_len);
3422     if (netdev_flow) {
3423         if (stats) {
3424             get_dpif_flow_stats(netdev_flow, stats);
3425         }
3426         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3427     } else {
3428         error = ENOENT;
3429     }
3430     ovs_mutex_unlock(&pmd->flow_mutex);
3431
3432     return error;
3433 }
3434
3435 static int
3436 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3437 {
3438     struct dp_netdev *dp = get_dp_netdev(dpif);
3439     struct dp_netdev_pmd_thread *pmd;
3440     int error = 0;
3441
3442     if (del->stats) {
3443         memset(del->stats, 0, sizeof *del->stats);
3444     }
3445
3446     if (del->pmd_id == PMD_ID_NULL) {
3447         if (cmap_count(&dp->poll_threads) == 0) {
3448             return EINVAL;
3449         }
3450         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3451             struct dpif_flow_stats pmd_stats;
3452             int pmd_error;
3453
3454             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3455             if (pmd_error) {
3456                 error = pmd_error;
3457             } else if (del->stats) {
3458                 del->stats->n_packets += pmd_stats.n_packets;
3459                 del->stats->n_bytes += pmd_stats.n_bytes;
3460                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3461                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3462             }
3463         }
3464     } else {
3465         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3466         if (!pmd) {
3467             return EINVAL;
3468         }
3469         error = flow_del_on_pmd(pmd, del->stats, del);
3470         dp_netdev_pmd_unref(pmd);
3471     }
3472
3473
3474     return error;
3475 }
3476
3477 struct dpif_netdev_flow_dump {
3478     struct dpif_flow_dump up;
3479     struct cmap_position poll_thread_pos;
3480     struct cmap_position flow_pos;
3481     struct dp_netdev_pmd_thread *cur_pmd;
3482     int status;
3483     struct ovs_mutex mutex;
3484 };
3485
3486 static struct dpif_netdev_flow_dump *
3487 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3488 {
3489     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3490 }
3491
3492 static struct dpif_flow_dump *
3493 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3494                              struct dpif_flow_dump_types *types OVS_UNUSED)
3495 {
3496     struct dpif_netdev_flow_dump *dump;
3497
3498     dump = xzalloc(sizeof *dump);
3499     dpif_flow_dump_init(&dump->up, dpif_);
3500     dump->up.terse = terse;
3501     ovs_mutex_init(&dump->mutex);
3502
3503     return &dump->up;
3504 }
3505
3506 static int
3507 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3508 {
3509     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3510
3511     ovs_mutex_destroy(&dump->mutex);
3512     free(dump);
3513     return 0;
3514 }
3515
3516 struct dpif_netdev_flow_dump_thread {
3517     struct dpif_flow_dump_thread up;
3518     struct dpif_netdev_flow_dump *dump;
3519     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3520     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3521 };
3522
3523 static struct dpif_netdev_flow_dump_thread *
3524 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3525 {
3526     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3527 }
3528
3529 static struct dpif_flow_dump_thread *
3530 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3531 {
3532     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3533     struct dpif_netdev_flow_dump_thread *thread;
3534
3535     thread = xmalloc(sizeof *thread);
3536     dpif_flow_dump_thread_init(&thread->up, &dump->up);
3537     thread->dump = dump;
3538     return &thread->up;
3539 }
3540
3541 static void
3542 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3543 {
3544     struct dpif_netdev_flow_dump_thread *thread
3545         = dpif_netdev_flow_dump_thread_cast(thread_);
3546
3547     free(thread);
3548 }
3549
3550 static int
3551 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3552                            struct dpif_flow *flows, int max_flows)
3553 {
3554     struct dpif_netdev_flow_dump_thread *thread
3555         = dpif_netdev_flow_dump_thread_cast(thread_);
3556     struct dpif_netdev_flow_dump *dump = thread->dump;
3557     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3558     int n_flows = 0;
3559     int i;
3560
3561     ovs_mutex_lock(&dump->mutex);
3562     if (!dump->status) {
3563         struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3564         struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3565         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3566         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3567
3568         /* First call to dump_next(), extracts the first pmd thread.
3569          * If there is no pmd thread, returns immediately. */
3570         if (!pmd) {
3571             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3572             if (!pmd) {
3573                 ovs_mutex_unlock(&dump->mutex);
3574                 return n_flows;
3575
3576             }
3577         }
3578
3579         do {
3580             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3581                 struct cmap_node *node;
3582
3583                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3584                 if (!node) {
3585                     break;
3586                 }
3587                 netdev_flows[n_flows] = CONTAINER_OF(node,
3588                                                      struct dp_netdev_flow,
3589                                                      node);
3590             }
3591             /* When finishing dumping the current pmd thread, moves to
3592              * the next. */
3593             if (n_flows < flow_limit) {
3594                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3595                 dp_netdev_pmd_unref(pmd);
3596                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3597                 if (!pmd) {
3598                     dump->status = EOF;
3599                     break;
3600                 }
3601             }
3602             /* Keeps the reference to next caller. */
3603             dump->cur_pmd = pmd;
3604
3605             /* If the current dump is empty, do not exit the loop, since the
3606              * remaining pmds could have flows to be dumped.  Just dumps again
3607              * on the new 'pmd'. */
3608         } while (!n_flows);
3609     }
3610     ovs_mutex_unlock(&dump->mutex);
3611
3612     for (i = 0; i < n_flows; i++) {
3613         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3614         struct odputil_keybuf *keybuf = &thread->keybuf[i];
3615         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3616         struct dpif_flow *f = &flows[i];
3617         struct ofpbuf key, mask;
3618
3619         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3620         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3621         dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3622                                     dump->up.terse);
3623     }
3624
3625     return n_flows;
3626 }
3627
3628 static int
3629 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3630     OVS_NO_THREAD_SAFETY_ANALYSIS
3631 {
3632     struct dp_netdev *dp = get_dp_netdev(dpif);
3633     struct dp_netdev_pmd_thread *pmd;
3634     struct dp_packet_batch pp;
3635
3636     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3637         dp_packet_size(execute->packet) > UINT16_MAX) {
3638         return EINVAL;
3639     }
3640
3641     /* Tries finding the 'pmd'.  If NULL is returned, that means
3642      * the current thread is a non-pmd thread and should use
3643      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3644     pmd = ovsthread_getspecific(dp->per_pmd_key);
3645     if (!pmd) {
3646         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3647         if (!pmd) {
3648             return EBUSY;
3649         }
3650     }
3651
3652     if (execute->probe) {
3653         /* If this is part of a probe, Drop the packet, since executing
3654          * the action may actually cause spurious packets be sent into
3655          * the network. */
3656         if (pmd->core_id == NON_PMD_CORE_ID) {
3657             dp_netdev_pmd_unref(pmd);
3658         }
3659         return 0;
3660     }
3661
3662     /* If the current thread is non-pmd thread, acquires
3663      * the 'non_pmd_mutex'. */
3664     if (pmd->core_id == NON_PMD_CORE_ID) {
3665         ovs_mutex_lock(&dp->non_pmd_mutex);
3666     }
3667
3668     /* Update current time in PMD context. */
3669     pmd_thread_ctx_time_update(pmd);
3670
3671     /* The action processing expects the RSS hash to be valid, because
3672      * it's always initialized at the beginning of datapath processing.
3673      * In this case, though, 'execute->packet' may not have gone through
3674      * the datapath at all, it may have been generated by the upper layer
3675      * (OpenFlow packet-out, BFD frame, ...). */
3676     if (!dp_packet_rss_valid(execute->packet)) {
3677         dp_packet_set_rss_hash(execute->packet,
3678                                flow_hash_5tuple(execute->flow, 0));
3679     }
3680
3681     dp_packet_batch_init_packet(&pp, execute->packet);
3682     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3683                               execute->actions, execute->actions_len);
3684     dp_netdev_pmd_flush_output_packets(pmd, true);
3685
3686     if (pmd->core_id == NON_PMD_CORE_ID) {
3687         ovs_mutex_unlock(&dp->non_pmd_mutex);
3688         dp_netdev_pmd_unref(pmd);
3689     }
3690
3691     return 0;
3692 }
3693
3694 static void
3695 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
3696 {
3697     size_t i;
3698
3699     for (i = 0; i < n_ops; i++) {
3700         struct dpif_op *op = ops[i];
3701
3702         switch (op->type) {
3703         case DPIF_OP_FLOW_PUT:
3704             op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3705             break;
3706
3707         case DPIF_OP_FLOW_DEL:
3708             op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3709             break;
3710
3711         case DPIF_OP_EXECUTE:
3712             op->error = dpif_netdev_execute(dpif, &op->execute);
3713             break;
3714
3715         case DPIF_OP_FLOW_GET:
3716             op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3717             break;
3718         }
3719     }
3720 }
3721
3722 /* Applies datapath configuration from the database. Some of the changes are
3723  * actually applied in dpif_netdev_run(). */
3724 static int
3725 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3726 {
3727     struct dp_netdev *dp = get_dp_netdev(dpif);
3728     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3729     unsigned long long insert_prob =
3730         smap_get_ullong(other_config, "emc-insert-inv-prob",
3731                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
3732     uint32_t insert_min, cur_min;
3733     uint32_t tx_flush_interval, cur_tx_flush_interval;
3734
3735     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3736                                      DEFAULT_TX_FLUSH_INTERVAL);
3737     atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3738     if (tx_flush_interval != cur_tx_flush_interval) {
3739         atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3740         VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3741                   tx_flush_interval);
3742     }
3743
3744     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3745         free(dp->pmd_cmask);
3746         dp->pmd_cmask = nullable_xstrdup(cmask);
3747         dp_netdev_request_reconfigure(dp);
3748     }
3749
3750     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3751     if (insert_prob <= UINT32_MAX) {
3752         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3753     } else {
3754         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3755         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3756     }
3757
3758     if (insert_min != cur_min) {
3759         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3760         if (insert_min == 0) {
3761             VLOG_INFO("EMC has been disabled");
3762         } else {
3763             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3764                       insert_prob, (100 / (float)insert_prob));
3765         }
3766     }
3767
3768     bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3769     bool cur_perf_enabled;
3770     atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3771     if (perf_enabled != cur_perf_enabled) {
3772         atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3773         if (perf_enabled) {
3774             VLOG_INFO("PMD performance metrics collection enabled");
3775         } else {
3776             VLOG_INFO("PMD performance metrics collection disabled");
3777         }
3778     }
3779
3780     bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3781     bool cur_smc;
3782     atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3783     if (smc_enable != cur_smc) {
3784         atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3785         if (smc_enable) {
3786             VLOG_INFO("SMC cache is enabled");
3787         } else {
3788             VLOG_INFO("SMC cache is disabled");
3789         }
3790     }
3791     return 0;
3792 }
3793
3794 /* Parses affinity list and returns result in 'core_ids'. */
3795 static int
3796 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3797 {
3798     unsigned i;
3799     char *list, *copy, *key, *value;
3800     int error = 0;
3801
3802     for (i = 0; i < n_rxq; i++) {
3803         core_ids[i] = OVS_CORE_UNSPEC;
3804     }
3805
3806     if (!affinity_list) {
3807         return 0;
3808     }
3809
3810     list = copy = xstrdup(affinity_list);
3811
3812     while (ofputil_parse_key_value(&list, &key, &value)) {
3813         int rxq_id, core_id;
3814
3815         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3816             || !str_to_int(value, 0, &core_id) || core_id < 0) {
3817             error = EINVAL;
3818             break;
3819         }
3820
3821         if (rxq_id < n_rxq) {
3822             core_ids[rxq_id] = core_id;
3823         }
3824     }
3825
3826     free(copy);
3827     return error;
3828 }
3829
3830 /* Parses 'affinity_list' and applies configuration if it is valid. */
3831 static int
3832 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3833                                   const char *affinity_list)
3834 {
3835     unsigned *core_ids, i;
3836     int error = 0;
3837
3838     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3839     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3840         error = EINVAL;
3841         goto exit;
3842     }
3843
3844     for (i = 0; i < port->n_rxq; i++) {
3845         port->rxqs[i].core_id = core_ids[i];
3846     }
3847
3848 exit:
3849     free(core_ids);
3850     return error;
3851 }
3852
3853 /* Changes the affinity of port's rx queues.  The changes are actually applied
3854  * in dpif_netdev_run(). */
3855 static int
3856 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
3857                             const struct smap *cfg)
3858 {
3859     struct dp_netdev *dp = get_dp_netdev(dpif);
3860     struct dp_netdev_port *port;
3861     int error = 0;
3862     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
3863
3864     ovs_mutex_lock(&dp->port_mutex);
3865     error = get_port_by_number(dp, port_no, &port);
3866     if (error || !netdev_is_pmd(port->netdev)
3867         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
3868         goto unlock;
3869     }
3870
3871     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
3872     if (error) {
3873         goto unlock;
3874     }
3875     free(port->rxq_affinity_list);
3876     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
3877
3878     dp_netdev_request_reconfigure(dp);
3879 unlock:
3880     ovs_mutex_unlock(&dp->port_mutex);
3881     return error;
3882 }
3883
3884 static int
3885 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
3886                               uint32_t queue_id, uint32_t *priority)
3887 {
3888     *priority = queue_id;
3889     return 0;
3890 }
3891
3892 \f
3893 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
3894  * a copy of the 'size' bytes of 'actions' input parameters. */
3895 struct dp_netdev_actions *
3896 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
3897 {
3898     struct dp_netdev_actions *netdev_actions;
3899
3900     netdev_actions = xmalloc(sizeof *netdev_actions + size);
3901     memcpy(netdev_actions->actions, actions, size);
3902     netdev_actions->size = size;
3903
3904     return netdev_actions;
3905 }
3906
3907 struct dp_netdev_actions *
3908 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
3909 {
3910     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
3911 }
3912
3913 static void
3914 dp_netdev_actions_free(struct dp_netdev_actions *actions)
3915 {
3916     free(actions);
3917 }
3918 \f
3919 static void
3920 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
3921                          enum rxq_cycles_counter_type type,
3922                          unsigned long long cycles)
3923 {
3924    atomic_store_relaxed(&rx->cycles[type], cycles);
3925 }
3926
3927 static void
3928 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
3929                          enum rxq_cycles_counter_type type,
3930                          unsigned long long cycles)
3931 {
3932     non_atomic_ullong_add(&rx->cycles[type], cycles);
3933 }
3934
3935 static uint64_t
3936 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
3937                          enum rxq_cycles_counter_type type)
3938 {
3939     unsigned long long processing_cycles;
3940     atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
3941     return processing_cycles;
3942 }
3943
3944 static void
3945 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
3946                                 unsigned long long cycles)
3947 {
3948     unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
3949     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
3950 }
3951
3952 static uint64_t
3953 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
3954 {
3955     unsigned long long processing_cycles;
3956     atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
3957     return processing_cycles;
3958 }
3959
3960 #if ATOMIC_ALWAYS_LOCK_FREE_8B
3961 static inline bool
3962 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
3963 {
3964     bool pmd_perf_enabled;
3965     atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
3966     return pmd_perf_enabled;
3967 }
3968 #else
3969 /* If stores and reads of 64-bit integers are not atomic, the full PMD
3970  * performance metrics are not available as locked access to 64 bit
3971  * integers would be prohibitively expensive. */
3972 static inline bool
3973 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
3974 {
3975     return false;
3976 }
3977 #endif
3978
3979 static int
3980 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
3981                                    struct tx_port *p)
3982 {
3983     int i;
3984     int tx_qid;
3985     int output_cnt;
3986     bool dynamic_txqs;
3987     struct cycle_timer timer;
3988     uint64_t cycles;
3989     uint32_t tx_flush_interval;
3990
3991     cycle_timer_start(&pmd->perf_stats, &timer);
3992
3993     dynamic_txqs = p->port->dynamic_txqs;
3994     if (dynamic_txqs) {
3995         tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
3996     } else {
3997         tx_qid = pmd->static_tx_qid;
3998     }
3999
4000     output_cnt = dp_packet_batch_size(&p->output_pkts);
4001     ovs_assert(output_cnt > 0);
4002
4003     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4004     dp_packet_batch_init(&p->output_pkts);
4005
4006     /* Update time of the next flush. */
4007     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4008     p->flush_time = pmd->ctx.now + tx_flush_interval;
4009
4010     ovs_assert(pmd->n_output_batches > 0);
4011     pmd->n_output_batches--;
4012
4013     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4014     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4015
4016     /* Distribute send cycles evenly among transmitted packets and assign to
4017      * their respective rx queues. */
4018     cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4019     for (i = 0; i < output_cnt; i++) {
4020         if (p->output_pkts_rxqs[i]) {
4021             dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4022                                      RXQ_CYCLES_PROC_CURR, cycles);
4023         }
4024     }
4025
4026     return output_cnt;
4027 }
4028
4029 static int
4030 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4031                                    bool force)
4032 {
4033     struct tx_port *p;
4034     int output_cnt = 0;
4035
4036     if (!pmd->n_output_batches) {
4037         return 0;
4038     }
4039
4040     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4041         if (!dp_packet_batch_is_empty(&p->output_pkts)
4042             && (force || pmd->ctx.now >= p->flush_time)) {
4043             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4044         }
4045     }
4046     return output_cnt;
4047 }
4048
4049 static int
4050 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4051                            struct dp_netdev_rxq *rxq,
4052                            odp_port_t port_no)
4053 {
4054     struct pmd_perf_stats *s = &pmd->perf_stats;
4055     struct dp_packet_batch batch;
4056     struct cycle_timer timer;
4057     int error;
4058     int batch_cnt = 0;
4059     int rem_qlen = 0, *qlen_p = NULL;
4060     uint64_t cycles;
4061
4062     /* Measure duration for polling and processing rx burst. */
4063     cycle_timer_start(&pmd->perf_stats, &timer);
4064
4065     pmd->ctx.last_rxq = rxq;
4066     dp_packet_batch_init(&batch);
4067
4068     /* Fetch the rx queue length only for vhostuser ports. */
4069     if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4070         qlen_p = &rem_qlen;
4071     }
4072
4073     error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4074     if (!error) {
4075         /* At least one packet received. */
4076         *recirc_depth_get() = 0;
4077         pmd_thread_ctx_time_update(pmd);
4078         batch_cnt = batch.count;
4079         if (pmd_perf_metrics_enabled(pmd)) {
4080             /* Update batch histogram. */
4081             s->current.batches++;
4082             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4083             /* Update the maximum vhost rx queue fill level. */
4084             if (rxq->is_vhost && rem_qlen >= 0) {
4085                 uint32_t qfill = batch_cnt + rem_qlen;
4086                 if (qfill > s->current.max_vhost_qfill) {
4087                     s->current.max_vhost_qfill = qfill;
4088                 }
4089             }
4090         }
4091         /* Process packet batch. */
4092         dp_netdev_input(pmd, &batch, port_no);
4093
4094         /* Assign processing cycles to rx queue. */
4095         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4096         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4097
4098         dp_netdev_pmd_flush_output_packets(pmd, false);
4099     } else {
4100         /* Discard cycles. */
4101         cycle_timer_stop(&pmd->perf_stats, &timer);
4102         if (error != EAGAIN && error != EOPNOTSUPP) {
4103             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4104
4105             VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4106                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4107         }
4108     }
4109
4110     pmd->ctx.last_rxq = NULL;
4111
4112     return batch_cnt;
4113 }
4114
4115 static struct tx_port *
4116 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4117 {
4118     struct tx_port *tx;
4119
4120     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4121         if (tx->port->port_no == port_no) {
4122             return tx;
4123         }
4124     }
4125
4126     return NULL;
4127 }
4128
4129 static int
4130 port_reconfigure(struct dp_netdev_port *port)
4131 {
4132     struct netdev *netdev = port->netdev;
4133     int i, err;
4134
4135     /* Closes the existing 'rxq's. */
4136     for (i = 0; i < port->n_rxq; i++) {
4137         netdev_rxq_close(port->rxqs[i].rx);
4138         port->rxqs[i].rx = NULL;
4139     }
4140     unsigned last_nrxq = port->n_rxq;
4141     port->n_rxq = 0;
4142
4143     /* Allows 'netdev' to apply the pending configuration changes. */
4144     if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4145         err = netdev_reconfigure(netdev);
4146         if (err && (err != EOPNOTSUPP)) {
4147             VLOG_ERR("Failed to set interface %s new configuration",
4148                      netdev_get_name(netdev));
4149             return err;
4150         }
4151     }
4152     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4153     port->rxqs = xrealloc(port->rxqs,
4154                           sizeof *port->rxqs * netdev_n_rxq(netdev));
4155     /* Realloc 'used' counters for tx queues. */
4156     free(port->txq_used);
4157     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4158
4159     for (i = 0; i < netdev_n_rxq(netdev); i++) {
4160         bool new_queue = i >= last_nrxq;
4161         if (new_queue) {
4162             memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4163         }
4164
4165         port->rxqs[i].port = port;
4166         port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4167
4168         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4169         if (err) {
4170             return err;
4171         }
4172         port->n_rxq++;
4173     }
4174
4175     /* Parse affinity list to apply configuration for new queues. */
4176     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4177
4178     /* If reconfiguration was successful mark it as such, so we can use it */
4179     port->need_reconfigure = false;
4180
4181     return 0;
4182 }
4183
4184 struct rr_numa_list {
4185     struct hmap numas;  /* Contains 'struct rr_numa' */
4186 };
4187
4188 struct rr_numa {
4189     struct hmap_node node;
4190
4191     int numa_id;
4192
4193     /* Non isolated pmds on numa node 'numa_id' */
4194     struct dp_netdev_pmd_thread **pmds;
4195     int n_pmds;
4196
4197     int cur_index;
4198     bool idx_inc;
4199 };
4200
4201 static struct rr_numa *
4202 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4203 {
4204     struct rr_numa *numa;
4205
4206     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4207         if (numa->numa_id == numa_id) {
4208             return numa;
4209         }
4210     }
4211
4212     return NULL;
4213 }
4214
4215 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4216  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4217  * Returns NULL if 'rr' numa list is empty. */
4218 static struct rr_numa *
4219 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4220 {
4221     struct hmap_node *node = NULL;
4222
4223     if (numa) {
4224         node = hmap_next(&rr->numas, &numa->node);
4225     }
4226     if (!node) {
4227         node = hmap_first(&rr->numas);
4228     }
4229
4230     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4231 }
4232
4233 static void
4234 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4235 {
4236     struct dp_netdev_pmd_thread *pmd;
4237     struct rr_numa *numa;
4238
4239     hmap_init(&rr->numas);
4240
4241     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4242         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4243             continue;
4244         }
4245
4246         numa = rr_numa_list_lookup(rr, pmd->numa_id);
4247         if (!numa) {
4248             numa = xzalloc(sizeof *numa);
4249             numa->numa_id = pmd->numa_id;
4250             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4251         }
4252         numa->n_pmds++;
4253         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4254         numa->pmds[numa->n_pmds - 1] = pmd;
4255         /* At least one pmd so initialise curr_idx and idx_inc. */
4256         numa->cur_index = 0;
4257         numa->idx_inc = true;
4258     }
4259 }
4260
4261 /* Returns the next pmd from the numa node in
4262  * incrementing or decrementing order. */
4263 static struct dp_netdev_pmd_thread *
4264 rr_numa_get_pmd(struct rr_numa *numa)
4265 {
4266     int numa_idx = numa->cur_index;
4267
4268     if (numa->idx_inc == true) {
4269         /* Incrementing through list of pmds. */
4270         if (numa->cur_index == numa->n_pmds-1) {
4271             /* Reached the last pmd. */
4272             numa->idx_inc = false;
4273         } else {
4274             numa->cur_index++;
4275         }
4276     } else {
4277         /* Decrementing through list of pmds. */
4278         if (numa->cur_index == 0) {
4279             /* Reached the first pmd. */
4280             numa->idx_inc = true;
4281         } else {
4282             numa->cur_index--;
4283         }
4284     }
4285     return numa->pmds[numa_idx];
4286 }
4287
4288 static void
4289 rr_numa_list_destroy(struct rr_numa_list *rr)
4290 {
4291     struct rr_numa *numa;
4292
4293     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4294         free(numa->pmds);
4295         free(numa);
4296     }
4297     hmap_destroy(&rr->numas);
4298 }
4299
4300 /* Sort Rx Queues by the processing cycles they are consuming. */
4301 static int
4302 compare_rxq_cycles(const void *a, const void *b)
4303 {
4304     struct dp_netdev_rxq *qa;
4305     struct dp_netdev_rxq *qb;
4306     uint64_t cycles_qa, cycles_qb;
4307
4308     qa = *(struct dp_netdev_rxq **) a;
4309     qb = *(struct dp_netdev_rxq **) b;
4310
4311     cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4312     cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4313
4314     if (cycles_qa != cycles_qb) {
4315         return (cycles_qa < cycles_qb) ? 1 : -1;
4316     } else {
4317         /* Cycles are the same so tiebreak on port/queue id.
4318          * Tiebreaking (as opposed to return 0) ensures consistent
4319          * sort results across multiple OS's. */
4320         uint32_t port_qa = odp_to_u32(qa->port->port_no);
4321         uint32_t port_qb = odp_to_u32(qb->port->port_no);
4322         if (port_qa != port_qb) {
4323             return port_qa > port_qb ? 1 : -1;
4324         } else {
4325             return netdev_rxq_get_queue_id(qa->rx)
4326                     - netdev_rxq_get_queue_id(qb->rx);
4327         }
4328     }
4329 }
4330
4331 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
4332  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
4333  * pmds to unpinned queues.
4334  *
4335  * If 'pinned' is false queues will be sorted by processing cycles they are
4336  * consuming and then assigned to pmds in round robin order.
4337  *
4338  * The function doesn't touch the pmd threads, it just stores the assignment
4339  * in the 'pmd' member of each rxq. */
4340 static void
4341 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4342 {
4343     struct dp_netdev_port *port;
4344     struct rr_numa_list rr;
4345     struct rr_numa *non_local_numa = NULL;
4346     struct dp_netdev_rxq ** rxqs = NULL;
4347     int n_rxqs = 0;
4348     struct rr_numa *numa = NULL;
4349     int numa_id;
4350
4351     HMAP_FOR_EACH (port, node, &dp->ports) {
4352         if (!netdev_is_pmd(port->netdev)) {
4353             continue;
4354         }
4355
4356         for (int qid = 0; qid < port->n_rxq; qid++) {
4357             struct dp_netdev_rxq *q = &port->rxqs[qid];
4358
4359             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4360                 struct dp_netdev_pmd_thread *pmd;
4361
4362                 pmd = dp_netdev_get_pmd(dp, q->core_id);
4363                 if (!pmd) {
4364                     VLOG_WARN("There is no PMD thread on core %d. Queue "
4365                               "%d on port \'%s\' will not be polled.",
4366                               q->core_id, qid, netdev_get_name(port->netdev));
4367                 } else {
4368                     q->pmd = pmd;
4369                     pmd->isolated = true;
4370                     dp_netdev_pmd_unref(pmd);
4371                 }
4372             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4373                 uint64_t cycle_hist = 0;
4374
4375                 if (n_rxqs == 0) {
4376                     rxqs = xmalloc(sizeof *rxqs);
4377                 } else {
4378                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4379                 }
4380                 /* Sum the queue intervals and store the cycle history. */
4381                 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4382                     cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4383                 }
4384                 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
4385
4386                 /* Store the queue. */
4387                 rxqs[n_rxqs++] = q;
4388             }
4389         }
4390     }
4391
4392     if (n_rxqs > 1) {
4393         /* Sort the queues in order of the processing cycles
4394          * they consumed during their last pmd interval. */
4395         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4396     }
4397
4398     rr_numa_list_populate(dp, &rr);
4399     /* Assign the sorted queues to pmds in round robin. */
4400     for (int i = 0; i < n_rxqs; i++) {
4401         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4402         numa = rr_numa_list_lookup(&rr, numa_id);
4403         if (!numa) {
4404             /* There are no pmds on the queue's local NUMA node.
4405                Round robin on the NUMA nodes that do have pmds. */
4406             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4407             if (!non_local_numa) {
4408                 VLOG_ERR("There is no available (non-isolated) pmd "
4409                          "thread for port \'%s\' queue %d. This queue "
4410                          "will not be polled. Is pmd-cpu-mask set to "
4411                          "zero? Or are all PMDs isolated to other "
4412                          "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4413                          netdev_rxq_get_queue_id(rxqs[i]->rx));
4414                 continue;
4415             }
4416             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
4417             VLOG_WARN("There's no available (non-isolated) pmd thread "
4418                       "on numa node %d. Queue %d on port \'%s\' will "
4419                       "be assigned to the pmd on core %d "
4420                       "(numa node %d). Expect reduced performance.",
4421                       numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4422                       netdev_rxq_get_name(rxqs[i]->rx),
4423                       rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4424         } else {
4425         rxqs[i]->pmd = rr_numa_get_pmd(numa);
4426         VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4427                   "rx queue %d (measured processing cycles %"PRIu64").",
4428                   rxqs[i]->pmd->core_id, numa_id,
4429                   netdev_rxq_get_name(rxqs[i]->rx),
4430                   netdev_rxq_get_queue_id(rxqs[i]->rx),
4431                   dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
4432         }
4433     }
4434
4435     rr_numa_list_destroy(&rr);
4436     free(rxqs);
4437 }
4438
4439 static void
4440 reload_affected_pmds(struct dp_netdev *dp)
4441 {
4442     struct dp_netdev_pmd_thread *pmd;
4443
4444     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4445         if (pmd->need_reload) {
4446             flow_mark_flush(pmd);
4447             dp_netdev_reload_pmd__(pmd);
4448             pmd->need_reload = false;
4449         }
4450     }
4451 }
4452
4453 static void
4454 reconfigure_pmd_threads(struct dp_netdev *dp)
4455     OVS_REQUIRES(dp->port_mutex)
4456 {
4457     struct dp_netdev_pmd_thread *pmd;
4458     struct ovs_numa_dump *pmd_cores;
4459     struct ovs_numa_info_core *core;
4460     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4461     struct hmapx_node *node;
4462     bool changed = false;
4463     bool need_to_adjust_static_tx_qids = false;
4464
4465     /* The pmd threads should be started only if there's a pmd port in the
4466      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
4467      * NR_PMD_THREADS per numa node. */
4468     if (!has_pmd_port(dp)) {
4469         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4470     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4471         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4472     } else {
4473         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4474     }
4475
4476     /* We need to adjust 'static_tx_qid's only if we're reducing number of
4477      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4478     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4479         /* Adjustment is required to keep 'static_tx_qid's sequential and
4480          * avoid possible issues, for example, imbalanced tx queue usage
4481          * and unnecessary locking caused by remapping on netdev level. */
4482         need_to_adjust_static_tx_qids = true;
4483     }
4484
4485     /* Check for unwanted pmd threads */
4486     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4487         if (pmd->core_id == NON_PMD_CORE_ID) {
4488             continue;
4489         }
4490         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4491                                                     pmd->core_id)) {
4492             hmapx_add(&to_delete, pmd);
4493         } else if (need_to_adjust_static_tx_qids) {
4494             pmd->need_reload = true;
4495         }
4496     }
4497
4498     HMAPX_FOR_EACH (node, &to_delete) {
4499         pmd = (struct dp_netdev_pmd_thread *) node->data;
4500         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4501                   pmd->numa_id, pmd->core_id);
4502         dp_netdev_del_pmd(dp, pmd);
4503     }
4504     changed = !hmapx_is_empty(&to_delete);
4505     hmapx_destroy(&to_delete);
4506
4507     if (need_to_adjust_static_tx_qids) {
4508         /* 'static_tx_qid's are not sequential now.
4509          * Reload remaining threads to fix this. */
4510         reload_affected_pmds(dp);
4511     }
4512
4513     /* Check for required new pmd threads */
4514     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4515         pmd = dp_netdev_get_pmd(dp, core->core_id);
4516         if (!pmd) {
4517             pmd = xzalloc(sizeof *pmd);
4518             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4519             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4520             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4521                       pmd->numa_id, pmd->core_id);
4522             changed = true;
4523         } else {
4524             dp_netdev_pmd_unref(pmd);
4525         }
4526     }
4527
4528     if (changed) {
4529         struct ovs_numa_info_numa *numa;
4530
4531         /* Log the number of pmd threads per numa node. */
4532         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4533             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4534                       numa->n_cores, numa->numa_id);
4535         }
4536     }
4537
4538     ovs_numa_dump_destroy(pmd_cores);
4539 }
4540
4541 static void
4542 pmd_remove_stale_ports(struct dp_netdev *dp,
4543                        struct dp_netdev_pmd_thread *pmd)
4544     OVS_EXCLUDED(pmd->port_mutex)
4545     OVS_REQUIRES(dp->port_mutex)
4546 {
4547     struct rxq_poll *poll, *poll_next;
4548     struct tx_port *tx, *tx_next;
4549
4550     ovs_mutex_lock(&pmd->port_mutex);
4551     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4552         struct dp_netdev_port *port = poll->rxq->port;
4553
4554         if (port->need_reconfigure
4555             || !hmap_contains(&dp->ports, &port->node)) {
4556             dp_netdev_del_rxq_from_pmd(pmd, poll);
4557         }
4558     }
4559     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4560         struct dp_netdev_port *port = tx->port;
4561
4562         if (port->need_reconfigure
4563             || !hmap_contains(&dp->ports, &port->node)) {
4564             dp_netdev_del_port_tx_from_pmd(pmd, tx);
4565         }
4566     }
4567     ovs_mutex_unlock(&pmd->port_mutex);
4568 }
4569
4570 /* Must be called each time a port is added/removed or the cmask changes.
4571  * This creates and destroys pmd threads, reconfigures ports, opens their
4572  * rxqs and assigns all rxqs/txqs to pmd threads. */
4573 static void
4574 reconfigure_datapath(struct dp_netdev *dp)
4575     OVS_REQUIRES(dp->port_mutex)
4576 {
4577     struct dp_netdev_pmd_thread *pmd;
4578     struct dp_netdev_port *port;
4579     int wanted_txqs;
4580
4581     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4582
4583     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4584      * on the system and the user configuration. */
4585     reconfigure_pmd_threads(dp);
4586
4587     wanted_txqs = cmap_count(&dp->poll_threads);
4588
4589     /* The number of pmd threads might have changed, or a port can be new:
4590      * adjust the txqs. */
4591     HMAP_FOR_EACH (port, node, &dp->ports) {
4592         netdev_set_tx_multiq(port->netdev, wanted_txqs);
4593     }
4594
4595     /* Step 2: Remove from the pmd threads ports that have been removed or
4596      * need reconfiguration. */
4597
4598     /* Check for all the ports that need reconfiguration.  We cache this in
4599      * 'port->need_reconfigure', because netdev_is_reconf_required() can
4600      * change at any time. */
4601     HMAP_FOR_EACH (port, node, &dp->ports) {
4602         if (netdev_is_reconf_required(port->netdev)) {
4603             port->need_reconfigure = true;
4604         }
4605     }
4606
4607     /* Remove from the pmd threads all the ports that have been deleted or
4608      * need reconfiguration. */
4609     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4610         pmd_remove_stale_ports(dp, pmd);
4611     }
4612
4613     /* Reload affected pmd threads.  We must wait for the pmd threads before
4614      * reconfiguring the ports, because a port cannot be reconfigured while
4615      * it's being used. */
4616     reload_affected_pmds(dp);
4617
4618     /* Step 3: Reconfigure ports. */
4619
4620     /* We only reconfigure the ports that we determined above, because they're
4621      * not being used by any pmd thread at the moment.  If a port fails to
4622      * reconfigure we remove it from the datapath. */
4623     struct dp_netdev_port *next_port;
4624     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4625         int err;
4626
4627         if (!port->need_reconfigure) {
4628             continue;
4629         }
4630
4631         err = port_reconfigure(port);
4632         if (err) {
4633             hmap_remove(&dp->ports, &port->node);
4634             seq_change(dp->port_seq);
4635             port_destroy(port);
4636         } else {
4637             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4638         }
4639     }
4640
4641     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
4642      * for now, we just update the 'pmd' pointer in each rxq to point to the
4643      * wanted thread according to the scheduling policy. */
4644
4645     /* Reset all the pmd threads to non isolated. */
4646     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4647         pmd->isolated = false;
4648     }
4649
4650     /* Reset all the queues to unassigned */
4651     HMAP_FOR_EACH (port, node, &dp->ports) {
4652         for (int i = 0; i < port->n_rxq; i++) {
4653             port->rxqs[i].pmd = NULL;
4654         }
4655     }
4656
4657     /* Add pinned queues and mark pmd threads isolated. */
4658     rxq_scheduling(dp, true);
4659
4660     /* Add non-pinned queues. */
4661     rxq_scheduling(dp, false);
4662
4663     /* Step 5: Remove queues not compliant with new scheduling. */
4664     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4665         struct rxq_poll *poll, *poll_next;
4666
4667         ovs_mutex_lock(&pmd->port_mutex);
4668         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4669             if (poll->rxq->pmd != pmd) {
4670                 dp_netdev_del_rxq_from_pmd(pmd, poll);
4671             }
4672         }
4673         ovs_mutex_unlock(&pmd->port_mutex);
4674     }
4675
4676     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
4677      * the old queues before readding them, otherwise a queue can be polled by
4678      * two threads at the same time. */
4679     reload_affected_pmds(dp);
4680
4681     /* Step 6: Add queues from scheduling, if they're not there already. */
4682     HMAP_FOR_EACH (port, node, &dp->ports) {
4683         if (!netdev_is_pmd(port->netdev)) {
4684             continue;
4685         }
4686
4687         for (int qid = 0; qid < port->n_rxq; qid++) {
4688             struct dp_netdev_rxq *q = &port->rxqs[qid];
4689
4690             if (q->pmd) {
4691                 ovs_mutex_lock(&q->pmd->port_mutex);
4692                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4693                 ovs_mutex_unlock(&q->pmd->port_mutex);
4694             }
4695         }
4696     }
4697
4698     /* Add every port to the tx cache of every pmd thread, if it's not
4699      * there already and if this pmd has at least one rxq to poll. */
4700     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4701         ovs_mutex_lock(&pmd->port_mutex);
4702         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4703             HMAP_FOR_EACH (port, node, &dp->ports) {
4704                 dp_netdev_add_port_tx_to_pmd(pmd, port);
4705             }
4706         }
4707         ovs_mutex_unlock(&pmd->port_mutex);
4708     }
4709
4710     /* Reload affected pmd threads. */
4711     reload_affected_pmds(dp);
4712 }
4713
4714 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
4715 static bool
4716 ports_require_restart(const struct dp_netdev *dp)
4717     OVS_REQUIRES(dp->port_mutex)
4718 {
4719     struct dp_netdev_port *port;
4720
4721     HMAP_FOR_EACH (port, node, &dp->ports) {
4722         if (netdev_is_reconf_required(port->netdev)) {
4723             return true;
4724         }
4725     }
4726
4727     return false;
4728 }
4729
4730 /* Return true if needs to revalidate datapath flows. */
4731 static bool
4732 dpif_netdev_run(struct dpif *dpif)
4733 {
4734     struct dp_netdev_port *port;
4735     struct dp_netdev *dp = get_dp_netdev(dpif);
4736     struct dp_netdev_pmd_thread *non_pmd;
4737     uint64_t new_tnl_seq;
4738     bool need_to_flush = true;
4739
4740     ovs_mutex_lock(&dp->port_mutex);
4741     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4742     if (non_pmd) {
4743         ovs_mutex_lock(&dp->non_pmd_mutex);
4744         HMAP_FOR_EACH (port, node, &dp->ports) {
4745             if (!netdev_is_pmd(port->netdev)) {
4746                 int i;
4747
4748                 for (i = 0; i < port->n_rxq; i++) {
4749                     if (dp_netdev_process_rxq_port(non_pmd,
4750                                                    &port->rxqs[i],
4751                                                    port->port_no)) {
4752                         need_to_flush = false;
4753                     }
4754                 }
4755             }
4756         }
4757         if (need_to_flush) {
4758             /* We didn't receive anything in the process loop.
4759              * Check if we need to send something.
4760              * There was no time updates on current iteration. */
4761             pmd_thread_ctx_time_update(non_pmd);
4762             dp_netdev_pmd_flush_output_packets(non_pmd, false);
4763         }
4764
4765         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
4766         ovs_mutex_unlock(&dp->non_pmd_mutex);
4767
4768         dp_netdev_pmd_unref(non_pmd);
4769     }
4770
4771     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
4772         reconfigure_datapath(dp);
4773     }
4774     ovs_mutex_unlock(&dp->port_mutex);
4775
4776     tnl_neigh_cache_run();
4777     tnl_port_map_run();
4778     new_tnl_seq = seq_read(tnl_conf_seq);
4779
4780     if (dp->last_tnl_conf_seq != new_tnl_seq) {
4781         dp->last_tnl_conf_seq = new_tnl_seq;
4782         return true;
4783     }
4784     return false;
4785 }
4786
4787 static void
4788 dpif_netdev_wait(struct dpif *dpif)
4789 {
4790     struct dp_netdev_port *port;
4791     struct dp_netdev *dp = get_dp_netdev(dpif);
4792
4793     ovs_mutex_lock(&dp_netdev_mutex);
4794     ovs_mutex_lock(&dp->port_mutex);
4795     HMAP_FOR_EACH (port, node, &dp->ports) {
4796         netdev_wait_reconf_required(port->netdev);
4797         if (!netdev_is_pmd(port->netdev)) {
4798             int i;
4799
4800             for (i = 0; i < port->n_rxq; i++) {
4801                 netdev_rxq_wait(port->rxqs[i].rx);
4802             }
4803         }
4804     }
4805     ovs_mutex_unlock(&dp->port_mutex);
4806     ovs_mutex_unlock(&dp_netdev_mutex);
4807     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
4808 }
4809
4810 static void
4811 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
4812 {
4813     struct tx_port *tx_port_cached;
4814
4815     /* Flush all the queued packets. */
4816     dp_netdev_pmd_flush_output_packets(pmd, true);
4817     /* Free all used tx queue ids. */
4818     dpif_netdev_xps_revalidate_pmd(pmd, true);
4819
4820     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
4821         free(tx_port_cached);
4822     }
4823     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
4824         free(tx_port_cached);
4825     }
4826 }
4827
4828 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
4829  * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
4830  * device, otherwise to 'pmd->send_port_cache' if the port has at least
4831  * one txq. */
4832 static void
4833 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
4834     OVS_REQUIRES(pmd->port_mutex)
4835 {
4836     struct tx_port *tx_port, *tx_port_cached;
4837
4838     pmd_free_cached_ports(pmd);
4839     hmap_shrink(&pmd->send_port_cache);
4840     hmap_shrink(&pmd->tnl_port_cache);
4841
4842     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
4843         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
4844             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4845             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
4846                         hash_port_no(tx_port_cached->port->port_no));
4847         }
4848
4849         if (netdev_n_txq(tx_port->port->netdev)) {
4850             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4851             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
4852                         hash_port_no(tx_port_cached->port->port_no));
4853         }
4854     }
4855 }
4856
4857 static void
4858 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4859 {
4860     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4861     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
4862         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
4863                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
4864     }
4865     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4866
4867     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
4868              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
4869 }
4870
4871 static void
4872 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4873 {
4874     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4875     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
4876     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4877 }
4878
4879 static int
4880 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
4881                           struct polled_queue **ppoll_list)
4882 {
4883     struct polled_queue *poll_list = *ppoll_list;
4884     struct rxq_poll *poll;
4885     int i;
4886
4887     ovs_mutex_lock(&pmd->port_mutex);
4888     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
4889                                     * sizeof *poll_list);
4890
4891     i = 0;
4892     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4893         poll_list[i].rxq = poll->rxq;
4894         poll_list[i].port_no = poll->rxq->port->port_no;
4895         i++;
4896     }
4897
4898     pmd_load_cached_ports(pmd);
4899
4900     ovs_mutex_unlock(&pmd->port_mutex);
4901
4902     *ppoll_list = poll_list;
4903     return i;
4904 }
4905
4906 static void *
4907 pmd_thread_main(void *f_)
4908 {
4909     struct dp_netdev_pmd_thread *pmd = f_;
4910     struct pmd_perf_stats *s = &pmd->perf_stats;
4911     unsigned int lc = 0;
4912     struct polled_queue *poll_list;
4913     bool exiting;
4914     int poll_cnt;
4915     int i;
4916     int process_packets = 0;
4917
4918     poll_list = NULL;
4919
4920     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
4921     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
4922     ovs_numa_thread_setaffinity_core(pmd->core_id);
4923     dpdk_set_lcore_id(pmd->core_id);
4924     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4925     dfc_cache_init(&pmd->flow_cache);
4926 reload:
4927     pmd_alloc_static_tx_qid(pmd);
4928
4929     /* List port/core affinity */
4930     for (i = 0; i < poll_cnt; i++) {
4931        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
4932                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
4933                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4934        /* Reset the rxq current cycles counter. */
4935        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
4936     }
4937
4938     if (!poll_cnt) {
4939         while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
4940             seq_wait(pmd->reload_seq, pmd->last_reload_seq);
4941             poll_block();
4942         }
4943         lc = UINT_MAX;
4944     }
4945
4946     pmd->intrvl_tsc_prev = 0;
4947     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
4948     cycles_counter_update(s);
4949     /* Protect pmd stats from external clearing while polling. */
4950     ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
4951     for (;;) {
4952         uint64_t rx_packets = 0, tx_packets = 0;
4953
4954         pmd_perf_start_iteration(s);
4955
4956         for (i = 0; i < poll_cnt; i++) {
4957             process_packets =
4958                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
4959                                            poll_list[i].port_no);
4960             rx_packets += process_packets;
4961         }
4962
4963         if (!rx_packets) {
4964             /* We didn't receive anything in the process loop.
4965              * Check if we need to send something.
4966              * There was no time updates on current iteration. */
4967             pmd_thread_ctx_time_update(pmd);
4968             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
4969         }
4970
4971         if (lc++ > 1024) {
4972             bool reload;
4973
4974             lc = 0;
4975
4976             coverage_try_clear();
4977             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
4978             if (!ovsrcu_try_quiesce()) {
4979                 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
4980             }
4981
4982             atomic_read_relaxed(&pmd->reload, &reload);
4983             if (reload) {
4984                 break;
4985             }
4986         }
4987         pmd_perf_end_iteration(s, rx_packets, tx_packets,
4988                                pmd_perf_metrics_enabled(pmd));
4989     }
4990     ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
4991
4992     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4993     exiting = latch_is_set(&pmd->exit_latch);
4994     /* Signal here to make sure the pmd finishes
4995      * reloading the updated configuration. */
4996     dp_netdev_pmd_reload_done(pmd);
4997
4998     pmd_free_static_tx_qid(pmd);
4999
5000     if (!exiting) {
5001         goto reload;
5002     }
5003
5004     dfc_cache_uninit(&pmd->flow_cache);
5005     free(poll_list);
5006     pmd_free_cached_ports(pmd);
5007     return NULL;
5008 }
5009
5010 static void
5011 dp_netdev_disable_upcall(struct dp_netdev *dp)
5012     OVS_ACQUIRES(dp->upcall_rwlock)
5013 {
5014     fat_rwlock_wrlock(&dp->upcall_rwlock);
5015 }
5016
5017 \f
5018 /* Meters */
5019 static void
5020 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5021                                struct ofputil_meter_features *features)
5022 {
5023     features->max_meters = MAX_METERS;
5024     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5025     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5026     features->max_bands = MAX_BANDS;
5027     features->max_color = 0;
5028 }
5029
5030 /* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
5031  * that exceed a band are dropped in-place. */
5032 static void
5033 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5034                     uint32_t meter_id, long long int now)
5035 {
5036     struct dp_meter *meter;
5037     struct dp_meter_band *band;
5038     struct dp_packet *packet;
5039     long long int long_delta_t; /* msec */
5040     uint32_t delta_t; /* msec */
5041     const size_t cnt = dp_packet_batch_size(packets_);
5042     uint32_t bytes, volume;
5043     int exceeded_band[NETDEV_MAX_BURST];
5044     uint32_t exceeded_rate[NETDEV_MAX_BURST];
5045     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5046
5047     if (meter_id >= MAX_METERS) {
5048         return;
5049     }
5050
5051     meter_lock(dp, meter_id);
5052     meter = dp->meters[meter_id];
5053     if (!meter) {
5054         goto out;
5055     }
5056
5057     /* Initialize as negative values. */
5058     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5059     /* Initialize as zeroes. */
5060     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5061
5062     /* All packets will hit the meter at the same time. */
5063     long_delta_t = (now - meter->used) / 1000; /* msec */
5064
5065     /* Make sure delta_t will not be too large, so that bucket will not
5066      * wrap around below. */
5067     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5068         ? meter->max_delta_t : (uint32_t)long_delta_t;
5069
5070     /* Update meter stats. */
5071     meter->used = now;
5072     meter->packet_count += cnt;
5073     bytes = 0;
5074     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5075         bytes += dp_packet_size(packet);
5076     }
5077     meter->byte_count += bytes;
5078
5079     /* Meters can operate in terms of packets per second or kilobits per
5080      * second. */
5081     if (meter->flags & OFPMF13_PKTPS) {
5082         /* Rate in packets/second, bucket 1/1000 packets. */
5083         /* msec * packets/sec = 1/1000 packets. */
5084         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5085     } else {
5086         /* Rate in kbps, bucket in bits. */
5087         /* msec * kbps = bits */
5088         volume = bytes * 8;
5089     }
5090
5091     /* Update all bands and find the one hit with the highest rate for each
5092      * packet (if any). */
5093     for (int m = 0; m < meter->n_bands; ++m) {
5094         band = &meter->bands[m];
5095
5096         /* Update band's bucket. */
5097         band->bucket += delta_t * band->up.rate;
5098         if (band->bucket > band->up.burst_size) {
5099             band->bucket = band->up.burst_size;
5100         }
5101
5102         /* Drain the bucket for all the packets, if possible. */
5103         if (band->bucket >= volume) {
5104             band->bucket -= volume;
5105         } else {
5106             int band_exceeded_pkt;
5107
5108             /* Band limit hit, must process packet-by-packet. */
5109             if (meter->flags & OFPMF13_PKTPS) {
5110                 band_exceeded_pkt = band->bucket / 1000;
5111                 band->bucket %= 1000; /* Remainder stays in bucket. */
5112
5113                 /* Update the exceeding band for each exceeding packet.
5114                  * (Only one band will be fired by a packet, and that
5115                  * can be different for each packet.) */
5116                 for (int i = band_exceeded_pkt; i < cnt; i++) {
5117                     if (band->up.rate > exceeded_rate[i]) {
5118                         exceeded_rate[i] = band->up.rate;
5119                         exceeded_band[i] = m;
5120                     }
5121                 }
5122             } else {
5123                 /* Packet sizes differ, must process one-by-one. */
5124                 band_exceeded_pkt = cnt;
5125                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5126                     uint32_t bits = dp_packet_size(packet) * 8;
5127
5128                     if (band->bucket >= bits) {
5129                         band->bucket -= bits;
5130                     } else {
5131                         if (i < band_exceeded_pkt) {
5132                             band_exceeded_pkt = i;
5133                         }
5134                         /* Update the exceeding band for the exceeding packet.
5135                          * (Only one band will be fired by a packet, and that
5136                          * can be different for each packet.) */
5137                         if (band->up.rate > exceeded_rate[i]) {
5138                             exceeded_rate[i] = band->up.rate;
5139                             exceeded_band[i] = m;
5140                         }
5141                     }
5142                 }
5143             }
5144             /* Remember the first exceeding packet. */
5145             if (exceeded_pkt > band_exceeded_pkt) {
5146                 exceeded_pkt = band_exceeded_pkt;
5147             }
5148         }
5149     }
5150
5151     /* Fire the highest rate band exceeded by each packet, and drop
5152      * packets if needed. */
5153     size_t j;
5154     DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
5155         if (exceeded_band[j] >= 0) {
5156             /* Meter drop packet. */
5157             band = &meter->bands[exceeded_band[j]];
5158             band->packet_count += 1;
5159             band->byte_count += dp_packet_size(packet);
5160
5161             dp_packet_delete(packet);
5162         } else {
5163             /* Meter accepts packet. */
5164             dp_packet_batch_refill(packets_, packet, j);
5165         }
5166     }
5167  out:
5168     meter_unlock(dp, meter_id);
5169 }
5170
5171 /* Meter set/get/del processing is still single-threaded. */
5172 static int
5173 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
5174                       struct ofputil_meter_config *config)
5175 {
5176     struct dp_netdev *dp = get_dp_netdev(dpif);
5177     uint32_t mid = meter_id.uint32;
5178     struct dp_meter *meter;
5179     int i;
5180
5181     if (mid >= MAX_METERS) {
5182         return EFBIG; /* Meter_id out of range. */
5183     }
5184
5185     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
5186         return EBADF; /* Unsupported flags set */
5187     }
5188
5189     if (config->n_bands > MAX_BANDS) {
5190         return EINVAL;
5191     }
5192
5193     for (i = 0; i < config->n_bands; ++i) {
5194         switch (config->bands[i].type) {
5195         case OFPMBT13_DROP:
5196             break;
5197         default:
5198             return ENODEV; /* Unsupported band type */
5199         }
5200     }
5201
5202     /* Allocate meter */
5203     meter = xzalloc(sizeof *meter
5204                     + config->n_bands * sizeof(struct dp_meter_band));
5205
5206     meter->flags = config->flags;
5207     meter->n_bands = config->n_bands;
5208     meter->max_delta_t = 0;
5209     meter->used = time_usec();
5210
5211     /* set up bands */
5212     for (i = 0; i < config->n_bands; ++i) {
5213         uint32_t band_max_delta_t;
5214
5215         /* Set burst size to a workable value if none specified. */
5216         if (config->bands[i].burst_size == 0) {
5217             config->bands[i].burst_size = config->bands[i].rate;
5218         }
5219
5220         meter->bands[i].up = config->bands[i];
5221         /* Convert burst size to the bucket units: */
5222         /* pkts => 1/1000 packets, kilobits => bits. */
5223         meter->bands[i].up.burst_size *= 1000;
5224         /* Initialize bucket to empty. */
5225         meter->bands[i].bucket = 0;
5226
5227         /* Figure out max delta_t that is enough to fill any bucket. */
5228         band_max_delta_t
5229             = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5230         if (band_max_delta_t > meter->max_delta_t) {
5231             meter->max_delta_t = band_max_delta_t;
5232         }
5233     }
5234
5235     meter_lock(dp, mid);
5236     dp_delete_meter(dp, mid); /* Free existing meter, if any */
5237     dp->meters[mid] = meter;
5238     meter_unlock(dp, mid);
5239
5240     return 0;
5241 }
5242
5243 static int
5244 dpif_netdev_meter_get(const struct dpif *dpif,
5245                       ofproto_meter_id meter_id_,
5246                       struct ofputil_meter_stats *stats, uint16_t n_bands)
5247 {
5248     const struct dp_netdev *dp = get_dp_netdev(dpif);
5249     uint32_t meter_id = meter_id_.uint32;
5250     int retval = 0;
5251
5252     if (meter_id >= MAX_METERS) {
5253         return EFBIG;
5254     }
5255
5256     meter_lock(dp, meter_id);
5257     const struct dp_meter *meter = dp->meters[meter_id];
5258     if (!meter) {
5259         retval = ENOENT;
5260         goto done;
5261     }
5262     if (stats) {
5263         int i = 0;
5264
5265         stats->packet_in_count = meter->packet_count;
5266         stats->byte_in_count = meter->byte_count;
5267
5268         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5269             stats->bands[i].packet_count = meter->bands[i].packet_count;
5270             stats->bands[i].byte_count = meter->bands[i].byte_count;
5271         }
5272
5273         stats->n_bands = i;
5274     }
5275
5276 done:
5277     meter_unlock(dp, meter_id);
5278     return retval;
5279 }
5280
5281 static int
5282 dpif_netdev_meter_del(struct dpif *dpif,
5283                       ofproto_meter_id meter_id_,
5284                       struct ofputil_meter_stats *stats, uint16_t n_bands)
5285 {
5286     struct dp_netdev *dp = get_dp_netdev(dpif);
5287     int error;
5288
5289     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5290     if (!error) {
5291         uint32_t meter_id = meter_id_.uint32;
5292
5293         meter_lock(dp, meter_id);
5294         dp_delete_meter(dp, meter_id);
5295         meter_unlock(dp, meter_id);
5296     }
5297     return error;
5298 }
5299
5300 \f
5301 static void
5302 dpif_netdev_disable_upcall(struct dpif *dpif)
5303     OVS_NO_THREAD_SAFETY_ANALYSIS
5304 {
5305     struct dp_netdev *dp = get_dp_netdev(dpif);
5306     dp_netdev_disable_upcall(dp);
5307 }
5308
5309 static void
5310 dp_netdev_enable_upcall(struct dp_netdev *dp)
5311     OVS_RELEASES(dp->upcall_rwlock)
5312 {
5313     fat_rwlock_unlock(&dp->upcall_rwlock);
5314 }
5315
5316 static void
5317 dpif_netdev_enable_upcall(struct dpif *dpif)
5318     OVS_NO_THREAD_SAFETY_ANALYSIS
5319 {
5320     struct dp_netdev *dp = get_dp_netdev(dpif);
5321     dp_netdev_enable_upcall(dp);
5322 }
5323
5324 static void
5325 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
5326 {
5327     ovs_mutex_lock(&pmd->cond_mutex);
5328     atomic_store_relaxed(&pmd->reload, false);
5329     pmd->last_reload_seq = seq_read(pmd->reload_seq);
5330     xpthread_cond_signal(&pmd->cond);
5331     ovs_mutex_unlock(&pmd->cond_mutex);
5332 }
5333
5334 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
5335  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
5336  * 'core_id' is NON_PMD_CORE_ID).
5337  *
5338  * Caller must unrefs the returned reference.  */
5339 static struct dp_netdev_pmd_thread *
5340 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
5341 {
5342     struct dp_netdev_pmd_thread *pmd;
5343     const struct cmap_node *pnode;
5344
5345     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
5346     if (!pnode) {
5347         return NULL;
5348     }
5349     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
5350
5351     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
5352 }
5353
5354 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
5355 static void
5356 dp_netdev_set_nonpmd(struct dp_netdev *dp)
5357     OVS_REQUIRES(dp->port_mutex)
5358 {
5359     struct dp_netdev_pmd_thread *non_pmd;
5360
5361     non_pmd = xzalloc(sizeof *non_pmd);
5362     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5363 }
5364
5365 /* Caller must have valid pointer to 'pmd'. */
5366 static bool
5367 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5368 {
5369     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5370 }
5371
5372 static void
5373 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5374 {
5375     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5376         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5377     }
5378 }
5379
5380 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
5381  * fails, keeps checking for next node until reaching the end of cmap.
5382  *
5383  * Caller must unrefs the returned reference. */
5384 static struct dp_netdev_pmd_thread *
5385 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5386 {
5387     struct dp_netdev_pmd_thread *next;
5388
5389     do {
5390         struct cmap_node *node;
5391
5392         node = cmap_next_position(&dp->poll_threads, pos);
5393         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5394             : NULL;
5395     } while (next && !dp_netdev_pmd_try_ref(next));
5396
5397     return next;
5398 }
5399
5400 /* Configures the 'pmd' based on the input argument. */
5401 static void
5402 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
5403                         unsigned core_id, int numa_id)
5404 {
5405     pmd->dp = dp;
5406     pmd->core_id = core_id;
5407     pmd->numa_id = numa_id;
5408     pmd->need_reload = false;
5409     pmd->n_output_batches = 0;
5410
5411     ovs_refcount_init(&pmd->ref_cnt);
5412     latch_init(&pmd->exit_latch);
5413     pmd->reload_seq = seq_create();
5414     pmd->last_reload_seq = seq_read(pmd->reload_seq);
5415     atomic_init(&pmd->reload, false);
5416     xpthread_cond_init(&pmd->cond, NULL);
5417     ovs_mutex_init(&pmd->cond_mutex);
5418     ovs_mutex_init(&pmd->flow_mutex);
5419     ovs_mutex_init(&pmd->port_mutex);
5420     cmap_init(&pmd->flow_table);
5421     cmap_init(&pmd->classifiers);
5422     pmd->ctx.last_rxq = NULL;
5423     pmd_thread_ctx_time_update(pmd);
5424     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
5425     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
5426     hmap_init(&pmd->poll_list);
5427     hmap_init(&pmd->tx_ports);
5428     hmap_init(&pmd->tnl_port_cache);
5429     hmap_init(&pmd->send_port_cache);
5430     /* init the 'flow_cache' since there is no
5431      * actual thread created for NON_PMD_CORE_ID. */
5432     if (core_id == NON_PMD_CORE_ID) {
5433         dfc_cache_init(&pmd->flow_cache);
5434         pmd_alloc_static_tx_qid(pmd);
5435     }
5436     pmd_perf_stats_init(&pmd->perf_stats);
5437     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
5438                 hash_int(core_id, 0));
5439 }
5440
5441 static void
5442 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
5443 {
5444     struct dpcls *cls;
5445
5446     dp_netdev_pmd_flow_flush(pmd);
5447     hmap_destroy(&pmd->send_port_cache);
5448     hmap_destroy(&pmd->tnl_port_cache);
5449     hmap_destroy(&pmd->tx_ports);
5450     hmap_destroy(&pmd->poll_list);
5451     /* All flows (including their dpcls_rules) have been deleted already */
5452     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
5453         dpcls_destroy(cls);
5454         ovsrcu_postpone(free, cls);
5455     }
5456     cmap_destroy(&pmd->classifiers);
5457     cmap_destroy(&pmd->flow_table);
5458     ovs_mutex_destroy(&pmd->flow_mutex);
5459     latch_destroy(&pmd->exit_latch);
5460     seq_destroy(pmd->reload_seq);
5461     xpthread_cond_destroy(&pmd->cond);
5462     ovs_mutex_destroy(&pmd->cond_mutex);
5463     ovs_mutex_destroy(&pmd->port_mutex);
5464     free(pmd);
5465 }
5466
5467 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
5468  * and unrefs the struct. */
5469 static void
5470 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
5471 {
5472     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
5473      * but extra cleanup is necessary */
5474     if (pmd->core_id == NON_PMD_CORE_ID) {
5475         ovs_mutex_lock(&dp->non_pmd_mutex);
5476         dfc_cache_uninit(&pmd->flow_cache);
5477         pmd_free_cached_ports(pmd);
5478         pmd_free_static_tx_qid(pmd);
5479         ovs_mutex_unlock(&dp->non_pmd_mutex);
5480     } else {
5481         latch_set(&pmd->exit_latch);
5482         dp_netdev_reload_pmd__(pmd);
5483         xpthread_join(pmd->thread, NULL);
5484     }
5485
5486     dp_netdev_pmd_clear_ports(pmd);
5487
5488     /* Purges the 'pmd''s flows after stopping the thread, but before
5489      * destroying the flows, so that the flow stats can be collected. */
5490     if (dp->dp_purge_cb) {
5491         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
5492     }
5493     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
5494     dp_netdev_pmd_unref(pmd);
5495 }
5496
5497 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
5498  * thread. */
5499 static void
5500 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
5501 {
5502     struct dp_netdev_pmd_thread *pmd;
5503     struct dp_netdev_pmd_thread **pmd_list;
5504     size_t k = 0, n_pmds;
5505
5506     n_pmds = cmap_count(&dp->poll_threads);
5507     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
5508
5509     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5510         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
5511             continue;
5512         }
5513         /* We cannot call dp_netdev_del_pmd(), since it alters
5514          * 'dp->poll_threads' (while we're iterating it) and it
5515          * might quiesce. */
5516         ovs_assert(k < n_pmds);
5517         pmd_list[k++] = pmd;
5518     }
5519
5520     for (size_t i = 0; i < k; i++) {
5521         dp_netdev_del_pmd(dp, pmd_list[i]);
5522     }
5523     free(pmd_list);
5524 }
5525
5526 /* Deletes all rx queues from pmd->poll_list and all the ports from
5527  * pmd->tx_ports. */
5528 static void
5529 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
5530 {
5531     struct rxq_poll *poll;
5532     struct tx_port *port;
5533
5534     ovs_mutex_lock(&pmd->port_mutex);
5535     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
5536         free(poll);
5537     }
5538     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
5539         free(port);
5540     }
5541     ovs_mutex_unlock(&pmd->port_mutex);
5542 }
5543
5544 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
5545 static void
5546 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
5547                          struct dp_netdev_rxq *rxq)
5548     OVS_REQUIRES(pmd->port_mutex)
5549 {
5550     int qid = netdev_rxq_get_queue_id(rxq->rx);
5551     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
5552     struct rxq_poll *poll;
5553
5554     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
5555         if (poll->rxq == rxq) {
5556             /* 'rxq' is already polled by this thread. Do nothing. */
5557             return;
5558         }
5559     }
5560
5561     poll = xmalloc(sizeof *poll);
5562     poll->rxq = rxq;
5563     hmap_insert(&pmd->poll_list, &poll->node, hash);
5564
5565     pmd->need_reload = true;
5566 }
5567
5568 /* Delete 'poll' from poll_list of PMD thread. */
5569 static void
5570 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
5571                            struct rxq_poll *poll)
5572     OVS_REQUIRES(pmd->port_mutex)
5573 {
5574     hmap_remove(&pmd->poll_list, &poll->node);
5575     free(poll);
5576
5577     pmd->need_reload = true;
5578 }
5579
5580 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
5581  * changes to take effect. */
5582 static void
5583 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
5584                              struct dp_netdev_port *port)
5585     OVS_REQUIRES(pmd->port_mutex)
5586 {
5587     struct tx_port *tx;
5588
5589     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
5590     if (tx) {
5591         /* 'port' is already on this thread tx cache. Do nothing. */
5592         return;
5593     }
5594
5595     tx = xzalloc(sizeof *tx);
5596
5597     tx->port = port;
5598     tx->qid = -1;
5599     tx->flush_time = 0LL;
5600     dp_packet_batch_init(&tx->output_pkts);
5601
5602     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
5603     pmd->need_reload = true;
5604 }
5605
5606 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
5607  * changes to take effect. */
5608 static void
5609 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
5610                                struct tx_port *tx)
5611     OVS_REQUIRES(pmd->port_mutex)
5612 {
5613     hmap_remove(&pmd->tx_ports, &tx->node);
5614     free(tx);
5615     pmd->need_reload = true;
5616 }
5617 \f
5618 static char *
5619 dpif_netdev_get_datapath_version(void)
5620 {
5621      return xstrdup("<built-in>");
5622 }
5623
5624 static void
5625 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
5626                     uint16_t tcp_flags, long long now)
5627 {
5628     uint16_t flags;
5629
5630     atomic_store_relaxed(&netdev_flow->stats.used, now);
5631     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
5632     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
5633     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
5634     flags |= tcp_flags;
5635     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
5636 }
5637
5638 static int
5639 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
5640                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
5641                  enum dpif_upcall_type type, const struct nlattr *userdata,
5642                  struct ofpbuf *actions, struct ofpbuf *put_actions)
5643 {
5644     struct dp_netdev *dp = pmd->dp;
5645
5646     if (OVS_UNLIKELY(!dp->upcall_cb)) {
5647         return ENODEV;
5648     }
5649
5650     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
5651         struct ds ds = DS_EMPTY_INITIALIZER;
5652         char *packet_str;
5653         struct ofpbuf key;
5654         struct odp_flow_key_parms odp_parms = {
5655             .flow = flow,
5656             .mask = wc ? &wc->masks : NULL,
5657             .support = dp_netdev_support,
5658         };
5659
5660         ofpbuf_init(&key, 0);
5661         odp_flow_key_from_flow(&odp_parms, &key);
5662         packet_str = ofp_dp_packet_to_string(packet_);
5663
5664         odp_flow_key_format(key.data, key.size, &ds);
5665
5666         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
5667                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
5668
5669         ofpbuf_uninit(&key);
5670         free(packet_str);
5671
5672         ds_destroy(&ds);
5673     }
5674
5675     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
5676                          actions, wc, put_actions, dp->upcall_aux);
5677 }
5678
5679 static inline uint32_t
5680 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
5681                                 const struct miniflow *mf)
5682 {
5683     uint32_t hash;
5684
5685     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5686         hash = dp_packet_get_rss_hash(packet);
5687     } else {
5688         hash = miniflow_hash_5tuple(mf, 0);
5689         dp_packet_set_rss_hash(packet, hash);
5690     }
5691
5692     return hash;
5693 }
5694
5695 static inline uint32_t
5696 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
5697                                 const struct miniflow *mf)
5698 {
5699     uint32_t hash, recirc_depth;
5700
5701     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5702         hash = dp_packet_get_rss_hash(packet);
5703     } else {
5704         hash = miniflow_hash_5tuple(mf, 0);
5705         dp_packet_set_rss_hash(packet, hash);
5706     }
5707
5708     /* The RSS hash must account for the recirculation depth to avoid
5709      * collisions in the exact match cache */
5710     recirc_depth = *recirc_depth_get_unsafe();
5711     if (OVS_UNLIKELY(recirc_depth)) {
5712         hash = hash_finish(hash, recirc_depth);
5713         dp_packet_set_rss_hash(packet, hash);
5714     }
5715     return hash;
5716 }
5717
5718 struct packet_batch_per_flow {
5719     unsigned int byte_count;
5720     uint16_t tcp_flags;
5721     struct dp_netdev_flow *flow;
5722
5723     struct dp_packet_batch array;
5724 };
5725
5726 static inline void
5727 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
5728                              struct dp_packet *packet,
5729                              uint16_t tcp_flags)
5730 {
5731     batch->byte_count += dp_packet_size(packet);
5732     batch->tcp_flags |= tcp_flags;
5733     batch->array.packets[batch->array.count++] = packet;
5734 }
5735
5736 static inline void
5737 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
5738                            struct dp_netdev_flow *flow)
5739 {
5740     flow->batch = batch;
5741
5742     batch->flow = flow;
5743     dp_packet_batch_init(&batch->array);
5744     batch->byte_count = 0;
5745     batch->tcp_flags = 0;
5746 }
5747
5748 static inline void
5749 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
5750                               struct dp_netdev_pmd_thread *pmd)
5751 {
5752     struct dp_netdev_actions *actions;
5753     struct dp_netdev_flow *flow = batch->flow;
5754
5755     dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
5756                         batch->tcp_flags, pmd->ctx.now / 1000);
5757
5758     actions = dp_netdev_flow_get_actions(flow);
5759
5760     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
5761                               actions->actions, actions->size);
5762 }
5763
5764 static inline void
5765 dp_netdev_queue_batches(struct dp_packet *pkt,
5766                         struct dp_netdev_flow *flow, uint16_t tcp_flags,
5767                         struct packet_batch_per_flow *batches,
5768                         size_t *n_batches)
5769 {
5770     struct packet_batch_per_flow *batch = flow->batch;
5771
5772     if (OVS_UNLIKELY(!batch)) {
5773         batch = &batches[(*n_batches)++];
5774         packet_batch_per_flow_init(batch, flow);
5775     }
5776
5777     packet_batch_per_flow_update(batch, pkt, tcp_flags);
5778 }
5779
5780 static inline void
5781 packet_enqueue_to_flow_map(struct dp_packet *packet,
5782                            struct dp_netdev_flow *flow,
5783                            uint16_t tcp_flags,
5784                            struct dp_packet_flow_map *flow_map,
5785                            size_t index)
5786 {
5787     struct dp_packet_flow_map *map = &flow_map[index];
5788     map->flow = flow;
5789     map->packet = packet;
5790     map->tcp_flags = tcp_flags;
5791 }
5792
5793 /* SMC lookup function for a batch of packets.
5794  * By doing batching SMC lookup, we can use prefetch
5795  * to hide memory access latency.
5796  */
5797 static inline void
5798 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
5799             struct netdev_flow_key *keys,
5800             struct netdev_flow_key **missed_keys,
5801             struct dp_packet_batch *packets_,
5802             const int cnt,
5803             struct dp_packet_flow_map *flow_map,
5804             uint8_t *index_map)
5805 {
5806     int i;
5807     struct dp_packet *packet;
5808     size_t n_smc_hit = 0, n_missed = 0;
5809     struct dfc_cache *cache = &pmd->flow_cache;
5810     struct smc_cache *smc_cache = &cache->smc_cache;
5811     const struct cmap_node *flow_node;
5812     int recv_idx;
5813     uint16_t tcp_flags;
5814
5815     /* Prefetch buckets for all packets */
5816     for (i = 0; i < cnt; i++) {
5817         OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
5818     }
5819
5820     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
5821         struct dp_netdev_flow *flow = NULL;
5822         flow_node = smc_entry_get(pmd, keys[i].hash);
5823         bool hit = false;
5824         /* Get the original order of this packet in received batch. */
5825         recv_idx = index_map[i];
5826
5827         if (OVS_LIKELY(flow_node != NULL)) {
5828             CMAP_NODE_FOR_EACH (flow, node, flow_node) {
5829                 /* Since we dont have per-port megaflow to check the port
5830                  * number, we need to  verify that the input ports match. */
5831                 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
5832                 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
5833                     tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
5834
5835                     /* SMC hit and emc miss, we insert into EMC */
5836                     keys[i].len =
5837                         netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
5838                     emc_probabilistic_insert(pmd, &keys[i], flow);
5839                     /* Add these packets into the flow map in the same order
5840                      * as received.
5841                      */
5842                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
5843                                                flow_map, recv_idx);
5844                     n_smc_hit++;
5845                     hit = true;
5846                     break;
5847                 }
5848             }
5849             if (hit) {
5850                 continue;
5851             }
5852         }
5853
5854         /* SMC missed. Group missed packets together at
5855          * the beginning of the 'packets' array. */
5856         dp_packet_batch_refill(packets_, packet, i);
5857
5858         /* Preserve the order of packet for flow batching. */
5859         index_map[n_missed] = recv_idx;
5860
5861         /* Put missed keys to the pointer arrays return to the caller */
5862         missed_keys[n_missed++] = &keys[i];
5863     }
5864
5865     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
5866 }
5867
5868 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
5869  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
5870  * miniflow is copied into 'keys' and the packet pointer is moved at the
5871  * beginning of the 'packets' array. The pointers of missed keys are put in the
5872  * missed_keys pointer array for future processing.
5873  *
5874  * The function returns the number of packets that needs to be processed in the
5875  * 'packets' array (they have been moved to the beginning of the vector).
5876  *
5877  * For performance reasons a caller may choose not to initialize the metadata
5878  * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
5879  * is not valid and must be initialized by this function using 'port_no'.
5880  * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
5881  * will be ignored.
5882  */
5883 static inline size_t
5884 dfc_processing(struct dp_netdev_pmd_thread *pmd,
5885                struct dp_packet_batch *packets_,
5886                struct netdev_flow_key *keys,
5887                struct netdev_flow_key **missed_keys,
5888                struct packet_batch_per_flow batches[], size_t *n_batches,
5889                struct dp_packet_flow_map *flow_map,
5890                size_t *n_flows, uint8_t *index_map,
5891                bool md_is_valid, odp_port_t port_no)
5892 {
5893     struct netdev_flow_key *key = &keys[0];
5894     size_t n_missed = 0, n_emc_hit = 0;
5895     struct dfc_cache *cache = &pmd->flow_cache;
5896     struct dp_packet *packet;
5897     const size_t cnt = dp_packet_batch_size(packets_);
5898     uint32_t cur_min;
5899     int i;
5900     uint16_t tcp_flags;
5901     bool smc_enable_db;
5902     size_t map_cnt = 0;
5903     bool batch_enable = true;
5904
5905     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
5906     atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
5907     pmd_perf_update_counter(&pmd->perf_stats,
5908                             md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
5909                             cnt);
5910
5911     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
5912         struct dp_netdev_flow *flow;
5913         uint32_t mark;
5914
5915         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
5916             dp_packet_delete(packet);
5917             continue;
5918         }
5919
5920         if (i != cnt - 1) {
5921             struct dp_packet **packets = packets_->packets;
5922             /* Prefetch next packet data and metadata. */
5923             OVS_PREFETCH(dp_packet_data(packets[i+1]));
5924             pkt_metadata_prefetch_init(&packets[i+1]->md);
5925         }
5926
5927         if (!md_is_valid) {
5928             pkt_metadata_init(&packet->md, port_no);
5929         }
5930
5931         if ((*recirc_depth_get() == 0) &&
5932             dp_packet_has_flow_mark(packet, &mark)) {
5933             flow = mark_to_flow_find(pmd, mark);
5934             if (OVS_LIKELY(flow)) {
5935                 tcp_flags = parse_tcp_flags(packet);
5936                 if (OVS_LIKELY(batch_enable)) {
5937                     dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
5938                                             n_batches);
5939                 } else {
5940                     /* Flow batching should be performed only after fast-path
5941                      * processing is also completed for packets with emc miss
5942                      * or else it will result in reordering of packets with
5943                      * same datapath flows. */
5944                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
5945                                                flow_map, map_cnt++);
5946                 }
5947                 continue;
5948             }
5949         }
5950
5951         miniflow_extract(packet, &key->mf);
5952         key->len = 0; /* Not computed yet. */
5953         /* If EMC and SMC disabled skip hash computation */
5954         if (smc_enable_db == true || cur_min != 0) {
5955             if (!md_is_valid) {
5956                 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
5957                         &key->mf);
5958             } else {
5959                 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
5960             }
5961         }
5962         if (cur_min) {
5963             flow = emc_lookup(&cache->emc_cache, key);
5964         } else {
5965             flow = NULL;
5966         }
5967         if (OVS_LIKELY(flow)) {
5968             tcp_flags = miniflow_get_tcp_flags(&key->mf);
5969             n_emc_hit++;
5970             if (OVS_LIKELY(batch_enable)) {
5971                 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
5972                                         n_batches);
5973             } else {
5974                 /* Flow batching should be performed only after fast-path
5975                  * processing is also completed for packets with emc miss
5976                  * or else it will result in reordering of packets with
5977                  * same datapath flows. */
5978                 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
5979                                            flow_map, map_cnt++);
5980             }
5981         } else {
5982             /* Exact match cache missed. Group missed packets together at
5983              * the beginning of the 'packets' array. */
5984             dp_packet_batch_refill(packets_, packet, i);
5985
5986             /* Preserve the order of packet for flow batching. */
5987             index_map[n_missed] = map_cnt;
5988             flow_map[map_cnt++].flow = NULL;
5989
5990             /* 'key[n_missed]' contains the key of the current packet and it
5991              * will be passed to SMC lookup. The next key should be extracted
5992              * to 'keys[n_missed + 1]'.
5993              * We also maintain a pointer array to keys missed both SMC and EMC
5994              * which will be returned to the caller for future processing. */
5995             missed_keys[n_missed] = key;
5996             key = &keys[++n_missed];
5997
5998             /* Skip batching for subsequent packets to avoid reordering. */
5999             batch_enable = false;
6000         }
6001     }
6002     /* Count of packets which are not flow batched. */
6003     *n_flows = map_cnt;
6004
6005     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6006
6007     if (!smc_enable_db) {
6008         return dp_packet_batch_size(packets_);
6009     }
6010
6011     /* Packets miss EMC will do a batch lookup in SMC if enabled */
6012     smc_lookup_batch(pmd, keys, missed_keys, packets_,
6013                      n_missed, flow_map, index_map);
6014
6015     return dp_packet_batch_size(packets_);
6016 }
6017
6018 static inline int
6019 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6020                      struct dp_packet *packet,
6021                      const struct netdev_flow_key *key,
6022                      struct ofpbuf *actions, struct ofpbuf *put_actions)
6023 {
6024     struct ofpbuf *add_actions;
6025     struct dp_packet_batch b;
6026     struct match match;
6027     ovs_u128 ufid;
6028     int error;
6029     uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6030
6031     match.tun_md.valid = false;
6032     miniflow_expand(&key->mf, &match.flow);
6033
6034     ofpbuf_clear(actions);
6035     ofpbuf_clear(put_actions);
6036
6037     dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
6038     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6039                              &ufid, DPIF_UC_MISS, NULL, actions,
6040                              put_actions);
6041     if (OVS_UNLIKELY(error && error != ENOSPC)) {
6042         dp_packet_delete(packet);
6043         return error;
6044     }
6045
6046     /* The Netlink encoding of datapath flow keys cannot express
6047      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6048      * tag is interpreted as exact match on the fact that there is no
6049      * VLAN.  Unless we refactor a lot of code that translates between
6050      * Netlink and struct flow representations, we have to do the same
6051      * here. */
6052     if (!match.wc.masks.vlans[0].tci) {
6053         match.wc.masks.vlans[0].tci = htons(0xffff);
6054     }
6055
6056     /* We can't allow the packet batching in the next loop to execute
6057      * the actions.  Otherwise, if there are any slow path actions,
6058      * we'll send the packet up twice. */
6059     dp_packet_batch_init_packet(&b, packet);
6060     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6061                               actions->data, actions->size);
6062
6063     add_actions = put_actions->size ? put_actions : actions;
6064     if (OVS_LIKELY(error != ENOSPC)) {
6065         struct dp_netdev_flow *netdev_flow;
6066
6067         /* XXX: There's a race window where a flow covering this packet
6068          * could have already been installed since we last did the flow
6069          * lookup before upcall.  This could be solved by moving the
6070          * mutex lock outside the loop, but that's an awful long time
6071          * to be locking everyone out of making flow installs.  If we
6072          * move to a per-core classifier, it would be reasonable. */
6073         ovs_mutex_lock(&pmd->flow_mutex);
6074         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6075         if (OVS_LIKELY(!netdev_flow)) {
6076             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6077                                              add_actions->data,
6078                                              add_actions->size);
6079         }
6080         ovs_mutex_unlock(&pmd->flow_mutex);
6081         uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6082         smc_insert(pmd, key, hash);
6083         emc_probabilistic_insert(pmd, key, netdev_flow);
6084     }
6085     if (pmd_perf_metrics_enabled(pmd)) {
6086         /* Update upcall stats. */
6087         cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6088         struct pmd_perf_stats *s = &pmd->perf_stats;
6089         s->current.upcalls++;
6090         s->current.upcall_cycles += cycles;
6091         histogram_add_sample(&s->cycles_per_upcall, cycles);
6092     }
6093     return error;
6094 }
6095
6096 static inline void
6097 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
6098                      struct dp_packet_batch *packets_,
6099                      struct netdev_flow_key **keys,
6100                      struct dp_packet_flow_map *flow_map,
6101                      uint8_t *index_map,
6102                      odp_port_t in_port)
6103 {
6104     const size_t cnt = dp_packet_batch_size(packets_);
6105 #if !defined(__CHECKER__) && !defined(_WIN32)
6106     const size_t PKT_ARRAY_SIZE = cnt;
6107 #else
6108     /* Sparse or MSVC doesn't like variable length array. */
6109     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6110 #endif
6111     struct dp_packet *packet;
6112     struct dpcls *cls;
6113     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
6114     struct dp_netdev *dp = pmd->dp;
6115     int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
6116     int lookup_cnt = 0, add_lookup_cnt;
6117     bool any_miss;
6118
6119     for (size_t i = 0; i < cnt; i++) {
6120         /* Key length is needed in all the cases, hash computed on demand. */
6121         keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
6122     }
6123     /* Get the classifier for the in_port */
6124     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6125     if (OVS_LIKELY(cls)) {
6126         any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6127                                 rules, cnt, &lookup_cnt);
6128     } else {
6129         any_miss = true;
6130         memset(rules, 0, sizeof(rules));
6131     }
6132     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6133         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6134         struct ofpbuf actions, put_actions;
6135
6136         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6137         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6138
6139         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6140             struct dp_netdev_flow *netdev_flow;
6141
6142             if (OVS_LIKELY(rules[i])) {
6143                 continue;
6144             }
6145
6146             /* It's possible that an earlier slow path execution installed
6147              * a rule covering this flow.  In this case, it's a lot cheaper
6148              * to catch it here than execute a miss. */
6149             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
6150                                                     &add_lookup_cnt);
6151             if (netdev_flow) {
6152                 lookup_cnt += add_lookup_cnt;
6153                 rules[i] = &netdev_flow->cr;
6154                 continue;
6155             }
6156
6157             int error = handle_packet_upcall(pmd, packet, keys[i],
6158                                              &actions, &put_actions);
6159
6160             if (OVS_UNLIKELY(error)) {
6161                 upcall_fail_cnt++;
6162             } else {
6163                 upcall_ok_cnt++;
6164             }
6165         }
6166
6167         ofpbuf_uninit(&actions);
6168         ofpbuf_uninit(&put_actions);
6169         fat_rwlock_unlock(&dp->upcall_rwlock);
6170     } else if (OVS_UNLIKELY(any_miss)) {
6171         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6172             if (OVS_UNLIKELY(!rules[i])) {
6173                 dp_packet_delete(packet);
6174                 upcall_fail_cnt++;
6175             }
6176         }
6177     }
6178
6179     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6180         struct dp_netdev_flow *flow;
6181         /* Get the original order of this packet in received batch. */
6182         int recv_idx = index_map[i];
6183         uint16_t tcp_flags;
6184
6185         if (OVS_UNLIKELY(!rules[i])) {
6186             continue;
6187         }
6188
6189         flow = dp_netdev_flow_cast(rules[i]);
6190         uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
6191         smc_insert(pmd, keys[i], hash);
6192
6193         emc_probabilistic_insert(pmd, keys[i], flow);
6194         /* Add these packets into the flow map in the same order
6195          * as received.
6196          */
6197         tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6198         packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6199                                    flow_map, recv_idx);
6200     }
6201
6202     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6203                             cnt - upcall_ok_cnt - upcall_fail_cnt);
6204     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6205                             lookup_cnt);
6206     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6207                             upcall_ok_cnt);
6208     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6209                             upcall_fail_cnt);
6210 }
6211
6212 /* Packets enter the datapath from a port (or from recirculation) here.
6213  *
6214  * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6215  * When false the metadata in 'packets' need to be initialized. */
6216 static void
6217 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
6218                   struct dp_packet_batch *packets,
6219                   bool md_is_valid, odp_port_t port_no)
6220 {
6221 #if !defined(__CHECKER__) && !defined(_WIN32)
6222     const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
6223 #else
6224     /* Sparse or MSVC doesn't like variable length array. */
6225     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6226 #endif
6227     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6228         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
6229     struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
6230     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
6231     size_t n_batches;
6232     struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6233     uint8_t index_map[PKT_ARRAY_SIZE];
6234     size_t n_flows, i;
6235
6236     odp_port_t in_port;
6237
6238     n_batches = 0;
6239     dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
6240                    flow_map, &n_flows, index_map, md_is_valid, port_no);
6241
6242     if (!dp_packet_batch_is_empty(packets)) {
6243         /* Get ingress port from first packet's metadata. */
6244         in_port = packets->packets[0]->md.in_port.odp_port;
6245         fast_path_processing(pmd, packets, missed_keys,
6246                              flow_map, index_map, in_port);
6247     }
6248
6249     /* Batch rest of packets which are in flow map. */
6250     for (i = 0; i < n_flows; i++) {
6251         struct dp_packet_flow_map *map = &flow_map[i];
6252
6253         if (OVS_UNLIKELY(!map->flow)) {
6254             continue;
6255         }
6256         dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6257                                 batches, &n_batches);
6258      }
6259
6260     /* All the flow batches need to be reset before any call to
6261      * packet_batch_per_flow_execute() as it could potentially trigger
6262      * recirculation. When a packet matching flow ‘j’ happens to be
6263      * recirculated, the nested call to dp_netdev_input__() could potentially
6264      * classify the packet as matching another flow - say 'k'. It could happen
6265      * that in the previous call to dp_netdev_input__() that same flow 'k' had
6266      * already its own batches[k] still waiting to be served.  So if its
6267      * ‘batch’ member is not reset, the recirculated packet would be wrongly
6268      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
6269     for (i = 0; i < n_batches; i++) {
6270         batches[i].flow->batch = NULL;
6271     }
6272
6273     for (i = 0; i < n_batches; i++) {
6274         packet_batch_per_flow_execute(&batches[i], pmd);
6275     }
6276 }
6277
6278 static void
6279 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
6280                 struct dp_packet_batch *packets,
6281                 odp_port_t port_no)
6282 {
6283     dp_netdev_input__(pmd, packets, false, port_no);
6284 }
6285
6286 static void
6287 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
6288                       struct dp_packet_batch *packets)
6289 {
6290     dp_netdev_input__(pmd, packets, true, 0);
6291 }
6292
6293 struct dp_netdev_execute_aux {
6294     struct dp_netdev_pmd_thread *pmd;
6295     const struct flow *flow;
6296 };
6297
6298 static void
6299 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6300                                  void *aux)
6301 {
6302     struct dp_netdev *dp = get_dp_netdev(dpif);
6303     dp->dp_purge_aux = aux;
6304     dp->dp_purge_cb = cb;
6305 }
6306
6307 static void
6308 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6309                                void *aux)
6310 {
6311     struct dp_netdev *dp = get_dp_netdev(dpif);
6312     dp->upcall_aux = aux;
6313     dp->upcall_cb = cb;
6314 }
6315
6316 static void
6317 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
6318                                bool purge)
6319 {
6320     struct tx_port *tx;
6321     struct dp_netdev_port *port;
6322     long long interval;
6323
6324     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
6325         if (!tx->port->dynamic_txqs) {
6326             continue;
6327         }
6328         interval = pmd->ctx.now - tx->last_used;
6329         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
6330             port = tx->port;
6331             ovs_mutex_lock(&port->txq_used_mutex);
6332             port->txq_used[tx->qid]--;
6333             ovs_mutex_unlock(&port->txq_used_mutex);
6334             tx->qid = -1;
6335         }
6336     }
6337 }
6338
6339 static int
6340 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
6341                            struct tx_port *tx)
6342 {
6343     struct dp_netdev_port *port;
6344     long long interval;
6345     int i, min_cnt, min_qid;
6346
6347     interval = pmd->ctx.now - tx->last_used;
6348     tx->last_used = pmd->ctx.now;
6349
6350     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
6351         return tx->qid;
6352     }
6353
6354     port = tx->port;
6355
6356     ovs_mutex_lock(&port->txq_used_mutex);
6357     if (tx->qid >= 0) {
6358         port->txq_used[tx->qid]--;
6359         tx->qid = -1;
6360     }
6361
6362     min_cnt = -1;
6363     min_qid = 0;
6364     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
6365         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
6366             min_cnt = port->txq_used[i];
6367             min_qid = i;
6368         }
6369     }
6370
6371     port->txq_used[min_qid]++;
6372     tx->qid = min_qid;
6373
6374     ovs_mutex_unlock(&port->txq_used_mutex);
6375
6376     dpif_netdev_xps_revalidate_pmd(pmd, false);
6377
6378     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
6379              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
6380     return min_qid;
6381 }
6382
6383 static struct tx_port *
6384 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6385                           odp_port_t port_no)
6386 {
6387     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
6388 }
6389
6390 static struct tx_port *
6391 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6392                            odp_port_t port_no)
6393 {
6394     return tx_port_lookup(&pmd->send_port_cache, port_no);
6395 }
6396
6397 static int
6398 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
6399                 const struct nlattr *attr,
6400                 struct dp_packet_batch *batch)
6401 {
6402     struct tx_port *tun_port;
6403     const struct ovs_action_push_tnl *data;
6404     int err;
6405
6406     data = nl_attr_get(attr);
6407
6408     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
6409     if (!tun_port) {
6410         err = -EINVAL;
6411         goto error;
6412     }
6413     err = netdev_push_header(tun_port->port->netdev, batch, data);
6414     if (!err) {
6415         return 0;
6416     }
6417 error:
6418     dp_packet_delete_batch(batch, true);
6419     return err;
6420 }
6421
6422 static void
6423 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
6424                             struct dp_packet *packet, bool should_steal,
6425                             struct flow *flow, ovs_u128 *ufid,
6426                             struct ofpbuf *actions,
6427                             const struct nlattr *userdata)
6428 {
6429     struct dp_packet_batch b;
6430     int error;
6431
6432     ofpbuf_clear(actions);
6433
6434     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
6435                              DPIF_UC_ACTION, userdata, actions,
6436                              NULL);
6437     if (!error || error == ENOSPC) {
6438         dp_packet_batch_init_packet(&b, packet);
6439         dp_netdev_execute_actions(pmd, &b, should_steal, flow,
6440                                   actions->data, actions->size);
6441     } else if (should_steal) {
6442         dp_packet_delete(packet);
6443     }
6444 }
6445
6446 static void
6447 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
6448               const struct nlattr *a, bool should_steal)
6449     OVS_NO_THREAD_SAFETY_ANALYSIS
6450 {
6451     struct dp_netdev_execute_aux *aux = aux_;
6452     uint32_t *depth = recirc_depth_get();
6453     struct dp_netdev_pmd_thread *pmd = aux->pmd;
6454     struct dp_netdev *dp = pmd->dp;
6455     int type = nl_attr_type(a);
6456     struct tx_port *p;
6457
6458     switch ((enum ovs_action_attr)type) {
6459     case OVS_ACTION_ATTR_OUTPUT:
6460         p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
6461         if (OVS_LIKELY(p)) {
6462             struct dp_packet *packet;
6463             struct dp_packet_batch out;
6464
6465             if (!should_steal) {
6466                 dp_packet_batch_clone(&out, packets_);
6467                 dp_packet_batch_reset_cutlen(packets_);
6468                 packets_ = &out;
6469             }
6470             dp_packet_batch_apply_cutlen(packets_);
6471
6472 #ifdef DPDK_NETDEV
6473             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
6474                              && packets_->packets[0]->source
6475                                 != p->output_pkts.packets[0]->source)) {
6476                 /* XXX: netdev-dpdk assumes that all packets in a single
6477                  *      output batch has the same source. Flush here to
6478                  *      avoid memory access issues. */
6479                 dp_netdev_pmd_flush_output_on_port(pmd, p);
6480             }
6481 #endif
6482             if (dp_packet_batch_size(&p->output_pkts)
6483                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
6484                 /* Flush here to avoid overflow. */
6485                 dp_netdev_pmd_flush_output_on_port(pmd, p);
6486             }
6487
6488             if (dp_packet_batch_is_empty(&p->output_pkts)) {
6489                 pmd->n_output_batches++;
6490             }
6491
6492             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6493                 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
6494                                                              pmd->ctx.last_rxq;
6495                 dp_packet_batch_add(&p->output_pkts, packet);
6496             }
6497             return;
6498         }
6499         break;
6500
6501     case OVS_ACTION_ATTR_TUNNEL_PUSH:
6502         if (should_steal) {
6503             /* We're requested to push tunnel header, but also we need to take
6504              * the ownership of these packets. Thus, we can avoid performing
6505              * the action, because the caller will not use the result anyway.
6506              * Just break to free the batch. */
6507             break;
6508         }
6509         dp_packet_batch_apply_cutlen(packets_);
6510         push_tnl_action(pmd, a, packets_);
6511         return;
6512
6513     case OVS_ACTION_ATTR_TUNNEL_POP:
6514         if (*depth < MAX_RECIRC_DEPTH) {
6515             struct dp_packet_batch *orig_packets_ = packets_;
6516             odp_port_t portno = nl_attr_get_odp_port(a);
6517
6518             p = pmd_tnl_port_cache_lookup(pmd, portno);
6519             if (p) {
6520                 struct dp_packet_batch tnl_pkt;
6521
6522                 if (!should_steal) {
6523                     dp_packet_batch_clone(&tnl_pkt, packets_);
6524                     packets_ = &tnl_pkt;
6525                     dp_packet_batch_reset_cutlen(orig_packets_);
6526                 }
6527
6528                 dp_packet_batch_apply_cutlen(packets_);
6529
6530                 netdev_pop_header(p->port->netdev, packets_);
6531                 if (dp_packet_batch_is_empty(packets_)) {
6532                     return;
6533                 }
6534
6535                 struct dp_packet *packet;
6536                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6537                     packet->md.in_port.odp_port = portno;
6538                 }
6539
6540                 (*depth)++;
6541                 dp_netdev_recirculate(pmd, packets_);
6542                 (*depth)--;
6543                 return;
6544             }
6545         }
6546         break;
6547
6548     case OVS_ACTION_ATTR_USERSPACE:
6549         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6550             struct dp_packet_batch *orig_packets_ = packets_;
6551             const struct nlattr *userdata;
6552             struct dp_packet_batch usr_pkt;
6553             struct ofpbuf actions;
6554             struct flow flow;
6555             ovs_u128 ufid;
6556             bool clone = false;
6557
6558             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
6559             ofpbuf_init(&actions, 0);
6560
6561             if (packets_->trunc) {
6562                 if (!should_steal) {
6563                     dp_packet_batch_clone(&usr_pkt, packets_);
6564                     packets_ = &usr_pkt;
6565                     clone = true;
6566                     dp_packet_batch_reset_cutlen(orig_packets_);
6567                 }
6568
6569                 dp_packet_batch_apply_cutlen(packets_);
6570             }
6571
6572             struct dp_packet *packet;
6573             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6574                 flow_extract(packet, &flow);
6575                 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
6576                 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
6577                                             &ufid, &actions, userdata);
6578             }
6579
6580             if (clone) {
6581                 dp_packet_delete_batch(packets_, true);
6582             }
6583
6584             ofpbuf_uninit(&actions);
6585             fat_rwlock_unlock(&dp->upcall_rwlock);
6586
6587             return;
6588         }
6589         break;
6590
6591     case OVS_ACTION_ATTR_RECIRC:
6592         if (*depth < MAX_RECIRC_DEPTH) {
6593             struct dp_packet_batch recirc_pkts;
6594
6595             if (!should_steal) {
6596                dp_packet_batch_clone(&recirc_pkts, packets_);
6597                packets_ = &recirc_pkts;
6598             }
6599
6600             struct dp_packet *packet;
6601             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6602                 packet->md.recirc_id = nl_attr_get_u32(a);
6603             }
6604
6605             (*depth)++;
6606             dp_netdev_recirculate(pmd, packets_);
6607             (*depth)--;
6608
6609             return;
6610         }
6611
6612         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
6613         break;
6614
6615     case OVS_ACTION_ATTR_CT: {
6616         const struct nlattr *b;
6617         bool force = false;
6618         bool commit = false;
6619         unsigned int left;
6620         uint16_t zone = 0;
6621         const char *helper = NULL;
6622         const uint32_t *setmark = NULL;
6623         const struct ovs_key_ct_labels *setlabel = NULL;
6624         struct nat_action_info_t nat_action_info;
6625         struct nat_action_info_t *nat_action_info_ref = NULL;
6626         bool nat_config = false;
6627
6628         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
6629                                  nl_attr_get_size(a)) {
6630             enum ovs_ct_attr sub_type = nl_attr_type(b);
6631
6632             switch(sub_type) {
6633             case OVS_CT_ATTR_FORCE_COMMIT:
6634                 force = true;
6635                 /* fall through. */
6636             case OVS_CT_ATTR_COMMIT:
6637                 commit = true;
6638                 break;
6639             case OVS_CT_ATTR_ZONE:
6640                 zone = nl_attr_get_u16(b);
6641                 break;
6642             case OVS_CT_ATTR_HELPER:
6643                 helper = nl_attr_get_string(b);
6644                 break;
6645             case OVS_CT_ATTR_MARK:
6646                 setmark = nl_attr_get(b);
6647                 break;
6648             case OVS_CT_ATTR_LABELS:
6649                 setlabel = nl_attr_get(b);
6650                 break;
6651             case OVS_CT_ATTR_EVENTMASK:
6652                 /* Silently ignored, as userspace datapath does not generate
6653                  * netlink events. */
6654                 break;
6655             case OVS_CT_ATTR_NAT: {
6656                 const struct nlattr *b_nest;
6657                 unsigned int left_nest;
6658                 bool ip_min_specified = false;
6659                 bool proto_num_min_specified = false;
6660                 bool ip_max_specified = false;
6661                 bool proto_num_max_specified = false;
6662                 memset(&nat_action_info, 0, sizeof nat_action_info);
6663                 nat_action_info_ref = &nat_action_info;
6664
6665                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
6666                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
6667
6668                     switch (sub_type_nest) {
6669                     case OVS_NAT_ATTR_SRC:
6670                     case OVS_NAT_ATTR_DST:
6671                         nat_config = true;
6672                         nat_action_info.nat_action |=
6673                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
6674                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
6675                         break;
6676                     case OVS_NAT_ATTR_IP_MIN:
6677                         memcpy(&nat_action_info.min_addr,
6678                                nl_attr_get(b_nest),
6679                                nl_attr_get_size(b_nest));
6680                         ip_min_specified = true;
6681                         break;
6682                     case OVS_NAT_ATTR_IP_MAX:
6683                         memcpy(&nat_action_info.max_addr,
6684                                nl_attr_get(b_nest),
6685                                nl_attr_get_size(b_nest));
6686                         ip_max_specified = true;
6687                         break;
6688                     case OVS_NAT_ATTR_PROTO_MIN:
6689                         nat_action_info.min_port =
6690                             nl_attr_get_u16(b_nest);
6691                         proto_num_min_specified = true;
6692                         break;
6693                     case OVS_NAT_ATTR_PROTO_MAX:
6694                         nat_action_info.max_port =
6695                             nl_attr_get_u16(b_nest);
6696                         proto_num_max_specified = true;
6697                         break;
6698                     case OVS_NAT_ATTR_PERSISTENT:
6699                     case OVS_NAT_ATTR_PROTO_HASH:
6700                     case OVS_NAT_ATTR_PROTO_RANDOM:
6701                         break;
6702                     case OVS_NAT_ATTR_UNSPEC:
6703                     case __OVS_NAT_ATTR_MAX:
6704                         OVS_NOT_REACHED();
6705                     }
6706                 }
6707
6708                 if (ip_min_specified && !ip_max_specified) {
6709                     nat_action_info.max_addr = nat_action_info.min_addr;
6710                 }
6711                 if (proto_num_min_specified && !proto_num_max_specified) {
6712                     nat_action_info.max_port = nat_action_info.min_port;
6713                 }
6714                 if (proto_num_min_specified || proto_num_max_specified) {
6715                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
6716                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
6717                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
6718                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
6719                     }
6720                 }
6721                 break;
6722             }
6723             case OVS_CT_ATTR_UNSPEC:
6724             case __OVS_CT_ATTR_MAX:
6725                 OVS_NOT_REACHED();
6726             }
6727         }
6728
6729         /* We won't be able to function properly in this case, hence
6730          * complain loudly. */
6731         if (nat_config && !commit) {
6732             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
6733             VLOG_WARN_RL(&rl, "NAT specified without commit.");
6734         }
6735
6736         conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
6737                           commit, zone, setmark, setlabel, aux->flow->tp_src,
6738                           aux->flow->tp_dst, helper, nat_action_info_ref,
6739                           pmd->ctx.now / 1000);
6740         break;
6741     }
6742
6743     case OVS_ACTION_ATTR_METER:
6744         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
6745                             pmd->ctx.now);
6746         break;
6747
6748     case OVS_ACTION_ATTR_PUSH_VLAN:
6749     case OVS_ACTION_ATTR_POP_VLAN:
6750     case OVS_ACTION_ATTR_PUSH_MPLS:
6751     case OVS_ACTION_ATTR_POP_MPLS:
6752     case OVS_ACTION_ATTR_SET:
6753     case OVS_ACTION_ATTR_SET_MASKED:
6754     case OVS_ACTION_ATTR_SAMPLE:
6755     case OVS_ACTION_ATTR_HASH:
6756     case OVS_ACTION_ATTR_UNSPEC:
6757     case OVS_ACTION_ATTR_TRUNC:
6758     case OVS_ACTION_ATTR_PUSH_ETH:
6759     case OVS_ACTION_ATTR_POP_ETH:
6760     case OVS_ACTION_ATTR_CLONE:
6761     case OVS_ACTION_ATTR_PUSH_NSH:
6762     case OVS_ACTION_ATTR_POP_NSH:
6763     case OVS_ACTION_ATTR_CT_CLEAR:
6764     case __OVS_ACTION_ATTR_MAX:
6765         OVS_NOT_REACHED();
6766     }
6767
6768     dp_packet_delete_batch(packets_, should_steal);
6769 }
6770
6771 static void
6772 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
6773                           struct dp_packet_batch *packets,
6774                           bool should_steal, const struct flow *flow,
6775                           const struct nlattr *actions, size_t actions_len)
6776 {
6777     struct dp_netdev_execute_aux aux = { pmd, flow };
6778
6779     odp_execute_actions(&aux, packets, should_steal, actions,
6780                         actions_len, dp_execute_cb);
6781 }
6782
6783 struct dp_netdev_ct_dump {
6784     struct ct_dpif_dump_state up;
6785     struct conntrack_dump dump;
6786     struct conntrack *ct;
6787     struct dp_netdev *dp;
6788 };
6789
6790 static int
6791 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
6792                           const uint16_t *pzone, int *ptot_bkts)
6793 {
6794     struct dp_netdev *dp = get_dp_netdev(dpif);
6795     struct dp_netdev_ct_dump *dump;
6796
6797     dump = xzalloc(sizeof *dump);
6798     dump->dp = dp;
6799     dump->ct = &dp->conntrack;
6800
6801     conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
6802
6803     *dump_ = &dump->up;
6804
6805     return 0;
6806 }
6807
6808 static int
6809 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
6810                          struct ct_dpif_dump_state *dump_,
6811                          struct ct_dpif_entry *entry)
6812 {
6813     struct dp_netdev_ct_dump *dump;
6814
6815     INIT_CONTAINER(dump, dump_, up);
6816
6817     return conntrack_dump_next(&dump->dump, entry);
6818 }
6819
6820 static int
6821 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
6822                          struct ct_dpif_dump_state *dump_)
6823 {
6824     struct dp_netdev_ct_dump *dump;
6825     int err;
6826
6827     INIT_CONTAINER(dump, dump_, up);
6828
6829     err = conntrack_dump_done(&dump->dump);
6830
6831     free(dump);
6832
6833     return err;
6834 }
6835
6836 static int
6837 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
6838                      const struct ct_dpif_tuple *tuple)
6839 {
6840     struct dp_netdev *dp = get_dp_netdev(dpif);
6841
6842     if (tuple) {
6843         return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
6844     }
6845     return conntrack_flush(&dp->conntrack, zone);
6846 }
6847
6848 static int
6849 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
6850 {
6851     struct dp_netdev *dp = get_dp_netdev(dpif);
6852
6853     return conntrack_set_maxconns(&dp->conntrack, maxconns);
6854 }
6855
6856 static int
6857 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
6858 {
6859     struct dp_netdev *dp = get_dp_netdev(dpif);
6860
6861     return conntrack_get_maxconns(&dp->conntrack, maxconns);
6862 }
6863
6864 static int
6865 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
6866 {
6867     struct dp_netdev *dp = get_dp_netdev(dpif);
6868
6869     return conntrack_get_nconns(&dp->conntrack, nconns);
6870 }
6871
6872 const struct dpif_class dpif_netdev_class = {
6873     "netdev",
6874     dpif_netdev_init,
6875     dpif_netdev_enumerate,
6876     dpif_netdev_port_open_type,
6877     dpif_netdev_open,
6878     dpif_netdev_close,
6879     dpif_netdev_destroy,
6880     dpif_netdev_run,
6881     dpif_netdev_wait,
6882     dpif_netdev_get_stats,
6883     dpif_netdev_port_add,
6884     dpif_netdev_port_del,
6885     dpif_netdev_port_set_config,
6886     dpif_netdev_port_query_by_number,
6887     dpif_netdev_port_query_by_name,
6888     NULL,                       /* port_get_pid */
6889     dpif_netdev_port_dump_start,
6890     dpif_netdev_port_dump_next,
6891     dpif_netdev_port_dump_done,
6892     dpif_netdev_port_poll,
6893     dpif_netdev_port_poll_wait,
6894     dpif_netdev_flow_flush,
6895     dpif_netdev_flow_dump_create,
6896     dpif_netdev_flow_dump_destroy,
6897     dpif_netdev_flow_dump_thread_create,
6898     dpif_netdev_flow_dump_thread_destroy,
6899     dpif_netdev_flow_dump_next,
6900     dpif_netdev_operate,
6901     NULL,                       /* recv_set */
6902     NULL,                       /* handlers_set */
6903     dpif_netdev_set_config,
6904     dpif_netdev_queue_to_priority,
6905     NULL,                       /* recv */
6906     NULL,                       /* recv_wait */
6907     NULL,                       /* recv_purge */
6908     dpif_netdev_register_dp_purge_cb,
6909     dpif_netdev_register_upcall_cb,
6910     dpif_netdev_enable_upcall,
6911     dpif_netdev_disable_upcall,
6912     dpif_netdev_get_datapath_version,
6913     dpif_netdev_ct_dump_start,
6914     dpif_netdev_ct_dump_next,
6915     dpif_netdev_ct_dump_done,
6916     dpif_netdev_ct_flush,
6917     dpif_netdev_ct_set_maxconns,
6918     dpif_netdev_ct_get_maxconns,
6919     dpif_netdev_ct_get_nconns,
6920     NULL,                       /* ct_set_limits */
6921     NULL,                       /* ct_get_limits */
6922     NULL,                       /* ct_del_limits */
6923     dpif_netdev_meter_get_features,
6924     dpif_netdev_meter_set,
6925     dpif_netdev_meter_get,
6926     dpif_netdev_meter_del,
6927 };
6928
6929 static void
6930 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
6931                               const char *argv[], void *aux OVS_UNUSED)
6932 {
6933     struct dp_netdev_port *port;
6934     struct dp_netdev *dp;
6935     odp_port_t port_no;
6936
6937     ovs_mutex_lock(&dp_netdev_mutex);
6938     dp = shash_find_data(&dp_netdevs, argv[1]);
6939     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
6940         ovs_mutex_unlock(&dp_netdev_mutex);
6941         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
6942         return;
6943     }
6944     ovs_refcount_ref(&dp->ref_cnt);
6945     ovs_mutex_unlock(&dp_netdev_mutex);
6946
6947     ovs_mutex_lock(&dp->port_mutex);
6948     if (get_port_by_name(dp, argv[2], &port)) {
6949         unixctl_command_reply_error(conn, "unknown port");
6950         goto exit;
6951     }
6952
6953     port_no = u32_to_odp(atoi(argv[3]));
6954     if (!port_no || port_no == ODPP_NONE) {
6955         unixctl_command_reply_error(conn, "bad port number");
6956         goto exit;
6957     }
6958     if (dp_netdev_lookup_port(dp, port_no)) {
6959         unixctl_command_reply_error(conn, "port number already in use");
6960         goto exit;
6961     }
6962
6963     /* Remove port. */
6964     hmap_remove(&dp->ports, &port->node);
6965     reconfigure_datapath(dp);
6966
6967     /* Reinsert with new port number. */
6968     port->port_no = port_no;
6969     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
6970     reconfigure_datapath(dp);
6971
6972     seq_change(dp->port_seq);
6973     unixctl_command_reply(conn, NULL);
6974
6975 exit:
6976     ovs_mutex_unlock(&dp->port_mutex);
6977     dp_netdev_unref(dp);
6978 }
6979
6980 static void
6981 dpif_dummy_register__(const char *type)
6982 {
6983     struct dpif_class *class;
6984
6985     class = xmalloc(sizeof *class);
6986     *class = dpif_netdev_class;
6987     class->type = xstrdup(type);
6988     dp_register_provider(class);
6989 }
6990
6991 static void
6992 dpif_dummy_override(const char *type)
6993 {
6994     int error;
6995
6996     /*
6997      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
6998      * a userland-only build.  It's useful for testsuite.
6999      */
7000     error = dp_unregister_provider(type);
7001     if (error == 0 || error == EAFNOSUPPORT) {
7002         dpif_dummy_register__(type);
7003     }
7004 }
7005
7006 void
7007 dpif_dummy_register(enum dummy_level level)
7008 {
7009     if (level == DUMMY_OVERRIDE_ALL) {
7010         struct sset types;
7011         const char *type;
7012
7013         sset_init(&types);
7014         dp_enumerate_types(&types);
7015         SSET_FOR_EACH (type, &types) {
7016             dpif_dummy_override(type);
7017         }
7018         sset_destroy(&types);
7019     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7020         dpif_dummy_override("system");
7021     }
7022
7023     dpif_dummy_register__("dummy");
7024
7025     unixctl_command_register("dpif-dummy/change-port-number",
7026                              "dp port new-number",
7027                              3, 3, dpif_dummy_change_port_number, NULL);
7028 }
7029 \f
7030 /* Datapath Classifier. */
7031
7032 /* A set of rules that all have the same fields wildcarded. */
7033 struct dpcls_subtable {
7034     /* The fields are only used by writers. */
7035     struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
7036
7037     /* These fields are accessed by readers. */
7038     struct cmap rules;           /* Contains "struct dpcls_rule"s. */
7039     uint32_t hit_cnt;            /* Number of match hits in subtable in current
7040                                     optimization interval. */
7041     struct netdev_flow_key mask; /* Wildcards for fields (const). */
7042     /* 'mask' must be the last field, additional space is allocated here. */
7043 };
7044
7045 /* Initializes 'cls' as a classifier that initially contains no classification
7046  * rules. */
7047 static void
7048 dpcls_init(struct dpcls *cls)
7049 {
7050     cmap_init(&cls->subtables_map);
7051     pvector_init(&cls->subtables);
7052 }
7053
7054 static void
7055 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7056 {
7057     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
7058     pvector_remove(&cls->subtables, subtable);
7059     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7060                 subtable->mask.hash);
7061     cmap_destroy(&subtable->rules);
7062     ovsrcu_postpone(free, subtable);
7063 }
7064
7065 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
7066  * caller's responsibility.
7067  * May only be called after all the readers have been terminated. */
7068 static void
7069 dpcls_destroy(struct dpcls *cls)
7070 {
7071     if (cls) {
7072         struct dpcls_subtable *subtable;
7073
7074         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
7075             ovs_assert(cmap_count(&subtable->rules) == 0);
7076             dpcls_destroy_subtable(cls, subtable);
7077         }
7078         cmap_destroy(&cls->subtables_map);
7079         pvector_destroy(&cls->subtables);
7080     }
7081 }
7082
7083 static struct dpcls_subtable *
7084 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7085 {
7086     struct dpcls_subtable *subtable;
7087
7088     /* Need to add one. */
7089     subtable = xmalloc(sizeof *subtable
7090                        - sizeof subtable->mask.mf + mask->len);
7091     cmap_init(&subtable->rules);
7092     subtable->hit_cnt = 0;
7093     netdev_flow_key_clone(&subtable->mask, mask);
7094     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
7095     /* Add the new subtable at the end of the pvector (with no hits yet) */
7096     pvector_insert(&cls->subtables, subtable, 0);
7097     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
7098              cmap_count(&cls->subtables_map), subtable, cls->in_port);
7099     pvector_publish(&cls->subtables);
7100
7101     return subtable;
7102 }
7103
7104 static inline struct dpcls_subtable *
7105 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7106 {
7107     struct dpcls_subtable *subtable;
7108
7109     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7110                              &cls->subtables_map) {
7111         if (netdev_flow_key_equal(&subtable->mask, mask)) {
7112             return subtable;
7113         }
7114     }
7115     return dpcls_create_subtable(cls, mask);
7116 }
7117
7118
7119 /* Periodically sort the dpcls subtable vectors according to hit counts */
7120 static void
7121 dpcls_sort_subtable_vector(struct dpcls *cls)
7122 {
7123     struct pvector *pvec = &cls->subtables;
7124     struct dpcls_subtable *subtable;
7125
7126     PVECTOR_FOR_EACH (subtable, pvec) {
7127         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
7128         subtable->hit_cnt = 0;
7129     }
7130     pvector_publish(pvec);
7131 }
7132
7133 static inline void
7134 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
7135                            struct polled_queue *poll_list, int poll_cnt)
7136 {
7137     struct dpcls *cls;
7138
7139     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
7140         uint64_t curr_tsc;
7141         /* Get the cycles that were used to process each queue and store. */
7142         for (unsigned i = 0; i < poll_cnt; i++) {
7143             uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
7144                                                         RXQ_CYCLES_PROC_CURR);
7145             dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
7146             dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
7147                                      0);
7148         }
7149         curr_tsc = cycles_counter_update(&pmd->perf_stats);
7150         if (pmd->intrvl_tsc_prev) {
7151             /* There is a prev timestamp, store a new intrvl cycle count. */
7152             atomic_store_relaxed(&pmd->intrvl_cycles,
7153                                  curr_tsc - pmd->intrvl_tsc_prev);
7154         }
7155         pmd->intrvl_tsc_prev = curr_tsc;
7156         /* Start new measuring interval */
7157         pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
7158     }
7159
7160     if (pmd->ctx.now > pmd->next_optimization) {
7161         /* Try to obtain the flow lock to block out revalidator threads.
7162          * If not possible, just try next time. */
7163         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
7164             /* Optimize each classifier */
7165             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7166                 dpcls_sort_subtable_vector(cls);
7167             }
7168             ovs_mutex_unlock(&pmd->flow_mutex);
7169             /* Start new measuring interval */
7170             pmd->next_optimization = pmd->ctx.now
7171                                      + DPCLS_OPTIMIZATION_INTERVAL;
7172         }
7173     }
7174 }
7175
7176 /* Insert 'rule' into 'cls'. */
7177 static void
7178 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
7179              const struct netdev_flow_key *mask)
7180 {
7181     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
7182
7183     /* Refer to subtable's mask, also for later removal. */
7184     rule->mask = &subtable->mask;
7185     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
7186 }
7187
7188 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
7189 static void
7190 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
7191 {
7192     struct dpcls_subtable *subtable;
7193
7194     ovs_assert(rule->mask);
7195
7196     /* Get subtable from reference in rule->mask. */
7197     INIT_CONTAINER(subtable, rule->mask, mask);
7198     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
7199         == 0) {
7200         /* Delete empty subtable. */
7201         dpcls_destroy_subtable(cls, subtable);
7202         pvector_publish(&cls->subtables);
7203     }
7204 }
7205
7206 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
7207  * in 'mask' the values in 'key' and 'target' are the same. */
7208 static bool
7209 dpcls_rule_matches_key(const struct dpcls_rule *rule,
7210                        const struct netdev_flow_key *target)
7211 {
7212     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
7213     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
7214     uint64_t value;
7215
7216     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
7217         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
7218             return false;
7219         }
7220     }
7221     return true;
7222 }
7223
7224 /* For each miniflow in 'keys' performs a classifier lookup writing the result
7225  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
7226  * NULL it is skipped.
7227  *
7228  * This function is optimized for use in the userspace datapath and therefore
7229  * does not implement a lot of features available in the standard
7230  * classifier_lookup() function.  Specifically, it does not implement
7231  * priorities, instead returning any rule which matches the flow.
7232  *
7233  * Returns true if all miniflows found a corresponding rule. */
7234 static bool
7235 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
7236              struct dpcls_rule **rules, const size_t cnt,
7237              int *num_lookups_p)
7238 {
7239     /* The received 'cnt' miniflows are the search-keys that will be processed
7240      * to find a matching entry into the available subtables.
7241      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
7242     typedef uint32_t map_type;
7243 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
7244     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
7245
7246     struct dpcls_subtable *subtable;
7247
7248     map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
7249     map_type found_map;
7250     uint32_t hashes[MAP_BITS];
7251     const struct cmap_node *nodes[MAP_BITS];
7252
7253     if (cnt != MAP_BITS) {
7254         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
7255     }
7256     memset(rules, 0, cnt * sizeof *rules);
7257
7258     int lookups_match = 0, subtable_pos = 1;
7259
7260     /* The Datapath classifier - aka dpcls - is composed of subtables.
7261      * Subtables are dynamically created as needed when new rules are inserted.
7262      * Each subtable collects rules with matches on a specific subset of packet
7263      * fields as defined by the subtable's mask.  We proceed to process every
7264      * search-key against each subtable, but when a match is found for a
7265      * search-key, the search for that key can stop because the rules are
7266      * non-overlapping. */
7267     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
7268         int i;
7269
7270         /* Compute hashes for the remaining keys.  Each search-key is
7271          * masked with the subtable's mask to avoid hashing the wildcarded
7272          * bits. */
7273         ULLONG_FOR_EACH_1(i, keys_map) {
7274             hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
7275                                                      &subtable->mask);
7276         }
7277         /* Lookup. */
7278         found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
7279         /* Check results.  When the i-th bit of found_map is set, it means
7280          * that a set of nodes with a matching hash value was found for the
7281          * i-th search-key.  Due to possible hash collisions we need to check
7282          * which of the found rules, if any, really matches our masked
7283          * search-key. */
7284         ULLONG_FOR_EACH_1(i, found_map) {
7285             struct dpcls_rule *rule;
7286
7287             CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
7288                 if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
7289                     rules[i] = rule;
7290                     /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
7291                      * within one second optimization interval. */
7292                     subtable->hit_cnt++;
7293                     lookups_match += subtable_pos;
7294                     goto next;
7295                 }
7296             }
7297             /* None of the found rules was a match.  Reset the i-th bit to
7298              * keep searching this key in the next subtable. */
7299             ULLONG_SET0(found_map, i);  /* Did not match. */
7300         next:
7301             ;                     /* Keep Sparse happy. */
7302         }
7303         keys_map &= ~found_map;             /* Clear the found rules. */
7304         if (!keys_map) {
7305             if (num_lookups_p) {
7306                 *num_lookups_p = lookups_match;
7307             }
7308             return true;              /* All found. */
7309         }
7310         subtable_pos++;
7311     }
7312     if (num_lookups_p) {
7313         *num_lookups_p = lookups_match;
7314     }
7315     return false;                     /* Some misses. */
7316 }