lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <inttypes.h>
  24 #include <net/if.h>
  25 #include <sys/types.h>
  26 #include <netinet/in.h>
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/ioctl.h>
  31 #include <sys/socket.h>
  32 #include <sys/stat.h>
  33 #include <unistd.h>
  34
  35 #include "bitmap.h"
  36 #include "cmap.h"
  37 #include "conntrack.h"
  38 #include "coverage.h"
  39 #include "ct-dpif.h"
  40 #include "csum.h"
  41 #include "dp-packet.h"
  42 #include "dpif.h"
  43 #include "dpif-netdev-perf.h"
  44 #include "dpif-provider.h"
  45 #include "dummy.h"
  46 #include "fat-rwlock.h"
  47 #include "flow.h"
  48 #include "hmapx.h"
  49 #include "id-pool.h"
  50 #include "latch.h"
  51 #include "netdev.h"
  52 #include "netdev-provider.h"
  53 #include "netdev-vport.h"
  54 #include "netlink.h"
  55 #include "odp-execute.h"
  56 #include "odp-util.h"
  57 #include "openvswitch/dynamic-string.h"
  58 #include "openvswitch/list.h"
  59 #include "openvswitch/match.h"
  60 #include "openvswitch/ofp-parse.h"
  61 #include "openvswitch/ofp-print.h"
  62 #include "openvswitch/ofpbuf.h"
  63 #include "openvswitch/shash.h"
  64 #include "openvswitch/vlog.h"
  65 #include "ovs-numa.h"
  66 #include "ovs-rcu.h"
  67 #include "packets.h"
  68 #include "openvswitch/poll-loop.h"
  69 #include "pvector.h"
  70 #include "random.h"
  71 #include "seq.h"
  72 #include "smap.h"
  73 #include "sset.h"
  74 #include "timeval.h"
  75 #include "tnl-neigh-cache.h"
  76 #include "tnl-ports.h"
  77 #include "unixctl.h"
  78 #include "util.h"
  79 #include "uuid.h"
  80
  81 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  82
  83 #define FLOW_DUMP_MAX_BATCH 50
  84 /* Use per thread recirc_depth to prevent recirculation loop. */
  85 #define MAX_RECIRC_DEPTH 6
  86 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  87
  88 /* Use instant packet send by default. */
  89 #define DEFAULT_TX_FLUSH_INTERVAL 0
  90
  91 /* Configuration parameters. */
  92 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
  93 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
  94 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
  95 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
  96
  97 /* Protects against changes to 'dp_netdevs'. */
  98 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
  99
 100 /* Contains all 'struct dp_netdev's. */
 101 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 102     = SHASH_INITIALIZER(&dp_netdevs);
 103
 104 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 105
 106 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 107                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 108                                      | CS_SRC_NAT | CS_DST_NAT)
 109 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 110
 111 static struct odp_support dp_netdev_support = {
 112     .max_vlan_headers = SIZE_MAX,
 113     .max_mpls_depth = SIZE_MAX,
 114     .recirc = true,
 115     .ct_state = true,
 116     .ct_zone = true,
 117     .ct_mark = true,
 118     .ct_label = true,
 119     .ct_state_nat = true,
 120     .ct_orig_tuple = true,
 121     .ct_orig_tuple6 = true,
 122 };
 123
 124 /* Stores a miniflow with inline values */
 125
 126 struct netdev_flow_key {
 127     uint32_t hash;       /* Hash function differs for different users. */
 128     uint32_t len;        /* Length of the following miniflow (incl. map). */
 129     struct miniflow mf;
 130     uint64_t buf[FLOW_MAX_PACKET_U64S];
 131 };
 132
 133 /* Exact match cache for frequently used flows
 134  *
 135  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 136  * search its entries for a miniflow that matches exactly the miniflow of the
 137  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 138  *
 139  * A cache entry holds a reference to its 'dp_netdev_flow'.
 140  *
 141  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 142  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 143  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 144  * value is the index of a cache entry where the miniflow could be.
 145  *
 146  *
 147  * Thread-safety
 148  * =============
 149  *
 150  * Each pmd_thread has its own private exact match cache.
 151  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 152  */
 153
 154 #define EM_FLOW_HASH_SHIFT 13
 155 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 156 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 157 #define EM_FLOW_HASH_SEGS 2
 158
 159 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 160 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 161 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 162                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 163
 164 struct emc_entry {
 165     struct dp_netdev_flow *flow;
 166     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 167 };
 168
 169 struct emc_cache {
 170     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 171     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 172 };
 173
 174 /* Iterate in the exact match cache through every entry that might contain a
 175  * miniflow with hash 'HASH'. */
 176 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 177     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 178          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 179          i__ < EM_FLOW_HASH_SEGS;                                            \
 180          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 181 \f
 182 /* Simple non-wildcarding single-priority classifier. */
 183
 184 /* Time in microseconds between successive optimizations of the dpcls
 185  * subtable vector */
 186 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
 187
 188 /* Time in microseconds of the interval in which rxq processing cycles used
 189  * in rxq to pmd assignments is measured and stored. */
 190 #define PMD_RXQ_INTERVAL_LEN 10000000LL
 191
 192 /* Number of intervals for which cycles are stored
 193  * and used during rxq to pmd assignment. */
 194 #define PMD_RXQ_INTERVAL_MAX 6
 195
 196 struct dpcls {
 197     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 198     odp_port_t in_port;
 199     struct cmap subtables_map;
 200     struct pvector subtables;
 201 };
 202
 203 /* A rule to be inserted to the classifier. */
 204 struct dpcls_rule {
 205     struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 206     struct netdev_flow_key *mask; /* Subtable's mask. */
 207     struct netdev_flow_key flow;  /* Matching key. */
 208     /* 'flow' must be the last field, additional space is allocated here. */
 209 };
 210
 211 static void dpcls_init(struct dpcls *);
 212 static void dpcls_destroy(struct dpcls *);
 213 static void dpcls_sort_subtable_vector(struct dpcls *);
 214 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 215                          const struct netdev_flow_key *mask);
 216 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 217 static bool dpcls_lookup(struct dpcls *cls,
 218                          const struct netdev_flow_key keys[],
 219                          struct dpcls_rule **rules, size_t cnt,
 220                          int *num_lookups_p);
 221 \f
 222 /* Set of supported meter flags */
 223 #define DP_SUPPORTED_METER_FLAGS_MASK \
 224     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 225
 226 /* Set of supported meter band types */
 227 #define DP_SUPPORTED_METER_BAND_TYPES           \
 228     ( 1 << OFPMBT13_DROP )
 229
 230 struct dp_meter_band {
 231     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 232     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 233     uint64_t packet_count;
 234     uint64_t byte_count;
 235 };
 236
 237 struct dp_meter {
 238     uint16_t flags;
 239     uint16_t n_bands;
 240     uint32_t max_delta_t;
 241     uint64_t used;
 242     uint64_t packet_count;
 243     uint64_t byte_count;
 244     struct dp_meter_band bands[];
 245 };
 246
 247 /* Datapath based on the network device interface from netdev.h.
 248  *
 249  *
 250  * Thread-safety
 251  * =============
 252  *
 253  * Some members, marked 'const', are immutable.  Accessing other members
 254  * requires synchronization, as noted in more detail below.
 255  *
 256  * Acquisition order is, from outermost to innermost:
 257  *
 258  *    dp_netdev_mutex (global)
 259  *    port_mutex
 260  *    non_pmd_mutex
 261  */
 262 struct dp_netdev {
 263     const struct dpif_class *const class;
 264     const char *const name;
 265     struct dpif *dpif;
 266     struct ovs_refcount ref_cnt;
 267     atomic_flag destroyed;
 268
 269     /* Ports.
 270      *
 271      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 272      * through 'ports' requires taking 'port_mutex'. */
 273     struct ovs_mutex port_mutex;
 274     struct hmap ports;
 275     struct seq *port_seq;       /* Incremented whenever a port changes. */
 276
 277     /* The time that a packet can wait in output batch for sending. */
 278     atomic_uint32_t tx_flush_interval;
 279
 280     /* Meters. */
 281     struct ovs_mutex meter_locks[N_METER_LOCKS];
 282     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 283
 284     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 285     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 286     /* Enable collection of PMD performance metrics. */
 287     atomic_bool pmd_perf_metrics;
 288
 289     /* Protects access to ofproto-dpif-upcall interface during revalidator
 290      * thread synchronization. */
 291     struct fat_rwlock upcall_rwlock;
 292     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 293     void *upcall_aux;
 294
 295     /* Callback function for notifying the purging of dp flows (during
 296      * reseting pmd deletion). */
 297     dp_purge_callback *dp_purge_cb;
 298     void *dp_purge_aux;
 299
 300     /* Stores all 'struct dp_netdev_pmd_thread's. */
 301     struct cmap poll_threads;
 302     /* id pool for per thread static_tx_qid. */
 303     struct id_pool *tx_qid_pool;
 304     struct ovs_mutex tx_qid_pool_mutex;
 305
 306     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 307      * instance for non-pmd thread. */
 308     struct ovs_mutex non_pmd_mutex;
 309
 310     /* Each pmd thread will store its pointer to
 311      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 312     ovsthread_key_t per_pmd_key;
 313
 314     struct seq *reconfigure_seq;
 315     uint64_t last_reconfigure_seq;
 316
 317     /* Cpu mask for pin of pmd threads. */
 318     char *pmd_cmask;
 319
 320     uint64_t last_tnl_conf_seq;
 321
 322     struct conntrack conntrack;
 323 };
 324
 325 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 326     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 327 {
 328     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 329 }
 330
 331 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 332     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 333 {
 334     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 335 }
 336
 337
 338 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 339                                                     odp_port_t)
 340     OVS_REQUIRES(dp->port_mutex);
 341
 342 enum rxq_cycles_counter_type {
 343     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 344                                    processing packets during the current
 345                                    interval. */
 346     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 347                                    during rxq to pmd assignment. */
 348     RXQ_N_CYCLES
 349 };
 350
 351 #define XPS_TIMEOUT 500000LL    /* In microseconds. */
 352
 353 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 354 struct dp_netdev_rxq {
 355     struct dp_netdev_port *port;
 356     struct netdev_rxq *rx;
 357     unsigned core_id;                  /* Core to which this queue should be
 358                                           pinned. OVS_CORE_UNSPEC if the
 359                                           queue doesn't need to be pinned to a
 360                                           particular core. */
 361     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 362     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 363     bool is_vhost;                     /* Is rxq of a vhost port. */
 364
 365     /* Counters of cycles spent successfully polling and processing pkts. */
 366     atomic_ullong cycles[RXQ_N_CYCLES];
 367     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 368        sum them to yield the cycles used for an rxq. */
 369     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 370 };
 371
 372 /* A port in a netdev-based datapath. */
 373 struct dp_netdev_port {
 374     odp_port_t port_no;
 375     bool dynamic_txqs;          /* If true XPS will be used. */
 376     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 377     struct netdev *netdev;
 378     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 379     struct netdev_saved_flags *sf;
 380     struct dp_netdev_rxq *rxqs;
 381     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 382     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 383     struct ovs_mutex txq_used_mutex;
 384     char *type;                 /* Port type as requested by user. */
 385     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 386 };
 387
 388 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 389 struct dp_netdev_flow_stats {
 390     atomic_llong used;             /* Last used time, in monotonic msecs. */
 391     atomic_ullong packet_count;    /* Number of packets matched. */
 392     atomic_ullong byte_count;      /* Number of bytes matched. */
 393     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 394 };
 395
 396 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 397  *
 398  *
 399  * Thread-safety
 400  * =============
 401  *
 402  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 403  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 404  *
 405  * Motivation
 406  * ----------
 407  *
 408  * The thread safety rules described here for "struct dp_netdev_flow" are
 409  * motivated by two goals:
 410  *
 411  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 412  *      reading bad data due to changes by some thread concurrently modifying
 413  *      those members.
 414  *
 415  *    - Prevent two threads making changes to members of a given "struct
 416  *      dp_netdev_flow" from interfering with each other.
 417  *
 418  *
 419  * Rules
 420  * -----
 421  *
 422  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 423  * grace period.  Code that needs to hold onto a flow for a while
 424  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 425  *
 426  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 427  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 428  * from modification.
 429  *
 430  * Some members, marked 'const', are immutable.  Accessing other members
 431  * requires synchronization, as noted in more detail below.
 432  */
 433 struct dp_netdev_flow {
 434     const struct flow flow;      /* Unmasked flow that created this entry. */
 435     /* Hash table index by unmasked flow. */
 436     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 437                                  /* 'flow_table'. */
 438     const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
 439     const ovs_u128 ufid;         /* Unique flow identifier. */
 440     const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
 441     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 442                                  /* flow. */
 443
 444     /* Number of references.
 445      * The classifier owns one reference.
 446      * Any thread trying to keep a rule from being freed should hold its own
 447      * reference. */
 448     struct ovs_refcount ref_cnt;
 449
 450     bool dead;
 451     uint32_t mark;               /* Unique flow mark assigned to a flow */
 452
 453     /* Statistics. */
 454     struct dp_netdev_flow_stats stats;
 455
 456     /* Actions. */
 457     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 458
 459     /* While processing a group of input packets, the datapath uses the next
 460      * member to store a pointer to the output batch for the flow.  It is
 461      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 462      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 463     struct packet_batch_per_flow *batch;
 464
 465     /* Packet classification. */
 466     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 467     /* 'cr' must be the last member. */
 468 };
 469
 470 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 471 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 472 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 473                                          struct flow *, bool);
 474
 475 /* A set of datapath actions within a "struct dp_netdev_flow".
 476  *
 477  *
 478  * Thread-safety
 479  * =============
 480  *
 481  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 482 struct dp_netdev_actions {
 483     /* These members are immutable: they do not change during the struct's
 484      * lifetime.  */
 485     unsigned int size;          /* Size of 'actions', in bytes. */
 486     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 487 };
 488
 489 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 490                                                    size_t);
 491 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 492     const struct dp_netdev_flow *);
 493 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 494
 495 struct polled_queue {
 496     struct dp_netdev_rxq *rxq;
 497     odp_port_t port_no;
 498 };
 499
 500 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 501 struct rxq_poll {
 502     struct dp_netdev_rxq *rxq;
 503     struct hmap_node node;
 504 };
 505
 506 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 507  * 'tnl_port_cache' or 'tx_ports'. */
 508 struct tx_port {
 509     struct dp_netdev_port *port;
 510     int qid;
 511     long long last_used;
 512     struct hmap_node node;
 513     long long flush_time;
 514     struct dp_packet_batch output_pkts;
 515     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 516 };
 517
 518 /* A set of properties for the current processing loop that is not directly
 519  * associated with the pmd thread itself, but with the packets being
 520  * processed or the short-term system configuration (for example, time).
 521  * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 522 struct dp_netdev_pmd_thread_ctx {
 523     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 524     long long now;
 525     /* RX queue from which last packet was received. */
 526     struct dp_netdev_rxq *last_rxq;
 527 };
 528
 529 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 530  * the performance overhead of interrupt processing.  Therefore netdev can
 531  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 532  * these device to check for recv buffer.  pmd-thread does polling for
 533  * devices assigned to itself.
 534  *
 535  * DPDK used PMD for accessing NIC.
 536  *
 537  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 538  * I/O of all non-pmd threads.  There will be no actual thread created
 539  * for the instance.
 540  *
 541  * Each struct has its own flow cache and classifier per managed ingress port.
 542  * For packets received on ingress port, a look up is done on corresponding PMD
 543  * thread's flow cache and in case of a miss, lookup is performed in the
 544  * corresponding classifier of port.  Packets are executed with the found
 545  * actions in either case.
 546  * */
 547 struct dp_netdev_pmd_thread {
 548     struct dp_netdev *dp;
 549     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 550     struct cmap_node node;          /* In 'dp->poll_threads'. */
 551
 552     pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
 553     struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */
 554
 555     /* Per thread exact-match cache.  Note, the instance for cpu core
 556      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 557      * need to be protected by 'non_pmd_mutex'.  Every other instance
 558      * will only be accessed by its own pmd thread. */
 559     struct emc_cache flow_cache;
 560
 561     /* Flow-Table and classifiers
 562      *
 563      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 564      * changes to 'classifiers' must be made while still holding the
 565      * 'flow_mutex'.
 566      */
 567     struct ovs_mutex flow_mutex;
 568     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 569
 570     /* One classifier per in_port polled by the pmd */
 571     struct cmap classifiers;
 572     /* Periodically sort subtable vectors according to hit frequencies */
 573     long long int next_optimization;
 574     /* End of the next time interval for which processing cycles
 575        are stored for each polled rxq. */
 576     long long int rxq_next_cycle_store;
 577
 578     /* Last interval timestamp. */
 579     uint64_t intrvl_tsc_prev;
 580     /* Last interval cycles. */
 581     atomic_ullong intrvl_cycles;
 582
 583     /* Current context of the PMD thread. */
 584     struct dp_netdev_pmd_thread_ctx ctx;
 585
 586     struct latch exit_latch;        /* For terminating the pmd thread. */
 587     struct seq *reload_seq;
 588     uint64_t last_reload_seq;
 589     atomic_bool reload;             /* Do we need to reload ports? */
 590     pthread_t thread;
 591     unsigned core_id;               /* CPU core id of this pmd thread. */
 592     int numa_id;                    /* numa node id of this pmd thread. */
 593     bool isolated;
 594
 595     /* Queue id used by this pmd thread to send packets on all netdevs if
 596      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 597      * than 'cmap_count(dp->poll_threads)'. */
 598     uint32_t static_tx_qid;
 599
 600     /* Number of filled output batches. */
 601     int n_output_batches;
 602
 603     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 604     /* List of rx queues to poll. */
 605     struct hmap poll_list OVS_GUARDED;
 606     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 607      * read by the pmd thread. */
 608     struct hmap tx_ports OVS_GUARDED;
 609
 610     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 611      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 612      * with at least one txq (that support send).  A port can be in both.
 613      *
 614      * There are two separate maps to make sure that we don't try to execute
 615      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 616      *
 617      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 618      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 619      * other instance will only be accessed by its own pmd thread. */
 620     struct hmap tnl_port_cache;
 621     struct hmap send_port_cache;
 622
 623     /* Keep track of detailed PMD performance statistics. */
 624     struct pmd_perf_stats perf_stats;
 625
 626     /* Set to true if the pmd thread needs to be reloaded. */
 627     bool need_reload;
 628 };
 629
 630 /* Interface to netdev-based datapath. */
 631 struct dpif_netdev {
 632     struct dpif dpif;
 633     struct dp_netdev *dp;
 634     uint64_t last_port_seq;
 635 };
 636
 637 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 638                               struct dp_netdev_port **portp)
 639     OVS_REQUIRES(dp->port_mutex);
 640 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 641                             struct dp_netdev_port **portp)
 642     OVS_REQUIRES(dp->port_mutex);
 643 static void dp_netdev_free(struct dp_netdev *)
 644     OVS_REQUIRES(dp_netdev_mutex);
 645 static int do_add_port(struct dp_netdev *dp, const char *devname,
 646                        const char *type, odp_port_t port_no)
 647     OVS_REQUIRES(dp->port_mutex);
 648 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 649     OVS_REQUIRES(dp->port_mutex);
 650 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 651                             bool create, struct dpif **);
 652 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 653                                       struct dp_packet_batch *,
 654                                       bool should_steal,
 655                                       const struct flow *flow,
 656                                       const struct nlattr *actions,
 657                                       size_t actions_len);
 658 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 659                             struct dp_packet_batch *, odp_port_t port_no);
 660 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 661                                   struct dp_packet_batch *);
 662
 663 static void dp_netdev_disable_upcall(struct dp_netdev *);
 664 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 665 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 666                                     struct dp_netdev *dp, unsigned core_id,
 667                                     int numa_id);
 668 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 669 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 670     OVS_REQUIRES(dp->port_mutex);
 671
 672 static void *pmd_thread_main(void *);
 673 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 674                                                       unsigned core_id);
 675 static struct dp_netdev_pmd_thread *
 676 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 677 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 678                               struct dp_netdev_pmd_thread *pmd);
 679 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 680 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 681 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 682                                          struct dp_netdev_port *port)
 683     OVS_REQUIRES(pmd->port_mutex);
 684 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 685                                            struct tx_port *tx)
 686     OVS_REQUIRES(pmd->port_mutex);
 687 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 688                                      struct dp_netdev_rxq *rxq)
 689     OVS_REQUIRES(pmd->port_mutex);
 690 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 691                                        struct rxq_poll *poll)
 692     OVS_REQUIRES(pmd->port_mutex);
 693 static int
 694 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 695                                    bool force);
 696
 697 static void reconfigure_datapath(struct dp_netdev *dp)
 698     OVS_REQUIRES(dp->port_mutex);
 699 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 700 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 701 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 702 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 703     OVS_REQUIRES(pmd->port_mutex);
 704 static inline void
 705 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 706                            struct polled_queue *poll_list, int poll_cnt);
 707 static void
 708 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 709                          enum rxq_cycles_counter_type type,
 710                          unsigned long long cycles);
 711 static uint64_t
 712 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 713                          enum rxq_cycles_counter_type type);
 714 static void
 715 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 716                            unsigned long long cycles);
 717 static uint64_t
 718 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 719 static void
 720 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 721                                bool purge);
 722 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 723                                       struct tx_port *tx);
 724
 725 static inline bool emc_entry_alive(struct emc_entry *ce);
 726 static void emc_clear_entry(struct emc_entry *ce);
 727
 728 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
 729 static inline bool
 730 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
 731
 732 static void
 733 emc_cache_init(struct emc_cache *flow_cache)
 734 {
 735     int i;
 736
 737     flow_cache->sweep_idx = 0;
 738     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 739         flow_cache->entries[i].flow = NULL;
 740         flow_cache->entries[i].key.hash = 0;
 741         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 742         flowmap_init(&flow_cache->entries[i].key.mf.map);
 743     }
 744 }
 745
 746 static void
 747 emc_cache_uninit(struct emc_cache *flow_cache)
 748 {
 749     int i;
 750
 751     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 752         emc_clear_entry(&flow_cache->entries[i]);
 753     }
 754 }
 755
 756 /* Check and clear dead flow references slowly (one entry at each
 757  * invocation).  */
 758 static void
 759 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 760 {
 761     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 762
 763     if (!emc_entry_alive(entry)) {
 764         emc_clear_entry(entry);
 765     }
 766     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 767 }
 768
 769 /* Updates the time in PMD threads context and should be called in three cases:
 770  *
 771  *     1. PMD structure initialization:
 772  *         - dp_netdev_configure_pmd()
 773  *
 774  *     2. Before processing of the new packet batch:
 775  *         - dpif_netdev_execute()
 776  *         - dp_netdev_process_rxq_port()
 777  *
 778  *     3. At least once per polling iteration in main polling threads if no
 779  *        packets received on current iteration:
 780  *         - dpif_netdev_run()
 781  *         - pmd_thread_main()
 782  *
 783  * 'pmd->ctx.now' should be used without update in all other cases if possible.
 784  */
 785 static inline void
 786 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
 787 {
 788     pmd->ctx.now = time_usec();
 789 }
 790
 791 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 792 bool
 793 dpif_is_netdev(const struct dpif *dpif)
 794 {
 795     return dpif->dpif_class->open == dpif_netdev_open;
 796 }
 797
 798 static struct dpif_netdev *
 799 dpif_netdev_cast(const struct dpif *dpif)
 800 {
 801     ovs_assert(dpif_is_netdev(dpif));
 802     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 803 }
 804
 805 static struct dp_netdev *
 806 get_dp_netdev(const struct dpif *dpif)
 807 {
 808     return dpif_netdev_cast(dpif)->dp;
 809 }
 810 \f
 811 enum pmd_info_type {
 812     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 813     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
 814     PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
 815     PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
 816 };
 817
 818 static void
 819 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 820 {
 821     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
 822                         ? "main thread" : "pmd thread");
 823     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
 824         ds_put_format(reply, " numa_id %d", pmd->numa_id);
 825     }
 826     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
 827         ds_put_format(reply, " core_id %u", pmd->core_id);
 828     }
 829     ds_put_cstr(reply, ":\n");
 830 }
 831
 832 static void
 833 pmd_info_show_stats(struct ds *reply,
 834                     struct dp_netdev_pmd_thread *pmd)
 835 {
 836     uint64_t stats[PMD_N_STATS];
 837     uint64_t total_cycles, total_packets;
 838     double passes_per_pkt = 0;
 839     double lookups_per_hit = 0;
 840     double packets_per_batch = 0;
 841
 842     pmd_perf_read_counters(&pmd->perf_stats, stats);
 843     total_cycles = stats[PMD_CYCLES_ITER_IDLE]
 844                          + stats[PMD_CYCLES_ITER_BUSY];
 845     total_packets = stats[PMD_STAT_RECV];
 846
 847     format_pmd_thread(reply, pmd);
 848
 849     if (total_packets > 0) {
 850         passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
 851                             / (double) total_packets;
 852     }
 853     if (stats[PMD_STAT_MASKED_HIT] > 0) {
 854         lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
 855                             / (double) stats[PMD_STAT_MASKED_HIT];
 856     }
 857     if (stats[PMD_STAT_SENT_BATCHES] > 0) {
 858         packets_per_batch = stats[PMD_STAT_SENT_PKTS]
 859                             / (double) stats[PMD_STAT_SENT_BATCHES];
 860     }
 861
 862     ds_put_format(reply,
 863                   "  packets received: %"PRIu64"\n"
 864                   "  packet recirculations: %"PRIu64"\n"
 865                   "  avg. datapath passes per packet: %.02f\n"
 866                   "  emc hits: %"PRIu64"\n"
 867                   "  megaflow hits: %"PRIu64"\n"
 868                   "  avg. subtable lookups per megaflow hit: %.02f\n"
 869                   "  miss with success upcall: %"PRIu64"\n"
 870                   "  miss with failed upcall: %"PRIu64"\n"
 871                   "  avg. packets per output batch: %.02f\n",
 872                   total_packets, stats[PMD_STAT_RECIRC],
 873                   passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
 874                   stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
 875                   stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
 876                   packets_per_batch);
 877
 878     if (total_cycles == 0) {
 879         return;
 880     }
 881
 882     ds_put_format(reply,
 883                   "  idle cycles: %"PRIu64" (%.02f%%)\n"
 884                   "  processing cycles: %"PRIu64" (%.02f%%)\n",
 885                   stats[PMD_CYCLES_ITER_IDLE],
 886                   stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
 887                   stats[PMD_CYCLES_ITER_BUSY],
 888                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
 889
 890     if (total_packets == 0) {
 891         return;
 892     }
 893
 894     ds_put_format(reply,
 895                   "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
 896                   total_cycles / (double) total_packets,
 897                   total_cycles, total_packets);
 898
 899     ds_put_format(reply,
 900                   "  avg processing cycles per packet: "
 901                   "%.02f (%"PRIu64"/%"PRIu64")\n",
 902                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
 903                   stats[PMD_CYCLES_ITER_BUSY], total_packets);
 904 }
 905
 906 static void
 907 pmd_info_show_perf(struct ds *reply,
 908                    struct dp_netdev_pmd_thread *pmd,
 909                    struct pmd_perf_params *par)
 910 {
 911     if (pmd->core_id != NON_PMD_CORE_ID) {
 912         char *time_str =
 913                 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
 914         long long now = time_msec();
 915         double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
 916
 917         ds_put_cstr(reply, "\n");
 918         ds_put_format(reply, "Time: %s\n", time_str);
 919         ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
 920         ds_put_cstr(reply, "\n");
 921         format_pmd_thread(reply, pmd);
 922         ds_put_cstr(reply, "\n");
 923         pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
 924         if (pmd_perf_metrics_enabled(pmd)) {
 925             /* Prevent parallel clearing of perf metrics. */
 926             ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
 927             if (par->histograms) {
 928                 ds_put_cstr(reply, "\n");
 929                 pmd_perf_format_histograms(reply, &pmd->perf_stats);
 930             }
 931             if (par->iter_hist_len > 0) {
 932                 ds_put_cstr(reply, "\n");
 933                 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
 934                         par->iter_hist_len);
 935             }
 936             if (par->ms_hist_len > 0) {
 937                 ds_put_cstr(reply, "\n");
 938                 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
 939                         par->ms_hist_len);
 940             }
 941             ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
 942         }
 943         free(time_str);
 944     }
 945 }
 946
 947 static int
 948 compare_poll_list(const void *a_, const void *b_)
 949 {
 950     const struct rxq_poll *a = a_;
 951     const struct rxq_poll *b = b_;
 952
 953     const char *namea = netdev_rxq_get_name(a->rxq->rx);
 954     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
 955
 956     int cmp = strcmp(namea, nameb);
 957     if (!cmp) {
 958         return netdev_rxq_get_queue_id(a->rxq->rx)
 959                - netdev_rxq_get_queue_id(b->rxq->rx);
 960     } else {
 961         return cmp;
 962     }
 963 }
 964
 965 static void
 966 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
 967                  size_t *n)
 968 {
 969     struct rxq_poll *ret, *poll;
 970     size_t i;
 971
 972     *n = hmap_count(&pmd->poll_list);
 973     if (!*n) {
 974         ret = NULL;
 975     } else {
 976         ret = xcalloc(*n, sizeof *ret);
 977         i = 0;
 978         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
 979             ret[i] = *poll;
 980             i++;
 981         }
 982         ovs_assert(i == *n);
 983         qsort(ret, *n, sizeof *ret, compare_poll_list);
 984     }
 985
 986     *list = ret;
 987 }
 988
 989 static void
 990 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 991 {
 992     if (pmd->core_id != NON_PMD_CORE_ID) {
 993         struct rxq_poll *list;
 994         size_t n_rxq;
 995         uint64_t total_cycles = 0;
 996
 997         ds_put_format(reply,
 998                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
 999                       pmd->numa_id, pmd->core_id, (pmd->isolated)
1000                                                   ? "true" : "false");
1001
1002         ovs_mutex_lock(&pmd->port_mutex);
1003         sorted_poll_list(pmd, &list, &n_rxq);
1004
1005         /* Get the total pmd cycles for an interval. */
1006         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1007         /* Estimate the cycles to cover all intervals. */
1008         total_cycles *= PMD_RXQ_INTERVAL_MAX;
1009
1010         for (int i = 0; i < n_rxq; i++) {
1011             struct dp_netdev_rxq *rxq = list[i].rxq;
1012             const char *name = netdev_rxq_get_name(rxq->rx);
1013             uint64_t proc_cycles = 0;
1014
1015             for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1016                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1017             }
1018             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
1019                           netdev_rxq_get_queue_id(list[i].rxq->rx));
1020             ds_put_format(reply, "  pmd usage: ");
1021             if (total_cycles) {
1022                 ds_put_format(reply, "%2"PRIu64"",
1023                               proc_cycles * 100 / total_cycles);
1024                 ds_put_cstr(reply, " %");
1025             } else {
1026                 ds_put_format(reply, "%s", "NOT AVAIL");
1027             }
1028             ds_put_cstr(reply, "\n");
1029         }
1030         ovs_mutex_unlock(&pmd->port_mutex);
1031         free(list);
1032     }
1033 }
1034
1035 static int
1036 compare_poll_thread_list(const void *a_, const void *b_)
1037 {
1038     const struct dp_netdev_pmd_thread *a, *b;
1039
1040     a = *(struct dp_netdev_pmd_thread **)a_;
1041     b = *(struct dp_netdev_pmd_thread **)b_;
1042
1043     if (a->core_id < b->core_id) {
1044         return -1;
1045     }
1046     if (a->core_id > b->core_id) {
1047         return 1;
1048     }
1049     return 0;
1050 }
1051
1052 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1053  * this list, as long as we do not go to quiescent state. */
1054 static void
1055 sorted_poll_thread_list(struct dp_netdev *dp,
1056                         struct dp_netdev_pmd_thread ***list,
1057                         size_t *n)
1058 {
1059     struct dp_netdev_pmd_thread *pmd;
1060     struct dp_netdev_pmd_thread **pmd_list;
1061     size_t k = 0, n_pmds;
1062
1063     n_pmds = cmap_count(&dp->poll_threads);
1064     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1065
1066     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1067         if (k >= n_pmds) {
1068             break;
1069         }
1070         pmd_list[k++] = pmd;
1071     }
1072
1073     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1074
1075     *list = pmd_list;
1076     *n = k;
1077 }
1078
1079 static void
1080 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1081                           const char *argv[], void *aux OVS_UNUSED)
1082 {
1083     struct ds reply = DS_EMPTY_INITIALIZER;
1084     struct dp_netdev *dp = NULL;
1085
1086     ovs_mutex_lock(&dp_netdev_mutex);
1087
1088     if (argc == 2) {
1089         dp = shash_find_data(&dp_netdevs, argv[1]);
1090     } else if (shash_count(&dp_netdevs) == 1) {
1091         /* There's only one datapath */
1092         dp = shash_first(&dp_netdevs)->data;
1093     }
1094
1095     if (!dp) {
1096         ovs_mutex_unlock(&dp_netdev_mutex);
1097         unixctl_command_reply_error(conn,
1098                                     "please specify an existing datapath");
1099         return;
1100     }
1101
1102     dp_netdev_request_reconfigure(dp);
1103     ovs_mutex_unlock(&dp_netdev_mutex);
1104     ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1105     unixctl_command_reply(conn, ds_cstr(&reply));
1106     ds_destroy(&reply);
1107 }
1108
1109 static void
1110 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1111                      void *aux)
1112 {
1113     struct ds reply = DS_EMPTY_INITIALIZER;
1114     struct dp_netdev_pmd_thread **pmd_list;
1115     struct dp_netdev *dp = NULL;
1116     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1117     unsigned int core_id;
1118     bool filter_on_pmd = false;
1119     size_t n;
1120
1121     ovs_mutex_lock(&dp_netdev_mutex);
1122
1123     while (argc > 1) {
1124         if (!strcmp(argv[1], "-pmd") && argc > 2) {
1125             if (str_to_uint(argv[2], 10, &core_id)) {
1126                 filter_on_pmd = true;
1127             }
1128             argc -= 2;
1129             argv += 2;
1130         } else {
1131             dp = shash_find_data(&dp_netdevs, argv[1]);
1132             argc -= 1;
1133             argv += 1;
1134         }
1135     }
1136
1137     if (!dp) {
1138         if (shash_count(&dp_netdevs) == 1) {
1139             /* There's only one datapath */
1140             dp = shash_first(&dp_netdevs)->data;
1141         } else {
1142             ovs_mutex_unlock(&dp_netdev_mutex);
1143             unixctl_command_reply_error(conn,
1144                                         "please specify an existing datapath");
1145             return;
1146         }
1147     }
1148
1149     sorted_poll_thread_list(dp, &pmd_list, &n);
1150     for (size_t i = 0; i < n; i++) {
1151         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1152         if (!pmd) {
1153             break;
1154         }
1155         if (filter_on_pmd && pmd->core_id != core_id) {
1156             continue;
1157         }
1158         if (type == PMD_INFO_SHOW_RXQ) {
1159             pmd_info_show_rxq(&reply, pmd);
1160         } else if (type == PMD_INFO_CLEAR_STATS) {
1161             pmd_perf_stats_clear(&pmd->perf_stats);
1162         } else if (type == PMD_INFO_SHOW_STATS) {
1163             pmd_info_show_stats(&reply, pmd);
1164         } else if (type == PMD_INFO_PERF_SHOW) {
1165             pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1166         }
1167     }
1168     free(pmd_list);
1169
1170     ovs_mutex_unlock(&dp_netdev_mutex);
1171
1172     unixctl_command_reply(conn, ds_cstr(&reply));
1173     ds_destroy(&reply);
1174 }
1175
1176 static void
1177 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1178                           const char *argv[],
1179                           void *aux OVS_UNUSED)
1180 {
1181     struct pmd_perf_params par;
1182     long int it_hist = 0, ms_hist = 0;
1183     par.histograms = true;
1184
1185     while (argc > 1) {
1186         if (!strcmp(argv[1], "-nh")) {
1187             par.histograms = false;
1188             argc -= 1;
1189             argv += 1;
1190         } else if (!strcmp(argv[1], "-it") && argc > 2) {
1191             it_hist = strtol(argv[2], NULL, 10);
1192             if (it_hist < 0) {
1193                 it_hist = 0;
1194             } else if (it_hist > HISTORY_LEN) {
1195                 it_hist = HISTORY_LEN;
1196             }
1197             argc -= 2;
1198             argv += 2;
1199         } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1200             ms_hist = strtol(argv[2], NULL, 10);
1201             if (ms_hist < 0) {
1202                 ms_hist = 0;
1203             } else if (ms_hist > HISTORY_LEN) {
1204                 ms_hist = HISTORY_LEN;
1205             }
1206             argc -= 2;
1207             argv += 2;
1208         } else {
1209             break;
1210         }
1211     }
1212     par.iter_hist_len = it_hist;
1213     par.ms_hist_len = ms_hist;
1214     par.command_type = PMD_INFO_PERF_SHOW;
1215     dpif_netdev_pmd_info(conn, argc, argv, &par);
1216 }
1217 \f
1218 static int
1219 dpif_netdev_init(void)
1220 {
1221     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1222                               clear_aux = PMD_INFO_CLEAR_STATS,
1223                               poll_aux = PMD_INFO_SHOW_RXQ;
1224
1225     unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1226                              0, 3, dpif_netdev_pmd_info,
1227                              (void *)&show_aux);
1228     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1229                              0, 3, dpif_netdev_pmd_info,
1230                              (void *)&clear_aux);
1231     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1232                              0, 3, dpif_netdev_pmd_info,
1233                              (void *)&poll_aux);
1234     unixctl_command_register("dpif-netdev/pmd-perf-show",
1235                              "[-nh] [-it iter-history-len]"
1236                              " [-ms ms-history-len]"
1237                              " [-pmd core] [dp]",
1238                              0, 8, pmd_perf_show_cmd,
1239                              NULL);
1240     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1241                              0, 1, dpif_netdev_pmd_rebalance,
1242                              NULL);
1243     unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1244                              "on|off [-b before] [-a after] [-e|-ne] "
1245                              "[-us usec] [-q qlen]",
1246                              0, 10, pmd_perf_log_set_cmd,
1247                              NULL);
1248     return 0;
1249 }
1250
1251 static int
1252 dpif_netdev_enumerate(struct sset *all_dps,
1253                       const struct dpif_class *dpif_class)
1254 {
1255     struct shash_node *node;
1256
1257     ovs_mutex_lock(&dp_netdev_mutex);
1258     SHASH_FOR_EACH(node, &dp_netdevs) {
1259         struct dp_netdev *dp = node->data;
1260         if (dpif_class != dp->class) {
1261             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1262              * If the class doesn't match, skip this dpif. */
1263              continue;
1264         }
1265         sset_add(all_dps, node->name);
1266     }
1267     ovs_mutex_unlock(&dp_netdev_mutex);
1268
1269     return 0;
1270 }
1271
1272 static bool
1273 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1274 {
1275     return class != &dpif_netdev_class;
1276 }
1277
1278 static const char *
1279 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1280 {
1281     return strcmp(type, "internal") ? type
1282                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1283                   : "tap";
1284 }
1285
1286 static struct dpif *
1287 create_dpif_netdev(struct dp_netdev *dp)
1288 {
1289     uint16_t netflow_id = hash_string(dp->name, 0);
1290     struct dpif_netdev *dpif;
1291
1292     ovs_refcount_ref(&dp->ref_cnt);
1293
1294     dpif = xmalloc(sizeof *dpif);
1295     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1296     dpif->dp = dp;
1297     dpif->last_port_seq = seq_read(dp->port_seq);
1298
1299     return &dpif->dpif;
1300 }
1301
1302 /* Choose an unused, non-zero port number and return it on success.
1303  * Return ODPP_NONE on failure. */
1304 static odp_port_t
1305 choose_port(struct dp_netdev *dp, const char *name)
1306     OVS_REQUIRES(dp->port_mutex)
1307 {
1308     uint32_t port_no;
1309
1310     if (dp->class != &dpif_netdev_class) {
1311         const char *p;
1312         int start_no = 0;
1313
1314         /* If the port name begins with "br", start the number search at
1315          * 100 to make writing tests easier. */
1316         if (!strncmp(name, "br", 2)) {
1317             start_no = 100;
1318         }
1319
1320         /* If the port name contains a number, try to assign that port number.
1321          * This can make writing unit tests easier because port numbers are
1322          * predictable. */
1323         for (p = name; *p != '\0'; p++) {
1324             if (isdigit((unsigned char) *p)) {
1325                 port_no = start_no + strtol(p, NULL, 10);
1326                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1327                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1328                     return u32_to_odp(port_no);
1329                 }
1330                 break;
1331             }
1332         }
1333     }
1334
1335     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1336         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1337             return u32_to_odp(port_no);
1338         }
1339     }
1340
1341     return ODPP_NONE;
1342 }
1343
1344 static int
1345 create_dp_netdev(const char *name, const struct dpif_class *class,
1346                  struct dp_netdev **dpp)
1347     OVS_REQUIRES(dp_netdev_mutex)
1348 {
1349     struct dp_netdev *dp;
1350     int error;
1351
1352     dp = xzalloc(sizeof *dp);
1353     shash_add(&dp_netdevs, name, dp);
1354
1355     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1356     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1357     ovs_refcount_init(&dp->ref_cnt);
1358     atomic_flag_clear(&dp->destroyed);
1359
1360     ovs_mutex_init(&dp->port_mutex);
1361     hmap_init(&dp->ports);
1362     dp->port_seq = seq_create();
1363     fat_rwlock_init(&dp->upcall_rwlock);
1364
1365     dp->reconfigure_seq = seq_create();
1366     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1367
1368     for (int i = 0; i < N_METER_LOCKS; ++i) {
1369         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1370     }
1371
1372     /* Disable upcalls by default. */
1373     dp_netdev_disable_upcall(dp);
1374     dp->upcall_aux = NULL;
1375     dp->upcall_cb = NULL;
1376
1377     conntrack_init(&dp->conntrack);
1378
1379     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1380     atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1381
1382     cmap_init(&dp->poll_threads);
1383
1384     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1385     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1386     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1387
1388     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1389     ovsthread_key_create(&dp->per_pmd_key, NULL);
1390
1391     ovs_mutex_lock(&dp->port_mutex);
1392     /* non-PMD will be created before all other threads and will
1393      * allocate static_tx_qid = 0. */
1394     dp_netdev_set_nonpmd(dp);
1395
1396     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1397                                                              "internal"),
1398                         ODPP_LOCAL);
1399     ovs_mutex_unlock(&dp->port_mutex);
1400     if (error) {
1401         dp_netdev_free(dp);
1402         return error;
1403     }
1404
1405     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1406     *dpp = dp;
1407     return 0;
1408 }
1409
1410 static void
1411 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1412 {
1413     seq_change(dp->reconfigure_seq);
1414 }
1415
1416 static bool
1417 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1418 {
1419     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1420 }
1421
1422 static int
1423 dpif_netdev_open(const struct dpif_class *class, const char *name,
1424                  bool create, struct dpif **dpifp)
1425 {
1426     struct dp_netdev *dp;
1427     int error;
1428
1429     ovs_mutex_lock(&dp_netdev_mutex);
1430     dp = shash_find_data(&dp_netdevs, name);
1431     if (!dp) {
1432         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1433     } else {
1434         error = (dp->class != class ? EINVAL
1435                  : create ? EEXIST
1436                  : 0);
1437     }
1438     if (!error) {
1439         *dpifp = create_dpif_netdev(dp);
1440         dp->dpif = *dpifp;
1441     }
1442     ovs_mutex_unlock(&dp_netdev_mutex);
1443
1444     return error;
1445 }
1446
1447 static void
1448 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1449     OVS_NO_THREAD_SAFETY_ANALYSIS
1450 {
1451     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1452     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1453
1454     /* Before freeing a lock we should release it */
1455     fat_rwlock_unlock(&dp->upcall_rwlock);
1456     fat_rwlock_destroy(&dp->upcall_rwlock);
1457 }
1458
1459 static void
1460 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1461     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1462 {
1463     if (dp->meters[meter_id]) {
1464         free(dp->meters[meter_id]);
1465         dp->meters[meter_id] = NULL;
1466     }
1467 }
1468
1469 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1470  * through the 'dp_netdevs' shash while freeing 'dp'. */
1471 static void
1472 dp_netdev_free(struct dp_netdev *dp)
1473     OVS_REQUIRES(dp_netdev_mutex)
1474 {
1475     struct dp_netdev_port *port, *next;
1476
1477     shash_find_and_delete(&dp_netdevs, dp->name);
1478
1479     ovs_mutex_lock(&dp->port_mutex);
1480     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1481         do_del_port(dp, port);
1482     }
1483     ovs_mutex_unlock(&dp->port_mutex);
1484
1485     dp_netdev_destroy_all_pmds(dp, true);
1486     cmap_destroy(&dp->poll_threads);
1487
1488     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1489     id_pool_destroy(dp->tx_qid_pool);
1490
1491     ovs_mutex_destroy(&dp->non_pmd_mutex);
1492     ovsthread_key_delete(dp->per_pmd_key);
1493
1494     conntrack_destroy(&dp->conntrack);
1495
1496
1497     seq_destroy(dp->reconfigure_seq);
1498
1499     seq_destroy(dp->port_seq);
1500     hmap_destroy(&dp->ports);
1501     ovs_mutex_destroy(&dp->port_mutex);
1502
1503     /* Upcalls must be disabled at this point */
1504     dp_netdev_destroy_upcall_lock(dp);
1505
1506     int i;
1507
1508     for (i = 0; i < MAX_METERS; ++i) {
1509         meter_lock(dp, i);
1510         dp_delete_meter(dp, i);
1511         meter_unlock(dp, i);
1512     }
1513     for (i = 0; i < N_METER_LOCKS; ++i) {
1514         ovs_mutex_destroy(&dp->meter_locks[i]);
1515     }
1516
1517     free(dp->pmd_cmask);
1518     free(CONST_CAST(char *, dp->name));
1519     free(dp);
1520 }
1521
1522 static void
1523 dp_netdev_unref(struct dp_netdev *dp)
1524 {
1525     if (dp) {
1526         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1527          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1528         ovs_mutex_lock(&dp_netdev_mutex);
1529         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1530             dp_netdev_free(dp);
1531         }
1532         ovs_mutex_unlock(&dp_netdev_mutex);
1533     }
1534 }
1535
1536 static void
1537 dpif_netdev_close(struct dpif *dpif)
1538 {
1539     struct dp_netdev *dp = get_dp_netdev(dpif);
1540
1541     dp_netdev_unref(dp);
1542     free(dpif);
1543 }
1544
1545 static int
1546 dpif_netdev_destroy(struct dpif *dpif)
1547 {
1548     struct dp_netdev *dp = get_dp_netdev(dpif);
1549
1550     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1551         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1552             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1553             OVS_NOT_REACHED();
1554         }
1555     }
1556
1557     return 0;
1558 }
1559
1560 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1561  * load/store semantics.  While the increment is not atomic, the load and
1562  * store operations are, making it impossible to read inconsistent values.
1563  *
1564  * This is used to update thread local stats counters. */
1565 static void
1566 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1567 {
1568     unsigned long long tmp;
1569
1570     atomic_read_relaxed(var, &tmp);
1571     tmp += n;
1572     atomic_store_relaxed(var, tmp);
1573 }
1574
1575 static int
1576 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1577 {
1578     struct dp_netdev *dp = get_dp_netdev(dpif);
1579     struct dp_netdev_pmd_thread *pmd;
1580     uint64_t pmd_stats[PMD_N_STATS];
1581
1582     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1583     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1584         stats->n_flows += cmap_count(&pmd->flow_table);
1585         pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1586         stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1587         stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1588         stats->n_missed += pmd_stats[PMD_STAT_MISS];
1589         stats->n_lost += pmd_stats[PMD_STAT_LOST];
1590     }
1591     stats->n_masks = UINT32_MAX;
1592     stats->n_mask_hit = UINT64_MAX;
1593
1594     return 0;
1595 }
1596
1597 static void
1598 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1599 {
1600     if (pmd->core_id == NON_PMD_CORE_ID) {
1601         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1602         ovs_mutex_lock(&pmd->port_mutex);
1603         pmd_load_cached_ports(pmd);
1604         ovs_mutex_unlock(&pmd->port_mutex);
1605         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1606         return;
1607     }
1608
1609     ovs_mutex_lock(&pmd->cond_mutex);
1610     seq_change(pmd->reload_seq);
1611     atomic_store_relaxed(&pmd->reload, true);
1612     ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1613     ovs_mutex_unlock(&pmd->cond_mutex);
1614 }
1615
1616 static uint32_t
1617 hash_port_no(odp_port_t port_no)
1618 {
1619     return hash_int(odp_to_u32(port_no), 0);
1620 }
1621
1622 static int
1623 port_create(const char *devname, const char *type,
1624             odp_port_t port_no, struct dp_netdev_port **portp)
1625 {
1626     struct netdev_saved_flags *sf;
1627     struct dp_netdev_port *port;
1628     enum netdev_flags flags;
1629     struct netdev *netdev;
1630     int error;
1631
1632     *portp = NULL;
1633
1634     /* Open and validate network device. */
1635     error = netdev_open(devname, type, &netdev);
1636     if (error) {
1637         return error;
1638     }
1639     /* XXX reject non-Ethernet devices */
1640
1641     netdev_get_flags(netdev, &flags);
1642     if (flags & NETDEV_LOOPBACK) {
1643         VLOG_ERR("%s: cannot add a loopback device", devname);
1644         error = EINVAL;
1645         goto out;
1646     }
1647
1648     error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1649     if (error) {
1650         VLOG_ERR("%s: cannot set promisc flag", devname);
1651         goto out;
1652     }
1653
1654     port = xzalloc(sizeof *port);
1655     port->port_no = port_no;
1656     port->netdev = netdev;
1657     port->type = xstrdup(type);
1658     port->sf = sf;
1659     port->need_reconfigure = true;
1660     ovs_mutex_init(&port->txq_used_mutex);
1661
1662     *portp = port;
1663
1664     return 0;
1665
1666 out:
1667     netdev_close(netdev);
1668     return error;
1669 }
1670
1671 static int
1672 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1673             odp_port_t port_no)
1674     OVS_REQUIRES(dp->port_mutex)
1675 {
1676     struct dp_netdev_port *port;
1677     int error;
1678
1679     /* Reject devices already in 'dp'. */
1680     if (!get_port_by_name(dp, devname, &port)) {
1681         return EEXIST;
1682     }
1683
1684     error = port_create(devname, type, port_no, &port);
1685     if (error) {
1686         return error;
1687     }
1688
1689     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1690     seq_change(dp->port_seq);
1691
1692     reconfigure_datapath(dp);
1693
1694     return 0;
1695 }
1696
1697 static int
1698 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1699                      odp_port_t *port_nop)
1700 {
1701     struct dp_netdev *dp = get_dp_netdev(dpif);
1702     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1703     const char *dpif_port;
1704     odp_port_t port_no;
1705     int error;
1706
1707     ovs_mutex_lock(&dp->port_mutex);
1708     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1709     if (*port_nop != ODPP_NONE) {
1710         port_no = *port_nop;
1711         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1712     } else {
1713         port_no = choose_port(dp, dpif_port);
1714         error = port_no == ODPP_NONE ? EFBIG : 0;
1715     }
1716     if (!error) {
1717         *port_nop = port_no;
1718         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1719     }
1720     ovs_mutex_unlock(&dp->port_mutex);
1721
1722     return error;
1723 }
1724
1725 static int
1726 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1727 {
1728     struct dp_netdev *dp = get_dp_netdev(dpif);
1729     int error;
1730
1731     ovs_mutex_lock(&dp->port_mutex);
1732     if (port_no == ODPP_LOCAL) {
1733         error = EINVAL;
1734     } else {
1735         struct dp_netdev_port *port;
1736
1737         error = get_port_by_number(dp, port_no, &port);
1738         if (!error) {
1739             do_del_port(dp, port);
1740         }
1741     }
1742     ovs_mutex_unlock(&dp->port_mutex);
1743
1744     return error;
1745 }
1746
1747 static bool
1748 is_valid_port_number(odp_port_t port_no)
1749 {
1750     return port_no != ODPP_NONE;
1751 }
1752
1753 static struct dp_netdev_port *
1754 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1755     OVS_REQUIRES(dp->port_mutex)
1756 {
1757     struct dp_netdev_port *port;
1758
1759     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1760         if (port->port_no == port_no) {
1761             return port;
1762         }
1763     }
1764     return NULL;
1765 }
1766
1767 static int
1768 get_port_by_number(struct dp_netdev *dp,
1769                    odp_port_t port_no, struct dp_netdev_port **portp)
1770     OVS_REQUIRES(dp->port_mutex)
1771 {
1772     if (!is_valid_port_number(port_no)) {
1773         *portp = NULL;
1774         return EINVAL;
1775     } else {
1776         *portp = dp_netdev_lookup_port(dp, port_no);
1777         return *portp ? 0 : ENODEV;
1778     }
1779 }
1780
1781 static void
1782 port_destroy(struct dp_netdev_port *port)
1783 {
1784     if (!port) {
1785         return;
1786     }
1787
1788     netdev_close(port->netdev);
1789     netdev_restore_flags(port->sf);
1790
1791     for (unsigned i = 0; i < port->n_rxq; i++) {
1792         netdev_rxq_close(port->rxqs[i].rx);
1793     }
1794     ovs_mutex_destroy(&port->txq_used_mutex);
1795     free(port->rxq_affinity_list);
1796     free(port->txq_used);
1797     free(port->rxqs);
1798     free(port->type);
1799     free(port);
1800 }
1801
1802 static int
1803 get_port_by_name(struct dp_netdev *dp,
1804                  const char *devname, struct dp_netdev_port **portp)
1805     OVS_REQUIRES(dp->port_mutex)
1806 {
1807     struct dp_netdev_port *port;
1808
1809     HMAP_FOR_EACH (port, node, &dp->ports) {
1810         if (!strcmp(netdev_get_name(port->netdev), devname)) {
1811             *portp = port;
1812             return 0;
1813         }
1814     }
1815
1816     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1817      * existing port. */
1818     return ENODEV;
1819 }
1820
1821 /* Returns 'true' if there is a port with pmd netdev. */
1822 static bool
1823 has_pmd_port(struct dp_netdev *dp)
1824     OVS_REQUIRES(dp->port_mutex)
1825 {
1826     struct dp_netdev_port *port;
1827
1828     HMAP_FOR_EACH (port, node, &dp->ports) {
1829         if (netdev_is_pmd(port->netdev)) {
1830             return true;
1831         }
1832     }
1833
1834     return false;
1835 }
1836
1837 static void
1838 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1839     OVS_REQUIRES(dp->port_mutex)
1840 {
1841     hmap_remove(&dp->ports, &port->node);
1842     seq_change(dp->port_seq);
1843
1844     reconfigure_datapath(dp);
1845
1846     port_destroy(port);
1847 }
1848
1849 static void
1850 answer_port_query(const struct dp_netdev_port *port,
1851                   struct dpif_port *dpif_port)
1852 {
1853     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1854     dpif_port->type = xstrdup(port->type);
1855     dpif_port->port_no = port->port_no;
1856 }
1857
1858 static int
1859 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1860                                  struct dpif_port *dpif_port)
1861 {
1862     struct dp_netdev *dp = get_dp_netdev(dpif);
1863     struct dp_netdev_port *port;
1864     int error;
1865
1866     ovs_mutex_lock(&dp->port_mutex);
1867     error = get_port_by_number(dp, port_no, &port);
1868     if (!error && dpif_port) {
1869         answer_port_query(port, dpif_port);
1870     }
1871     ovs_mutex_unlock(&dp->port_mutex);
1872
1873     return error;
1874 }
1875
1876 static int
1877 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1878                                struct dpif_port *dpif_port)
1879 {
1880     struct dp_netdev *dp = get_dp_netdev(dpif);
1881     struct dp_netdev_port *port;
1882     int error;
1883
1884     ovs_mutex_lock(&dp->port_mutex);
1885     error = get_port_by_name(dp, devname, &port);
1886     if (!error && dpif_port) {
1887         answer_port_query(port, dpif_port);
1888     }
1889     ovs_mutex_unlock(&dp->port_mutex);
1890
1891     return error;
1892 }
1893
1894 static void
1895 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1896 {
1897     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1898     free(flow);
1899 }
1900
1901 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1902 {
1903     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1904         ovsrcu_postpone(dp_netdev_flow_free, flow);
1905     }
1906 }
1907
1908 static uint32_t
1909 dp_netdev_flow_hash(const ovs_u128 *ufid)
1910 {
1911     return ufid->u32[0];
1912 }
1913
1914 static inline struct dpcls *
1915 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1916                            odp_port_t in_port)
1917 {
1918     struct dpcls *cls;
1919     uint32_t hash = hash_port_no(in_port);
1920     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1921         if (cls->in_port == in_port) {
1922             /* Port classifier exists already */
1923             return cls;
1924         }
1925     }
1926     return NULL;
1927 }
1928
1929 static inline struct dpcls *
1930 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1931                          odp_port_t in_port)
1932     OVS_REQUIRES(pmd->flow_mutex)
1933 {
1934     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1935     uint32_t hash = hash_port_no(in_port);
1936
1937     if (!cls) {
1938         /* Create new classifier for in_port */
1939         cls = xmalloc(sizeof(*cls));
1940         dpcls_init(cls);
1941         cls->in_port = in_port;
1942         cmap_insert(&pmd->classifiers, &cls->node, hash);
1943         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1944     }
1945     return cls;
1946 }
1947
1948 #define MAX_FLOW_MARK       (UINT32_MAX - 1)
1949 #define INVALID_FLOW_MARK   (UINT32_MAX)
1950
1951 struct megaflow_to_mark_data {
1952     const struct cmap_node node;
1953     ovs_u128 mega_ufid;
1954     uint32_t mark;
1955 };
1956
1957 struct flow_mark {
1958     struct cmap megaflow_to_mark;
1959     struct cmap mark_to_flow;
1960     struct id_pool *pool;
1961     struct ovs_mutex mutex;
1962 };
1963
1964 static struct flow_mark flow_mark = {
1965     .megaflow_to_mark = CMAP_INITIALIZER,
1966     .mark_to_flow = CMAP_INITIALIZER,
1967     .mutex = OVS_MUTEX_INITIALIZER,
1968 };
1969
1970 static uint32_t
1971 flow_mark_alloc(void)
1972 {
1973     uint32_t mark;
1974
1975     if (!flow_mark.pool) {
1976         /* Haven't initiated yet, do it here */
1977         flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
1978     }
1979
1980     if (id_pool_alloc_id(flow_mark.pool, &mark)) {
1981         return mark;
1982     }
1983
1984     return INVALID_FLOW_MARK;
1985 }
1986
1987 static void
1988 flow_mark_free(uint32_t mark)
1989 {
1990     id_pool_free_id(flow_mark.pool, mark);
1991 }
1992
1993 /* associate megaflow with a mark, which is a 1:1 mapping */
1994 static void
1995 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
1996 {
1997     size_t hash = dp_netdev_flow_hash(mega_ufid);
1998     struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
1999
2000     data->mega_ufid = *mega_ufid;
2001     data->mark = mark;
2002
2003     cmap_insert(&flow_mark.megaflow_to_mark,
2004                 CONST_CAST(struct cmap_node *, &data->node), hash);
2005 }
2006
2007 /* disassociate meagaflow with a mark */
2008 static void
2009 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2010 {
2011     size_t hash = dp_netdev_flow_hash(mega_ufid);
2012     struct megaflow_to_mark_data *data;
2013
2014     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2015         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2016             cmap_remove(&flow_mark.megaflow_to_mark,
2017                         CONST_CAST(struct cmap_node *, &data->node), hash);
2018             free(data);
2019             return;
2020         }
2021     }
2022
2023     VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2024               UUID_ARGS((struct uuid *)mega_ufid));
2025 }
2026
2027 static inline uint32_t
2028 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2029 {
2030     size_t hash = dp_netdev_flow_hash(mega_ufid);
2031     struct megaflow_to_mark_data *data;
2032
2033     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2034         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2035             return data->mark;
2036         }
2037     }
2038
2039     VLOG_WARN("Mark id for ufid "UUID_FMT" was not found\n",
2040               UUID_ARGS((struct uuid *)mega_ufid));
2041     return INVALID_FLOW_MARK;
2042 }
2043
2044 /* associate mark with a flow, which is 1:N mapping */
2045 static void
2046 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2047 {
2048     dp_netdev_flow_ref(flow);
2049
2050     cmap_insert(&flow_mark.mark_to_flow,
2051                 CONST_CAST(struct cmap_node *, &flow->mark_node),
2052                 hash_int(mark, 0));
2053     flow->mark = mark;
2054
2055     VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2056 }
2057
2058 static bool
2059 flow_mark_has_no_ref(uint32_t mark)
2060 {
2061     struct dp_netdev_flow *flow;
2062
2063     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2064                              &flow_mark.mark_to_flow) {
2065         if (flow->mark == mark) {
2066             return false;
2067         }
2068     }
2069
2070     return true;
2071 }
2072
2073 static int
2074 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2075                           struct dp_netdev_flow *flow)
2076 {
2077     int ret = 0;
2078     uint32_t mark = flow->mark;
2079     struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2080                                              &flow->mark_node);
2081
2082     cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2083     flow->mark = INVALID_FLOW_MARK;
2084
2085     /*
2086      * no flow is referencing the mark any more? If so, let's
2087      * remove the flow from hardware and free the mark.
2088      */
2089     if (flow_mark_has_no_ref(mark)) {
2090         struct dp_netdev_port *port;
2091         odp_port_t in_port = flow->flow.in_port.odp_port;
2092
2093         ovs_mutex_lock(&pmd->dp->port_mutex);
2094         port = dp_netdev_lookup_port(pmd->dp, in_port);
2095         if (port) {
2096             ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2097         }
2098         ovs_mutex_unlock(&pmd->dp->port_mutex);
2099
2100         flow_mark_free(mark);
2101         VLOG_DBG("Freed flow mark %u\n", mark);
2102
2103         megaflow_to_mark_disassociate(&flow->mega_ufid);
2104     }
2105     dp_netdev_flow_unref(flow);
2106
2107     return ret;
2108 }
2109
2110 static void
2111 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2112 {
2113     struct dp_netdev_flow *flow;
2114
2115     CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2116         if (flow->pmd_id == pmd->core_id) {
2117             mark_to_flow_disassociate(pmd, flow);
2118         }
2119     }
2120 }
2121
2122 static void
2123 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2124                           struct dp_netdev_flow *flow)
2125     OVS_REQUIRES(pmd->flow_mutex)
2126 {
2127     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2128     struct dpcls *cls;
2129     odp_port_t in_port = flow->flow.in_port.odp_port;
2130
2131     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2132     ovs_assert(cls != NULL);
2133     dpcls_remove(cls, &flow->cr);
2134     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2135     if (flow->mark != INVALID_FLOW_MARK) {
2136         mark_to_flow_disassociate(pmd, flow);
2137     }
2138     flow->dead = true;
2139
2140     dp_netdev_flow_unref(flow);
2141 }
2142
2143 static void
2144 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2145 {
2146     struct dp_netdev_flow *netdev_flow;
2147
2148     ovs_mutex_lock(&pmd->flow_mutex);
2149     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2150         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2151     }
2152     ovs_mutex_unlock(&pmd->flow_mutex);
2153 }
2154
2155 static int
2156 dpif_netdev_flow_flush(struct dpif *dpif)
2157 {
2158     struct dp_netdev *dp = get_dp_netdev(dpif);
2159     struct dp_netdev_pmd_thread *pmd;
2160
2161     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2162         dp_netdev_pmd_flow_flush(pmd);
2163     }
2164
2165     return 0;
2166 }
2167
2168 struct dp_netdev_port_state {
2169     struct hmap_position position;
2170     char *name;
2171 };
2172
2173 static int
2174 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2175 {
2176     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2177     return 0;
2178 }
2179
2180 static int
2181 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2182                            struct dpif_port *dpif_port)
2183 {
2184     struct dp_netdev_port_state *state = state_;
2185     struct dp_netdev *dp = get_dp_netdev(dpif);
2186     struct hmap_node *node;
2187     int retval;
2188
2189     ovs_mutex_lock(&dp->port_mutex);
2190     node = hmap_at_position(&dp->ports, &state->position);
2191     if (node) {
2192         struct dp_netdev_port *port;
2193
2194         port = CONTAINER_OF(node, struct dp_netdev_port, node);
2195
2196         free(state->name);
2197         state->name = xstrdup(netdev_get_name(port->netdev));
2198         dpif_port->name = state->name;
2199         dpif_port->type = port->type;
2200         dpif_port->port_no = port->port_no;
2201
2202         retval = 0;
2203     } else {
2204         retval = EOF;
2205     }
2206     ovs_mutex_unlock(&dp->port_mutex);
2207
2208     return retval;
2209 }
2210
2211 static int
2212 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2213 {
2214     struct dp_netdev_port_state *state = state_;
2215     free(state->name);
2216     free(state);
2217     return 0;
2218 }
2219
2220 static int
2221 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2222 {
2223     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2224     uint64_t new_port_seq;
2225     int error;
2226
2227     new_port_seq = seq_read(dpif->dp->port_seq);
2228     if (dpif->last_port_seq != new_port_seq) {
2229         dpif->last_port_seq = new_port_seq;
2230         error = ENOBUFS;
2231     } else {
2232         error = EAGAIN;
2233     }
2234
2235     return error;
2236 }
2237
2238 static void
2239 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2240 {
2241     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2242
2243     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2244 }
2245
2246 static struct dp_netdev_flow *
2247 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2248 {
2249     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2250 }
2251
2252 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2253 {
2254     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2255 }
2256
2257 /* netdev_flow_key utilities.
2258  *
2259  * netdev_flow_key is basically a miniflow.  We use these functions
2260  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2261  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2262  *
2263  * - Since we are dealing exclusively with miniflows created by
2264  *   miniflow_extract(), if the map is different the miniflow is different.
2265  *   Therefore we can be faster by comparing the map and the miniflow in a
2266  *   single memcmp().
2267  * - These functions can be inlined by the compiler. */
2268
2269 /* Given the number of bits set in miniflow's maps, returns the size of the
2270  * 'netdev_flow_key.mf' */
2271 static inline size_t
2272 netdev_flow_key_size(size_t flow_u64s)
2273 {
2274     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2275 }
2276
2277 static inline bool
2278 netdev_flow_key_equal(const struct netdev_flow_key *a,
2279                       const struct netdev_flow_key *b)
2280 {
2281     /* 'b->len' may be not set yet. */
2282     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2283 }
2284
2285 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2286  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2287  * generated by miniflow_extract. */
2288 static inline bool
2289 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2290                          const struct miniflow *mf)
2291 {
2292     return !memcmp(&key->mf, mf, key->len);
2293 }
2294
2295 static inline void
2296 netdev_flow_key_clone(struct netdev_flow_key *dst,
2297                       const struct netdev_flow_key *src)
2298 {
2299     memcpy(dst, src,
2300            offsetof(struct netdev_flow_key, mf) + src->len);
2301 }
2302
2303 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2304 static inline void
2305 netdev_flow_mask_init(struct netdev_flow_key *mask,
2306                       const struct match *match)
2307 {
2308     uint64_t *dst = miniflow_values(&mask->mf);
2309     struct flowmap fmap;
2310     uint32_t hash = 0;
2311     size_t idx;
2312
2313     /* Only check masks that make sense for the flow. */
2314     flow_wc_map(&match->flow, &fmap);
2315     flowmap_init(&mask->mf.map);
2316
2317     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2318         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2319
2320         if (mask_u64) {
2321             flowmap_set(&mask->mf.map, idx, 1);
2322             *dst++ = mask_u64;
2323             hash = hash_add64(hash, mask_u64);
2324         }
2325     }
2326
2327     map_t map;
2328
2329     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2330         hash = hash_add64(hash, map);
2331     }
2332
2333     size_t n = dst - miniflow_get_values(&mask->mf);
2334
2335     mask->hash = hash_finish(hash, n * 8);
2336     mask->len = netdev_flow_key_size(n);
2337 }
2338
2339 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2340 static inline void
2341 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2342                             const struct flow *flow,
2343                             const struct netdev_flow_key *mask)
2344 {
2345     uint64_t *dst_u64 = miniflow_values(&dst->mf);
2346     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2347     uint32_t hash = 0;
2348     uint64_t value;
2349
2350     dst->len = mask->len;
2351     dst->mf = mask->mf;   /* Copy maps. */
2352
2353     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2354         *dst_u64 = value & *mask_u64++;
2355         hash = hash_add64(hash, *dst_u64++);
2356     }
2357     dst->hash = hash_finish(hash,
2358                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2359 }
2360
2361 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2362 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
2363     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2364
2365 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2366  * 'mask'. */
2367 static inline uint32_t
2368 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2369                              const struct netdev_flow_key *mask)
2370 {
2371     const uint64_t *p = miniflow_get_values(&mask->mf);
2372     uint32_t hash = 0;
2373     uint64_t value;
2374
2375     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2376         hash = hash_add64(hash, value & *p++);
2377     }
2378
2379     return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2380 }
2381
2382 static inline bool
2383 emc_entry_alive(struct emc_entry *ce)
2384 {
2385     return ce->flow && !ce->flow->dead;
2386 }
2387
2388 static void
2389 emc_clear_entry(struct emc_entry *ce)
2390 {
2391     if (ce->flow) {
2392         dp_netdev_flow_unref(ce->flow);
2393         ce->flow = NULL;
2394     }
2395 }
2396
2397 static inline void
2398 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2399                  const struct netdev_flow_key *key)
2400 {
2401     if (ce->flow != flow) {
2402         if (ce->flow) {
2403             dp_netdev_flow_unref(ce->flow);
2404         }
2405
2406         if (dp_netdev_flow_ref(flow)) {
2407             ce->flow = flow;
2408         } else {
2409             ce->flow = NULL;
2410         }
2411     }
2412     if (key) {
2413         netdev_flow_key_clone(&ce->key, key);
2414     }
2415 }
2416
2417 static inline void
2418 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2419            struct dp_netdev_flow *flow)
2420 {
2421     struct emc_entry *to_be_replaced = NULL;
2422     struct emc_entry *current_entry;
2423
2424     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2425         if (netdev_flow_key_equal(&current_entry->key, key)) {
2426             /* We found the entry with the 'mf' miniflow */
2427             emc_change_entry(current_entry, flow, NULL);
2428             return;
2429         }
2430
2431         /* Replacement policy: put the flow in an empty (not alive) entry, or
2432          * in the first entry where it can be */
2433         if (!to_be_replaced
2434             || (emc_entry_alive(to_be_replaced)
2435                 && !emc_entry_alive(current_entry))
2436             || current_entry->key.hash < to_be_replaced->key.hash) {
2437             to_be_replaced = current_entry;
2438         }
2439     }
2440     /* We didn't find the miniflow in the cache.
2441      * The 'to_be_replaced' entry is where the new flow will be stored */
2442
2443     emc_change_entry(to_be_replaced, flow, key);
2444 }
2445
2446 static inline void
2447 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2448                          const struct netdev_flow_key *key,
2449                          struct dp_netdev_flow *flow)
2450 {
2451     /* Insert an entry into the EMC based on probability value 'min'. By
2452      * default the value is UINT32_MAX / 100 which yields an insertion
2453      * probability of 1/100 ie. 1% */
2454
2455     uint32_t min;
2456     atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
2457
2458     if (min && random_uint32() <= min) {
2459         emc_insert(&pmd->flow_cache, key, flow);
2460     }
2461 }
2462
2463 static inline struct dp_netdev_flow *
2464 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2465 {
2466     struct emc_entry *current_entry;
2467
2468     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2469         if (current_entry->key.hash == key->hash
2470             && emc_entry_alive(current_entry)
2471             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2472
2473             /* We found the entry with the 'key->mf' miniflow */
2474             return current_entry->flow;
2475         }
2476     }
2477
2478     return NULL;
2479 }
2480
2481 static struct dp_netdev_flow *
2482 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2483                           const struct netdev_flow_key *key,
2484                           int *lookup_num_p)
2485 {
2486     struct dpcls *cls;
2487     struct dpcls_rule *rule;
2488     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2489                                                      in_port.odp_port));
2490     struct dp_netdev_flow *netdev_flow = NULL;
2491
2492     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2493     if (OVS_LIKELY(cls)) {
2494         dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2495         netdev_flow = dp_netdev_flow_cast(rule);
2496     }
2497     return netdev_flow;
2498 }
2499
2500 static struct dp_netdev_flow *
2501 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2502                         const ovs_u128 *ufidp, const struct nlattr *key,
2503                         size_t key_len)
2504 {
2505     struct dp_netdev_flow *netdev_flow;
2506     struct flow flow;
2507     ovs_u128 ufid;
2508
2509     /* If a UFID is not provided, determine one based on the key. */
2510     if (!ufidp && key && key_len
2511         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2512         dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2513         ufidp = &ufid;
2514     }
2515
2516     if (ufidp) {
2517         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2518                                  &pmd->flow_table) {
2519             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2520                 return netdev_flow;
2521             }
2522         }
2523     }
2524
2525     return NULL;
2526 }
2527
2528 static void
2529 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2530                     struct dpif_flow_stats *stats)
2531 {
2532     struct dp_netdev_flow *netdev_flow;
2533     unsigned long long n;
2534     long long used;
2535     uint16_t flags;
2536
2537     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2538
2539     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2540     stats->n_packets = n;
2541     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2542     stats->n_bytes = n;
2543     atomic_read_relaxed(&netdev_flow->stats.used, &used);
2544     stats->used = used;
2545     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2546     stats->tcp_flags = flags;
2547 }
2548
2549 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2550  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2551  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2552  * protect them. */
2553 static void
2554 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2555                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2556                             struct dpif_flow *flow, bool terse)
2557 {
2558     if (terse) {
2559         memset(flow, 0, sizeof *flow);
2560     } else {
2561         struct flow_wildcards wc;
2562         struct dp_netdev_actions *actions;
2563         size_t offset;
2564         struct odp_flow_key_parms odp_parms = {
2565             .flow = &netdev_flow->flow,
2566             .mask = &wc.masks,
2567             .support = dp_netdev_support,
2568         };
2569
2570         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2571         /* in_port is exact matched, but we have left it out from the mask for
2572          * optimnization reasons. Add in_port back to the mask. */
2573         wc.masks.in_port.odp_port = ODPP_NONE;
2574
2575         /* Key */
2576         offset = key_buf->size;
2577         flow->key = ofpbuf_tail(key_buf);
2578         odp_flow_key_from_flow(&odp_parms, key_buf);
2579         flow->key_len = key_buf->size - offset;
2580
2581         /* Mask */
2582         offset = mask_buf->size;
2583         flow->mask = ofpbuf_tail(mask_buf);
2584         odp_parms.key_buf = key_buf;
2585         odp_flow_key_from_mask(&odp_parms, mask_buf);
2586         flow->mask_len = mask_buf->size - offset;
2587
2588         /* Actions */
2589         actions = dp_netdev_flow_get_actions(netdev_flow);
2590         flow->actions = actions->actions;
2591         flow->actions_len = actions->size;
2592     }
2593
2594     flow->ufid = netdev_flow->ufid;
2595     flow->ufid_present = true;
2596     flow->pmd_id = netdev_flow->pmd_id;
2597     get_dpif_flow_stats(netdev_flow, &flow->stats);
2598 }
2599
2600 static int
2601 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2602                               const struct nlattr *mask_key,
2603                               uint32_t mask_key_len, const struct flow *flow,
2604                               struct flow_wildcards *wc, bool probe)
2605 {
2606     enum odp_key_fitness fitness;
2607
2608     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
2609     if (fitness) {
2610         if (!probe) {
2611             /* This should not happen: it indicates that
2612              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2613              * disagree on the acceptable form of a mask.  Log the problem
2614              * as an error, with enough details to enable debugging. */
2615             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2616
2617             if (!VLOG_DROP_ERR(&rl)) {
2618                 struct ds s;
2619
2620                 ds_init(&s);
2621                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2622                                 true);
2623                 VLOG_ERR("internal error parsing flow mask %s (%s)",
2624                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2625                 ds_destroy(&s);
2626             }
2627         }
2628
2629         return EINVAL;
2630     }
2631
2632     return 0;
2633 }
2634
2635 static int
2636 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2637                               struct flow *flow, bool probe)
2638 {
2639     if (odp_flow_key_to_flow(key, key_len, flow)) {
2640         if (!probe) {
2641             /* This should not happen: it indicates that
2642              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2643              * the acceptable form of a flow.  Log the problem as an error,
2644              * with enough details to enable debugging. */
2645             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2646
2647             if (!VLOG_DROP_ERR(&rl)) {
2648                 struct ds s;
2649
2650                 ds_init(&s);
2651                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2652                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2653                 ds_destroy(&s);
2654             }
2655         }
2656
2657         return EINVAL;
2658     }
2659
2660     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2661         return EINVAL;
2662     }
2663
2664     return 0;
2665 }
2666
2667 static int
2668 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2669 {
2670     struct dp_netdev *dp = get_dp_netdev(dpif);
2671     struct dp_netdev_flow *netdev_flow;
2672     struct dp_netdev_pmd_thread *pmd;
2673     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2674     struct hmapx_node *node;
2675     int error = EINVAL;
2676
2677     if (get->pmd_id == PMD_ID_NULL) {
2678         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2679             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2680                 dp_netdev_pmd_unref(pmd);
2681             }
2682         }
2683     } else {
2684         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2685         if (!pmd) {
2686             goto out;
2687         }
2688         hmapx_add(&to_find, pmd);
2689     }
2690
2691     if (!hmapx_count(&to_find)) {
2692         goto out;
2693     }
2694
2695     HMAPX_FOR_EACH (node, &to_find) {
2696         pmd = (struct dp_netdev_pmd_thread *) node->data;
2697         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2698                                               get->key_len);
2699         if (netdev_flow) {
2700             dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2701                                         get->flow, false);
2702             error = 0;
2703             break;
2704         } else {
2705             error = ENOENT;
2706         }
2707     }
2708
2709     HMAPX_FOR_EACH (node, &to_find) {
2710         pmd = (struct dp_netdev_pmd_thread *) node->data;
2711         dp_netdev_pmd_unref(pmd);
2712     }
2713 out:
2714     hmapx_destroy(&to_find);
2715     return error;
2716 }
2717
2718 /*
2719  * There are two flow offload operations here: addition and modification.
2720  *
2721  * For flow addition, this function does:
2722  * - allocate a new flow mark id
2723  * - perform hardware flow offload
2724  * - associate the flow mark with flow and mega flow
2725  *
2726  * For flow modification, both flow mark and the associations are still
2727  * valid, thus only item 2 needed.
2728  */
2729 static void
2730 try_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, odp_port_t in_port,
2731                     struct dp_netdev_flow *flow, struct match *match,
2732                     const struct nlattr *actions, size_t actions_len)
2733 {
2734     struct offload_info info;
2735     struct dp_netdev_port *port;
2736     bool modification = flow->mark != INVALID_FLOW_MARK;
2737     const char *op = modification ? "modify" : "add";
2738     uint32_t mark;
2739     int ret;
2740
2741     ovs_mutex_lock(&flow_mark.mutex);
2742
2743     if (modification) {
2744         mark = flow->mark;
2745     } else {
2746         if (!netdev_is_flow_api_enabled()) {
2747             goto out;
2748         }
2749
2750         /*
2751          * If a mega flow has already been offloaded (from other PMD
2752          * instances), do not offload it again.
2753          */
2754         mark = megaflow_to_mark_find(&flow->mega_ufid);
2755         if (mark != INVALID_FLOW_MARK) {
2756             VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2757             mark_to_flow_associate(mark, flow);
2758             goto out;
2759         }
2760
2761         mark = flow_mark_alloc();
2762         if (mark == INVALID_FLOW_MARK) {
2763             VLOG_ERR("Failed to allocate flow mark!\n");
2764             goto out;
2765         }
2766     }
2767     info.flow_mark = mark;
2768
2769     ovs_mutex_lock(&pmd->dp->port_mutex);
2770     port = dp_netdev_lookup_port(pmd->dp, in_port);
2771     if (!port) {
2772         ovs_mutex_unlock(&pmd->dp->port_mutex);
2773         goto out;
2774     }
2775     ret = netdev_flow_put(port->netdev, match,
2776                           CONST_CAST(struct nlattr *, actions),
2777                           actions_len, &flow->mega_ufid, &info, NULL);
2778     ovs_mutex_unlock(&pmd->dp->port_mutex);
2779
2780     if (ret) {
2781         VLOG_ERR("Failed to %s netdev flow with mark %u\n", op, mark);
2782         if (!modification) {
2783             flow_mark_free(mark);
2784         } else {
2785             mark_to_flow_disassociate(pmd, flow);
2786         }
2787         goto out;
2788     }
2789
2790     if (!modification) {
2791         megaflow_to_mark_associate(&flow->mega_ufid, mark);
2792         mark_to_flow_associate(mark, flow);
2793     }
2794     VLOG_DBG("Succeed to %s netdev flow with mark %u\n", op, mark);
2795
2796 out:
2797     ovs_mutex_unlock(&flow_mark.mutex);
2798 }
2799
2800 static void
2801 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
2802 {
2803     struct flow masked_flow;
2804     size_t i;
2805
2806     for (i = 0; i < sizeof(struct flow); i++) {
2807         ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
2808                                        ((uint8_t *)&match->wc)[i];
2809     }
2810     dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
2811 }
2812
2813 static struct dp_netdev_flow *
2814 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2815                    struct match *match, const ovs_u128 *ufid,
2816                    const struct nlattr *actions, size_t actions_len)
2817     OVS_REQUIRES(pmd->flow_mutex)
2818 {
2819     struct dp_netdev_flow *flow;
2820     struct netdev_flow_key mask;
2821     struct dpcls *cls;
2822
2823     /* Make sure in_port is exact matched before we read it. */
2824     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
2825     odp_port_t in_port = match->flow.in_port.odp_port;
2826
2827     /* As we select the dpcls based on the port number, each netdev flow
2828      * belonging to the same dpcls will have the same odp_port value.
2829      * For performance reasons we wildcard odp_port here in the mask.  In the
2830      * typical case dp_hash is also wildcarded, and the resulting 8-byte
2831      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
2832      * will not be part of the subtable mask.
2833      * This will speed up the hash computation during dpcls_lookup() because
2834      * there is one less call to hash_add64() in this case. */
2835     match->wc.masks.in_port.odp_port = 0;
2836     netdev_flow_mask_init(&mask, match);
2837     match->wc.masks.in_port.odp_port = ODPP_NONE;
2838
2839     /* Make sure wc does not have metadata. */
2840     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2841                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2842
2843     /* Do not allocate extra space. */
2844     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2845     memset(&flow->stats, 0, sizeof flow->stats);
2846     flow->dead = false;
2847     flow->batch = NULL;
2848     flow->mark = INVALID_FLOW_MARK;
2849     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2850     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2851     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2852     ovs_refcount_init(&flow->ref_cnt);
2853     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2854
2855     dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
2856     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2857
2858     /* Select dpcls for in_port. Relies on in_port to be exact match. */
2859     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2860     dpcls_insert(cls, &flow->cr, &mask);
2861
2862     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2863                 dp_netdev_flow_hash(&flow->ufid));
2864
2865     try_netdev_flow_put(pmd, in_port, flow, match, actions, actions_len);
2866
2867     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
2868         struct ds ds = DS_EMPTY_INITIALIZER;
2869         struct ofpbuf key_buf, mask_buf;
2870         struct odp_flow_key_parms odp_parms = {
2871             .flow = &match->flow,
2872             .mask = &match->wc.masks,
2873             .support = dp_netdev_support,
2874         };
2875
2876         ofpbuf_init(&key_buf, 0);
2877         ofpbuf_init(&mask_buf, 0);
2878
2879         odp_flow_key_from_flow(&odp_parms, &key_buf);
2880         odp_parms.key_buf = &key_buf;
2881         odp_flow_key_from_mask(&odp_parms, &mask_buf);
2882
2883         ds_put_cstr(&ds, "flow_add: ");
2884         odp_format_ufid(ufid, &ds);
2885         ds_put_cstr(&ds, " ");
2886         odp_flow_format(key_buf.data, key_buf.size,
2887                         mask_buf.data, mask_buf.size,
2888                         NULL, &ds, false);
2889         ds_put_cstr(&ds, ", actions:");
2890         format_odp_actions(&ds, actions, actions_len, NULL);
2891
2892         VLOG_DBG("%s", ds_cstr(&ds));
2893
2894         ofpbuf_uninit(&key_buf);
2895         ofpbuf_uninit(&mask_buf);
2896
2897         /* Add a printout of the actual match installed. */
2898         struct match m;
2899         ds_clear(&ds);
2900         ds_put_cstr(&ds, "flow match: ");
2901         miniflow_expand(&flow->cr.flow.mf, &m.flow);
2902         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2903         memset(&m.tun_md, 0, sizeof m.tun_md);
2904         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2905
2906         VLOG_DBG("%s", ds_cstr(&ds));
2907
2908         ds_destroy(&ds);
2909     }
2910
2911     return flow;
2912 }
2913
2914 static int
2915 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
2916                 struct netdev_flow_key *key,
2917                 struct match *match,
2918                 ovs_u128 *ufid,
2919                 const struct dpif_flow_put *put,
2920                 struct dpif_flow_stats *stats)
2921 {
2922     struct dp_netdev_flow *netdev_flow;
2923     int error = 0;
2924
2925     if (stats) {
2926         memset(stats, 0, sizeof *stats);
2927     }
2928
2929     ovs_mutex_lock(&pmd->flow_mutex);
2930     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
2931     if (!netdev_flow) {
2932         if (put->flags & DPIF_FP_CREATE) {
2933             if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2934                 dp_netdev_flow_add(pmd, match, ufid, put->actions,
2935                                    put->actions_len);
2936                 error = 0;
2937             } else {
2938                 error = EFBIG;
2939             }
2940         } else {
2941             error = ENOENT;
2942         }
2943     } else {
2944         if (put->flags & DPIF_FP_MODIFY) {
2945             struct dp_netdev_actions *new_actions;
2946             struct dp_netdev_actions *old_actions;
2947             odp_port_t in_port = netdev_flow->flow.in_port.odp_port;
2948
2949             new_actions = dp_netdev_actions_create(put->actions,
2950                                                    put->actions_len);
2951
2952             old_actions = dp_netdev_flow_get_actions(netdev_flow);
2953             ovsrcu_set(&netdev_flow->actions, new_actions);
2954
2955             try_netdev_flow_put(pmd, in_port, netdev_flow, match,
2956                                 put->actions, put->actions_len);
2957
2958             if (stats) {
2959                 get_dpif_flow_stats(netdev_flow, stats);
2960             }
2961             if (put->flags & DPIF_FP_ZERO_STATS) {
2962                 /* XXX: The userspace datapath uses thread local statistics
2963                  * (for flows), which should be updated only by the owning
2964                  * thread.  Since we cannot write on stats memory here,
2965                  * we choose not to support this flag.  Please note:
2966                  * - This feature is currently used only by dpctl commands with
2967                  *   option --clear.
2968                  * - Should the need arise, this operation can be implemented
2969                  *   by keeping a base value (to be update here) for each
2970                  *   counter, and subtracting it before outputting the stats */
2971                 error = EOPNOTSUPP;
2972             }
2973
2974             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2975         } else if (put->flags & DPIF_FP_CREATE) {
2976             error = EEXIST;
2977         } else {
2978             /* Overlapping flow. */
2979             error = EINVAL;
2980         }
2981     }
2982     ovs_mutex_unlock(&pmd->flow_mutex);
2983     return error;
2984 }
2985
2986 static int
2987 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2988 {
2989     struct dp_netdev *dp = get_dp_netdev(dpif);
2990     struct netdev_flow_key key, mask;
2991     struct dp_netdev_pmd_thread *pmd;
2992     struct match match;
2993     ovs_u128 ufid;
2994     int error;
2995     bool probe = put->flags & DPIF_FP_PROBE;
2996
2997     if (put->stats) {
2998         memset(put->stats, 0, sizeof *put->stats);
2999     }
3000     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3001                                           probe);
3002     if (error) {
3003         return error;
3004     }
3005     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3006                                           put->mask, put->mask_len,
3007                                           &match.flow, &match.wc, probe);
3008     if (error) {
3009         return error;
3010     }
3011
3012     if (put->ufid) {
3013         ufid = *put->ufid;
3014     } else {
3015         dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3016     }
3017
3018     /* Must produce a netdev_flow_key for lookup.
3019      * Use the same method as employed to create the key when adding
3020      * the flow to the dplcs to make sure they match. */
3021     netdev_flow_mask_init(&mask, &match);
3022     netdev_flow_key_init_masked(&key, &match.flow, &mask);
3023
3024     if (put->pmd_id == PMD_ID_NULL) {
3025         if (cmap_count(&dp->poll_threads) == 0) {
3026             return EINVAL;
3027         }
3028         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3029             struct dpif_flow_stats pmd_stats;
3030             int pmd_error;
3031
3032             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3033                                         &pmd_stats);
3034             if (pmd_error) {
3035                 error = pmd_error;
3036             } else if (put->stats) {
3037                 put->stats->n_packets += pmd_stats.n_packets;
3038                 put->stats->n_bytes += pmd_stats.n_bytes;
3039                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3040                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3041             }
3042         }
3043     } else {
3044         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3045         if (!pmd) {
3046             return EINVAL;
3047         }
3048         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3049         dp_netdev_pmd_unref(pmd);
3050     }
3051
3052     return error;
3053 }
3054
3055 static int
3056 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3057                 struct dpif_flow_stats *stats,
3058                 const struct dpif_flow_del *del)
3059 {
3060     struct dp_netdev_flow *netdev_flow;
3061     int error = 0;
3062
3063     ovs_mutex_lock(&pmd->flow_mutex);
3064     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3065                                           del->key_len);
3066     if (netdev_flow) {
3067         if (stats) {
3068             get_dpif_flow_stats(netdev_flow, stats);
3069         }
3070         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3071     } else {
3072         error = ENOENT;
3073     }
3074     ovs_mutex_unlock(&pmd->flow_mutex);
3075
3076     return error;
3077 }
3078
3079 static int
3080 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3081 {
3082     struct dp_netdev *dp = get_dp_netdev(dpif);
3083     struct dp_netdev_pmd_thread *pmd;
3084     int error = 0;
3085
3086     if (del->stats) {
3087         memset(del->stats, 0, sizeof *del->stats);
3088     }
3089
3090     if (del->pmd_id == PMD_ID_NULL) {
3091         if (cmap_count(&dp->poll_threads) == 0) {
3092             return EINVAL;
3093         }
3094         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3095             struct dpif_flow_stats pmd_stats;
3096             int pmd_error;
3097
3098             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3099             if (pmd_error) {
3100                 error = pmd_error;
3101             } else if (del->stats) {
3102                 del->stats->n_packets += pmd_stats.n_packets;
3103                 del->stats->n_bytes += pmd_stats.n_bytes;
3104                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3105                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3106             }
3107         }
3108     } else {
3109         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3110         if (!pmd) {
3111             return EINVAL;
3112         }
3113         error = flow_del_on_pmd(pmd, del->stats, del);
3114         dp_netdev_pmd_unref(pmd);
3115     }
3116
3117
3118     return error;
3119 }
3120
3121 struct dpif_netdev_flow_dump {
3122     struct dpif_flow_dump up;
3123     struct cmap_position poll_thread_pos;
3124     struct cmap_position flow_pos;
3125     struct dp_netdev_pmd_thread *cur_pmd;
3126     int status;
3127     struct ovs_mutex mutex;
3128 };
3129
3130 static struct dpif_netdev_flow_dump *
3131 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3132 {
3133     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3134 }
3135
3136 static struct dpif_flow_dump *
3137 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3138                              char *type OVS_UNUSED)
3139 {
3140     struct dpif_netdev_flow_dump *dump;
3141
3142     dump = xzalloc(sizeof *dump);
3143     dpif_flow_dump_init(&dump->up, dpif_);
3144     dump->up.terse = terse;
3145     ovs_mutex_init(&dump->mutex);
3146
3147     return &dump->up;
3148 }
3149
3150 static int
3151 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3152 {
3153     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3154
3155     ovs_mutex_destroy(&dump->mutex);
3156     free(dump);
3157     return 0;
3158 }
3159
3160 struct dpif_netdev_flow_dump_thread {
3161     struct dpif_flow_dump_thread up;
3162     struct dpif_netdev_flow_dump *dump;
3163     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3164     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3165 };
3166
3167 static struct dpif_netdev_flow_dump_thread *
3168 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3169 {
3170     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3171 }
3172
3173 static struct dpif_flow_dump_thread *
3174 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3175 {
3176     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3177     struct dpif_netdev_flow_dump_thread *thread;
3178
3179     thread = xmalloc(sizeof *thread);
3180     dpif_flow_dump_thread_init(&thread->up, &dump->up);
3181     thread->dump = dump;
3182     return &thread->up;
3183 }
3184
3185 static void
3186 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3187 {
3188     struct dpif_netdev_flow_dump_thread *thread
3189         = dpif_netdev_flow_dump_thread_cast(thread_);
3190
3191     free(thread);
3192 }
3193
3194 static int
3195 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3196                            struct dpif_flow *flows, int max_flows)
3197 {
3198     struct dpif_netdev_flow_dump_thread *thread
3199         = dpif_netdev_flow_dump_thread_cast(thread_);
3200     struct dpif_netdev_flow_dump *dump = thread->dump;
3201     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3202     int n_flows = 0;
3203     int i;
3204
3205     ovs_mutex_lock(&dump->mutex);
3206     if (!dump->status) {
3207         struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3208         struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3209         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3210         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3211
3212         /* First call to dump_next(), extracts the first pmd thread.
3213          * If there is no pmd thread, returns immediately. */
3214         if (!pmd) {
3215             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3216             if (!pmd) {
3217                 ovs_mutex_unlock(&dump->mutex);
3218                 return n_flows;
3219
3220             }
3221         }
3222
3223         do {
3224             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3225                 struct cmap_node *node;
3226
3227                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3228                 if (!node) {
3229                     break;
3230                 }
3231                 netdev_flows[n_flows] = CONTAINER_OF(node,
3232                                                      struct dp_netdev_flow,
3233                                                      node);
3234             }
3235             /* When finishing dumping the current pmd thread, moves to
3236              * the next. */
3237             if (n_flows < flow_limit) {
3238                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3239                 dp_netdev_pmd_unref(pmd);
3240                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3241                 if (!pmd) {
3242                     dump->status = EOF;
3243                     break;
3244                 }
3245             }
3246             /* Keeps the reference to next caller. */
3247             dump->cur_pmd = pmd;
3248
3249             /* If the current dump is empty, do not exit the loop, since the
3250              * remaining pmds could have flows to be dumped.  Just dumps again
3251              * on the new 'pmd'. */
3252         } while (!n_flows);
3253     }
3254     ovs_mutex_unlock(&dump->mutex);
3255
3256     for (i = 0; i < n_flows; i++) {
3257         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3258         struct odputil_keybuf *keybuf = &thread->keybuf[i];
3259         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3260         struct dpif_flow *f = &flows[i];
3261         struct ofpbuf key, mask;
3262
3263         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3264         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3265         dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3266                                     dump->up.terse);
3267     }
3268
3269     return n_flows;
3270 }
3271
3272 static int
3273 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3274     OVS_NO_THREAD_SAFETY_ANALYSIS
3275 {
3276     struct dp_netdev *dp = get_dp_netdev(dpif);
3277     struct dp_netdev_pmd_thread *pmd;
3278     struct dp_packet_batch pp;
3279
3280     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3281         dp_packet_size(execute->packet) > UINT16_MAX) {
3282         return EINVAL;
3283     }
3284
3285     /* Tries finding the 'pmd'.  If NULL is returned, that means
3286      * the current thread is a non-pmd thread and should use
3287      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3288     pmd = ovsthread_getspecific(dp->per_pmd_key);
3289     if (!pmd) {
3290         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3291         if (!pmd) {
3292             return EBUSY;
3293         }
3294     }
3295
3296     if (execute->probe) {
3297         /* If this is part of a probe, Drop the packet, since executing
3298          * the action may actually cause spurious packets be sent into
3299          * the network. */
3300         if (pmd->core_id == NON_PMD_CORE_ID) {
3301             dp_netdev_pmd_unref(pmd);
3302         }
3303         return 0;
3304     }
3305
3306     /* If the current thread is non-pmd thread, acquires
3307      * the 'non_pmd_mutex'. */
3308     if (pmd->core_id == NON_PMD_CORE_ID) {
3309         ovs_mutex_lock(&dp->non_pmd_mutex);
3310     }
3311
3312     /* Update current time in PMD context. */
3313     pmd_thread_ctx_time_update(pmd);
3314
3315     /* The action processing expects the RSS hash to be valid, because
3316      * it's always initialized at the beginning of datapath processing.
3317      * In this case, though, 'execute->packet' may not have gone through
3318      * the datapath at all, it may have been generated by the upper layer
3319      * (OpenFlow packet-out, BFD frame, ...). */
3320     if (!dp_packet_rss_valid(execute->packet)) {
3321         dp_packet_set_rss_hash(execute->packet,
3322                                flow_hash_5tuple(execute->flow, 0));
3323     }
3324
3325     dp_packet_batch_init_packet(&pp, execute->packet);
3326     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3327                               execute->actions, execute->actions_len);
3328     dp_netdev_pmd_flush_output_packets(pmd, true);
3329
3330     if (pmd->core_id == NON_PMD_CORE_ID) {
3331         ovs_mutex_unlock(&dp->non_pmd_mutex);
3332         dp_netdev_pmd_unref(pmd);
3333     }
3334
3335     return 0;
3336 }
3337
3338 static void
3339 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
3340 {
3341     size_t i;
3342
3343     for (i = 0; i < n_ops; i++) {
3344         struct dpif_op *op = ops[i];
3345
3346         switch (op->type) {
3347         case DPIF_OP_FLOW_PUT:
3348             op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3349             break;
3350
3351         case DPIF_OP_FLOW_DEL:
3352             op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3353             break;
3354
3355         case DPIF_OP_EXECUTE:
3356             op->error = dpif_netdev_execute(dpif, &op->execute);
3357             break;
3358
3359         case DPIF_OP_FLOW_GET:
3360             op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3361             break;
3362         }
3363     }
3364 }
3365
3366 /* Applies datapath configuration from the database. Some of the changes are
3367  * actually applied in dpif_netdev_run(). */
3368 static int
3369 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3370 {
3371     struct dp_netdev *dp = get_dp_netdev(dpif);
3372     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3373     unsigned long long insert_prob =
3374         smap_get_ullong(other_config, "emc-insert-inv-prob",
3375                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
3376     uint32_t insert_min, cur_min;
3377     uint32_t tx_flush_interval, cur_tx_flush_interval;
3378
3379     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3380                                      DEFAULT_TX_FLUSH_INTERVAL);
3381     atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3382     if (tx_flush_interval != cur_tx_flush_interval) {
3383         atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3384         VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3385                   tx_flush_interval);
3386     }
3387
3388     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3389         free(dp->pmd_cmask);
3390         dp->pmd_cmask = nullable_xstrdup(cmask);
3391         dp_netdev_request_reconfigure(dp);
3392     }
3393
3394     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3395     if (insert_prob <= UINT32_MAX) {
3396         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3397     } else {
3398         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3399         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3400     }
3401
3402     if (insert_min != cur_min) {
3403         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3404         if (insert_min == 0) {
3405             VLOG_INFO("EMC has been disabled");
3406         } else {
3407             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3408                       insert_prob, (100 / (float)insert_prob));
3409         }
3410     }
3411
3412     bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3413     bool cur_perf_enabled;
3414     atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3415     if (perf_enabled != cur_perf_enabled) {
3416         atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3417         if (perf_enabled) {
3418             VLOG_INFO("PMD performance metrics collection enabled");
3419         } else {
3420             VLOG_INFO("PMD performance metrics collection disabled");
3421         }
3422     }
3423
3424     return 0;
3425 }
3426
3427 /* Parses affinity list and returns result in 'core_ids'. */
3428 static int
3429 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3430 {
3431     unsigned i;
3432     char *list, *copy, *key, *value;
3433     int error = 0;
3434
3435     for (i = 0; i < n_rxq; i++) {
3436         core_ids[i] = OVS_CORE_UNSPEC;
3437     }
3438
3439     if (!affinity_list) {
3440         return 0;
3441     }
3442
3443     list = copy = xstrdup(affinity_list);
3444
3445     while (ofputil_parse_key_value(&list, &key, &value)) {
3446         int rxq_id, core_id;
3447
3448         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3449             || !str_to_int(value, 0, &core_id) || core_id < 0) {
3450             error = EINVAL;
3451             break;
3452         }
3453
3454         if (rxq_id < n_rxq) {
3455             core_ids[rxq_id] = core_id;
3456         }
3457     }
3458
3459     free(copy);
3460     return error;
3461 }
3462
3463 /* Parses 'affinity_list' and applies configuration if it is valid. */
3464 static int
3465 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3466                                   const char *affinity_list)
3467 {
3468     unsigned *core_ids, i;
3469     int error = 0;
3470
3471     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3472     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3473         error = EINVAL;
3474         goto exit;
3475     }
3476
3477     for (i = 0; i < port->n_rxq; i++) {
3478         port->rxqs[i].core_id = core_ids[i];
3479     }
3480
3481 exit:
3482     free(core_ids);
3483     return error;
3484 }
3485
3486 /* Changes the affinity of port's rx queues.  The changes are actually applied
3487  * in dpif_netdev_run(). */
3488 static int
3489 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
3490                             const struct smap *cfg)
3491 {
3492     struct dp_netdev *dp = get_dp_netdev(dpif);
3493     struct dp_netdev_port *port;
3494     int error = 0;
3495     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
3496
3497     ovs_mutex_lock(&dp->port_mutex);
3498     error = get_port_by_number(dp, port_no, &port);
3499     if (error || !netdev_is_pmd(port->netdev)
3500         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
3501         goto unlock;
3502     }
3503
3504     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
3505     if (error) {
3506         goto unlock;
3507     }
3508     free(port->rxq_affinity_list);
3509     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
3510
3511     dp_netdev_request_reconfigure(dp);
3512 unlock:
3513     ovs_mutex_unlock(&dp->port_mutex);
3514     return error;
3515 }
3516
3517 static int
3518 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
3519                               uint32_t queue_id, uint32_t *priority)
3520 {
3521     *priority = queue_id;
3522     return 0;
3523 }
3524
3525 \f
3526 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
3527  * a copy of the 'size' bytes of 'actions' input parameters. */
3528 struct dp_netdev_actions *
3529 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
3530 {
3531     struct dp_netdev_actions *netdev_actions;
3532
3533     netdev_actions = xmalloc(sizeof *netdev_actions + size);
3534     memcpy(netdev_actions->actions, actions, size);
3535     netdev_actions->size = size;
3536
3537     return netdev_actions;
3538 }
3539
3540 struct dp_netdev_actions *
3541 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
3542 {
3543     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
3544 }
3545
3546 static void
3547 dp_netdev_actions_free(struct dp_netdev_actions *actions)
3548 {
3549     free(actions);
3550 }
3551 \f
3552 static void
3553 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
3554                          enum rxq_cycles_counter_type type,
3555                          unsigned long long cycles)
3556 {
3557    atomic_store_relaxed(&rx->cycles[type], cycles);
3558 }
3559
3560 static void
3561 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
3562                          enum rxq_cycles_counter_type type,
3563                          unsigned long long cycles)
3564 {
3565     non_atomic_ullong_add(&rx->cycles[type], cycles);
3566 }
3567
3568 static uint64_t
3569 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
3570                          enum rxq_cycles_counter_type type)
3571 {
3572     unsigned long long processing_cycles;
3573     atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
3574     return processing_cycles;
3575 }
3576
3577 static void
3578 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
3579                                 unsigned long long cycles)
3580 {
3581     unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
3582     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
3583 }
3584
3585 static uint64_t
3586 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
3587 {
3588     unsigned long long processing_cycles;
3589     atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
3590     return processing_cycles;
3591 }
3592
3593 #if ATOMIC_ALWAYS_LOCK_FREE_8B
3594 static inline bool
3595 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
3596 {
3597     bool pmd_perf_enabled;
3598     atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
3599     return pmd_perf_enabled;
3600 }
3601 #else
3602 /* If stores and reads of 64-bit integers are not atomic, the full PMD
3603  * performance metrics are not available as locked access to 64 bit
3604  * integers would be prohibitively expensive. */
3605 static inline bool
3606 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
3607 {
3608     return false;
3609 }
3610 #endif
3611
3612 static int
3613 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
3614                                    struct tx_port *p)
3615 {
3616     int i;
3617     int tx_qid;
3618     int output_cnt;
3619     bool dynamic_txqs;
3620     struct cycle_timer timer;
3621     uint64_t cycles;
3622     uint32_t tx_flush_interval;
3623
3624     cycle_timer_start(&pmd->perf_stats, &timer);
3625
3626     dynamic_txqs = p->port->dynamic_txqs;
3627     if (dynamic_txqs) {
3628         tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
3629     } else {
3630         tx_qid = pmd->static_tx_qid;
3631     }
3632
3633     output_cnt = dp_packet_batch_size(&p->output_pkts);
3634     ovs_assert(output_cnt > 0);
3635
3636     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
3637     dp_packet_batch_init(&p->output_pkts);
3638
3639     /* Update time of the next flush. */
3640     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
3641     p->flush_time = pmd->ctx.now + tx_flush_interval;
3642
3643     ovs_assert(pmd->n_output_batches > 0);
3644     pmd->n_output_batches--;
3645
3646     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
3647     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
3648
3649     /* Distribute send cycles evenly among transmitted packets and assign to
3650      * their respective rx queues. */
3651     cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
3652     for (i = 0; i < output_cnt; i++) {
3653         if (p->output_pkts_rxqs[i]) {
3654             dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
3655                                      RXQ_CYCLES_PROC_CURR, cycles);
3656         }
3657     }
3658
3659     return output_cnt;
3660 }
3661
3662 static int
3663 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
3664                                    bool force)
3665 {
3666     struct tx_port *p;
3667     int output_cnt = 0;
3668
3669     if (!pmd->n_output_batches) {
3670         return 0;
3671     }
3672
3673     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
3674         if (!dp_packet_batch_is_empty(&p->output_pkts)
3675             && (force || pmd->ctx.now >= p->flush_time)) {
3676             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
3677         }
3678     }
3679     return output_cnt;
3680 }
3681
3682 static int
3683 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
3684                            struct dp_netdev_rxq *rxq,
3685                            odp_port_t port_no)
3686 {
3687     struct pmd_perf_stats *s = &pmd->perf_stats;
3688     struct dp_packet_batch batch;
3689     struct cycle_timer timer;
3690     int error;
3691     int batch_cnt = 0;
3692     int rem_qlen = 0, *qlen_p = NULL;
3693     uint64_t cycles;
3694
3695     /* Measure duration for polling and processing rx burst. */
3696     cycle_timer_start(&pmd->perf_stats, &timer);
3697
3698     pmd->ctx.last_rxq = rxq;
3699     dp_packet_batch_init(&batch);
3700
3701     /* Fetch the rx queue length only for vhostuser ports. */
3702     if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
3703         qlen_p = &rem_qlen;
3704     }
3705
3706     error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
3707     if (!error) {
3708         /* At least one packet received. */
3709         *recirc_depth_get() = 0;
3710         pmd_thread_ctx_time_update(pmd);
3711         batch_cnt = batch.count;
3712         if (pmd_perf_metrics_enabled(pmd)) {
3713             /* Update batch histogram. */
3714             s->current.batches++;
3715             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
3716             /* Update the maximum vhost rx queue fill level. */
3717             if (rxq->is_vhost && rem_qlen >= 0) {
3718                 uint32_t qfill = batch_cnt + rem_qlen;
3719                 if (qfill > s->current.max_vhost_qfill) {
3720                     s->current.max_vhost_qfill = qfill;
3721                 }
3722             }
3723         }
3724         /* Process packet batch. */
3725         dp_netdev_input(pmd, &batch, port_no);
3726
3727         /* Assign processing cycles to rx queue. */
3728         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
3729         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
3730
3731         dp_netdev_pmd_flush_output_packets(pmd, false);
3732     } else {
3733         /* Discard cycles. */
3734         cycle_timer_stop(&pmd->perf_stats, &timer);
3735         if (error != EAGAIN && error != EOPNOTSUPP) {
3736             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3737
3738             VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3739                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
3740         }
3741     }
3742
3743     pmd->ctx.last_rxq = NULL;
3744
3745     return batch_cnt;
3746 }
3747
3748 static struct tx_port *
3749 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3750 {
3751     struct tx_port *tx;
3752
3753     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3754         if (tx->port->port_no == port_no) {
3755             return tx;
3756         }
3757     }
3758
3759     return NULL;
3760 }
3761
3762 static int
3763 port_reconfigure(struct dp_netdev_port *port)
3764 {
3765     struct netdev *netdev = port->netdev;
3766     int i, err;
3767
3768     /* Closes the existing 'rxq's. */
3769     for (i = 0; i < port->n_rxq; i++) {
3770         netdev_rxq_close(port->rxqs[i].rx);
3771         port->rxqs[i].rx = NULL;
3772     }
3773     unsigned last_nrxq = port->n_rxq;
3774     port->n_rxq = 0;
3775
3776     /* Allows 'netdev' to apply the pending configuration changes. */
3777     if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
3778         err = netdev_reconfigure(netdev);
3779         if (err && (err != EOPNOTSUPP)) {
3780             VLOG_ERR("Failed to set interface %s new configuration",
3781                      netdev_get_name(netdev));
3782             return err;
3783         }
3784     }
3785     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3786     port->rxqs = xrealloc(port->rxqs,
3787                           sizeof *port->rxqs * netdev_n_rxq(netdev));
3788     /* Realloc 'used' counters for tx queues. */
3789     free(port->txq_used);
3790     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
3791
3792     for (i = 0; i < netdev_n_rxq(netdev); i++) {
3793         bool new_queue = i >= last_nrxq;
3794         if (new_queue) {
3795             memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
3796         }
3797
3798         port->rxqs[i].port = port;
3799         port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
3800
3801         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
3802         if (err) {
3803             return err;
3804         }
3805         port->n_rxq++;
3806     }
3807
3808     /* Parse affinity list to apply configuration for new queues. */
3809     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
3810
3811     /* If reconfiguration was successful mark it as such, so we can use it */
3812     port->need_reconfigure = false;
3813
3814     return 0;
3815 }
3816
3817 struct rr_numa_list {
3818     struct hmap numas;  /* Contains 'struct rr_numa' */
3819 };
3820
3821 struct rr_numa {
3822     struct hmap_node node;
3823
3824     int numa_id;
3825
3826     /* Non isolated pmds on numa node 'numa_id' */
3827     struct dp_netdev_pmd_thread **pmds;
3828     int n_pmds;
3829
3830     int cur_index;
3831     bool idx_inc;
3832 };
3833
3834 static struct rr_numa *
3835 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
3836 {
3837     struct rr_numa *numa;
3838
3839     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
3840         if (numa->numa_id == numa_id) {
3841             return numa;
3842         }
3843     }
3844
3845     return NULL;
3846 }
3847
3848 /* Returns the next node in numa list following 'numa' in round-robin fashion.
3849  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
3850  * Returns NULL if 'rr' numa list is empty. */
3851 static struct rr_numa *
3852 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
3853 {
3854     struct hmap_node *node = NULL;
3855
3856     if (numa) {
3857         node = hmap_next(&rr->numas, &numa->node);
3858     }
3859     if (!node) {
3860         node = hmap_first(&rr->numas);
3861     }
3862
3863     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
3864 }
3865
3866 static void
3867 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
3868 {
3869     struct dp_netdev_pmd_thread *pmd;
3870     struct rr_numa *numa;
3871
3872     hmap_init(&rr->numas);
3873
3874     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3875         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3876             continue;
3877         }
3878
3879         numa = rr_numa_list_lookup(rr, pmd->numa_id);
3880         if (!numa) {
3881             numa = xzalloc(sizeof *numa);
3882             numa->numa_id = pmd->numa_id;
3883             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
3884         }
3885         numa->n_pmds++;
3886         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
3887         numa->pmds[numa->n_pmds - 1] = pmd;
3888         /* At least one pmd so initialise curr_idx and idx_inc. */
3889         numa->cur_index = 0;
3890         numa->idx_inc = true;
3891     }
3892 }
3893
3894 /* Returns the next pmd from the numa node in
3895  * incrementing or decrementing order. */
3896 static struct dp_netdev_pmd_thread *
3897 rr_numa_get_pmd(struct rr_numa *numa)
3898 {
3899     int numa_idx = numa->cur_index;
3900
3901     if (numa->idx_inc == true) {
3902         /* Incrementing through list of pmds. */
3903         if (numa->cur_index == numa->n_pmds-1) {
3904             /* Reached the last pmd. */
3905             numa->idx_inc = false;
3906         } else {
3907             numa->cur_index++;
3908         }
3909     } else {
3910         /* Decrementing through list of pmds. */
3911         if (numa->cur_index == 0) {
3912             /* Reached the first pmd. */
3913             numa->idx_inc = true;
3914         } else {
3915             numa->cur_index--;
3916         }
3917     }
3918     return numa->pmds[numa_idx];
3919 }
3920
3921 static void
3922 rr_numa_list_destroy(struct rr_numa_list *rr)
3923 {
3924     struct rr_numa *numa;
3925
3926     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
3927         free(numa->pmds);
3928         free(numa);
3929     }
3930     hmap_destroy(&rr->numas);
3931 }
3932
3933 /* Sort Rx Queues by the processing cycles they are consuming. */
3934 static int
3935 compare_rxq_cycles(const void *a, const void *b)
3936 {
3937     struct dp_netdev_rxq *qa;
3938     struct dp_netdev_rxq *qb;
3939     uint64_t cycles_qa, cycles_qb;
3940
3941     qa = *(struct dp_netdev_rxq **) a;
3942     qb = *(struct dp_netdev_rxq **) b;
3943
3944     cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
3945     cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
3946
3947     if (cycles_qa != cycles_qb) {
3948         return (cycles_qa < cycles_qb) ? 1 : -1;
3949     } else {
3950         /* Cycles are the same so tiebreak on port/queue id.
3951          * Tiebreaking (as opposed to return 0) ensures consistent
3952          * sort results across multiple OS's. */
3953         uint32_t port_qa = odp_to_u32(qa->port->port_no);
3954         uint32_t port_qb = odp_to_u32(qb->port->port_no);
3955         if (port_qa != port_qb) {
3956             return port_qa > port_qb ? 1 : -1;
3957         } else {
3958             return netdev_rxq_get_queue_id(qa->rx)
3959                     - netdev_rxq_get_queue_id(qb->rx);
3960         }
3961     }
3962 }
3963
3964 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
3965  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
3966  * pmds to unpinned queues.
3967  *
3968  * If 'pinned' is false queues will be sorted by processing cycles they are
3969  * consuming and then assigned to pmds in round robin order.
3970  *
3971  * The function doesn't touch the pmd threads, it just stores the assignment
3972  * in the 'pmd' member of each rxq. */
3973 static void
3974 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
3975 {
3976     struct dp_netdev_port *port;
3977     struct rr_numa_list rr;
3978     struct rr_numa *non_local_numa = NULL;
3979     struct dp_netdev_rxq ** rxqs = NULL;
3980     int n_rxqs = 0;
3981     struct rr_numa *numa = NULL;
3982     int numa_id;
3983
3984     HMAP_FOR_EACH (port, node, &dp->ports) {
3985         if (!netdev_is_pmd(port->netdev)) {
3986             continue;
3987         }
3988
3989         for (int qid = 0; qid < port->n_rxq; qid++) {
3990             struct dp_netdev_rxq *q = &port->rxqs[qid];
3991
3992             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
3993                 struct dp_netdev_pmd_thread *pmd;
3994
3995                 pmd = dp_netdev_get_pmd(dp, q->core_id);
3996                 if (!pmd) {
3997                     VLOG_WARN("There is no PMD thread on core %d. Queue "
3998                               "%d on port \'%s\' will not be polled.",
3999                               q->core_id, qid, netdev_get_name(port->netdev));
4000                 } else {
4001                     q->pmd = pmd;
4002                     pmd->isolated = true;
4003                     dp_netdev_pmd_unref(pmd);
4004                 }
4005             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4006                 uint64_t cycle_hist = 0;
4007
4008                 if (n_rxqs == 0) {
4009                     rxqs = xmalloc(sizeof *rxqs);
4010                 } else {
4011                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4012                 }
4013                 /* Sum the queue intervals and store the cycle history. */
4014                 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4015                     cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4016                 }
4017                 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
4018
4019                 /* Store the queue. */
4020                 rxqs[n_rxqs++] = q;
4021             }
4022         }
4023     }
4024
4025     if (n_rxqs > 1) {
4026         /* Sort the queues in order of the processing cycles
4027          * they consumed during their last pmd interval. */
4028         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4029     }
4030
4031     rr_numa_list_populate(dp, &rr);
4032     /* Assign the sorted queues to pmds in round robin. */
4033     for (int i = 0; i < n_rxqs; i++) {
4034         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4035         numa = rr_numa_list_lookup(&rr, numa_id);
4036         if (!numa) {
4037             /* There are no pmds on the queue's local NUMA node.
4038                Round robin on the NUMA nodes that do have pmds. */
4039             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4040             if (!non_local_numa) {
4041                 VLOG_ERR("There is no available (non-isolated) pmd "
4042                          "thread for port \'%s\' queue %d. This queue "
4043                          "will not be polled. Is pmd-cpu-mask set to "
4044                          "zero? Or are all PMDs isolated to other "
4045                          "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4046                          netdev_rxq_get_queue_id(rxqs[i]->rx));
4047                 continue;
4048             }
4049             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
4050             VLOG_WARN("There's no available (non-isolated) pmd thread "
4051                       "on numa node %d. Queue %d on port \'%s\' will "
4052                       "be assigned to the pmd on core %d "
4053                       "(numa node %d). Expect reduced performance.",
4054                       numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4055                       netdev_rxq_get_name(rxqs[i]->rx),
4056                       rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4057         } else {
4058         rxqs[i]->pmd = rr_numa_get_pmd(numa);
4059         VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4060                   "rx queue %d (measured processing cycles %"PRIu64").",
4061                   rxqs[i]->pmd->core_id, numa_id,
4062                   netdev_rxq_get_name(rxqs[i]->rx),
4063                   netdev_rxq_get_queue_id(rxqs[i]->rx),
4064                   dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
4065         }
4066     }
4067
4068     rr_numa_list_destroy(&rr);
4069     free(rxqs);
4070 }
4071
4072 static void
4073 reload_affected_pmds(struct dp_netdev *dp)
4074 {
4075     struct dp_netdev_pmd_thread *pmd;
4076
4077     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4078         if (pmd->need_reload) {
4079             flow_mark_flush(pmd);
4080             dp_netdev_reload_pmd__(pmd);
4081             pmd->need_reload = false;
4082         }
4083     }
4084 }
4085
4086 static void
4087 reconfigure_pmd_threads(struct dp_netdev *dp)
4088     OVS_REQUIRES(dp->port_mutex)
4089 {
4090     struct dp_netdev_pmd_thread *pmd;
4091     struct ovs_numa_dump *pmd_cores;
4092     struct ovs_numa_info_core *core;
4093     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4094     struct hmapx_node *node;
4095     bool changed = false;
4096     bool need_to_adjust_static_tx_qids = false;
4097
4098     /* The pmd threads should be started only if there's a pmd port in the
4099      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
4100      * NR_PMD_THREADS per numa node. */
4101     if (!has_pmd_port(dp)) {
4102         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4103     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4104         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4105     } else {
4106         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4107     }
4108
4109     /* We need to adjust 'static_tx_qid's only if we're reducing number of
4110      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4111     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4112         /* Adjustment is required to keep 'static_tx_qid's sequential and
4113          * avoid possible issues, for example, imbalanced tx queue usage
4114          * and unnecessary locking caused by remapping on netdev level. */
4115         need_to_adjust_static_tx_qids = true;
4116     }
4117
4118     /* Check for unwanted pmd threads */
4119     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4120         if (pmd->core_id == NON_PMD_CORE_ID) {
4121             continue;
4122         }
4123         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4124                                                     pmd->core_id)) {
4125             hmapx_add(&to_delete, pmd);
4126         } else if (need_to_adjust_static_tx_qids) {
4127             pmd->need_reload = true;
4128         }
4129     }
4130
4131     HMAPX_FOR_EACH (node, &to_delete) {
4132         pmd = (struct dp_netdev_pmd_thread *) node->data;
4133         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4134                   pmd->numa_id, pmd->core_id);
4135         dp_netdev_del_pmd(dp, pmd);
4136     }
4137     changed = !hmapx_is_empty(&to_delete);
4138     hmapx_destroy(&to_delete);
4139
4140     if (need_to_adjust_static_tx_qids) {
4141         /* 'static_tx_qid's are not sequential now.
4142          * Reload remaining threads to fix this. */
4143         reload_affected_pmds(dp);
4144     }
4145
4146     /* Check for required new pmd threads */
4147     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4148         pmd = dp_netdev_get_pmd(dp, core->core_id);
4149         if (!pmd) {
4150             pmd = xzalloc(sizeof *pmd);
4151             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4152             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4153             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4154                       pmd->numa_id, pmd->core_id);
4155             changed = true;
4156         } else {
4157             dp_netdev_pmd_unref(pmd);
4158         }
4159     }
4160
4161     if (changed) {
4162         struct ovs_numa_info_numa *numa;
4163
4164         /* Log the number of pmd threads per numa node. */
4165         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4166             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4167                       numa->n_cores, numa->numa_id);
4168         }
4169     }
4170
4171     ovs_numa_dump_destroy(pmd_cores);
4172 }
4173
4174 static void
4175 pmd_remove_stale_ports(struct dp_netdev *dp,
4176                        struct dp_netdev_pmd_thread *pmd)
4177     OVS_EXCLUDED(pmd->port_mutex)
4178     OVS_REQUIRES(dp->port_mutex)
4179 {
4180     struct rxq_poll *poll, *poll_next;
4181     struct tx_port *tx, *tx_next;
4182
4183     ovs_mutex_lock(&pmd->port_mutex);
4184     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4185         struct dp_netdev_port *port = poll->rxq->port;
4186
4187         if (port->need_reconfigure
4188             || !hmap_contains(&dp->ports, &port->node)) {
4189             dp_netdev_del_rxq_from_pmd(pmd, poll);
4190         }
4191     }
4192     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4193         struct dp_netdev_port *port = tx->port;
4194
4195         if (port->need_reconfigure
4196             || !hmap_contains(&dp->ports, &port->node)) {
4197             dp_netdev_del_port_tx_from_pmd(pmd, tx);
4198         }
4199     }
4200     ovs_mutex_unlock(&pmd->port_mutex);
4201 }
4202
4203 /* Must be called each time a port is added/removed or the cmask changes.
4204  * This creates and destroys pmd threads, reconfigures ports, opens their
4205  * rxqs and assigns all rxqs/txqs to pmd threads. */
4206 static void
4207 reconfigure_datapath(struct dp_netdev *dp)
4208     OVS_REQUIRES(dp->port_mutex)
4209 {
4210     struct dp_netdev_pmd_thread *pmd;
4211     struct dp_netdev_port *port;
4212     int wanted_txqs;
4213
4214     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4215
4216     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4217      * on the system and the user configuration. */
4218     reconfigure_pmd_threads(dp);
4219
4220     wanted_txqs = cmap_count(&dp->poll_threads);
4221
4222     /* The number of pmd threads might have changed, or a port can be new:
4223      * adjust the txqs. */
4224     HMAP_FOR_EACH (port, node, &dp->ports) {
4225         netdev_set_tx_multiq(port->netdev, wanted_txqs);
4226     }
4227
4228     /* Step 2: Remove from the pmd threads ports that have been removed or
4229      * need reconfiguration. */
4230
4231     /* Check for all the ports that need reconfiguration.  We cache this in
4232      * 'port->need_reconfigure', because netdev_is_reconf_required() can
4233      * change at any time. */
4234     HMAP_FOR_EACH (port, node, &dp->ports) {
4235         if (netdev_is_reconf_required(port->netdev)) {
4236             port->need_reconfigure = true;
4237         }
4238     }
4239
4240     /* Remove from the pmd threads all the ports that have been deleted or
4241      * need reconfiguration. */
4242     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4243         pmd_remove_stale_ports(dp, pmd);
4244     }
4245
4246     /* Reload affected pmd threads.  We must wait for the pmd threads before
4247      * reconfiguring the ports, because a port cannot be reconfigured while
4248      * it's being used. */
4249     reload_affected_pmds(dp);
4250
4251     /* Step 3: Reconfigure ports. */
4252
4253     /* We only reconfigure the ports that we determined above, because they're
4254      * not being used by any pmd thread at the moment.  If a port fails to
4255      * reconfigure we remove it from the datapath. */
4256     struct dp_netdev_port *next_port;
4257     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4258         int err;
4259
4260         if (!port->need_reconfigure) {
4261             continue;
4262         }
4263
4264         err = port_reconfigure(port);
4265         if (err) {
4266             hmap_remove(&dp->ports, &port->node);
4267             seq_change(dp->port_seq);
4268             port_destroy(port);
4269         } else {
4270             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4271         }
4272     }
4273
4274     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
4275      * for now, we just update the 'pmd' pointer in each rxq to point to the
4276      * wanted thread according to the scheduling policy. */
4277
4278     /* Reset all the pmd threads to non isolated. */
4279     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4280         pmd->isolated = false;
4281     }
4282
4283     /* Reset all the queues to unassigned */
4284     HMAP_FOR_EACH (port, node, &dp->ports) {
4285         for (int i = 0; i < port->n_rxq; i++) {
4286             port->rxqs[i].pmd = NULL;
4287         }
4288     }
4289
4290     /* Add pinned queues and mark pmd threads isolated. */
4291     rxq_scheduling(dp, true);
4292
4293     /* Add non-pinned queues. */
4294     rxq_scheduling(dp, false);
4295
4296     /* Step 5: Remove queues not compliant with new scheduling. */
4297     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4298         struct rxq_poll *poll, *poll_next;
4299
4300         ovs_mutex_lock(&pmd->port_mutex);
4301         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4302             if (poll->rxq->pmd != pmd) {
4303                 dp_netdev_del_rxq_from_pmd(pmd, poll);
4304             }
4305         }
4306         ovs_mutex_unlock(&pmd->port_mutex);
4307     }
4308
4309     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
4310      * the old queues before readding them, otherwise a queue can be polled by
4311      * two threads at the same time. */
4312     reload_affected_pmds(dp);
4313
4314     /* Step 6: Add queues from scheduling, if they're not there already. */
4315     HMAP_FOR_EACH (port, node, &dp->ports) {
4316         if (!netdev_is_pmd(port->netdev)) {
4317             continue;
4318         }
4319
4320         for (int qid = 0; qid < port->n_rxq; qid++) {
4321             struct dp_netdev_rxq *q = &port->rxqs[qid];
4322
4323             if (q->pmd) {
4324                 ovs_mutex_lock(&q->pmd->port_mutex);
4325                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4326                 ovs_mutex_unlock(&q->pmd->port_mutex);
4327             }
4328         }
4329     }
4330
4331     /* Add every port to the tx cache of every pmd thread, if it's not
4332      * there already and if this pmd has at least one rxq to poll. */
4333     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4334         ovs_mutex_lock(&pmd->port_mutex);
4335         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4336             HMAP_FOR_EACH (port, node, &dp->ports) {
4337                 dp_netdev_add_port_tx_to_pmd(pmd, port);
4338             }
4339         }
4340         ovs_mutex_unlock(&pmd->port_mutex);
4341     }
4342
4343     /* Reload affected pmd threads. */
4344     reload_affected_pmds(dp);
4345 }
4346
4347 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
4348 static bool
4349 ports_require_restart(const struct dp_netdev *dp)
4350     OVS_REQUIRES(dp->port_mutex)
4351 {
4352     struct dp_netdev_port *port;
4353
4354     HMAP_FOR_EACH (port, node, &dp->ports) {
4355         if (netdev_is_reconf_required(port->netdev)) {
4356             return true;
4357         }
4358     }
4359
4360     return false;
4361 }
4362
4363 /* Return true if needs to revalidate datapath flows. */
4364 static bool
4365 dpif_netdev_run(struct dpif *dpif)
4366 {
4367     struct dp_netdev_port *port;
4368     struct dp_netdev *dp = get_dp_netdev(dpif);
4369     struct dp_netdev_pmd_thread *non_pmd;
4370     uint64_t new_tnl_seq;
4371     bool need_to_flush = true;
4372
4373     ovs_mutex_lock(&dp->port_mutex);
4374     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4375     if (non_pmd) {
4376         ovs_mutex_lock(&dp->non_pmd_mutex);
4377         HMAP_FOR_EACH (port, node, &dp->ports) {
4378             if (!netdev_is_pmd(port->netdev)) {
4379                 int i;
4380
4381                 for (i = 0; i < port->n_rxq; i++) {
4382                     if (dp_netdev_process_rxq_port(non_pmd,
4383                                                    &port->rxqs[i],
4384                                                    port->port_no)) {
4385                         need_to_flush = false;
4386                     }
4387                 }
4388             }
4389         }
4390         if (need_to_flush) {
4391             /* We didn't receive anything in the process loop.
4392              * Check if we need to send something.
4393              * There was no time updates on current iteration. */
4394             pmd_thread_ctx_time_update(non_pmd);
4395             dp_netdev_pmd_flush_output_packets(non_pmd, false);
4396         }
4397
4398         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
4399         ovs_mutex_unlock(&dp->non_pmd_mutex);
4400
4401         dp_netdev_pmd_unref(non_pmd);
4402     }
4403
4404     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
4405         reconfigure_datapath(dp);
4406     }
4407     ovs_mutex_unlock(&dp->port_mutex);
4408
4409     tnl_neigh_cache_run();
4410     tnl_port_map_run();
4411     new_tnl_seq = seq_read(tnl_conf_seq);
4412
4413     if (dp->last_tnl_conf_seq != new_tnl_seq) {
4414         dp->last_tnl_conf_seq = new_tnl_seq;
4415         return true;
4416     }
4417     return false;
4418 }
4419
4420 static void
4421 dpif_netdev_wait(struct dpif *dpif)
4422 {
4423     struct dp_netdev_port *port;
4424     struct dp_netdev *dp = get_dp_netdev(dpif);
4425
4426     ovs_mutex_lock(&dp_netdev_mutex);
4427     ovs_mutex_lock(&dp->port_mutex);
4428     HMAP_FOR_EACH (port, node, &dp->ports) {
4429         netdev_wait_reconf_required(port->netdev);
4430         if (!netdev_is_pmd(port->netdev)) {
4431             int i;
4432
4433             for (i = 0; i < port->n_rxq; i++) {
4434                 netdev_rxq_wait(port->rxqs[i].rx);
4435             }
4436         }
4437     }
4438     ovs_mutex_unlock(&dp->port_mutex);
4439     ovs_mutex_unlock(&dp_netdev_mutex);
4440     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
4441 }
4442
4443 static void
4444 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
4445 {
4446     struct tx_port *tx_port_cached;
4447
4448     /* Flush all the queued packets. */
4449     dp_netdev_pmd_flush_output_packets(pmd, true);
4450     /* Free all used tx queue ids. */
4451     dpif_netdev_xps_revalidate_pmd(pmd, true);
4452
4453     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
4454         free(tx_port_cached);
4455     }
4456     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
4457         free(tx_port_cached);
4458     }
4459 }
4460
4461 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
4462  * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
4463  * device, otherwise to 'pmd->send_port_cache' if the port has at least
4464  * one txq. */
4465 static void
4466 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
4467     OVS_REQUIRES(pmd->port_mutex)
4468 {
4469     struct tx_port *tx_port, *tx_port_cached;
4470
4471     pmd_free_cached_ports(pmd);
4472     hmap_shrink(&pmd->send_port_cache);
4473     hmap_shrink(&pmd->tnl_port_cache);
4474
4475     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
4476         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
4477             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4478             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
4479                         hash_port_no(tx_port_cached->port->port_no));
4480         }
4481
4482         if (netdev_n_txq(tx_port->port->netdev)) {
4483             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4484             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
4485                         hash_port_no(tx_port_cached->port->port_no));
4486         }
4487     }
4488 }
4489
4490 static void
4491 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4492 {
4493     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4494     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
4495         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
4496                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
4497     }
4498     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4499
4500     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
4501              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
4502 }
4503
4504 static void
4505 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4506 {
4507     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4508     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
4509     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4510 }
4511
4512 static int
4513 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
4514                           struct polled_queue **ppoll_list)
4515 {
4516     struct polled_queue *poll_list = *ppoll_list;
4517     struct rxq_poll *poll;
4518     int i;
4519
4520     ovs_mutex_lock(&pmd->port_mutex);
4521     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
4522                                     * sizeof *poll_list);
4523
4524     i = 0;
4525     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4526         poll_list[i].rxq = poll->rxq;
4527         poll_list[i].port_no = poll->rxq->port->port_no;
4528         i++;
4529     }
4530
4531     pmd_load_cached_ports(pmd);
4532
4533     ovs_mutex_unlock(&pmd->port_mutex);
4534
4535     *ppoll_list = poll_list;
4536     return i;
4537 }
4538
4539 static void *
4540 pmd_thread_main(void *f_)
4541 {
4542     struct dp_netdev_pmd_thread *pmd = f_;
4543     struct pmd_perf_stats *s = &pmd->perf_stats;
4544     unsigned int lc = 0;
4545     struct polled_queue *poll_list;
4546     bool exiting;
4547     int poll_cnt;
4548     int i;
4549     int process_packets = 0;
4550
4551     poll_list = NULL;
4552
4553     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
4554     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
4555     ovs_numa_thread_setaffinity_core(pmd->core_id);
4556     dpdk_set_lcore_id(pmd->core_id);
4557     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4558     emc_cache_init(&pmd->flow_cache);
4559 reload:
4560     pmd_alloc_static_tx_qid(pmd);
4561
4562     /* List port/core affinity */
4563     for (i = 0; i < poll_cnt; i++) {
4564        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
4565                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
4566                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4567        /* Reset the rxq current cycles counter. */
4568        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
4569     }
4570
4571     if (!poll_cnt) {
4572         while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
4573             seq_wait(pmd->reload_seq, pmd->last_reload_seq);
4574             poll_block();
4575         }
4576         lc = UINT_MAX;
4577     }
4578
4579     pmd->intrvl_tsc_prev = 0;
4580     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
4581     cycles_counter_update(s);
4582     /* Protect pmd stats from external clearing while polling. */
4583     ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
4584     for (;;) {
4585         uint64_t rx_packets = 0, tx_packets = 0;
4586
4587         pmd_perf_start_iteration(s);
4588
4589         for (i = 0; i < poll_cnt; i++) {
4590             process_packets =
4591                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
4592                                            poll_list[i].port_no);
4593             rx_packets += process_packets;
4594         }
4595
4596         if (!rx_packets) {
4597             /* We didn't receive anything in the process loop.
4598              * Check if we need to send something.
4599              * There was no time updates on current iteration. */
4600             pmd_thread_ctx_time_update(pmd);
4601             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
4602         }
4603
4604         if (lc++ > 1024) {
4605             bool reload;
4606
4607             lc = 0;
4608
4609             coverage_try_clear();
4610             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
4611             if (!ovsrcu_try_quiesce()) {
4612                 emc_cache_slow_sweep(&pmd->flow_cache);
4613             }
4614
4615             atomic_read_relaxed(&pmd->reload, &reload);
4616             if (reload) {
4617                 break;
4618             }
4619         }
4620         pmd_perf_end_iteration(s, rx_packets, tx_packets,
4621                                pmd_perf_metrics_enabled(pmd));
4622     }
4623     ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
4624
4625     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4626     exiting = latch_is_set(&pmd->exit_latch);
4627     /* Signal here to make sure the pmd finishes
4628      * reloading the updated configuration. */
4629     dp_netdev_pmd_reload_done(pmd);
4630
4631     pmd_free_static_tx_qid(pmd);
4632
4633     if (!exiting) {
4634         goto reload;
4635     }
4636
4637     emc_cache_uninit(&pmd->flow_cache);
4638     free(poll_list);
4639     pmd_free_cached_ports(pmd);
4640     return NULL;
4641 }
4642
4643 static void
4644 dp_netdev_disable_upcall(struct dp_netdev *dp)
4645     OVS_ACQUIRES(dp->upcall_rwlock)
4646 {
4647     fat_rwlock_wrlock(&dp->upcall_rwlock);
4648 }
4649
4650 \f
4651 /* Meters */
4652 static void
4653 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
4654                                struct ofputil_meter_features *features)
4655 {
4656     features->max_meters = MAX_METERS;
4657     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
4658     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
4659     features->max_bands = MAX_BANDS;
4660     features->max_color = 0;
4661 }
4662
4663 /* Returns false when packet needs to be dropped. */
4664 static void
4665 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
4666                     uint32_t meter_id, long long int now)
4667 {
4668     struct dp_meter *meter;
4669     struct dp_meter_band *band;
4670     struct dp_packet *packet;
4671     long long int long_delta_t; /* msec */
4672     uint32_t delta_t; /* msec */
4673     const size_t cnt = dp_packet_batch_size(packets_);
4674     uint32_t bytes, volume;
4675     int exceeded_band[NETDEV_MAX_BURST];
4676     uint32_t exceeded_rate[NETDEV_MAX_BURST];
4677     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
4678
4679     if (meter_id >= MAX_METERS) {
4680         return;
4681     }
4682
4683     meter_lock(dp, meter_id);
4684     meter = dp->meters[meter_id];
4685     if (!meter) {
4686         goto out;
4687     }
4688
4689     /* Initialize as negative values. */
4690     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
4691     /* Initialize as zeroes. */
4692     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
4693
4694     /* All packets will hit the meter at the same time. */
4695     long_delta_t = (now - meter->used) / 1000; /* msec */
4696
4697     /* Make sure delta_t will not be too large, so that bucket will not
4698      * wrap around below. */
4699     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
4700         ? meter->max_delta_t : (uint32_t)long_delta_t;
4701
4702     /* Update meter stats. */
4703     meter->used = now;
4704     meter->packet_count += cnt;
4705     bytes = 0;
4706     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
4707         bytes += dp_packet_size(packet);
4708     }
4709     meter->byte_count += bytes;
4710
4711     /* Meters can operate in terms of packets per second or kilobits per
4712      * second. */
4713     if (meter->flags & OFPMF13_PKTPS) {
4714         /* Rate in packets/second, bucket 1/1000 packets. */
4715         /* msec * packets/sec = 1/1000 packets. */
4716         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
4717     } else {
4718         /* Rate in kbps, bucket in bits. */
4719         /* msec * kbps = bits */
4720         volume = bytes * 8;
4721     }
4722
4723     /* Update all bands and find the one hit with the highest rate for each
4724      * packet (if any). */
4725     for (int m = 0; m < meter->n_bands; ++m) {
4726         band = &meter->bands[m];
4727
4728         /* Update band's bucket. */
4729         band->bucket += delta_t * band->up.rate;
4730         if (band->bucket > band->up.burst_size) {
4731             band->bucket = band->up.burst_size;
4732         }
4733
4734         /* Drain the bucket for all the packets, if possible. */
4735         if (band->bucket >= volume) {
4736             band->bucket -= volume;
4737         } else {
4738             int band_exceeded_pkt;
4739
4740             /* Band limit hit, must process packet-by-packet. */
4741             if (meter->flags & OFPMF13_PKTPS) {
4742                 band_exceeded_pkt = band->bucket / 1000;
4743                 band->bucket %= 1000; /* Remainder stays in bucket. */
4744
4745                 /* Update the exceeding band for each exceeding packet.
4746                  * (Only one band will be fired by a packet, and that
4747                  * can be different for each packet.) */
4748                 for (int i = band_exceeded_pkt; i < cnt; i++) {
4749                     if (band->up.rate > exceeded_rate[i]) {
4750                         exceeded_rate[i] = band->up.rate;
4751                         exceeded_band[i] = m;
4752                     }
4753                 }
4754             } else {
4755                 /* Packet sizes differ, must process one-by-one. */
4756                 band_exceeded_pkt = cnt;
4757                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
4758                     uint32_t bits = dp_packet_size(packet) * 8;
4759
4760                     if (band->bucket >= bits) {
4761                         band->bucket -= bits;
4762                     } else {
4763                         if (i < band_exceeded_pkt) {
4764                             band_exceeded_pkt = i;
4765                         }
4766                         /* Update the exceeding band for the exceeding packet.
4767                          * (Only one band will be fired by a packet, and that
4768                          * can be different for each packet.) */
4769                         if (band->up.rate > exceeded_rate[i]) {
4770                             exceeded_rate[i] = band->up.rate;
4771                             exceeded_band[i] = m;
4772                         }
4773                     }
4774                 }
4775             }
4776             /* Remember the first exceeding packet. */
4777             if (exceeded_pkt > band_exceeded_pkt) {
4778                 exceeded_pkt = band_exceeded_pkt;
4779             }
4780         }
4781     }
4782
4783     /* Fire the highest rate band exceeded by each packet.
4784      * Drop packets if needed, by swapping packet to the end that will be
4785      * ignored. */
4786     size_t j;
4787     DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4788         if (exceeded_band[j] >= 0) {
4789             /* Meter drop packet. */
4790             band = &meter->bands[exceeded_band[j]];
4791             band->packet_count += 1;
4792             band->byte_count += dp_packet_size(packet);
4793
4794             dp_packet_delete(packet);
4795         } else {
4796             /* Meter accepts packet. */
4797             dp_packet_batch_refill(packets_, packet, j);
4798         }
4799     }
4800  out:
4801     meter_unlock(dp, meter_id);
4802 }
4803
4804 /* Meter set/get/del processing is still single-threaded. */
4805 static int
4806 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id,
4807                       struct ofputil_meter_config *config)
4808 {
4809     struct dp_netdev *dp = get_dp_netdev(dpif);
4810     uint32_t mid = meter_id->uint32;
4811     struct dp_meter *meter;
4812     int i;
4813
4814     if (mid >= MAX_METERS) {
4815         return EFBIG; /* Meter_id out of range. */
4816     }
4817
4818     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK ||
4819         !(config->flags & (OFPMF13_KBPS | OFPMF13_PKTPS))) {
4820         return EBADF; /* Unsupported flags set */
4821     }
4822
4823     /* Validate bands */
4824     if (config->n_bands == 0 || config->n_bands > MAX_BANDS) {
4825         return EINVAL; /* Too many bands */
4826     }
4827
4828     /* Validate rates */
4829     for (i = 0; i < config->n_bands; i++) {
4830         if (config->bands[i].rate == 0) {
4831             return EDOM; /* rate must be non-zero */
4832         }
4833     }
4834
4835     for (i = 0; i < config->n_bands; ++i) {
4836         switch (config->bands[i].type) {
4837         case OFPMBT13_DROP:
4838             break;
4839         default:
4840             return ENODEV; /* Unsupported band type */
4841         }
4842     }
4843
4844     /* Allocate meter */
4845     meter = xzalloc(sizeof *meter
4846                     + config->n_bands * sizeof(struct dp_meter_band));
4847     if (meter) {
4848         meter->flags = config->flags;
4849         meter->n_bands = config->n_bands;
4850         meter->max_delta_t = 0;
4851         meter->used = time_usec();
4852
4853         /* set up bands */
4854         for (i = 0; i < config->n_bands; ++i) {
4855             uint32_t band_max_delta_t;
4856
4857             /* Set burst size to a workable value if none specified. */
4858             if (config->bands[i].burst_size == 0) {
4859                 config->bands[i].burst_size = config->bands[i].rate;
4860             }
4861
4862             meter->bands[i].up = config->bands[i];
4863             /* Convert burst size to the bucket units: */
4864             /* pkts => 1/1000 packets, kilobits => bits. */
4865             meter->bands[i].up.burst_size *= 1000;
4866             /* Initialize bucket to empty. */
4867             meter->bands[i].bucket = 0;
4868
4869             /* Figure out max delta_t that is enough to fill any bucket. */
4870             band_max_delta_t
4871                 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
4872             if (band_max_delta_t > meter->max_delta_t) {
4873                 meter->max_delta_t = band_max_delta_t;
4874             }
4875         }
4876
4877         meter_lock(dp, mid);
4878         dp_delete_meter(dp, mid); /* Free existing meter, if any */
4879         dp->meters[mid] = meter;
4880         meter_unlock(dp, mid);
4881
4882         return 0;
4883     }
4884     return ENOMEM;
4885 }
4886
4887 static int
4888 dpif_netdev_meter_get(const struct dpif *dpif,
4889                       ofproto_meter_id meter_id_,
4890                       struct ofputil_meter_stats *stats, uint16_t n_bands)
4891 {
4892     const struct dp_netdev *dp = get_dp_netdev(dpif);
4893     const struct dp_meter *meter;
4894     uint32_t meter_id = meter_id_.uint32;
4895
4896     if (meter_id >= MAX_METERS) {
4897         return EFBIG;
4898     }
4899     meter = dp->meters[meter_id];
4900     if (!meter) {
4901         return ENOENT;
4902     }
4903     if (stats) {
4904         int i = 0;
4905
4906         meter_lock(dp, meter_id);
4907         stats->packet_in_count = meter->packet_count;
4908         stats->byte_in_count = meter->byte_count;
4909
4910         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
4911             stats->bands[i].packet_count = meter->bands[i].packet_count;
4912             stats->bands[i].byte_count = meter->bands[i].byte_count;
4913         }
4914         meter_unlock(dp, meter_id);
4915
4916         stats->n_bands = i;
4917     }
4918     return 0;
4919 }
4920
4921 static int
4922 dpif_netdev_meter_del(struct dpif *dpif,
4923                       ofproto_meter_id meter_id_,
4924                       struct ofputil_meter_stats *stats, uint16_t n_bands)
4925 {
4926     struct dp_netdev *dp = get_dp_netdev(dpif);
4927     int error;
4928
4929     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
4930     if (!error) {
4931         uint32_t meter_id = meter_id_.uint32;
4932
4933         meter_lock(dp, meter_id);
4934         dp_delete_meter(dp, meter_id);
4935         meter_unlock(dp, meter_id);
4936     }
4937     return error;
4938 }
4939
4940 \f
4941 static void
4942 dpif_netdev_disable_upcall(struct dpif *dpif)
4943     OVS_NO_THREAD_SAFETY_ANALYSIS
4944 {
4945     struct dp_netdev *dp = get_dp_netdev(dpif);
4946     dp_netdev_disable_upcall(dp);
4947 }
4948
4949 static void
4950 dp_netdev_enable_upcall(struct dp_netdev *dp)
4951     OVS_RELEASES(dp->upcall_rwlock)
4952 {
4953     fat_rwlock_unlock(&dp->upcall_rwlock);
4954 }
4955
4956 static void
4957 dpif_netdev_enable_upcall(struct dpif *dpif)
4958     OVS_NO_THREAD_SAFETY_ANALYSIS
4959 {
4960     struct dp_netdev *dp = get_dp_netdev(dpif);
4961     dp_netdev_enable_upcall(dp);
4962 }
4963
4964 static void
4965 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
4966 {
4967     ovs_mutex_lock(&pmd->cond_mutex);
4968     atomic_store_relaxed(&pmd->reload, false);
4969     pmd->last_reload_seq = seq_read(pmd->reload_seq);
4970     xpthread_cond_signal(&pmd->cond);
4971     ovs_mutex_unlock(&pmd->cond_mutex);
4972 }
4973
4974 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
4975  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
4976  * 'core_id' is NON_PMD_CORE_ID).
4977  *
4978  * Caller must unrefs the returned reference.  */
4979 static struct dp_netdev_pmd_thread *
4980 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
4981 {
4982     struct dp_netdev_pmd_thread *pmd;
4983     const struct cmap_node *pnode;
4984
4985     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
4986     if (!pnode) {
4987         return NULL;
4988     }
4989     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
4990
4991     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
4992 }
4993
4994 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
4995 static void
4996 dp_netdev_set_nonpmd(struct dp_netdev *dp)
4997     OVS_REQUIRES(dp->port_mutex)
4998 {
4999     struct dp_netdev_pmd_thread *non_pmd;
5000
5001     non_pmd = xzalloc(sizeof *non_pmd);
5002     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5003 }
5004
5005 /* Caller must have valid pointer to 'pmd'. */
5006 static bool
5007 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5008 {
5009     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5010 }
5011
5012 static void
5013 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5014 {
5015     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5016         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5017     }
5018 }
5019
5020 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
5021  * fails, keeps checking for next node until reaching the end of cmap.
5022  *
5023  * Caller must unrefs the returned reference. */
5024 static struct dp_netdev_pmd_thread *
5025 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5026 {
5027     struct dp_netdev_pmd_thread *next;
5028
5029     do {
5030         struct cmap_node *node;
5031
5032         node = cmap_next_position(&dp->poll_threads, pos);
5033         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5034             : NULL;
5035     } while (next && !dp_netdev_pmd_try_ref(next));
5036
5037     return next;
5038 }
5039
5040 /* Configures the 'pmd' based on the input argument. */
5041 static void
5042 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
5043                         unsigned core_id, int numa_id)
5044 {
5045     pmd->dp = dp;
5046     pmd->core_id = core_id;
5047     pmd->numa_id = numa_id;
5048     pmd->need_reload = false;
5049     pmd->n_output_batches = 0;
5050
5051     ovs_refcount_init(&pmd->ref_cnt);
5052     latch_init(&pmd->exit_latch);
5053     pmd->reload_seq = seq_create();
5054     pmd->last_reload_seq = seq_read(pmd->reload_seq);
5055     atomic_init(&pmd->reload, false);
5056     xpthread_cond_init(&pmd->cond, NULL);
5057     ovs_mutex_init(&pmd->cond_mutex);
5058     ovs_mutex_init(&pmd->flow_mutex);
5059     ovs_mutex_init(&pmd->port_mutex);
5060     cmap_init(&pmd->flow_table);
5061     cmap_init(&pmd->classifiers);
5062     pmd->ctx.last_rxq = NULL;
5063     pmd_thread_ctx_time_update(pmd);
5064     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
5065     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
5066     hmap_init(&pmd->poll_list);
5067     hmap_init(&pmd->tx_ports);
5068     hmap_init(&pmd->tnl_port_cache);
5069     hmap_init(&pmd->send_port_cache);
5070     /* init the 'flow_cache' since there is no
5071      * actual thread created for NON_PMD_CORE_ID. */
5072     if (core_id == NON_PMD_CORE_ID) {
5073         emc_cache_init(&pmd->flow_cache);
5074         pmd_alloc_static_tx_qid(pmd);
5075     }
5076     pmd_perf_stats_init(&pmd->perf_stats);
5077     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
5078                 hash_int(core_id, 0));
5079 }
5080
5081 static void
5082 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
5083 {
5084     struct dpcls *cls;
5085
5086     dp_netdev_pmd_flow_flush(pmd);
5087     hmap_destroy(&pmd->send_port_cache);
5088     hmap_destroy(&pmd->tnl_port_cache);
5089     hmap_destroy(&pmd->tx_ports);
5090     hmap_destroy(&pmd->poll_list);
5091     /* All flows (including their dpcls_rules) have been deleted already */
5092     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
5093         dpcls_destroy(cls);
5094         ovsrcu_postpone(free, cls);
5095     }
5096     cmap_destroy(&pmd->classifiers);
5097     cmap_destroy(&pmd->flow_table);
5098     ovs_mutex_destroy(&pmd->flow_mutex);
5099     latch_destroy(&pmd->exit_latch);
5100     seq_destroy(pmd->reload_seq);
5101     xpthread_cond_destroy(&pmd->cond);
5102     ovs_mutex_destroy(&pmd->cond_mutex);
5103     ovs_mutex_destroy(&pmd->port_mutex);
5104     free(pmd);
5105 }
5106
5107 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
5108  * and unrefs the struct. */
5109 static void
5110 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
5111 {
5112     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
5113      * but extra cleanup is necessary */
5114     if (pmd->core_id == NON_PMD_CORE_ID) {
5115         ovs_mutex_lock(&dp->non_pmd_mutex);
5116         emc_cache_uninit(&pmd->flow_cache);
5117         pmd_free_cached_ports(pmd);
5118         pmd_free_static_tx_qid(pmd);
5119         ovs_mutex_unlock(&dp->non_pmd_mutex);
5120     } else {
5121         latch_set(&pmd->exit_latch);
5122         dp_netdev_reload_pmd__(pmd);
5123         xpthread_join(pmd->thread, NULL);
5124     }
5125
5126     dp_netdev_pmd_clear_ports(pmd);
5127
5128     /* Purges the 'pmd''s flows after stopping the thread, but before
5129      * destroying the flows, so that the flow stats can be collected. */
5130     if (dp->dp_purge_cb) {
5131         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
5132     }
5133     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
5134     dp_netdev_pmd_unref(pmd);
5135 }
5136
5137 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
5138  * thread. */
5139 static void
5140 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
5141 {
5142     struct dp_netdev_pmd_thread *pmd;
5143     struct dp_netdev_pmd_thread **pmd_list;
5144     size_t k = 0, n_pmds;
5145
5146     n_pmds = cmap_count(&dp->poll_threads);
5147     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
5148
5149     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5150         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
5151             continue;
5152         }
5153         /* We cannot call dp_netdev_del_pmd(), since it alters
5154          * 'dp->poll_threads' (while we're iterating it) and it
5155          * might quiesce. */
5156         ovs_assert(k < n_pmds);
5157         pmd_list[k++] = pmd;
5158     }
5159
5160     for (size_t i = 0; i < k; i++) {
5161         dp_netdev_del_pmd(dp, pmd_list[i]);
5162     }
5163     free(pmd_list);
5164 }
5165
5166 /* Deletes all rx queues from pmd->poll_list and all the ports from
5167  * pmd->tx_ports. */
5168 static void
5169 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
5170 {
5171     struct rxq_poll *poll;
5172     struct tx_port *port;
5173
5174     ovs_mutex_lock(&pmd->port_mutex);
5175     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
5176         free(poll);
5177     }
5178     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
5179         free(port);
5180     }
5181     ovs_mutex_unlock(&pmd->port_mutex);
5182 }
5183
5184 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
5185 static void
5186 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
5187                          struct dp_netdev_rxq *rxq)
5188     OVS_REQUIRES(pmd->port_mutex)
5189 {
5190     int qid = netdev_rxq_get_queue_id(rxq->rx);
5191     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
5192     struct rxq_poll *poll;
5193
5194     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
5195         if (poll->rxq == rxq) {
5196             /* 'rxq' is already polled by this thread. Do nothing. */
5197             return;
5198         }
5199     }
5200
5201     poll = xmalloc(sizeof *poll);
5202     poll->rxq = rxq;
5203     hmap_insert(&pmd->poll_list, &poll->node, hash);
5204
5205     pmd->need_reload = true;
5206 }
5207
5208 /* Delete 'poll' from poll_list of PMD thread. */
5209 static void
5210 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
5211                            struct rxq_poll *poll)
5212     OVS_REQUIRES(pmd->port_mutex)
5213 {
5214     hmap_remove(&pmd->poll_list, &poll->node);
5215     free(poll);
5216
5217     pmd->need_reload = true;
5218 }
5219
5220 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
5221  * changes to take effect. */
5222 static void
5223 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
5224                              struct dp_netdev_port *port)
5225     OVS_REQUIRES(pmd->port_mutex)
5226 {
5227     struct tx_port *tx;
5228
5229     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
5230     if (tx) {
5231         /* 'port' is already on this thread tx cache. Do nothing. */
5232         return;
5233     }
5234
5235     tx = xzalloc(sizeof *tx);
5236
5237     tx->port = port;
5238     tx->qid = -1;
5239     tx->flush_time = 0LL;
5240     dp_packet_batch_init(&tx->output_pkts);
5241
5242     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
5243     pmd->need_reload = true;
5244 }
5245
5246 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
5247  * changes to take effect. */
5248 static void
5249 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
5250                                struct tx_port *tx)
5251     OVS_REQUIRES(pmd->port_mutex)
5252 {
5253     hmap_remove(&pmd->tx_ports, &tx->node);
5254     free(tx);
5255     pmd->need_reload = true;
5256 }
5257 \f
5258 static char *
5259 dpif_netdev_get_datapath_version(void)
5260 {
5261      return xstrdup("<built-in>");
5262 }
5263
5264 static void
5265 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
5266                     uint16_t tcp_flags, long long now)
5267 {
5268     uint16_t flags;
5269
5270     atomic_store_relaxed(&netdev_flow->stats.used, now);
5271     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
5272     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
5273     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
5274     flags |= tcp_flags;
5275     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
5276 }
5277
5278 static int
5279 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
5280                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
5281                  enum dpif_upcall_type type, const struct nlattr *userdata,
5282                  struct ofpbuf *actions, struct ofpbuf *put_actions)
5283 {
5284     struct dp_netdev *dp = pmd->dp;
5285
5286     if (OVS_UNLIKELY(!dp->upcall_cb)) {
5287         return ENODEV;
5288     }
5289
5290     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
5291         struct ds ds = DS_EMPTY_INITIALIZER;
5292         char *packet_str;
5293         struct ofpbuf key;
5294         struct odp_flow_key_parms odp_parms = {
5295             .flow = flow,
5296             .mask = wc ? &wc->masks : NULL,
5297             .support = dp_netdev_support,
5298         };
5299
5300         ofpbuf_init(&key, 0);
5301         odp_flow_key_from_flow(&odp_parms, &key);
5302         packet_str = ofp_dp_packet_to_string(packet_);
5303
5304         odp_flow_key_format(key.data, key.size, &ds);
5305
5306         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
5307                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
5308
5309         ofpbuf_uninit(&key);
5310         free(packet_str);
5311
5312         ds_destroy(&ds);
5313     }
5314
5315     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
5316                          actions, wc, put_actions, dp->upcall_aux);
5317 }
5318
5319 static inline uint32_t
5320 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
5321                                 const struct miniflow *mf)
5322 {
5323     uint32_t hash;
5324
5325     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5326         hash = dp_packet_get_rss_hash(packet);
5327     } else {
5328         hash = miniflow_hash_5tuple(mf, 0);
5329         dp_packet_set_rss_hash(packet, hash);
5330     }
5331
5332     return hash;
5333 }
5334
5335 static inline uint32_t
5336 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
5337                                 const struct miniflow *mf)
5338 {
5339     uint32_t hash, recirc_depth;
5340
5341     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5342         hash = dp_packet_get_rss_hash(packet);
5343     } else {
5344         hash = miniflow_hash_5tuple(mf, 0);
5345         dp_packet_set_rss_hash(packet, hash);
5346     }
5347
5348     /* The RSS hash must account for the recirculation depth to avoid
5349      * collisions in the exact match cache */
5350     recirc_depth = *recirc_depth_get_unsafe();
5351     if (OVS_UNLIKELY(recirc_depth)) {
5352         hash = hash_finish(hash, recirc_depth);
5353         dp_packet_set_rss_hash(packet, hash);
5354     }
5355     return hash;
5356 }
5357
5358 struct packet_batch_per_flow {
5359     unsigned int byte_count;
5360     uint16_t tcp_flags;
5361     struct dp_netdev_flow *flow;
5362
5363     struct dp_packet_batch array;
5364 };
5365
5366 static inline void
5367 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
5368                              struct dp_packet *packet,
5369                              const struct miniflow *mf)
5370 {
5371     batch->byte_count += dp_packet_size(packet);
5372     batch->tcp_flags |= miniflow_get_tcp_flags(mf);
5373     batch->array.packets[batch->array.count++] = packet;
5374 }
5375
5376 static inline void
5377 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
5378                            struct dp_netdev_flow *flow)
5379 {
5380     flow->batch = batch;
5381
5382     batch->flow = flow;
5383     dp_packet_batch_init(&batch->array);
5384     batch->byte_count = 0;
5385     batch->tcp_flags = 0;
5386 }
5387
5388 static inline void
5389 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
5390                               struct dp_netdev_pmd_thread *pmd)
5391 {
5392     struct dp_netdev_actions *actions;
5393     struct dp_netdev_flow *flow = batch->flow;
5394
5395     dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
5396                         batch->tcp_flags, pmd->ctx.now / 1000);
5397
5398     actions = dp_netdev_flow_get_actions(flow);
5399
5400     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
5401                               actions->actions, actions->size);
5402 }
5403
5404 static inline void
5405 dp_netdev_queue_batches(struct dp_packet *pkt,
5406                         struct dp_netdev_flow *flow, const struct miniflow *mf,
5407                         struct packet_batch_per_flow *batches,
5408                         size_t *n_batches)
5409 {
5410     struct packet_batch_per_flow *batch = flow->batch;
5411
5412     if (OVS_UNLIKELY(!batch)) {
5413         batch = &batches[(*n_batches)++];
5414         packet_batch_per_flow_init(batch, flow);
5415     }
5416
5417     packet_batch_per_flow_update(batch, pkt, mf);
5418 }
5419
5420 /* Try to process all ('cnt') the 'packets' using only the exact match cache
5421  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
5422  * miniflow is copied into 'keys' and the packet pointer is moved at the
5423  * beginning of the 'packets' array.
5424  *
5425  * The function returns the number of packets that needs to be processed in the
5426  * 'packets' array (they have been moved to the beginning of the vector).
5427  *
5428  * For performance reasons a caller may choose not to initialize the metadata
5429  * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
5430  * is not valid and must be initialized by this function using 'port_no'.
5431  * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
5432  * will be ignored.
5433  */
5434 static inline size_t
5435 emc_processing(struct dp_netdev_pmd_thread *pmd,
5436                struct dp_packet_batch *packets_,
5437                struct netdev_flow_key *keys,
5438                struct packet_batch_per_flow batches[], size_t *n_batches,
5439                bool md_is_valid, odp_port_t port_no)
5440 {
5441     struct emc_cache *flow_cache = &pmd->flow_cache;
5442     struct netdev_flow_key *key = &keys[0];
5443     size_t n_missed = 0, n_dropped = 0;
5444     struct dp_packet *packet;
5445     const size_t cnt = dp_packet_batch_size(packets_);
5446     uint32_t cur_min;
5447     int i;
5448
5449     atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
5450     pmd_perf_update_counter(&pmd->perf_stats,
5451                             md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
5452                             cnt);
5453
5454     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
5455         struct dp_netdev_flow *flow;
5456
5457         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
5458             dp_packet_delete(packet);
5459             n_dropped++;
5460             continue;
5461         }
5462
5463         if (i != cnt - 1) {
5464             struct dp_packet **packets = packets_->packets;
5465             /* Prefetch next packet data and metadata. */
5466             OVS_PREFETCH(dp_packet_data(packets[i+1]));
5467             pkt_metadata_prefetch_init(&packets[i+1]->md);
5468         }
5469
5470         if (!md_is_valid) {
5471             pkt_metadata_init(&packet->md, port_no);
5472         }
5473         miniflow_extract(packet, &key->mf);
5474         key->len = 0; /* Not computed yet. */
5475         /* If EMC is disabled skip hash computation and emc_lookup */
5476         if (cur_min) {
5477             if (!md_is_valid) {
5478                 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
5479                         &key->mf);
5480             } else {
5481                 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
5482             }
5483             flow = emc_lookup(flow_cache, key);
5484         } else {
5485             flow = NULL;
5486         }
5487         if (OVS_LIKELY(flow)) {
5488             dp_netdev_queue_batches(packet, flow, &key->mf, batches,
5489                                     n_batches);
5490         } else {
5491             /* Exact match cache missed. Group missed packets together at
5492              * the beginning of the 'packets' array. */
5493             dp_packet_batch_refill(packets_, packet, i);
5494             /* 'key[n_missed]' contains the key of the current packet and it
5495              * must be returned to the caller. The next key should be extracted
5496              * to 'keys[n_missed + 1]'. */
5497             key = &keys[++n_missed];
5498         }
5499     }
5500
5501     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT,
5502                             cnt - n_dropped - n_missed);
5503
5504     return dp_packet_batch_size(packets_);
5505 }
5506
5507 static inline int
5508 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
5509                      struct dp_packet *packet,
5510                      const struct netdev_flow_key *key,
5511                      struct ofpbuf *actions, struct ofpbuf *put_actions)
5512 {
5513     struct ofpbuf *add_actions;
5514     struct dp_packet_batch b;
5515     struct match match;
5516     ovs_u128 ufid;
5517     int error;
5518     uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
5519
5520     match.tun_md.valid = false;
5521     miniflow_expand(&key->mf, &match.flow);
5522
5523     ofpbuf_clear(actions);
5524     ofpbuf_clear(put_actions);
5525
5526     dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
5527     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
5528                              &ufid, DPIF_UC_MISS, NULL, actions,
5529                              put_actions);
5530     if (OVS_UNLIKELY(error && error != ENOSPC)) {
5531         dp_packet_delete(packet);
5532         return error;
5533     }
5534
5535     /* The Netlink encoding of datapath flow keys cannot express
5536      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
5537      * tag is interpreted as exact match on the fact that there is no
5538      * VLAN.  Unless we refactor a lot of code that translates between
5539      * Netlink and struct flow representations, we have to do the same
5540      * here. */
5541     if (!match.wc.masks.vlans[0].tci) {
5542         match.wc.masks.vlans[0].tci = htons(0xffff);
5543     }
5544
5545     /* We can't allow the packet batching in the next loop to execute
5546      * the actions.  Otherwise, if there are any slow path actions,
5547      * we'll send the packet up twice. */
5548     dp_packet_batch_init_packet(&b, packet);
5549     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
5550                               actions->data, actions->size);
5551
5552     add_actions = put_actions->size ? put_actions : actions;
5553     if (OVS_LIKELY(error != ENOSPC)) {
5554         struct dp_netdev_flow *netdev_flow;
5555
5556         /* XXX: There's a race window where a flow covering this packet
5557          * could have already been installed since we last did the flow
5558          * lookup before upcall.  This could be solved by moving the
5559          * mutex lock outside the loop, but that's an awful long time
5560          * to be locking everyone out of making flow installs.  If we
5561          * move to a per-core classifier, it would be reasonable. */
5562         ovs_mutex_lock(&pmd->flow_mutex);
5563         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
5564         if (OVS_LIKELY(!netdev_flow)) {
5565             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
5566                                              add_actions->data,
5567                                              add_actions->size);
5568         }
5569         ovs_mutex_unlock(&pmd->flow_mutex);
5570         emc_probabilistic_insert(pmd, key, netdev_flow);
5571     }
5572     if (pmd_perf_metrics_enabled(pmd)) {
5573         /* Update upcall stats. */
5574         cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
5575         struct pmd_perf_stats *s = &pmd->perf_stats;
5576         s->current.upcalls++;
5577         s->current.upcall_cycles += cycles;
5578         histogram_add_sample(&s->cycles_per_upcall, cycles);
5579     }
5580     return error;
5581 }
5582
5583 static inline void
5584 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
5585                      struct dp_packet_batch *packets_,
5586                      struct netdev_flow_key *keys,
5587                      struct packet_batch_per_flow batches[],
5588                      size_t *n_batches,
5589                      odp_port_t in_port)
5590 {
5591     const size_t cnt = dp_packet_batch_size(packets_);
5592 #if !defined(__CHECKER__) && !defined(_WIN32)
5593     const size_t PKT_ARRAY_SIZE = cnt;
5594 #else
5595     /* Sparse or MSVC doesn't like variable length array. */
5596     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5597 #endif
5598     struct dp_packet *packet;
5599     struct dpcls *cls;
5600     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
5601     struct dp_netdev *dp = pmd->dp;
5602     int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
5603     int lookup_cnt = 0, add_lookup_cnt;
5604     bool any_miss;
5605
5606     for (size_t i = 0; i < cnt; i++) {
5607         /* Key length is needed in all the cases, hash computed on demand. */
5608         keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
5609     }
5610     /* Get the classifier for the in_port */
5611     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
5612     if (OVS_LIKELY(cls)) {
5613         any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
5614     } else {
5615         any_miss = true;
5616         memset(rules, 0, sizeof(rules));
5617     }
5618     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
5619         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
5620         struct ofpbuf actions, put_actions;
5621
5622         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
5623         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
5624
5625         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5626             struct dp_netdev_flow *netdev_flow;
5627
5628             if (OVS_LIKELY(rules[i])) {
5629                 continue;
5630             }
5631
5632             /* It's possible that an earlier slow path execution installed
5633              * a rule covering this flow.  In this case, it's a lot cheaper
5634              * to catch it here than execute a miss. */
5635             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
5636                                                     &add_lookup_cnt);
5637             if (netdev_flow) {
5638                 lookup_cnt += add_lookup_cnt;
5639                 rules[i] = &netdev_flow->cr;
5640                 continue;
5641             }
5642
5643             int error = handle_packet_upcall(pmd, packet, &keys[i],
5644                                              &actions, &put_actions);
5645
5646             if (OVS_UNLIKELY(error)) {
5647                 upcall_fail_cnt++;
5648             } else {
5649                 upcall_ok_cnt++;
5650             }
5651         }
5652
5653         ofpbuf_uninit(&actions);
5654         ofpbuf_uninit(&put_actions);
5655         fat_rwlock_unlock(&dp->upcall_rwlock);
5656     } else if (OVS_UNLIKELY(any_miss)) {
5657         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5658             if (OVS_UNLIKELY(!rules[i])) {
5659                 dp_packet_delete(packet);
5660                 upcall_fail_cnt++;
5661             }
5662         }
5663     }
5664
5665     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5666         struct dp_netdev_flow *flow;
5667
5668         if (OVS_UNLIKELY(!rules[i])) {
5669             continue;
5670         }
5671
5672         flow = dp_netdev_flow_cast(rules[i]);
5673
5674         emc_probabilistic_insert(pmd, &keys[i], flow);
5675         dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
5676     }
5677
5678     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
5679                             cnt - upcall_ok_cnt - upcall_fail_cnt);
5680     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
5681                             lookup_cnt);
5682     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
5683                             upcall_ok_cnt);
5684     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
5685                             upcall_fail_cnt);
5686 }
5687
5688 /* Packets enter the datapath from a port (or from recirculation) here.
5689  *
5690  * When 'md_is_valid' is true the metadata in 'packets' are already valid.
5691  * When false the metadata in 'packets' need to be initialized. */
5692 static void
5693 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
5694                   struct dp_packet_batch *packets,
5695                   bool md_is_valid, odp_port_t port_no)
5696 {
5697 #if !defined(__CHECKER__) && !defined(_WIN32)
5698     const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
5699 #else
5700     /* Sparse or MSVC doesn't like variable length array. */
5701     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5702 #endif
5703     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
5704         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
5705     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
5706     size_t n_batches;
5707     odp_port_t in_port;
5708
5709     n_batches = 0;
5710     emc_processing(pmd, packets, keys, batches, &n_batches,
5711                             md_is_valid, port_no);
5712     if (!dp_packet_batch_is_empty(packets)) {
5713         /* Get ingress port from first packet's metadata. */
5714         in_port = packets->packets[0]->md.in_port.odp_port;
5715         fast_path_processing(pmd, packets, keys,
5716                              batches, &n_batches, in_port);
5717     }
5718
5719     /* All the flow batches need to be reset before any call to
5720      * packet_batch_per_flow_execute() as it could potentially trigger
5721      * recirculation. When a packet matching flow ‘j’ happens to be
5722      * recirculated, the nested call to dp_netdev_input__() could potentially
5723      * classify the packet as matching another flow - say 'k'. It could happen
5724      * that in the previous call to dp_netdev_input__() that same flow 'k' had
5725      * already its own batches[k] still waiting to be served.  So if its
5726      * ‘batch’ member is not reset, the recirculated packet would be wrongly
5727      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
5728     size_t i;
5729     for (i = 0; i < n_batches; i++) {
5730         batches[i].flow->batch = NULL;
5731     }
5732
5733     for (i = 0; i < n_batches; i++) {
5734         packet_batch_per_flow_execute(&batches[i], pmd);
5735     }
5736 }
5737
5738 static void
5739 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
5740                 struct dp_packet_batch *packets,
5741                 odp_port_t port_no)
5742 {
5743     dp_netdev_input__(pmd, packets, false, port_no);
5744 }
5745
5746 static void
5747 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
5748                       struct dp_packet_batch *packets)
5749 {
5750     dp_netdev_input__(pmd, packets, true, 0);
5751 }
5752
5753 struct dp_netdev_execute_aux {
5754     struct dp_netdev_pmd_thread *pmd;
5755     const struct flow *flow;
5756 };
5757
5758 static void
5759 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
5760                                  void *aux)
5761 {
5762     struct dp_netdev *dp = get_dp_netdev(dpif);
5763     dp->dp_purge_aux = aux;
5764     dp->dp_purge_cb = cb;
5765 }
5766
5767 static void
5768 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
5769                                void *aux)
5770 {
5771     struct dp_netdev *dp = get_dp_netdev(dpif);
5772     dp->upcall_aux = aux;
5773     dp->upcall_cb = cb;
5774 }
5775
5776 static void
5777 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
5778                                bool purge)
5779 {
5780     struct tx_port *tx;
5781     struct dp_netdev_port *port;
5782     long long interval;
5783
5784     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
5785         if (!tx->port->dynamic_txqs) {
5786             continue;
5787         }
5788         interval = pmd->ctx.now - tx->last_used;
5789         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
5790             port = tx->port;
5791             ovs_mutex_lock(&port->txq_used_mutex);
5792             port->txq_used[tx->qid]--;
5793             ovs_mutex_unlock(&port->txq_used_mutex);
5794             tx->qid = -1;
5795         }
5796     }
5797 }
5798
5799 static int
5800 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
5801                            struct tx_port *tx)
5802 {
5803     struct dp_netdev_port *port;
5804     long long interval;
5805     int i, min_cnt, min_qid;
5806
5807     interval = pmd->ctx.now - tx->last_used;
5808     tx->last_used = pmd->ctx.now;
5809
5810     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
5811         return tx->qid;
5812     }
5813
5814     port = tx->port;
5815
5816     ovs_mutex_lock(&port->txq_used_mutex);
5817     if (tx->qid >= 0) {
5818         port->txq_used[tx->qid]--;
5819         tx->qid = -1;
5820     }
5821
5822     min_cnt = -1;
5823     min_qid = 0;
5824     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
5825         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
5826             min_cnt = port->txq_used[i];
5827             min_qid = i;
5828         }
5829     }
5830
5831     port->txq_used[min_qid]++;
5832     tx->qid = min_qid;
5833
5834     ovs_mutex_unlock(&port->txq_used_mutex);
5835
5836     dpif_netdev_xps_revalidate_pmd(pmd, false);
5837
5838     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
5839              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
5840     return min_qid;
5841 }
5842
5843 static struct tx_port *
5844 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5845                           odp_port_t port_no)
5846 {
5847     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
5848 }
5849
5850 static struct tx_port *
5851 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5852                            odp_port_t port_no)
5853 {
5854     return tx_port_lookup(&pmd->send_port_cache, port_no);
5855 }
5856
5857 static int
5858 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
5859                 const struct nlattr *attr,
5860                 struct dp_packet_batch *batch)
5861 {
5862     struct tx_port *tun_port;
5863     const struct ovs_action_push_tnl *data;
5864     int err;
5865
5866     data = nl_attr_get(attr);
5867
5868     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
5869     if (!tun_port) {
5870         err = -EINVAL;
5871         goto error;
5872     }
5873     err = netdev_push_header(tun_port->port->netdev, batch, data);
5874     if (!err) {
5875         return 0;
5876     }
5877 error:
5878     dp_packet_delete_batch(batch, true);
5879     return err;
5880 }
5881
5882 static void
5883 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
5884                             struct dp_packet *packet, bool should_steal,
5885                             struct flow *flow, ovs_u128 *ufid,
5886                             struct ofpbuf *actions,
5887                             const struct nlattr *userdata)
5888 {
5889     struct dp_packet_batch b;
5890     int error;
5891
5892     ofpbuf_clear(actions);
5893
5894     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
5895                              DPIF_UC_ACTION, userdata, actions,
5896                              NULL);
5897     if (!error || error == ENOSPC) {
5898         dp_packet_batch_init_packet(&b, packet);
5899         dp_netdev_execute_actions(pmd, &b, should_steal, flow,
5900                                   actions->data, actions->size);
5901     } else if (should_steal) {
5902         dp_packet_delete(packet);
5903     }
5904 }
5905
5906 static void
5907 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
5908               const struct nlattr *a, bool should_steal)
5909     OVS_NO_THREAD_SAFETY_ANALYSIS
5910 {
5911     struct dp_netdev_execute_aux *aux = aux_;
5912     uint32_t *depth = recirc_depth_get();
5913     struct dp_netdev_pmd_thread *pmd = aux->pmd;
5914     struct dp_netdev *dp = pmd->dp;
5915     int type = nl_attr_type(a);
5916     struct tx_port *p;
5917
5918     switch ((enum ovs_action_attr)type) {
5919     case OVS_ACTION_ATTR_OUTPUT:
5920         p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
5921         if (OVS_LIKELY(p)) {
5922             struct dp_packet *packet;
5923             struct dp_packet_batch out;
5924
5925             if (!should_steal) {
5926                 dp_packet_batch_clone(&out, packets_);
5927                 dp_packet_batch_reset_cutlen(packets_);
5928                 packets_ = &out;
5929             }
5930             dp_packet_batch_apply_cutlen(packets_);
5931
5932 #ifdef DPDK_NETDEV
5933             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
5934                              && packets_->packets[0]->source
5935                                 != p->output_pkts.packets[0]->source)) {
5936                 /* XXX: netdev-dpdk assumes that all packets in a single
5937                  *      output batch has the same source. Flush here to
5938                  *      avoid memory access issues. */
5939                 dp_netdev_pmd_flush_output_on_port(pmd, p);
5940             }
5941 #endif
5942             if (dp_packet_batch_size(&p->output_pkts)
5943                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
5944                 /* Flush here to avoid overflow. */
5945                 dp_netdev_pmd_flush_output_on_port(pmd, p);
5946             }
5947
5948             if (dp_packet_batch_is_empty(&p->output_pkts)) {
5949                 pmd->n_output_batches++;
5950             }
5951
5952             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5953                 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
5954                                                              pmd->ctx.last_rxq;
5955                 dp_packet_batch_add(&p->output_pkts, packet);
5956             }
5957             return;
5958         }
5959         break;
5960
5961     case OVS_ACTION_ATTR_TUNNEL_PUSH:
5962         if (should_steal) {
5963             /* We're requested to push tunnel header, but also we need to take
5964              * the ownership of these packets. Thus, we can avoid performing
5965              * the action, because the caller will not use the result anyway.
5966              * Just break to free the batch. */
5967             break;
5968         }
5969         dp_packet_batch_apply_cutlen(packets_);
5970         push_tnl_action(pmd, a, packets_);
5971         return;
5972
5973     case OVS_ACTION_ATTR_TUNNEL_POP:
5974         if (*depth < MAX_RECIRC_DEPTH) {
5975             struct dp_packet_batch *orig_packets_ = packets_;
5976             odp_port_t portno = nl_attr_get_odp_port(a);
5977
5978             p = pmd_tnl_port_cache_lookup(pmd, portno);
5979             if (p) {
5980                 struct dp_packet_batch tnl_pkt;
5981
5982                 if (!should_steal) {
5983                     dp_packet_batch_clone(&tnl_pkt, packets_);
5984                     packets_ = &tnl_pkt;
5985                     dp_packet_batch_reset_cutlen(orig_packets_);
5986                 }
5987
5988                 dp_packet_batch_apply_cutlen(packets_);
5989
5990                 netdev_pop_header(p->port->netdev, packets_);
5991                 if (dp_packet_batch_is_empty(packets_)) {
5992                     return;
5993                 }
5994
5995                 struct dp_packet *packet;
5996                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5997                     packet->md.in_port.odp_port = portno;
5998                 }
5999
6000                 (*depth)++;
6001                 dp_netdev_recirculate(pmd, packets_);
6002                 (*depth)--;
6003                 return;
6004             }
6005         }
6006         break;
6007
6008     case OVS_ACTION_ATTR_USERSPACE:
6009         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6010             struct dp_packet_batch *orig_packets_ = packets_;
6011             const struct nlattr *userdata;
6012             struct dp_packet_batch usr_pkt;
6013             struct ofpbuf actions;
6014             struct flow flow;
6015             ovs_u128 ufid;
6016             bool clone = false;
6017
6018             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
6019             ofpbuf_init(&actions, 0);
6020
6021             if (packets_->trunc) {
6022                 if (!should_steal) {
6023                     dp_packet_batch_clone(&usr_pkt, packets_);
6024                     packets_ = &usr_pkt;
6025                     clone = true;
6026                     dp_packet_batch_reset_cutlen(orig_packets_);
6027                 }
6028
6029                 dp_packet_batch_apply_cutlen(packets_);
6030             }
6031
6032             struct dp_packet *packet;
6033             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6034                 flow_extract(packet, &flow);
6035                 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
6036                 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
6037                                             &ufid, &actions, userdata);
6038             }
6039
6040             if (clone) {
6041                 dp_packet_delete_batch(packets_, true);
6042             }
6043
6044             ofpbuf_uninit(&actions);
6045             fat_rwlock_unlock(&dp->upcall_rwlock);
6046
6047             return;
6048         }
6049         break;
6050
6051     case OVS_ACTION_ATTR_RECIRC:
6052         if (*depth < MAX_RECIRC_DEPTH) {
6053             struct dp_packet_batch recirc_pkts;
6054
6055             if (!should_steal) {
6056                dp_packet_batch_clone(&recirc_pkts, packets_);
6057                packets_ = &recirc_pkts;
6058             }
6059
6060             struct dp_packet *packet;
6061             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6062                 packet->md.recirc_id = nl_attr_get_u32(a);
6063             }
6064
6065             (*depth)++;
6066             dp_netdev_recirculate(pmd, packets_);
6067             (*depth)--;
6068
6069             return;
6070         }
6071
6072         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
6073         break;
6074
6075     case OVS_ACTION_ATTR_CT: {
6076         const struct nlattr *b;
6077         bool force = false;
6078         bool commit = false;
6079         unsigned int left;
6080         uint16_t zone = 0;
6081         const char *helper = NULL;
6082         const uint32_t *setmark = NULL;
6083         const struct ovs_key_ct_labels *setlabel = NULL;
6084         struct nat_action_info_t nat_action_info;
6085         struct nat_action_info_t *nat_action_info_ref = NULL;
6086         bool nat_config = false;
6087
6088         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
6089                                  nl_attr_get_size(a)) {
6090             enum ovs_ct_attr sub_type = nl_attr_type(b);
6091
6092             switch(sub_type) {
6093             case OVS_CT_ATTR_FORCE_COMMIT:
6094                 force = true;
6095                 /* fall through. */
6096             case OVS_CT_ATTR_COMMIT:
6097                 commit = true;
6098                 break;
6099             case OVS_CT_ATTR_ZONE:
6100                 zone = nl_attr_get_u16(b);
6101                 break;
6102             case OVS_CT_ATTR_HELPER:
6103                 helper = nl_attr_get_string(b);
6104                 break;
6105             case OVS_CT_ATTR_MARK:
6106                 setmark = nl_attr_get(b);
6107                 break;
6108             case OVS_CT_ATTR_LABELS:
6109                 setlabel = nl_attr_get(b);
6110                 break;
6111             case OVS_CT_ATTR_EVENTMASK:
6112                 /* Silently ignored, as userspace datapath does not generate
6113                  * netlink events. */
6114                 break;
6115             case OVS_CT_ATTR_NAT: {
6116                 const struct nlattr *b_nest;
6117                 unsigned int left_nest;
6118                 bool ip_min_specified = false;
6119                 bool proto_num_min_specified = false;
6120                 bool ip_max_specified = false;
6121                 bool proto_num_max_specified = false;
6122                 memset(&nat_action_info, 0, sizeof nat_action_info);
6123                 nat_action_info_ref = &nat_action_info;
6124
6125                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
6126                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
6127
6128                     switch (sub_type_nest) {
6129                     case OVS_NAT_ATTR_SRC:
6130                     case OVS_NAT_ATTR_DST:
6131                         nat_config = true;
6132                         nat_action_info.nat_action |=
6133                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
6134                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
6135                         break;
6136                     case OVS_NAT_ATTR_IP_MIN:
6137                         memcpy(&nat_action_info.min_addr,
6138                                nl_attr_get(b_nest),
6139                                nl_attr_get_size(b_nest));
6140                         ip_min_specified = true;
6141                         break;
6142                     case OVS_NAT_ATTR_IP_MAX:
6143                         memcpy(&nat_action_info.max_addr,
6144                                nl_attr_get(b_nest),
6145                                nl_attr_get_size(b_nest));
6146                         ip_max_specified = true;
6147                         break;
6148                     case OVS_NAT_ATTR_PROTO_MIN:
6149                         nat_action_info.min_port =
6150                             nl_attr_get_u16(b_nest);
6151                         proto_num_min_specified = true;
6152                         break;
6153                     case OVS_NAT_ATTR_PROTO_MAX:
6154                         nat_action_info.max_port =
6155                             nl_attr_get_u16(b_nest);
6156                         proto_num_max_specified = true;
6157                         break;
6158                     case OVS_NAT_ATTR_PERSISTENT:
6159                     case OVS_NAT_ATTR_PROTO_HASH:
6160                     case OVS_NAT_ATTR_PROTO_RANDOM:
6161                         break;
6162                     case OVS_NAT_ATTR_UNSPEC:
6163                     case __OVS_NAT_ATTR_MAX:
6164                         OVS_NOT_REACHED();
6165                     }
6166                 }
6167
6168                 if (ip_min_specified && !ip_max_specified) {
6169                     nat_action_info.max_addr = nat_action_info.min_addr;
6170                 }
6171                 if (proto_num_min_specified && !proto_num_max_specified) {
6172                     nat_action_info.max_port = nat_action_info.min_port;
6173                 }
6174                 if (proto_num_min_specified || proto_num_max_specified) {
6175                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
6176                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
6177                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
6178                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
6179                     }
6180                 }
6181                 break;
6182             }
6183             case OVS_CT_ATTR_UNSPEC:
6184             case __OVS_CT_ATTR_MAX:
6185                 OVS_NOT_REACHED();
6186             }
6187         }
6188
6189         /* We won't be able to function properly in this case, hence
6190          * complain loudly. */
6191         if (nat_config && !commit) {
6192             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
6193             VLOG_WARN_RL(&rl, "NAT specified without commit.");
6194         }
6195
6196         conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
6197                           commit, zone, setmark, setlabel, aux->flow->tp_src,
6198                           aux->flow->tp_dst, helper, nat_action_info_ref,
6199                           pmd->ctx.now / 1000);
6200         break;
6201     }
6202
6203     case OVS_ACTION_ATTR_METER:
6204         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
6205                             pmd->ctx.now);
6206         break;
6207
6208     case OVS_ACTION_ATTR_PUSH_VLAN:
6209     case OVS_ACTION_ATTR_POP_VLAN:
6210     case OVS_ACTION_ATTR_PUSH_MPLS:
6211     case OVS_ACTION_ATTR_POP_MPLS:
6212     case OVS_ACTION_ATTR_SET:
6213     case OVS_ACTION_ATTR_SET_MASKED:
6214     case OVS_ACTION_ATTR_SAMPLE:
6215     case OVS_ACTION_ATTR_HASH:
6216     case OVS_ACTION_ATTR_UNSPEC:
6217     case OVS_ACTION_ATTR_TRUNC:
6218     case OVS_ACTION_ATTR_PUSH_ETH:
6219     case OVS_ACTION_ATTR_POP_ETH:
6220     case OVS_ACTION_ATTR_CLONE:
6221     case OVS_ACTION_ATTR_PUSH_NSH:
6222     case OVS_ACTION_ATTR_POP_NSH:
6223     case OVS_ACTION_ATTR_CT_CLEAR:
6224     case __OVS_ACTION_ATTR_MAX:
6225         OVS_NOT_REACHED();
6226     }
6227
6228     dp_packet_delete_batch(packets_, should_steal);
6229 }
6230
6231 static void
6232 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
6233                           struct dp_packet_batch *packets,
6234                           bool should_steal, const struct flow *flow,
6235                           const struct nlattr *actions, size_t actions_len)
6236 {
6237     struct dp_netdev_execute_aux aux = { pmd, flow };
6238
6239     odp_execute_actions(&aux, packets, should_steal, actions,
6240                         actions_len, dp_execute_cb);
6241 }
6242
6243 struct dp_netdev_ct_dump {
6244     struct ct_dpif_dump_state up;
6245     struct conntrack_dump dump;
6246     struct conntrack *ct;
6247     struct dp_netdev *dp;
6248 };
6249
6250 static int
6251 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
6252                           const uint16_t *pzone, int *ptot_bkts)
6253 {
6254     struct dp_netdev *dp = get_dp_netdev(dpif);
6255     struct dp_netdev_ct_dump *dump;
6256
6257     dump = xzalloc(sizeof *dump);
6258     dump->dp = dp;
6259     dump->ct = &dp->conntrack;
6260
6261     conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
6262
6263     *dump_ = &dump->up;
6264
6265     return 0;
6266 }
6267
6268 static int
6269 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
6270                          struct ct_dpif_dump_state *dump_,
6271                          struct ct_dpif_entry *entry)
6272 {
6273     struct dp_netdev_ct_dump *dump;
6274
6275     INIT_CONTAINER(dump, dump_, up);
6276
6277     return conntrack_dump_next(&dump->dump, entry);
6278 }
6279
6280 static int
6281 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
6282                          struct ct_dpif_dump_state *dump_)
6283 {
6284     struct dp_netdev_ct_dump *dump;
6285     int err;
6286
6287     INIT_CONTAINER(dump, dump_, up);
6288
6289     err = conntrack_dump_done(&dump->dump);
6290
6291     free(dump);
6292
6293     return err;
6294 }
6295
6296 static int
6297 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
6298                      const struct ct_dpif_tuple *tuple)
6299 {
6300     struct dp_netdev *dp = get_dp_netdev(dpif);
6301
6302     if (tuple) {
6303         return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
6304     }
6305     return conntrack_flush(&dp->conntrack, zone);
6306 }
6307
6308 static int
6309 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
6310 {
6311     struct dp_netdev *dp = get_dp_netdev(dpif);
6312
6313     return conntrack_set_maxconns(&dp->conntrack, maxconns);
6314 }
6315
6316 static int
6317 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
6318 {
6319     struct dp_netdev *dp = get_dp_netdev(dpif);
6320
6321     return conntrack_get_maxconns(&dp->conntrack, maxconns);
6322 }
6323
6324 static int
6325 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
6326 {
6327     struct dp_netdev *dp = get_dp_netdev(dpif);
6328
6329     return conntrack_get_nconns(&dp->conntrack, nconns);
6330 }
6331
6332 const struct dpif_class dpif_netdev_class = {
6333     "netdev",
6334     dpif_netdev_init,
6335     dpif_netdev_enumerate,
6336     dpif_netdev_port_open_type,
6337     dpif_netdev_open,
6338     dpif_netdev_close,
6339     dpif_netdev_destroy,
6340     dpif_netdev_run,
6341     dpif_netdev_wait,
6342     dpif_netdev_get_stats,
6343     dpif_netdev_port_add,
6344     dpif_netdev_port_del,
6345     dpif_netdev_port_set_config,
6346     dpif_netdev_port_query_by_number,
6347     dpif_netdev_port_query_by_name,
6348     NULL,                       /* port_get_pid */
6349     dpif_netdev_port_dump_start,
6350     dpif_netdev_port_dump_next,
6351     dpif_netdev_port_dump_done,
6352     dpif_netdev_port_poll,
6353     dpif_netdev_port_poll_wait,
6354     dpif_netdev_flow_flush,
6355     dpif_netdev_flow_dump_create,
6356     dpif_netdev_flow_dump_destroy,
6357     dpif_netdev_flow_dump_thread_create,
6358     dpif_netdev_flow_dump_thread_destroy,
6359     dpif_netdev_flow_dump_next,
6360     dpif_netdev_operate,
6361     NULL,                       /* recv_set */
6362     NULL,                       /* handlers_set */
6363     dpif_netdev_set_config,
6364     dpif_netdev_queue_to_priority,
6365     NULL,                       /* recv */
6366     NULL,                       /* recv_wait */
6367     NULL,                       /* recv_purge */
6368     dpif_netdev_register_dp_purge_cb,
6369     dpif_netdev_register_upcall_cb,
6370     dpif_netdev_enable_upcall,
6371     dpif_netdev_disable_upcall,
6372     dpif_netdev_get_datapath_version,
6373     dpif_netdev_ct_dump_start,
6374     dpif_netdev_ct_dump_next,
6375     dpif_netdev_ct_dump_done,
6376     dpif_netdev_ct_flush,
6377     dpif_netdev_ct_set_maxconns,
6378     dpif_netdev_ct_get_maxconns,
6379     dpif_netdev_ct_get_nconns,
6380     dpif_netdev_meter_get_features,
6381     dpif_netdev_meter_set,
6382     dpif_netdev_meter_get,
6383     dpif_netdev_meter_del,
6384 };
6385
6386 static void
6387 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
6388                               const char *argv[], void *aux OVS_UNUSED)
6389 {
6390     struct dp_netdev_port *port;
6391     struct dp_netdev *dp;
6392     odp_port_t port_no;
6393
6394     ovs_mutex_lock(&dp_netdev_mutex);
6395     dp = shash_find_data(&dp_netdevs, argv[1]);
6396     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
6397         ovs_mutex_unlock(&dp_netdev_mutex);
6398         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
6399         return;
6400     }
6401     ovs_refcount_ref(&dp->ref_cnt);
6402     ovs_mutex_unlock(&dp_netdev_mutex);
6403
6404     ovs_mutex_lock(&dp->port_mutex);
6405     if (get_port_by_name(dp, argv[2], &port)) {
6406         unixctl_command_reply_error(conn, "unknown port");
6407         goto exit;
6408     }
6409
6410     port_no = u32_to_odp(atoi(argv[3]));
6411     if (!port_no || port_no == ODPP_NONE) {
6412         unixctl_command_reply_error(conn, "bad port number");
6413         goto exit;
6414     }
6415     if (dp_netdev_lookup_port(dp, port_no)) {
6416         unixctl_command_reply_error(conn, "port number already in use");
6417         goto exit;
6418     }
6419
6420     /* Remove port. */
6421     hmap_remove(&dp->ports, &port->node);
6422     reconfigure_datapath(dp);
6423
6424     /* Reinsert with new port number. */
6425     port->port_no = port_no;
6426     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
6427     reconfigure_datapath(dp);
6428
6429     seq_change(dp->port_seq);
6430     unixctl_command_reply(conn, NULL);
6431
6432 exit:
6433     ovs_mutex_unlock(&dp->port_mutex);
6434     dp_netdev_unref(dp);
6435 }
6436
6437 static void
6438 dpif_dummy_register__(const char *type)
6439 {
6440     struct dpif_class *class;
6441
6442     class = xmalloc(sizeof *class);
6443     *class = dpif_netdev_class;
6444     class->type = xstrdup(type);
6445     dp_register_provider(class);
6446 }
6447
6448 static void
6449 dpif_dummy_override(const char *type)
6450 {
6451     int error;
6452
6453     /*
6454      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
6455      * a userland-only build.  It's useful for testsuite.
6456      */
6457     error = dp_unregister_provider(type);
6458     if (error == 0 || error == EAFNOSUPPORT) {
6459         dpif_dummy_register__(type);
6460     }
6461 }
6462
6463 void
6464 dpif_dummy_register(enum dummy_level level)
6465 {
6466     if (level == DUMMY_OVERRIDE_ALL) {
6467         struct sset types;
6468         const char *type;
6469
6470         sset_init(&types);
6471         dp_enumerate_types(&types);
6472         SSET_FOR_EACH (type, &types) {
6473             dpif_dummy_override(type);
6474         }
6475         sset_destroy(&types);
6476     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
6477         dpif_dummy_override("system");
6478     }
6479
6480     dpif_dummy_register__("dummy");
6481
6482     unixctl_command_register("dpif-dummy/change-port-number",
6483                              "dp port new-number",
6484                              3, 3, dpif_dummy_change_port_number, NULL);
6485 }
6486 \f
6487 /* Datapath Classifier. */
6488
6489 /* A set of rules that all have the same fields wildcarded. */
6490 struct dpcls_subtable {
6491     /* The fields are only used by writers. */
6492     struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
6493
6494     /* These fields are accessed by readers. */
6495     struct cmap rules;           /* Contains "struct dpcls_rule"s. */
6496     uint32_t hit_cnt;            /* Number of match hits in subtable in current
6497                                     optimization interval. */
6498     struct netdev_flow_key mask; /* Wildcards for fields (const). */
6499     /* 'mask' must be the last field, additional space is allocated here. */
6500 };
6501
6502 /* Initializes 'cls' as a classifier that initially contains no classification
6503  * rules. */
6504 static void
6505 dpcls_init(struct dpcls *cls)
6506 {
6507     cmap_init(&cls->subtables_map);
6508     pvector_init(&cls->subtables);
6509 }
6510
6511 static void
6512 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
6513 {
6514     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
6515     pvector_remove(&cls->subtables, subtable);
6516     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
6517                 subtable->mask.hash);
6518     cmap_destroy(&subtable->rules);
6519     ovsrcu_postpone(free, subtable);
6520 }
6521
6522 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
6523  * caller's responsibility.
6524  * May only be called after all the readers have been terminated. */
6525 static void
6526 dpcls_destroy(struct dpcls *cls)
6527 {
6528     if (cls) {
6529         struct dpcls_subtable *subtable;
6530
6531         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
6532             ovs_assert(cmap_count(&subtable->rules) == 0);
6533             dpcls_destroy_subtable(cls, subtable);
6534         }
6535         cmap_destroy(&cls->subtables_map);
6536         pvector_destroy(&cls->subtables);
6537     }
6538 }
6539
6540 static struct dpcls_subtable *
6541 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6542 {
6543     struct dpcls_subtable *subtable;
6544
6545     /* Need to add one. */
6546     subtable = xmalloc(sizeof *subtable
6547                        - sizeof subtable->mask.mf + mask->len);
6548     cmap_init(&subtable->rules);
6549     subtable->hit_cnt = 0;
6550     netdev_flow_key_clone(&subtable->mask, mask);
6551     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
6552     /* Add the new subtable at the end of the pvector (with no hits yet) */
6553     pvector_insert(&cls->subtables, subtable, 0);
6554     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
6555              cmap_count(&cls->subtables_map), subtable, cls->in_port);
6556     pvector_publish(&cls->subtables);
6557
6558     return subtable;
6559 }
6560
6561 static inline struct dpcls_subtable *
6562 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6563 {
6564     struct dpcls_subtable *subtable;
6565
6566     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
6567                              &cls->subtables_map) {
6568         if (netdev_flow_key_equal(&subtable->mask, mask)) {
6569             return subtable;
6570         }
6571     }
6572     return dpcls_create_subtable(cls, mask);
6573 }
6574
6575
6576 /* Periodically sort the dpcls subtable vectors according to hit counts */
6577 static void
6578 dpcls_sort_subtable_vector(struct dpcls *cls)
6579 {
6580     struct pvector *pvec = &cls->subtables;
6581     struct dpcls_subtable *subtable;
6582
6583     PVECTOR_FOR_EACH (subtable, pvec) {
6584         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
6585         subtable->hit_cnt = 0;
6586     }
6587     pvector_publish(pvec);
6588 }
6589
6590 static inline void
6591 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
6592                            struct polled_queue *poll_list, int poll_cnt)
6593 {
6594     struct dpcls *cls;
6595
6596     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
6597         uint64_t curr_tsc;
6598         /* Get the cycles that were used to process each queue and store. */
6599         for (unsigned i = 0; i < poll_cnt; i++) {
6600             uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
6601                                                         RXQ_CYCLES_PROC_CURR);
6602             dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
6603             dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
6604                                      0);
6605         }
6606         curr_tsc = cycles_counter_update(&pmd->perf_stats);
6607         if (pmd->intrvl_tsc_prev) {
6608             /* There is a prev timestamp, store a new intrvl cycle count. */
6609             atomic_store_relaxed(&pmd->intrvl_cycles,
6610                                  curr_tsc - pmd->intrvl_tsc_prev);
6611         }
6612         pmd->intrvl_tsc_prev = curr_tsc;
6613         /* Start new measuring interval */
6614         pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6615     }
6616
6617     if (pmd->ctx.now > pmd->next_optimization) {
6618         /* Try to obtain the flow lock to block out revalidator threads.
6619          * If not possible, just try next time. */
6620         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
6621             /* Optimize each classifier */
6622             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6623                 dpcls_sort_subtable_vector(cls);
6624             }
6625             ovs_mutex_unlock(&pmd->flow_mutex);
6626             /* Start new measuring interval */
6627             pmd->next_optimization = pmd->ctx.now
6628                                      + DPCLS_OPTIMIZATION_INTERVAL;
6629         }
6630     }
6631 }
6632
6633 /* Insert 'rule' into 'cls'. */
6634 static void
6635 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
6636              const struct netdev_flow_key *mask)
6637 {
6638     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
6639
6640     /* Refer to subtable's mask, also for later removal. */
6641     rule->mask = &subtable->mask;
6642     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
6643 }
6644
6645 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
6646 static void
6647 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
6648 {
6649     struct dpcls_subtable *subtable;
6650
6651     ovs_assert(rule->mask);
6652
6653     /* Get subtable from reference in rule->mask. */
6654     INIT_CONTAINER(subtable, rule->mask, mask);
6655     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
6656         == 0) {
6657         /* Delete empty subtable. */
6658         dpcls_destroy_subtable(cls, subtable);
6659         pvector_publish(&cls->subtables);
6660     }
6661 }
6662
6663 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
6664  * in 'mask' the values in 'key' and 'target' are the same. */
6665 static inline bool
6666 dpcls_rule_matches_key(const struct dpcls_rule *rule,
6667                        const struct netdev_flow_key *target)
6668 {
6669     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
6670     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
6671     uint64_t value;
6672
6673     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
6674         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
6675             return false;
6676         }
6677     }
6678     return true;
6679 }
6680
6681 /* For each miniflow in 'keys' performs a classifier lookup writing the result
6682  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
6683  * NULL it is skipped.
6684  *
6685  * This function is optimized for use in the userspace datapath and therefore
6686  * does not implement a lot of features available in the standard
6687  * classifier_lookup() function.  Specifically, it does not implement
6688  * priorities, instead returning any rule which matches the flow.
6689  *
6690  * Returns true if all miniflows found a corresponding rule. */
6691 static bool
6692 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
6693              struct dpcls_rule **rules, const size_t cnt,
6694              int *num_lookups_p)
6695 {
6696     /* The received 'cnt' miniflows are the search-keys that will be processed
6697      * to find a matching entry into the available subtables.
6698      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
6699     typedef uint32_t map_type;
6700 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
6701     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
6702
6703     struct dpcls_subtable *subtable;
6704
6705     map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
6706     map_type found_map;
6707     uint32_t hashes[MAP_BITS];
6708     const struct cmap_node *nodes[MAP_BITS];
6709
6710     if (cnt != MAP_BITS) {
6711         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
6712     }
6713     memset(rules, 0, cnt * sizeof *rules);
6714
6715     int lookups_match = 0, subtable_pos = 1;
6716
6717     /* The Datapath classifier - aka dpcls - is composed of subtables.
6718      * Subtables are dynamically created as needed when new rules are inserted.
6719      * Each subtable collects rules with matches on a specific subset of packet
6720      * fields as defined by the subtable's mask.  We proceed to process every
6721      * search-key against each subtable, but when a match is found for a
6722      * search-key, the search for that key can stop because the rules are
6723      * non-overlapping. */
6724     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
6725         int i;
6726
6727         /* Compute hashes for the remaining keys.  Each search-key is
6728          * masked with the subtable's mask to avoid hashing the wildcarded
6729          * bits. */
6730         ULLONG_FOR_EACH_1(i, keys_map) {
6731             hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
6732                                                      &subtable->mask);
6733         }
6734         /* Lookup. */
6735         found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
6736         /* Check results.  When the i-th bit of found_map is set, it means
6737          * that a set of nodes with a matching hash value was found for the
6738          * i-th search-key.  Due to possible hash collisions we need to check
6739          * which of the found rules, if any, really matches our masked
6740          * search-key. */
6741         ULLONG_FOR_EACH_1(i, found_map) {
6742             struct dpcls_rule *rule;
6743
6744             CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
6745                 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
6746                     rules[i] = rule;
6747                     /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
6748                      * within one second optimization interval. */
6749                     subtable->hit_cnt++;
6750                     lookups_match += subtable_pos;
6751                     goto next;
6752                 }
6753             }
6754             /* None of the found rules was a match.  Reset the i-th bit to
6755              * keep searching this key in the next subtable. */
6756             ULLONG_SET0(found_map, i);  /* Did not match. */
6757         next:
6758             ;                     /* Keep Sparse happy. */
6759         }
6760         keys_map &= ~found_map;             /* Clear the found rules. */
6761         if (!keys_map) {
6762             if (num_lookups_p) {
6763                 *num_lookups_p = lookups_match;
6764             }
6765             return true;              /* All found. */
6766         }
6767         subtable_pos++;
6768     }
6769     if (num_lookups_p) {
6770         *num_lookups_p = lookups_match;
6771     }
6772     return false;                     /* Some misses. */
6773 }