]> git.proxmox.com Git - mirror_ovs.git/blobdiff - lib/dpif-netdev-perf.c
ovsdb-idl: Fix memleak when reinserting tracked orphan rows.
[mirror_ovs.git] / lib / dpif-netdev-perf.c
index 55b4067fa510521584c5c1cd47152d375153d742..9560e7c3c3d36534f799b942b67acbc4080776ff 100644 (file)
 #include <config.h>
 #include <stdint.h>
 
+#include "dpdk.h"
 #include "dpif-netdev-perf.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/vlog.h"
+#include "ovs-numa.h"
 #include "ovs-thread.h"
 #include "timeval.h"
 
 VLOG_DEFINE_THIS_MODULE(pmd_perf);
 
-#ifdef DPDK_NETDEV
-static uint64_t
-get_tsc_hz(void)
-{
-    return rte_get_tsc_hz();
-}
-#else
-/* This function is only invoked from PMD threads which depend on DPDK.
- * A dummy function is sufficient when building without DPDK_NETDEV. */
-static uint64_t
-get_tsc_hz(void)
+#define ITER_US_THRESHOLD 250   /* Warning threshold for iteration duration
+                                   in microseconds. */
+#define VHOST_QUEUE_FULL 128    /* Size of the virtio TX queue. */
+#define LOG_IT_BEFORE 5         /* Number of iterations to log before
+                                   suspicious iteration. */
+#define LOG_IT_AFTER 5          /* Number of iterations to log after
+                                   suspicious iteration. */
+
+bool log_enabled = false;
+bool log_extend = false;
+static uint32_t log_it_before = LOG_IT_BEFORE;
+static uint32_t log_it_after = LOG_IT_AFTER;
+static uint32_t log_us_thr = ITER_US_THRESHOLD;
+uint32_t log_q_thr = VHOST_QUEUE_FULL;
+uint64_t iter_cycle_threshold;
+
+static struct vlog_rate_limit latency_rl = VLOG_RATE_LIMIT_INIT(600, 600);
+
+static uint64_t tsc_hz = 1;
+
+void
+pmd_perf_estimate_tsc_frequency(void)
 {
-    return 1;
-}
+#ifdef DPDK_NETDEV
+    if (dpdk_available()) {
+        tsc_hz = rte_get_tsc_hz();
+    }
+    if (tsc_hz > 1) {
+        VLOG_INFO("DPDK provided TSC frequency: %"PRIu64" KHz", tsc_hz / 1000);
+        return;
+    }
 #endif
+    struct ovs_numa_dump *affinity;
+    struct pmd_perf_stats s;
+    uint64_t start, stop;
+
+    /* DPDK is not available or returned unreliable value.
+     * Trying to estimate. */
+    affinity = ovs_numa_thread_getaffinity_dump();
+    if (affinity) {
+        const struct ovs_numa_info_core *core;
+
+        FOR_EACH_CORE_ON_DUMP (core, affinity) {
+            /* Setting affinity to a single core from the affinity mask to
+             * avoid re-scheduling to another core while sleeping. */
+            ovs_numa_thread_setaffinity_core(core->core_id);
+            break;
+        }
+    }
+
+    memset(&s, 0, sizeof s);
+    start = cycles_counter_update(&s);
+    /* Using xnanosleep as it's interrupt resistant.
+     * Sleeping only 100 ms to avoid holding the main thread for too long. */
+    xnanosleep(1E8);
+    stop = cycles_counter_update(&s);
+
+    if (affinity) {
+        /* Restoring previous affinity. */
+        ovs_numa_thread_setaffinity_dump(affinity);
+        ovs_numa_dump_destroy(affinity);
+    }
+
+    if (stop <= start) {
+        VLOG_WARN("TSC source is unreliable.");
+        tsc_hz = 1;
+    } else {
+        tsc_hz = (stop - start) * 10;
+    }
+
+    VLOG_INFO("Estimated TSC frequency: %"PRIu64" KHz", tsc_hz / 1000);
+}
 
 /* Histogram functions. */
 
@@ -141,6 +200,10 @@ pmd_perf_stats_init(struct pmd_perf_stats *s)
     histogram_walls_set_log(&s->max_vhost_qfill, 0, 512);
     s->iteration_cnt = 0;
     s->start_ms = time_msec();
+    s->log_susp_it = UINT32_MAX;
+    s->log_begin_it = UINT32_MAX;
+    s->log_end_it = UINT32_MAX;
+    s->log_reason = NULL;
 }
 
 void
@@ -148,7 +211,7 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
                               double duration)
 {
     uint64_t stats[PMD_N_STATS];
-    double us_per_cycle = 1000000.0 / get_tsc_hz();
+    double us_per_cycle = 1000000.0 / tsc_hz;
 
     if (duration == 0) {
         return;
@@ -169,30 +232,33 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
     uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0;
 
     ds_put_format(str,
-            "  Cycles:          %12"PRIu64"  (%.2f GHz)\n"
-            "  Iterations:      %12"PRIu64"  (%.2f us/it)\n"
-            "  - idle:          %12"PRIu64"  (%4.1f %% cycles)\n"
-            "  - busy:          %12"PRIu64"  (%4.1f %% cycles)\n",
-            tot_cycles, (tot_cycles / duration) / 1E9,
+            "  Iterations:        %12"PRIu64"  (%.2f us/it)\n"
+            "  - Used TSC cycles: %12"PRIu64"  (%5.1f %% of total cycles)\n"
+            "  - idle iterations: %12"PRIu64"  (%5.1f %% of used cycles)\n"
+            "  - busy iterations: %12"PRIu64"  (%5.1f %% of used cycles)\n",
             tot_iter, tot_cycles * us_per_cycle / tot_iter,
+            tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz,
             idle_iter,
             100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles,
             busy_iter,
             100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles);
     if (rx_packets > 0) {
         ds_put_format(str,
-            "  Rx packets:      %12"PRIu64"  (%.0f Kpps, %.0f cycles/pkt)\n"
-            "  Datapath passes: %12"PRIu64"  (%.2f passes/pkt)\n"
-            "  - EMC hits:      %12"PRIu64"  (%4.1f %%)\n"
-            "  - Megaflow hits: %12"PRIu64"  (%4.1f %%, %.2f subtbl lookups/"
-                                                                     "hit)\n"
-            "  - Upcalls:       %12"PRIu64"  (%4.1f %%, %.1f us/upcall)\n"
-            "  - Lost upcalls:  %12"PRIu64"  (%4.1f %%)\n",
+            "  Rx packets:        %12"PRIu64"  (%.0f Kpps, %.0f cycles/pkt)\n"
+            "  Datapath passes:   %12"PRIu64"  (%.2f passes/pkt)\n"
+            "  - EMC hits:        %12"PRIu64"  (%5.1f %%)\n"
+            "  - SMC hits:        %12"PRIu64"  (%5.1f %%)\n"
+            "  - Megaflow hits:   %12"PRIu64"  (%5.1f %%, %.2f "
+                                                "subtbl lookups/hit)\n"
+            "  - Upcalls:         %12"PRIu64"  (%5.1f %%, %.1f us/upcall)\n"
+            "  - Lost upcalls:    %12"PRIu64"  (%5.1f %%)\n",
             rx_packets, (rx_packets / duration) / 1000,
             1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets,
             passes, rx_packets ? 1.0 * passes / rx_packets : 0,
             stats[PMD_STAT_EXACT_HIT],
             100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
+            stats[PMD_STAT_SMC_HIT],
+            100.0 * stats[PMD_STAT_SMC_HIT] / passes,
             stats[PMD_STAT_MASKED_HIT],
             100.0 * stats[PMD_STAT_MASKED_HIT] / passes,
             stats[PMD_STAT_MASKED_HIT]
@@ -203,17 +269,16 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
             stats[PMD_STAT_LOST],
             100.0 * stats[PMD_STAT_LOST] / passes);
     } else {
-        ds_put_format(str, "  Rx packets:      %12d\n", 0);
+        ds_put_format(str, "  Rx packets:        %12d\n", 0);
     }
     if (tx_packets > 0) {
         ds_put_format(str,
-            "  Tx packets:      %12"PRIu64"  (%.0f Kpps)\n"
-            "  Tx batches:      %12"PRIu64"  (%.2f pkts/batch)"
-            "\n",
+            "  Tx packets:        %12"PRIu64"  (%.0f Kpps)\n"
+            "  Tx batches:        %12"PRIu64"  (%.2f pkts/batch)\n",
             tx_packets, (tx_packets / duration) / 1000,
             tx_batches, 1.0 * tx_packets / tx_batches);
     } else {
-        ds_put_format(str, "  Tx packets:      %12d\n\n", 0);
+        ds_put_format(str, "  Tx packets:        %12d\n\n", 0);
     }
 }
 
@@ -391,6 +456,10 @@ pmd_perf_stats_clear_lock(struct pmd_perf_stats *s)
     history_init(&s->milliseconds);
     s->start_ms = time_msec();
     s->milliseconds.sample[0].timestamp = s->start_ms;
+    s->log_susp_it = UINT32_MAX;
+    s->log_begin_it = UINT32_MAX;
+    s->log_end_it = UINT32_MAX;
+    s->log_reason = NULL;
     /* Clearing finished. */
     s->clear = false;
     ovs_mutex_unlock(&s->clear_mutex);
@@ -414,7 +483,7 @@ pmd_perf_stats_clear(struct pmd_perf_stats *s)
 
 /* Functions recording PMD metrics per iteration. */
 
-inline void
+void
 pmd_perf_start_iteration(struct pmd_perf_stats *s)
 OVS_REQUIRES(s->stats_mutex)
 {
@@ -436,13 +505,14 @@ OVS_REQUIRES(s->stats_mutex)
     }
 }
 
-inline void
+void
 pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
                        int tx_packets, bool full_metrics)
 {
     uint64_t now_tsc = cycles_counter_update(s);
     struct iter_stats *cum_ms;
     uint64_t cycles, cycles_per_pkt = 0;
+    char *reason = NULL;
 
     cycles = now_tsc - s->start_tsc;
     s->current.timestamp = s->iteration_cnt;
@@ -468,15 +538,7 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
         cycles_per_pkt = cycles / rx_packets;
         histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt);
     }
-    if (s->current.batches > 0) {
-        histogram_add_sample(&s->pkts_per_batch,
-                             rx_packets / s->current.batches);
-    }
     histogram_add_sample(&s->upcalls, s->current.upcalls);
-    if (s->current.upcalls > 0) {
-        histogram_add_sample(&s->cycles_per_upcall,
-                             s->current.upcall_cycles / s->current.upcalls);
-    }
     histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill);
 
     /* Add iteration samples to millisecond stats. */
@@ -492,9 +554,30 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
     cum_ms->batches += s->current.batches;
     cum_ms->max_vhost_qfill += s->current.max_vhost_qfill;
 
+    if (log_enabled) {
+        /* Log suspicious iterations. */
+        if (cycles > iter_cycle_threshold) {
+            reason = "Excessive total cycles";
+        } else if (s->current.max_vhost_qfill >= log_q_thr) {
+            reason = "Vhost RX queue full";
+        }
+        if (OVS_UNLIKELY(reason)) {
+            pmd_perf_set_log_susp_iteration(s, reason);
+            cycles_counter_update(s);
+        }
+
+        /* Log iteration interval around suspicious iteration when reaching
+         * the end of the range to be logged. */
+        if (OVS_UNLIKELY(s->log_end_it == s->iterations.idx)) {
+            pmd_perf_log_susp_iteration_neighborhood(s);
+            cycles_counter_update(s);
+        }
+    }
+
     /* Store in iteration history. This advances the iteration idx and
      * clears the next slot in the iteration history. */
     history_store(&s->iterations, &s->current);
+
     if (now_tsc > s->next_check_tsc) {
         /* Check if ms is completed and store in milliseconds history. */
         uint64_t now = time_msec();
@@ -511,7 +594,182 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
             cum_ms = history_next(&s->milliseconds);
             cum_ms->timestamp = now;
         }
-        /* Do the next check after 10K cycles (4 us at 2.5 GHz TSC clock). */
-        s->next_check_tsc = cycles_counter_update(s) + 10000;
+        /* Do the next check after 4 us (10K cycles at 2.5 GHz TSC clock). */
+        s->next_check_tsc = cycles_counter_update(s) + tsc_hz / 250000;
+    }
+}
+
+/* Delay logging of the suspicious iteration and the range of iterations
+ * around it until after the last iteration in the range to be logged.
+ * This avoids any distortion of the measurements through the cost of
+ * logging itself. */
+
+void
+pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s,
+                                char *reason)
+{
+    if (s->log_susp_it == UINT32_MAX) {
+        /* No logging scheduled yet. */
+        s->log_susp_it = s->iterations.idx;
+        s->log_reason = reason;
+        s->log_begin_it = history_sub(s->iterations.idx, log_it_before);
+        s->log_end_it = history_add(s->iterations.idx, log_it_after + 1);
+    } else if (log_extend) {
+        /* Logging was initiated earlier, we log the previous suspicious
+         * iteration now and extend the logging interval, if possible. */
+        struct iter_stats *susp = &s->iterations.sample[s->log_susp_it];
+        uint32_t new_end_it, old_range, new_range;
+
+        VLOG_WARN_RL(&latency_rl,
+                "Suspicious iteration (%s): iter=%"PRIu64
+                " duration=%"PRIu64" us\n",
+                s->log_reason,
+                susp->timestamp,
+                (1000000L * susp->cycles) / tsc_hz);
+
+        new_end_it = history_add(s->iterations.idx, log_it_after + 1);
+        new_range = history_sub(new_end_it, s->log_begin_it);
+        old_range = history_sub(s->log_end_it, s->log_begin_it);
+        if (new_range < old_range) {
+            /* Extended range exceeds history length. */
+            new_end_it = s->log_begin_it;
+        }
+        s->log_susp_it = s->iterations.idx;
+        s->log_reason = reason;
+        s->log_end_it = new_end_it;
+    }
+}
+
+void
+pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s)
+{
+    ovs_assert(s->log_reason != NULL);
+    ovs_assert(s->log_susp_it != UINT32_MAX);
+
+    struct ds log = DS_EMPTY_INITIALIZER;
+    struct iter_stats *susp = &s->iterations.sample[s->log_susp_it];
+    uint32_t range = history_sub(s->log_end_it, s->log_begin_it);
+
+    VLOG_WARN_RL(&latency_rl,
+                 "Suspicious iteration (%s): iter=%"PRIu64
+                 " duration=%"PRIu64" us\n",
+                 s->log_reason,
+                 susp->timestamp,
+                 (1000000L * susp->cycles) / tsc_hz);
+
+    pmd_perf_format_iteration_history(&log, s, range);
+    VLOG_WARN_RL(&latency_rl,
+                 "Neighborhood of suspicious iteration:\n"
+                 "%s", ds_cstr(&log));
+    ds_destroy(&log);
+    s->log_susp_it = s->log_end_it = s->log_begin_it = UINT32_MAX;
+    s->log_reason = NULL;
+
+    if (range > 100) {
+        /* Reset to safe default values to avoid disturbance. */
+        log_it_before = LOG_IT_BEFORE;
+        log_it_after = LOG_IT_AFTER;
+        log_extend = false;
+    }
+}
+
+void
+pmd_perf_log_set_cmd(struct unixctl_conn *conn,
+                 int argc, const char *argv[],
+                 void *aux OVS_UNUSED)
+{
+    unsigned int it_before, it_after, us_thr, q_thr;
+    bool on, extend;
+    bool usage = false;
+
+    on = log_enabled;
+    extend = log_extend;
+    it_before = log_it_before;
+    it_after = log_it_after;
+    q_thr = log_q_thr;
+    us_thr = log_us_thr;
+
+    while (argc > 1) {
+        if (!strcmp(argv[1], "on")) {
+            on = true;
+            argc--;
+            argv++;
+        } else if (!strcmp(argv[1], "off")) {
+            on = false;
+            argc--;
+            argv++;
+        } else if (!strcmp(argv[1], "-e")) {
+            extend = true;
+            argc--;
+            argv++;
+        } else if (!strcmp(argv[1], "-ne")) {
+            extend = false;
+            argc--;
+            argv++;
+        } else if (!strcmp(argv[1], "-a") && argc > 2) {
+            if (str_to_uint(argv[2], 10, &it_after)) {
+                if (it_after > HISTORY_LEN - 2) {
+                    it_after = HISTORY_LEN - 2;
+                }
+            } else {
+                usage = true;
+                break;
+            }
+            argc -= 2;
+            argv += 2;
+        } else if (!strcmp(argv[1], "-b") && argc > 2) {
+            if (str_to_uint(argv[2], 10, &it_before)) {
+                if (it_before > HISTORY_LEN - 2) {
+                    it_before = HISTORY_LEN - 2;
+                }
+            } else {
+                usage = true;
+                break;
+            }
+            argc -= 2;
+            argv += 2;
+        } else if (!strcmp(argv[1], "-q") && argc > 2) {
+            if (!str_to_uint(argv[2], 10, &q_thr)) {
+                usage = true;
+                break;
+            }
+            argc -= 2;
+            argv += 2;
+        } else if (!strcmp(argv[1], "-us") && argc > 2) {
+            if (!str_to_uint(argv[2], 10, &us_thr)) {
+                usage = true;
+                break;
+            }
+            argc -= 2;
+            argv += 2;
+        } else {
+            usage = true;
+            break;
+        }
     }
+    if (it_before + it_after > HISTORY_LEN - 2) {
+        it_after = HISTORY_LEN - 2 - it_before;
+    }
+
+    if (usage) {
+        unixctl_command_reply_error(conn,
+                "Usage: ovs-appctl dpif-netdev/pmd-perf-log-set "
+                "[on|off] [-b before] [-a after] [-e|-ne] "
+                "[-us usec] [-q qlen]");
+        return;
+    }
+
+    VLOG_INFO("pmd-perf-log-set: %s, before=%d, after=%d, extend=%s, "
+              "us_thr=%d, q_thr=%d\n",
+              on ? "on" : "off", it_before, it_after,
+              extend ? "true" : "false", us_thr, q_thr);
+    log_enabled = on;
+    log_extend = extend;
+    log_it_before = it_before;
+    log_it_after = it_after;
+    log_q_thr = q_thr;
+    log_us_thr = us_thr;
+    iter_cycle_threshold = (log_us_thr * tsc_hz) / 1000000L;
+
+    unixctl_command_reply(conn, "");
 }