2 * Copyright (c) 2017 Ericsson AB.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef DPIF_NETDEV_PERF_H
18 #define DPIF_NETDEV_PERF_H 1
28 #include <rte_config.h>
29 #include <rte_cycles.h>
32 #include "openvswitch/vlog.h"
33 #include "ovs-atomic.h"
42 /* This module encapsulates data structures and functions to maintain basic PMD
43 * performance metrics such as packet counters, execution cycles as well as
44 * histograms and time series recording for more detailed PMD metrics.
46 * It provides a clean API for dpif-netdev to initialize, update and read and
47 * reset these metrics.
49 * The basic set of PMD counters is implemented as atomic_uint64_t variables
50 * to guarantee correct read also in 32-bit systems.
52 * The detailed PMD performance metrics are only supported on 64-bit systems
53 * with atomic 64-bit read and store semantics for plain uint64_t counters.
56 /* Set of counter types maintained in pmd_perf_stats. */
59 PMD_STAT_EXACT_HIT
, /* Packets that had an exact match (emc). */
60 PMD_STAT_SMC_HIT
, /* Packets that had a sig match hit (SMC). */
61 PMD_STAT_MASKED_HIT
, /* Packets that matched in the flow table. */
62 PMD_STAT_MISS
, /* Packets that did not match and upcall was ok. */
63 PMD_STAT_LOST
, /* Packets that did not match and upcall failed. */
64 /* The above statistics account for the total
65 * number of packet passes through the datapath
66 * pipeline and should not be overlapping with each
68 PMD_STAT_MASKED_LOOKUP
, /* Number of subtable lookups for flow table
69 hits. Each MASKED_HIT hit will have >= 1
71 PMD_STAT_RECV
, /* Packets entering the datapath pipeline from an
73 PMD_STAT_RECIRC
, /* Packets reentering the datapath pipeline due to
75 PMD_STAT_SENT_PKTS
, /* Packets that have been sent. */
76 PMD_STAT_SENT_BATCHES
, /* Number of batches sent. */
77 PMD_CYCLES_ITER_IDLE
, /* Cycles spent in idle iterations. */
78 PMD_CYCLES_ITER_BUSY
, /* Cycles spent in busy iterations. */
79 PMD_CYCLES_UPCALL
, /* Cycles spent processing upcalls. */
83 /* Array of PMD counters indexed by enum pmd_stat_type.
84 * The n[] array contains the actual counter values since initialization
85 * of the PMD. Counters are atomically updated from the PMD but are
86 * read and cleared also from other processes. To clear the counters at
87 * PMD run-time, the current counter values are copied over to the zero[]
88 * array. To read counters we subtract zero[] value from n[]. */
91 atomic_uint64_t n
[PMD_N_STATS
]; /* Value since _init(). */
92 uint64_t zero
[PMD_N_STATS
]; /* Value at last _clear(). */
95 /* Data structure to collect statistical distribution of an integer measurement
96 * type in form of a histogram. The wall[] array contains the inclusive
97 * upper boundaries of the bins, while the bin[] array contains the actual
98 * counters per bin. The histogram walls are typically set automatically
99 * using the functions provided below.*/
101 #define NUM_BINS 32 /* Number of histogram bins. */
104 uint32_t wall
[NUM_BINS
];
105 uint64_t bin
[NUM_BINS
];
108 /* Data structure to record details PMD execution metrics per iteration for
109 * a history period of up to HISTORY_LEN iterations in circular buffer.
110 * Also used to record up to HISTORY_LEN millisecond averages/totals of these
114 uint64_t timestamp
; /* Iteration no. or millisecond. */
115 uint64_t cycles
; /* Number of TSC cycles spent in it. or ms. */
116 uint64_t busy_cycles
; /* Cycles spent in busy iterations or ms. */
117 uint32_t iterations
; /* Iterations in ms. */
118 uint32_t pkts
; /* Packets processed in iteration or ms. */
119 uint32_t upcalls
; /* Number of upcalls in iteration or ms. */
120 uint32_t upcall_cycles
; /* Cycles spent in upcalls in it. or ms. */
121 uint32_t batches
; /* Number of rx batches in iteration or ms. */
122 uint32_t max_vhost_qfill
; /* Maximum fill level in iteration or ms. */
125 #define HISTORY_LEN 1000 /* Length of recorded history
126 (iterations and ms). */
127 #define DEF_HIST_SHOW 20 /* Default number of history samples to
131 size_t idx
; /* Slot to which next call to history_store()
133 struct iter_stats sample
[HISTORY_LEN
];
136 /* Container for all performance metrics of a PMD within the struct
137 * dp_netdev_pmd_thread. The metrics must be updated from within the PMD
138 * thread but can be read from any thread. The basic PMD counters in
139 * struct pmd_counters can be read without protection against concurrent
140 * clearing. The other metrics may only be safely read with the clear_mutex
141 * held to protect against concurrent clearing. */
143 struct pmd_perf_stats
{
144 /* Prevents interference between PMD polling and stats clearing. */
145 struct ovs_mutex stats_mutex
;
146 /* Set by CLI thread to order clearing of PMD stats. */
148 /* Prevents stats retrieval while clearing is in progress. */
149 struct ovs_mutex clear_mutex
;
150 /* Start of the current performance measurement period. */
152 /* Counter for PMD iterations. */
153 uint64_t iteration_cnt
;
154 /* Start of the current iteration. */
156 /* Latest TSC time stamp taken in PMD. */
158 /* Used to space certain checks in time. */
159 uint64_t next_check_tsc
;
160 /* If non-NULL, outermost cycle timer currently running in PMD. */
161 struct cycle_timer
*cur_timer
;
162 /* Set of PMD counters with their zero offsets. */
163 struct pmd_counters counters
;
164 /* Statistics of the current iteration. */
165 struct iter_stats current
;
166 /* Totals for the current millisecond. */
167 struct iter_stats totals
;
168 /* Histograms for the PMD metrics. */
169 struct histogram cycles
;
170 struct histogram pkts
;
171 struct histogram cycles_per_pkt
;
172 struct histogram upcalls
;
173 struct histogram cycles_per_upcall
;
174 struct histogram pkts_per_batch
;
175 struct histogram max_vhost_qfill
;
176 /* Iteration history buffer. */
177 struct history iterations
;
178 /* Millisecond history buffer. */
179 struct history milliseconds
;
180 /* Suspicious iteration log. */
181 uint32_t log_susp_it
;
182 /* Start of iteration range to log. */
183 uint32_t log_begin_it
;
184 /* End of iteration range to log. */
186 /* Reason for logging suspicious iteration. */
191 static inline uint64_t
192 rdtsc_syscall(struct pmd_perf_stats
*s
)
197 if (clock_gettime(CLOCK_MONOTONIC_RAW
, &val
) != 0) {
201 v
= val
.tv_sec
* UINT64_C(1000000000) + val
.tv_nsec
;
202 return s
->last_tsc
= v
;
206 /* Support for accurate timing of PMD execution on TSC clock cycle level.
207 * These functions are intended to be invoked in the context of pmd threads. */
209 /* Read the TSC cycle register and cache it. Any function not requiring clock
210 * cycle accuracy should read the cached value using cycles_counter_get() to
211 * avoid the overhead of reading the TSC register. */
213 static inline uint64_t
214 cycles_counter_update(struct pmd_perf_stats
*s
)
217 return s
->last_tsc
= rte_get_tsc_cycles();
218 #elif !defined(_MSC_VER) && defined(__x86_64__)
220 asm volatile("rdtsc" : "=a" (l
), "=d" (h
));
222 return s
->last_tsc
= ((uint64_t) h
<< 32) | l
;
223 #elif !defined(_MSC_VER) && defined(__aarch64__)
224 asm volatile("mrs %0, cntvct_el0" : "=r" (s
->last_tsc
));
227 #elif defined(__linux__)
228 return rdtsc_syscall(s
);
230 return s
->last_tsc
= 0;
234 static inline uint64_t
235 cycles_counter_get(struct pmd_perf_stats
*s
)
240 void pmd_perf_estimate_tsc_frequency(void);
242 /* A nestable timer for measuring execution time in TSC cycles.
245 * struct cycle_timer timer;
247 * cycle_timer_start(pmd, &timer);
249 * uint64_t cycles = cycle_timer_stop(pmd, &timer);
251 * The caller must guarantee that a call to cycle_timer_start() is always
252 * paired with a call to cycle_stimer_stop().
254 * Is is possible to have nested cycles timers within the timed code. The
255 * execution time measured by the nested timers is excluded from the time
256 * measured by the embracing timer.
262 struct cycle_timer
*interrupted
;
266 cycle_timer_start(struct pmd_perf_stats
*s
,
267 struct cycle_timer
*timer
)
269 struct cycle_timer
*cur_timer
= s
->cur_timer
;
270 uint64_t now
= cycles_counter_update(s
);
273 cur_timer
->suspended
= now
;
275 timer
->interrupted
= cur_timer
;
277 timer
->suspended
= 0;
278 s
->cur_timer
= timer
;
281 static inline uint64_t
282 cycle_timer_stop(struct pmd_perf_stats
*s
,
283 struct cycle_timer
*timer
)
285 /* Assert that this is the current cycle timer. */
286 ovs_assert(s
->cur_timer
== timer
);
287 uint64_t now
= cycles_counter_update(s
);
288 struct cycle_timer
*intr_timer
= timer
->interrupted
;
291 /* Adjust the start offset by the suspended cycles. */
292 intr_timer
->start
+= now
- intr_timer
->suspended
;
294 /* Restore suspended timer, if any. */
295 s
->cur_timer
= intr_timer
;
296 return now
- timer
->start
;
299 /* Functions to initialize and reset the PMD performance metrics. */
301 void pmd_perf_stats_init(struct pmd_perf_stats
*s
);
302 void pmd_perf_stats_clear(struct pmd_perf_stats
*s
);
303 void pmd_perf_stats_clear_lock(struct pmd_perf_stats
*s
);
305 /* Functions to read and update PMD counters. */
307 void pmd_perf_read_counters(struct pmd_perf_stats
*s
,
308 uint64_t stats
[PMD_N_STATS
]);
310 /* PMD performance counters are updated lock-less. For real PMDs
311 * they are only updated from the PMD thread itself. In the case of the
312 * NON-PMD they might be updated from multiple threads, but we can live
313 * with losing a rare update as 100% accuracy is not required.
314 * However, as counters are read for display from outside the PMD thread
315 * with e.g. pmd-stats-show, we make sure that the 64-bit read and store
316 * operations are atomic also on 32-bit systems so that readers cannot
317 * not read garbage. On 64-bit systems this incurs no overhead. */
320 pmd_perf_update_counter(struct pmd_perf_stats
*s
,
321 enum pmd_stat_type counter
, int delta
)
324 atomic_read_relaxed(&s
->counters
.n
[counter
], &tmp
);
326 atomic_store_relaxed(&s
->counters
.n
[counter
], tmp
);
329 /* Functions to manipulate a sample history. */
332 histogram_add_sample(struct histogram
*hist
, uint32_t val
)
334 /* TODO: Can do better with binary search? */
335 for (int i
= 0; i
< NUM_BINS
-1; i
++) {
336 if (val
<= hist
->wall
[i
]) {
341 hist
->bin
[NUM_BINS
-1]++;
344 uint64_t histogram_samples(const struct histogram
*hist
);
346 /* This function is used to advance the given history index by positive
347 * offset in the circular history buffer. */
348 static inline uint32_t
349 history_add(uint32_t idx
, uint32_t offset
)
351 return (idx
+ offset
) % HISTORY_LEN
;
354 /* This function computes the difference between two indices into the
355 * circular history buffer. The result is always positive in the range
356 * 0 .. HISTORY_LEN-1 and specifies the number of steps to reach idx1
357 * starting from idx2. It can also be used to retreat the history index
358 * idx1 by idx2 steps. */
359 static inline uint32_t
360 history_sub(uint32_t idx1
, uint32_t idx2
)
362 return (idx1
+ HISTORY_LEN
- idx2
) % HISTORY_LEN
;
365 static inline struct iter_stats
*
366 history_current(struct history
*h
)
368 return &h
->sample
[h
->idx
];
371 static inline struct iter_stats
*
372 history_next(struct history
*h
)
374 size_t next_idx
= history_add(h
->idx
, 1);
375 struct iter_stats
*next
= &h
->sample
[next_idx
];
377 memset(next
, 0, sizeof(*next
));
382 static inline struct iter_stats
*
383 history_store(struct history
*h
, struct iter_stats
*is
)
386 h
->sample
[h
->idx
] = *is
;
388 /* Advance the history pointer */
389 return history_next(h
);
392 /* Data and function related to logging of suspicious iterations. */
394 extern bool log_enabled
;
395 extern bool log_extend
;
396 extern uint32_t log_q_thr
;
397 extern uint64_t iter_cycle_threshold
;
399 void pmd_perf_set_log_susp_iteration(struct pmd_perf_stats
*s
, char *reason
);
400 void pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats
*s
);
402 /* Functions recording PMD metrics per iteration. */
405 pmd_perf_start_iteration(struct pmd_perf_stats
*s
);
407 pmd_perf_end_iteration(struct pmd_perf_stats
*s
, int rx_packets
,
408 int tx_packets
, bool full_metrics
);
410 /* Formatting the output of commands. */
412 struct pmd_perf_params
{
415 size_t iter_hist_len
;
419 void pmd_perf_format_overall_stats(struct ds
*str
, struct pmd_perf_stats
*s
,
421 void pmd_perf_format_histograms(struct ds
*str
, struct pmd_perf_stats
*s
);
422 void pmd_perf_format_iteration_history(struct ds
*str
,
423 struct pmd_perf_stats
*s
,
425 void pmd_perf_format_ms_history(struct ds
*str
, struct pmd_perf_stats
*s
,
427 void pmd_perf_log_set_cmd(struct unixctl_conn
*conn
,
428 int argc
, const char *argv
[],
429 void *aux OVS_UNUSED
);
435 #endif /* DPIF_NETDEV_PERF_H */