]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/dpdk/test/test/test_ring_perf.c
update download target update for octopus release
[ceph.git] / ceph / src / seastar / dpdk / test / test / test_ring_perf.c
CommitLineData
7c673cae
FG
1/*-
2 * BSD LICENSE
3 *
4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34
35#include <stdio.h>
36#include <inttypes.h>
37#include <rte_ring.h>
38#include <rte_cycles.h>
39#include <rte_launch.h>
40
41#include "test.h"
42
43/*
44 * Ring
45 * ====
46 *
47 * Measures performance of various operations using rdtsc
48 * * Empty ring dequeue
49 * * Enqueue/dequeue of bursts in 1 threads
50 * * Enqueue/dequeue of bursts in 2 threads
51 */
52
53#define RING_NAME "RING_PERF"
54#define RING_SIZE 4096
55#define MAX_BURST 32
56
57/*
58 * the sizes to enqueue and dequeue in testing
59 * (marked volatile so they won't be seen as compile-time constants)
60 */
61static const volatile unsigned bulk_sizes[] = { 8, 32 };
62
63/* The ring structure used for tests */
64static struct rte_ring *r;
65
66struct lcore_pair {
67 unsigned c1, c2;
68};
69
70static volatile unsigned lcore_count = 0;
71
72/**** Functions to analyse our core mask to get cores for different tests ***/
73
74static int
75get_two_hyperthreads(struct lcore_pair *lcp)
76{
77 unsigned id1, id2;
78 unsigned c1, c2, s1, s2;
79 RTE_LCORE_FOREACH(id1) {
80 /* inner loop just re-reads all id's. We could skip the first few
81 * elements, but since number of cores is small there is little point
82 */
83 RTE_LCORE_FOREACH(id2) {
84 if (id1 == id2)
85 continue;
86 c1 = lcore_config[id1].core_id;
87 c2 = lcore_config[id2].core_id;
88 s1 = lcore_config[id1].socket_id;
89 s2 = lcore_config[id2].socket_id;
90 if ((c1 == c2) && (s1 == s2)){
91 lcp->c1 = id1;
92 lcp->c2 = id2;
93 return 0;
94 }
95 }
96 }
97 return 1;
98}
99
100static int
101get_two_cores(struct lcore_pair *lcp)
102{
103 unsigned id1, id2;
104 unsigned c1, c2, s1, s2;
105 RTE_LCORE_FOREACH(id1) {
106 RTE_LCORE_FOREACH(id2) {
107 if (id1 == id2)
108 continue;
109 c1 = lcore_config[id1].core_id;
110 c2 = lcore_config[id2].core_id;
111 s1 = lcore_config[id1].socket_id;
112 s2 = lcore_config[id2].socket_id;
113 if ((c1 != c2) && (s1 == s2)){
114 lcp->c1 = id1;
115 lcp->c2 = id2;
116 return 0;
117 }
118 }
119 }
120 return 1;
121}
122
123static int
124get_two_sockets(struct lcore_pair *lcp)
125{
126 unsigned id1, id2;
127 unsigned s1, s2;
128 RTE_LCORE_FOREACH(id1) {
129 RTE_LCORE_FOREACH(id2) {
130 if (id1 == id2)
131 continue;
132 s1 = lcore_config[id1].socket_id;
133 s2 = lcore_config[id2].socket_id;
134 if (s1 != s2){
135 lcp->c1 = id1;
136 lcp->c2 = id2;
137 return 0;
138 }
139 }
140 }
141 return 1;
142}
143
144/* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
145static void
146test_empty_dequeue(void)
147{
148 const unsigned iter_shift = 26;
149 const unsigned iterations = 1<<iter_shift;
150 unsigned i = 0;
151 void *burst[MAX_BURST];
152
153 const uint64_t sc_start = rte_rdtsc();
154 for (i = 0; i < iterations; i++)
11fdf7f2 155 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
7c673cae
FG
156 const uint64_t sc_end = rte_rdtsc();
157
158 const uint64_t mc_start = rte_rdtsc();
159 for (i = 0; i < iterations; i++)
11fdf7f2 160 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
7c673cae
FG
161 const uint64_t mc_end = rte_rdtsc();
162
163 printf("SC empty dequeue: %.2F\n",
164 (double)(sc_end-sc_start) / iterations);
165 printf("MC empty dequeue: %.2F\n",
166 (double)(mc_end-mc_start) / iterations);
167}
168
169/*
170 * for the separate enqueue and dequeue threads they take in one param
171 * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
172 */
173struct thread_params {
174 unsigned size; /* input value, the burst size */
175 double spsc, mpmc; /* output value, the single or multi timings */
176};
177
178/*
179 * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
180 * thread running dequeue_bulk function
181 */
182static int
183enqueue_bulk(void *p)
184{
185 const unsigned iter_shift = 23;
186 const unsigned iterations = 1<<iter_shift;
187 struct thread_params *params = p;
188 const unsigned size = params->size;
189 unsigned i;
190 void *burst[MAX_BURST] = {0};
191
192 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
193 while(lcore_count != 2)
194 rte_pause();
195
196 const uint64_t sp_start = rte_rdtsc();
197 for (i = 0; i < iterations; i++)
11fdf7f2 198 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
7c673cae
FG
199 rte_pause();
200 const uint64_t sp_end = rte_rdtsc();
201
202 const uint64_t mp_start = rte_rdtsc();
203 for (i = 0; i < iterations; i++)
11fdf7f2 204 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
7c673cae
FG
205 rte_pause();
206 const uint64_t mp_end = rte_rdtsc();
207
208 params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
209 params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
210 return 0;
211}
212
213/*
214 * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
215 * thread running enqueue_bulk function
216 */
217static int
218dequeue_bulk(void *p)
219{
220 const unsigned iter_shift = 23;
221 const unsigned iterations = 1<<iter_shift;
222 struct thread_params *params = p;
223 const unsigned size = params->size;
224 unsigned i;
225 void *burst[MAX_BURST] = {0};
226
227 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
228 while(lcore_count != 2)
229 rte_pause();
230
231 const uint64_t sc_start = rte_rdtsc();
232 for (i = 0; i < iterations; i++)
11fdf7f2 233 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
7c673cae
FG
234 rte_pause();
235 const uint64_t sc_end = rte_rdtsc();
236
237 const uint64_t mc_start = rte_rdtsc();
238 for (i = 0; i < iterations; i++)
11fdf7f2 239 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
7c673cae
FG
240 rte_pause();
241 const uint64_t mc_end = rte_rdtsc();
242
243 params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
244 params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
245 return 0;
246}
247
248/*
249 * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
250 * used to measure ring perf between hyperthreads, cores and sockets.
251 */
252static void
253run_on_core_pair(struct lcore_pair *cores,
254 lcore_function_t f1, lcore_function_t f2)
255{
256 struct thread_params param1 = {0}, param2 = {0};
257 unsigned i;
258 for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
259 lcore_count = 0;
260 param1.size = param2.size = bulk_sizes[i];
261 if (cores->c1 == rte_get_master_lcore()) {
262 rte_eal_remote_launch(f2, &param2, cores->c2);
263 f1(&param1);
264 rte_eal_wait_lcore(cores->c2);
265 } else {
266 rte_eal_remote_launch(f1, &param1, cores->c1);
267 rte_eal_remote_launch(f2, &param2, cores->c2);
268 rte_eal_wait_lcore(cores->c1);
269 rte_eal_wait_lcore(cores->c2);
270 }
271 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
272 param1.spsc + param2.spsc);
273 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
274 param1.mpmc + param2.mpmc);
275 }
276}
277
278/*
279 * Test function that determines how long an enqueue + dequeue of a single item
280 * takes on a single lcore. Result is for comparison with the bulk enq+deq.
281 */
282static void
283test_single_enqueue_dequeue(void)
284{
285 const unsigned iter_shift = 24;
286 const unsigned iterations = 1<<iter_shift;
287 unsigned i = 0;
288 void *burst = NULL;
289
290 const uint64_t sc_start = rte_rdtsc();
291 for (i = 0; i < iterations; i++) {
292 rte_ring_sp_enqueue(r, burst);
293 rte_ring_sc_dequeue(r, &burst);
294 }
295 const uint64_t sc_end = rte_rdtsc();
296
297 const uint64_t mc_start = rte_rdtsc();
298 for (i = 0; i < iterations; i++) {
299 rte_ring_mp_enqueue(r, burst);
300 rte_ring_mc_dequeue(r, &burst);
301 }
302 const uint64_t mc_end = rte_rdtsc();
303
304 printf("SP/SC single enq/dequeue: %"PRIu64"\n",
305 (sc_end-sc_start) >> iter_shift);
306 printf("MP/MC single enq/dequeue: %"PRIu64"\n",
307 (mc_end-mc_start) >> iter_shift);
308}
309
310/*
311 * Test that does both enqueue and dequeue on a core using the burst() API calls
312 * instead of the bulk() calls used in other tests. Results should be the same
313 * as for the bulk function called on a single lcore.
314 */
315static void
316test_burst_enqueue_dequeue(void)
317{
318 const unsigned iter_shift = 23;
319 const unsigned iterations = 1<<iter_shift;
320 unsigned sz, i = 0;
321 void *burst[MAX_BURST] = {0};
322
323 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
324 const uint64_t sc_start = rte_rdtsc();
325 for (i = 0; i < iterations; i++) {
11fdf7f2
TL
326 rte_ring_sp_enqueue_burst(r, burst,
327 bulk_sizes[sz], NULL);
328 rte_ring_sc_dequeue_burst(r, burst,
329 bulk_sizes[sz], NULL);
7c673cae
FG
330 }
331 const uint64_t sc_end = rte_rdtsc();
332
333 const uint64_t mc_start = rte_rdtsc();
334 for (i = 0; i < iterations; i++) {
11fdf7f2
TL
335 rte_ring_mp_enqueue_burst(r, burst,
336 bulk_sizes[sz], NULL);
337 rte_ring_mc_dequeue_burst(r, burst,
338 bulk_sizes[sz], NULL);
7c673cae
FG
339 }
340 const uint64_t mc_end = rte_rdtsc();
341
342 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
343 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
344
345 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
346 sc_avg);
347 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
348 mc_avg);
349 }
350}
351
352/* Times enqueue and dequeue on a single lcore */
353static void
354test_bulk_enqueue_dequeue(void)
355{
356 const unsigned iter_shift = 23;
357 const unsigned iterations = 1<<iter_shift;
358 unsigned sz, i = 0;
359 void *burst[MAX_BURST] = {0};
360
361 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
362 const uint64_t sc_start = rte_rdtsc();
363 for (i = 0; i < iterations; i++) {
11fdf7f2
TL
364 rte_ring_sp_enqueue_bulk(r, burst,
365 bulk_sizes[sz], NULL);
366 rte_ring_sc_dequeue_bulk(r, burst,
367 bulk_sizes[sz], NULL);
7c673cae
FG
368 }
369 const uint64_t sc_end = rte_rdtsc();
370
371 const uint64_t mc_start = rte_rdtsc();
372 for (i = 0; i < iterations; i++) {
11fdf7f2
TL
373 rte_ring_mp_enqueue_bulk(r, burst,
374 bulk_sizes[sz], NULL);
375 rte_ring_mc_dequeue_bulk(r, burst,
376 bulk_sizes[sz], NULL);
7c673cae
FG
377 }
378 const uint64_t mc_end = rte_rdtsc();
379
380 double sc_avg = ((double)(sc_end-sc_start) /
381 (iterations * bulk_sizes[sz]));
382 double mc_avg = ((double)(mc_end-mc_start) /
383 (iterations * bulk_sizes[sz]));
384
385 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
386 sc_avg);
387 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
388 mc_avg);
389 }
390}
391
392static int
393test_ring_perf(void)
394{
395 struct lcore_pair cores;
396 r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
397 if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
398 return -1;
399
400 printf("### Testing single element and burst enq/deq ###\n");
401 test_single_enqueue_dequeue();
402 test_burst_enqueue_dequeue();
403
404 printf("\n### Testing empty dequeue ###\n");
405 test_empty_dequeue();
406
407 printf("\n### Testing using a single lcore ###\n");
408 test_bulk_enqueue_dequeue();
409
410 if (get_two_hyperthreads(&cores) == 0) {
411 printf("\n### Testing using two hyperthreads ###\n");
412 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
413 }
414 if (get_two_cores(&cores) == 0) {
415 printf("\n### Testing using two physical cores ###\n");
416 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
417 }
418 if (get_two_sockets(&cores) == 0) {
419 printf("\n### Testing using two NUMA nodes ###\n");
420 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
421 }
422 return 0;
423}
424
425REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);