]>
git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/test/test/test_ring_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
19 * Measures performance of various operations using rdtsc
20 * * Empty ring dequeue
21 * * Enqueue/dequeue of bursts in 1 threads
22 * * Enqueue/dequeue of bursts in 2 threads
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
30 * the sizes to enqueue and dequeue in testing
31 * (marked volatile so they won't be seen as compile-time constants)
33 static const volatile unsigned bulk_sizes
[] = { 8, 32 };
39 static volatile unsigned lcore_count
= 0;
41 /**** Functions to analyse our core mask to get cores for different tests ***/
44 get_two_hyperthreads(struct lcore_pair
*lcp
)
47 unsigned c1
, c2
, s1
, s2
;
48 RTE_LCORE_FOREACH(id1
) {
49 /* inner loop just re-reads all id's. We could skip the first few
50 * elements, but since number of cores is small there is little point
52 RTE_LCORE_FOREACH(id2
) {
55 c1
= lcore_config
[id1
].core_id
;
56 c2
= lcore_config
[id2
].core_id
;
57 s1
= lcore_config
[id1
].socket_id
;
58 s2
= lcore_config
[id2
].socket_id
;
59 if ((c1
== c2
) && (s1
== s2
)){
70 get_two_cores(struct lcore_pair
*lcp
)
73 unsigned c1
, c2
, s1
, s2
;
74 RTE_LCORE_FOREACH(id1
) {
75 RTE_LCORE_FOREACH(id2
) {
78 c1
= lcore_config
[id1
].core_id
;
79 c2
= lcore_config
[id2
].core_id
;
80 s1
= lcore_config
[id1
].socket_id
;
81 s2
= lcore_config
[id2
].socket_id
;
82 if ((c1
!= c2
) && (s1
== s2
)){
93 get_two_sockets(struct lcore_pair
*lcp
)
97 RTE_LCORE_FOREACH(id1
) {
98 RTE_LCORE_FOREACH(id2
) {
101 s1
= lcore_config
[id1
].socket_id
;
102 s2
= lcore_config
[id2
].socket_id
;
113 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
115 test_empty_dequeue(struct rte_ring
*r
)
117 const unsigned iter_shift
= 26;
118 const unsigned iterations
= 1<<iter_shift
;
120 void *burst
[MAX_BURST
];
122 const uint64_t sc_start
= rte_rdtsc();
123 for (i
= 0; i
< iterations
; i
++)
124 rte_ring_sc_dequeue_bulk(r
, burst
, bulk_sizes
[0], NULL
);
125 const uint64_t sc_end
= rte_rdtsc();
127 const uint64_t mc_start
= rte_rdtsc();
128 for (i
= 0; i
< iterations
; i
++)
129 rte_ring_mc_dequeue_bulk(r
, burst
, bulk_sizes
[0], NULL
);
130 const uint64_t mc_end
= rte_rdtsc();
132 printf("SC empty dequeue: %.2F\n",
133 (double)(sc_end
-sc_start
) / iterations
);
134 printf("MC empty dequeue: %.2F\n",
135 (double)(mc_end
-mc_start
) / iterations
);
139 * for the separate enqueue and dequeue threads they take in one param
140 * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
142 struct thread_params
{
144 unsigned size
; /* input value, the burst size */
145 double spsc
, mpmc
; /* output value, the single or multi timings */
149 * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
150 * thread running dequeue_bulk function
153 enqueue_bulk(void *p
)
155 const unsigned iter_shift
= 23;
156 const unsigned iterations
= 1<<iter_shift
;
157 struct thread_params
*params
= p
;
158 struct rte_ring
*r
= params
->r
;
159 const unsigned size
= params
->size
;
161 void *burst
[MAX_BURST
] = {0};
163 if ( __sync_add_and_fetch(&lcore_count
, 1) != 2 )
164 while(lcore_count
!= 2)
167 const uint64_t sp_start
= rte_rdtsc();
168 for (i
= 0; i
< iterations
; i
++)
169 while (rte_ring_sp_enqueue_bulk(r
, burst
, size
, NULL
) == 0)
171 const uint64_t sp_end
= rte_rdtsc();
173 const uint64_t mp_start
= rte_rdtsc();
174 for (i
= 0; i
< iterations
; i
++)
175 while (rte_ring_mp_enqueue_bulk(r
, burst
, size
, NULL
) == 0)
177 const uint64_t mp_end
= rte_rdtsc();
179 params
->spsc
= ((double)(sp_end
- sp_start
))/(iterations
*size
);
180 params
->mpmc
= ((double)(mp_end
- mp_start
))/(iterations
*size
);
185 * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
186 * thread running enqueue_bulk function
189 dequeue_bulk(void *p
)
191 const unsigned iter_shift
= 23;
192 const unsigned iterations
= 1<<iter_shift
;
193 struct thread_params
*params
= p
;
194 struct rte_ring
*r
= params
->r
;
195 const unsigned size
= params
->size
;
197 void *burst
[MAX_BURST
] = {0};
199 if ( __sync_add_and_fetch(&lcore_count
, 1) != 2 )
200 while(lcore_count
!= 2)
203 const uint64_t sc_start
= rte_rdtsc();
204 for (i
= 0; i
< iterations
; i
++)
205 while (rte_ring_sc_dequeue_bulk(r
, burst
, size
, NULL
) == 0)
207 const uint64_t sc_end
= rte_rdtsc();
209 const uint64_t mc_start
= rte_rdtsc();
210 for (i
= 0; i
< iterations
; i
++)
211 while (rte_ring_mc_dequeue_bulk(r
, burst
, size
, NULL
) == 0)
213 const uint64_t mc_end
= rte_rdtsc();
215 params
->spsc
= ((double)(sc_end
- sc_start
))/(iterations
*size
);
216 params
->mpmc
= ((double)(mc_end
- mc_start
))/(iterations
*size
);
221 * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
222 * used to measure ring perf between hyperthreads, cores and sockets.
225 run_on_core_pair(struct lcore_pair
*cores
, struct rte_ring
*r
,
226 lcore_function_t f1
, lcore_function_t f2
)
228 struct thread_params param1
= {0}, param2
= {0};
230 for (i
= 0; i
< sizeof(bulk_sizes
)/sizeof(bulk_sizes
[0]); i
++) {
232 param1
.size
= param2
.size
= bulk_sizes
[i
];
233 param1
.r
= param2
.r
= r
;
234 if (cores
->c1
== rte_get_master_lcore()) {
235 rte_eal_remote_launch(f2
, ¶m2
, cores
->c2
);
237 rte_eal_wait_lcore(cores
->c2
);
239 rte_eal_remote_launch(f1
, ¶m1
, cores
->c1
);
240 rte_eal_remote_launch(f2
, ¶m2
, cores
->c2
);
241 rte_eal_wait_lcore(cores
->c1
);
242 rte_eal_wait_lcore(cores
->c2
);
244 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes
[i
],
245 param1
.spsc
+ param2
.spsc
);
246 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes
[i
],
247 param1
.mpmc
+ param2
.mpmc
);
252 * Test function that determines how long an enqueue + dequeue of a single item
253 * takes on a single lcore. Result is for comparison with the bulk enq+deq.
256 test_single_enqueue_dequeue(struct rte_ring
*r
)
258 const unsigned iter_shift
= 24;
259 const unsigned iterations
= 1<<iter_shift
;
263 const uint64_t sc_start
= rte_rdtsc();
264 for (i
= 0; i
< iterations
; i
++) {
265 rte_ring_sp_enqueue(r
, burst
);
266 rte_ring_sc_dequeue(r
, &burst
);
268 const uint64_t sc_end
= rte_rdtsc();
270 const uint64_t mc_start
= rte_rdtsc();
271 for (i
= 0; i
< iterations
; i
++) {
272 rte_ring_mp_enqueue(r
, burst
);
273 rte_ring_mc_dequeue(r
, &burst
);
275 const uint64_t mc_end
= rte_rdtsc();
277 printf("SP/SC single enq/dequeue: %"PRIu64
"\n",
278 (sc_end
-sc_start
) >> iter_shift
);
279 printf("MP/MC single enq/dequeue: %"PRIu64
"\n",
280 (mc_end
-mc_start
) >> iter_shift
);
284 * Test that does both enqueue and dequeue on a core using the burst() API calls
285 * instead of the bulk() calls used in other tests. Results should be the same
286 * as for the bulk function called on a single lcore.
289 test_burst_enqueue_dequeue(struct rte_ring
*r
)
291 const unsigned iter_shift
= 23;
292 const unsigned iterations
= 1<<iter_shift
;
294 void *burst
[MAX_BURST
] = {0};
296 for (sz
= 0; sz
< sizeof(bulk_sizes
)/sizeof(bulk_sizes
[0]); sz
++) {
297 const uint64_t sc_start
= rte_rdtsc();
298 for (i
= 0; i
< iterations
; i
++) {
299 rte_ring_sp_enqueue_burst(r
, burst
,
300 bulk_sizes
[sz
], NULL
);
301 rte_ring_sc_dequeue_burst(r
, burst
,
302 bulk_sizes
[sz
], NULL
);
304 const uint64_t sc_end
= rte_rdtsc();
306 const uint64_t mc_start
= rte_rdtsc();
307 for (i
= 0; i
< iterations
; i
++) {
308 rte_ring_mp_enqueue_burst(r
, burst
,
309 bulk_sizes
[sz
], NULL
);
310 rte_ring_mc_dequeue_burst(r
, burst
,
311 bulk_sizes
[sz
], NULL
);
313 const uint64_t mc_end
= rte_rdtsc();
315 uint64_t mc_avg
= ((mc_end
-mc_start
) >> iter_shift
) / bulk_sizes
[sz
];
316 uint64_t sc_avg
= ((sc_end
-sc_start
) >> iter_shift
) / bulk_sizes
[sz
];
318 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64
"\n", bulk_sizes
[sz
],
320 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64
"\n", bulk_sizes
[sz
],
325 /* Times enqueue and dequeue on a single lcore */
327 test_bulk_enqueue_dequeue(struct rte_ring
*r
)
329 const unsigned iter_shift
= 23;
330 const unsigned iterations
= 1<<iter_shift
;
332 void *burst
[MAX_BURST
] = {0};
334 for (sz
= 0; sz
< sizeof(bulk_sizes
)/sizeof(bulk_sizes
[0]); sz
++) {
335 const uint64_t sc_start
= rte_rdtsc();
336 for (i
= 0; i
< iterations
; i
++) {
337 rte_ring_sp_enqueue_bulk(r
, burst
,
338 bulk_sizes
[sz
], NULL
);
339 rte_ring_sc_dequeue_bulk(r
, burst
,
340 bulk_sizes
[sz
], NULL
);
342 const uint64_t sc_end
= rte_rdtsc();
344 const uint64_t mc_start
= rte_rdtsc();
345 for (i
= 0; i
< iterations
; i
++) {
346 rte_ring_mp_enqueue_bulk(r
, burst
,
347 bulk_sizes
[sz
], NULL
);
348 rte_ring_mc_dequeue_bulk(r
, burst
,
349 bulk_sizes
[sz
], NULL
);
351 const uint64_t mc_end
= rte_rdtsc();
353 double sc_avg
= ((double)(sc_end
-sc_start
) /
354 (iterations
* bulk_sizes
[sz
]));
355 double mc_avg
= ((double)(mc_end
-mc_start
) /
356 (iterations
* bulk_sizes
[sz
]));
358 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes
[sz
],
360 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes
[sz
],
368 struct lcore_pair cores
;
369 struct rte_ring
*r
= NULL
;
371 r
= rte_ring_create(RING_NAME
, RING_SIZE
, rte_socket_id(), 0);
375 printf("### Testing single element and burst enq/deq ###\n");
376 test_single_enqueue_dequeue(r
);
377 test_burst_enqueue_dequeue(r
);
379 printf("\n### Testing empty dequeue ###\n");
380 test_empty_dequeue(r
);
382 printf("\n### Testing using a single lcore ###\n");
383 test_bulk_enqueue_dequeue(r
);
385 if (get_two_hyperthreads(&cores
) == 0) {
386 printf("\n### Testing using two hyperthreads ###\n");
387 run_on_core_pair(&cores
, r
, enqueue_bulk
, dequeue_bulk
);
389 if (get_two_cores(&cores
) == 0) {
390 printf("\n### Testing using two physical cores ###\n");
391 run_on_core_pair(&cores
, r
, enqueue_bulk
, dequeue_bulk
);
393 if (get_two_sockets(&cores
) == 0) {
394 printf("\n### Testing using two NUMA nodes ###\n");
395 run_on_core_pair(&cores
, r
, enqueue_bulk
, dequeue_bulk
);
401 REGISTER_TEST_COMMAND(ring_perf_autotest
, test_ring_perf
);