]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | ||
35 | #include <stdio.h> | |
36 | #include <inttypes.h> | |
37 | #include <rte_ring.h> | |
38 | #include <rte_cycles.h> | |
39 | #include <rte_launch.h> | |
40 | ||
41 | #include "test.h" | |
42 | ||
43 | /* | |
44 | * Ring | |
45 | * ==== | |
46 | * | |
47 | * Measures performance of various operations using rdtsc | |
48 | * * Empty ring dequeue | |
49 | * * Enqueue/dequeue of bursts in 1 threads | |
50 | * * Enqueue/dequeue of bursts in 2 threads | |
51 | */ | |
52 | ||
53 | #define RING_NAME "RING_PERF" | |
54 | #define RING_SIZE 4096 | |
55 | #define MAX_BURST 32 | |
56 | ||
57 | /* | |
58 | * the sizes to enqueue and dequeue in testing | |
59 | * (marked volatile so they won't be seen as compile-time constants) | |
60 | */ | |
61 | static const volatile unsigned bulk_sizes[] = { 8, 32 }; | |
62 | ||
63 | /* The ring structure used for tests */ | |
64 | static struct rte_ring *r; | |
65 | ||
66 | struct lcore_pair { | |
67 | unsigned c1, c2; | |
68 | }; | |
69 | ||
70 | static volatile unsigned lcore_count = 0; | |
71 | ||
72 | /**** Functions to analyse our core mask to get cores for different tests ***/ | |
73 | ||
74 | static int | |
75 | get_two_hyperthreads(struct lcore_pair *lcp) | |
76 | { | |
77 | unsigned id1, id2; | |
78 | unsigned c1, c2, s1, s2; | |
79 | RTE_LCORE_FOREACH(id1) { | |
80 | /* inner loop just re-reads all id's. We could skip the first few | |
81 | * elements, but since number of cores is small there is little point | |
82 | */ | |
83 | RTE_LCORE_FOREACH(id2) { | |
84 | if (id1 == id2) | |
85 | continue; | |
86 | c1 = lcore_config[id1].core_id; | |
87 | c2 = lcore_config[id2].core_id; | |
88 | s1 = lcore_config[id1].socket_id; | |
89 | s2 = lcore_config[id2].socket_id; | |
90 | if ((c1 == c2) && (s1 == s2)){ | |
91 | lcp->c1 = id1; | |
92 | lcp->c2 = id2; | |
93 | return 0; | |
94 | } | |
95 | } | |
96 | } | |
97 | return 1; | |
98 | } | |
99 | ||
100 | static int | |
101 | get_two_cores(struct lcore_pair *lcp) | |
102 | { | |
103 | unsigned id1, id2; | |
104 | unsigned c1, c2, s1, s2; | |
105 | RTE_LCORE_FOREACH(id1) { | |
106 | RTE_LCORE_FOREACH(id2) { | |
107 | if (id1 == id2) | |
108 | continue; | |
109 | c1 = lcore_config[id1].core_id; | |
110 | c2 = lcore_config[id2].core_id; | |
111 | s1 = lcore_config[id1].socket_id; | |
112 | s2 = lcore_config[id2].socket_id; | |
113 | if ((c1 != c2) && (s1 == s2)){ | |
114 | lcp->c1 = id1; | |
115 | lcp->c2 = id2; | |
116 | return 0; | |
117 | } | |
118 | } | |
119 | } | |
120 | return 1; | |
121 | } | |
122 | ||
123 | static int | |
124 | get_two_sockets(struct lcore_pair *lcp) | |
125 | { | |
126 | unsigned id1, id2; | |
127 | unsigned s1, s2; | |
128 | RTE_LCORE_FOREACH(id1) { | |
129 | RTE_LCORE_FOREACH(id2) { | |
130 | if (id1 == id2) | |
131 | continue; | |
132 | s1 = lcore_config[id1].socket_id; | |
133 | s2 = lcore_config[id2].socket_id; | |
134 | if (s1 != s2){ | |
135 | lcp->c1 = id1; | |
136 | lcp->c2 = id2; | |
137 | return 0; | |
138 | } | |
139 | } | |
140 | } | |
141 | return 1; | |
142 | } | |
143 | ||
144 | /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */ | |
145 | static void | |
146 | test_empty_dequeue(void) | |
147 | { | |
148 | const unsigned iter_shift = 26; | |
149 | const unsigned iterations = 1<<iter_shift; | |
150 | unsigned i = 0; | |
151 | void *burst[MAX_BURST]; | |
152 | ||
153 | const uint64_t sc_start = rte_rdtsc(); | |
154 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 155 | rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL); |
7c673cae FG |
156 | const uint64_t sc_end = rte_rdtsc(); |
157 | ||
158 | const uint64_t mc_start = rte_rdtsc(); | |
159 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 160 | rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL); |
7c673cae FG |
161 | const uint64_t mc_end = rte_rdtsc(); |
162 | ||
163 | printf("SC empty dequeue: %.2F\n", | |
164 | (double)(sc_end-sc_start) / iterations); | |
165 | printf("MC empty dequeue: %.2F\n", | |
166 | (double)(mc_end-mc_start) / iterations); | |
167 | } | |
168 | ||
169 | /* | |
170 | * for the separate enqueue and dequeue threads they take in one param | |
171 | * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc | |
172 | */ | |
173 | struct thread_params { | |
174 | unsigned size; /* input value, the burst size */ | |
175 | double spsc, mpmc; /* output value, the single or multi timings */ | |
176 | }; | |
177 | ||
178 | /* | |
179 | * Function that uses rdtsc to measure timing for ring enqueue. Needs pair | |
180 | * thread running dequeue_bulk function | |
181 | */ | |
182 | static int | |
183 | enqueue_bulk(void *p) | |
184 | { | |
185 | const unsigned iter_shift = 23; | |
186 | const unsigned iterations = 1<<iter_shift; | |
187 | struct thread_params *params = p; | |
188 | const unsigned size = params->size; | |
189 | unsigned i; | |
190 | void *burst[MAX_BURST] = {0}; | |
191 | ||
192 | if ( __sync_add_and_fetch(&lcore_count, 1) != 2 ) | |
193 | while(lcore_count != 2) | |
194 | rte_pause(); | |
195 | ||
196 | const uint64_t sp_start = rte_rdtsc(); | |
197 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 198 | while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0) |
7c673cae FG |
199 | rte_pause(); |
200 | const uint64_t sp_end = rte_rdtsc(); | |
201 | ||
202 | const uint64_t mp_start = rte_rdtsc(); | |
203 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 204 | while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0) |
7c673cae FG |
205 | rte_pause(); |
206 | const uint64_t mp_end = rte_rdtsc(); | |
207 | ||
208 | params->spsc = ((double)(sp_end - sp_start))/(iterations*size); | |
209 | params->mpmc = ((double)(mp_end - mp_start))/(iterations*size); | |
210 | return 0; | |
211 | } | |
212 | ||
213 | /* | |
214 | * Function that uses rdtsc to measure timing for ring dequeue. Needs pair | |
215 | * thread running enqueue_bulk function | |
216 | */ | |
217 | static int | |
218 | dequeue_bulk(void *p) | |
219 | { | |
220 | const unsigned iter_shift = 23; | |
221 | const unsigned iterations = 1<<iter_shift; | |
222 | struct thread_params *params = p; | |
223 | const unsigned size = params->size; | |
224 | unsigned i; | |
225 | void *burst[MAX_BURST] = {0}; | |
226 | ||
227 | if ( __sync_add_and_fetch(&lcore_count, 1) != 2 ) | |
228 | while(lcore_count != 2) | |
229 | rte_pause(); | |
230 | ||
231 | const uint64_t sc_start = rte_rdtsc(); | |
232 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 233 | while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0) |
7c673cae FG |
234 | rte_pause(); |
235 | const uint64_t sc_end = rte_rdtsc(); | |
236 | ||
237 | const uint64_t mc_start = rte_rdtsc(); | |
238 | for (i = 0; i < iterations; i++) | |
11fdf7f2 | 239 | while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0) |
7c673cae FG |
240 | rte_pause(); |
241 | const uint64_t mc_end = rte_rdtsc(); | |
242 | ||
243 | params->spsc = ((double)(sc_end - sc_start))/(iterations*size); | |
244 | params->mpmc = ((double)(mc_end - mc_start))/(iterations*size); | |
245 | return 0; | |
246 | } | |
247 | ||
248 | /* | |
249 | * Function that calls the enqueue and dequeue bulk functions on pairs of cores. | |
250 | * used to measure ring perf between hyperthreads, cores and sockets. | |
251 | */ | |
252 | static void | |
253 | run_on_core_pair(struct lcore_pair *cores, | |
254 | lcore_function_t f1, lcore_function_t f2) | |
255 | { | |
256 | struct thread_params param1 = {0}, param2 = {0}; | |
257 | unsigned i; | |
258 | for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) { | |
259 | lcore_count = 0; | |
260 | param1.size = param2.size = bulk_sizes[i]; | |
261 | if (cores->c1 == rte_get_master_lcore()) { | |
262 | rte_eal_remote_launch(f2, ¶m2, cores->c2); | |
263 | f1(¶m1); | |
264 | rte_eal_wait_lcore(cores->c2); | |
265 | } else { | |
266 | rte_eal_remote_launch(f1, ¶m1, cores->c1); | |
267 | rte_eal_remote_launch(f2, ¶m2, cores->c2); | |
268 | rte_eal_wait_lcore(cores->c1); | |
269 | rte_eal_wait_lcore(cores->c2); | |
270 | } | |
271 | printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i], | |
272 | param1.spsc + param2.spsc); | |
273 | printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i], | |
274 | param1.mpmc + param2.mpmc); | |
275 | } | |
276 | } | |
277 | ||
278 | /* | |
279 | * Test function that determines how long an enqueue + dequeue of a single item | |
280 | * takes on a single lcore. Result is for comparison with the bulk enq+deq. | |
281 | */ | |
282 | static void | |
283 | test_single_enqueue_dequeue(void) | |
284 | { | |
285 | const unsigned iter_shift = 24; | |
286 | const unsigned iterations = 1<<iter_shift; | |
287 | unsigned i = 0; | |
288 | void *burst = NULL; | |
289 | ||
290 | const uint64_t sc_start = rte_rdtsc(); | |
291 | for (i = 0; i < iterations; i++) { | |
292 | rte_ring_sp_enqueue(r, burst); | |
293 | rte_ring_sc_dequeue(r, &burst); | |
294 | } | |
295 | const uint64_t sc_end = rte_rdtsc(); | |
296 | ||
297 | const uint64_t mc_start = rte_rdtsc(); | |
298 | for (i = 0; i < iterations; i++) { | |
299 | rte_ring_mp_enqueue(r, burst); | |
300 | rte_ring_mc_dequeue(r, &burst); | |
301 | } | |
302 | const uint64_t mc_end = rte_rdtsc(); | |
303 | ||
304 | printf("SP/SC single enq/dequeue: %"PRIu64"\n", | |
305 | (sc_end-sc_start) >> iter_shift); | |
306 | printf("MP/MC single enq/dequeue: %"PRIu64"\n", | |
307 | (mc_end-mc_start) >> iter_shift); | |
308 | } | |
309 | ||
310 | /* | |
311 | * Test that does both enqueue and dequeue on a core using the burst() API calls | |
312 | * instead of the bulk() calls used in other tests. Results should be the same | |
313 | * as for the bulk function called on a single lcore. | |
314 | */ | |
315 | static void | |
316 | test_burst_enqueue_dequeue(void) | |
317 | { | |
318 | const unsigned iter_shift = 23; | |
319 | const unsigned iterations = 1<<iter_shift; | |
320 | unsigned sz, i = 0; | |
321 | void *burst[MAX_BURST] = {0}; | |
322 | ||
323 | for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) { | |
324 | const uint64_t sc_start = rte_rdtsc(); | |
325 | for (i = 0; i < iterations; i++) { | |
11fdf7f2 TL |
326 | rte_ring_sp_enqueue_burst(r, burst, |
327 | bulk_sizes[sz], NULL); | |
328 | rte_ring_sc_dequeue_burst(r, burst, | |
329 | bulk_sizes[sz], NULL); | |
7c673cae FG |
330 | } |
331 | const uint64_t sc_end = rte_rdtsc(); | |
332 | ||
333 | const uint64_t mc_start = rte_rdtsc(); | |
334 | for (i = 0; i < iterations; i++) { | |
11fdf7f2 TL |
335 | rte_ring_mp_enqueue_burst(r, burst, |
336 | bulk_sizes[sz], NULL); | |
337 | rte_ring_mc_dequeue_burst(r, burst, | |
338 | bulk_sizes[sz], NULL); | |
7c673cae FG |
339 | } |
340 | const uint64_t mc_end = rte_rdtsc(); | |
341 | ||
342 | uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz]; | |
343 | uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz]; | |
344 | ||
345 | printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz], | |
346 | sc_avg); | |
347 | printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz], | |
348 | mc_avg); | |
349 | } | |
350 | } | |
351 | ||
352 | /* Times enqueue and dequeue on a single lcore */ | |
353 | static void | |
354 | test_bulk_enqueue_dequeue(void) | |
355 | { | |
356 | const unsigned iter_shift = 23; | |
357 | const unsigned iterations = 1<<iter_shift; | |
358 | unsigned sz, i = 0; | |
359 | void *burst[MAX_BURST] = {0}; | |
360 | ||
361 | for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) { | |
362 | const uint64_t sc_start = rte_rdtsc(); | |
363 | for (i = 0; i < iterations; i++) { | |
11fdf7f2 TL |
364 | rte_ring_sp_enqueue_bulk(r, burst, |
365 | bulk_sizes[sz], NULL); | |
366 | rte_ring_sc_dequeue_bulk(r, burst, | |
367 | bulk_sizes[sz], NULL); | |
7c673cae FG |
368 | } |
369 | const uint64_t sc_end = rte_rdtsc(); | |
370 | ||
371 | const uint64_t mc_start = rte_rdtsc(); | |
372 | for (i = 0; i < iterations; i++) { | |
11fdf7f2 TL |
373 | rte_ring_mp_enqueue_bulk(r, burst, |
374 | bulk_sizes[sz], NULL); | |
375 | rte_ring_mc_dequeue_bulk(r, burst, | |
376 | bulk_sizes[sz], NULL); | |
7c673cae FG |
377 | } |
378 | const uint64_t mc_end = rte_rdtsc(); | |
379 | ||
380 | double sc_avg = ((double)(sc_end-sc_start) / | |
381 | (iterations * bulk_sizes[sz])); | |
382 | double mc_avg = ((double)(mc_end-mc_start) / | |
383 | (iterations * bulk_sizes[sz])); | |
384 | ||
385 | printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz], | |
386 | sc_avg); | |
387 | printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz], | |
388 | mc_avg); | |
389 | } | |
390 | } | |
391 | ||
392 | static int | |
393 | test_ring_perf(void) | |
394 | { | |
395 | struct lcore_pair cores; | |
396 | r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0); | |
397 | if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL) | |
398 | return -1; | |
399 | ||
400 | printf("### Testing single element and burst enq/deq ###\n"); | |
401 | test_single_enqueue_dequeue(); | |
402 | test_burst_enqueue_dequeue(); | |
403 | ||
404 | printf("\n### Testing empty dequeue ###\n"); | |
405 | test_empty_dequeue(); | |
406 | ||
407 | printf("\n### Testing using a single lcore ###\n"); | |
408 | test_bulk_enqueue_dequeue(); | |
409 | ||
410 | if (get_two_hyperthreads(&cores) == 0) { | |
411 | printf("\n### Testing using two hyperthreads ###\n"); | |
412 | run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk); | |
413 | } | |
414 | if (get_two_cores(&cores) == 0) { | |
415 | printf("\n### Testing using two physical cores ###\n"); | |
416 | run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk); | |
417 | } | |
418 | if (get_two_sockets(&cores) == 0) { | |
419 | printf("\n### Testing using two NUMA nodes ###\n"); | |
420 | run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk); | |
421 | } | |
422 | return 0; | |
423 | } | |
424 | ||
425 | REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf); |