1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
17 #define STACK_NAME "STACK_PERF"
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
21 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
24 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
27 static volatile unsigned int bulk_sizes
[] = {8, MAX_BURST
};
29 static rte_atomic32_t lcore_barrier
;
37 get_two_hyperthreads(struct lcore_pair
*lcp
)
39 unsigned int socket
[2];
43 RTE_LCORE_FOREACH(id
[0]) {
44 RTE_LCORE_FOREACH(id
[1]) {
47 core
[0] = lcore_config
[id
[0]].core_id
;
48 core
[1] = lcore_config
[id
[1]].core_id
;
49 socket
[0] = lcore_config
[id
[0]].socket_id
;
50 socket
[1] = lcore_config
[id
[1]].socket_id
;
51 if ((core
[0] == core
[1]) && (socket
[0] == socket
[1])) {
63 get_two_cores(struct lcore_pair
*lcp
)
65 unsigned int socket
[2];
69 RTE_LCORE_FOREACH(id
[0]) {
70 RTE_LCORE_FOREACH(id
[1]) {
73 core
[0] = lcore_config
[id
[0]].core_id
;
74 core
[1] = lcore_config
[id
[1]].core_id
;
75 socket
[0] = lcore_config
[id
[0]].socket_id
;
76 socket
[1] = lcore_config
[id
[1]].socket_id
;
77 if ((core
[0] != core
[1]) && (socket
[0] == socket
[1])) {
89 get_two_sockets(struct lcore_pair
*lcp
)
91 unsigned int socket
[2];
94 RTE_LCORE_FOREACH(id
[0]) {
95 RTE_LCORE_FOREACH(id
[1]) {
98 socket
[0] = lcore_config
[id
[0]].socket_id
;
99 socket
[1] = lcore_config
[id
[1]].socket_id
;
100 if (socket
[0] != socket
[1]) {
111 /* Measure the cycle cost of popping an empty stack. */
113 test_empty_pop(struct rte_stack
*s
)
115 unsigned int iterations
= 100000000;
116 void *objs
[MAX_BURST
];
119 uint64_t start
= rte_rdtsc();
121 for (i
= 0; i
< iterations
; i
++)
122 rte_stack_pop(s
, objs
, bulk_sizes
[0]);
124 uint64_t end
= rte_rdtsc();
126 printf("Stack empty pop: %.2F\n",
127 (double)(end
- start
) / iterations
);
136 /* Measure the average per-pointer cycle cost of stack push and pop */
138 bulk_push_pop(void *p
)
140 unsigned int iterations
= 1000000;
141 struct thread_args
*args
= p
;
142 void *objs
[MAX_BURST
] = {0};
143 unsigned int size
, i
;
149 rte_atomic32_sub(&lcore_barrier
, 1);
150 while (rte_atomic32_read(&lcore_barrier
) != 0)
153 uint64_t start
= rte_rdtsc();
155 for (i
= 0; i
< iterations
; i
++) {
156 rte_stack_push(s
, objs
, size
);
157 rte_stack_pop(s
, objs
, size
);
160 uint64_t end
= rte_rdtsc();
162 args
->avg
= ((double)(end
- start
))/(iterations
* size
);
168 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
169 * perf when between hyperthread siblings, cores on the same socket, and cores
170 * on different sockets.
173 run_on_core_pair(struct lcore_pair
*cores
, struct rte_stack
*s
,
176 struct thread_args args
[2];
179 for (i
= 0; i
< ARRAY_SIZE(bulk_sizes
); i
++) {
180 rte_atomic32_set(&lcore_barrier
, 2);
182 args
[0].sz
= args
[1].sz
= bulk_sizes
[i
];
183 args
[0].s
= args
[1].s
= s
;
185 if (cores
->c1
== rte_get_master_lcore()) {
186 rte_eal_remote_launch(fn
, &args
[1], cores
->c2
);
188 rte_eal_wait_lcore(cores
->c2
);
190 rte_eal_remote_launch(fn
, &args
[0], cores
->c1
);
191 rte_eal_remote_launch(fn
, &args
[1], cores
->c2
);
192 rte_eal_wait_lcore(cores
->c1
);
193 rte_eal_wait_lcore(cores
->c2
);
196 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
197 bulk_sizes
[i
], (args
[0].avg
+ args
[1].avg
) / 2);
201 /* Run bulk_push_pop() simultaneously on 1+ cores. */
203 run_on_n_cores(struct rte_stack
*s
, lcore_function_t fn
, int n
)
205 struct thread_args args
[RTE_MAX_LCORE
];
208 for (i
= 0; i
< ARRAY_SIZE(bulk_sizes
); i
++) {
209 unsigned int lcore_id
;
213 rte_atomic32_set(&lcore_barrier
, n
);
215 RTE_LCORE_FOREACH_SLAVE(lcore_id
) {
219 args
[lcore_id
].s
= s
;
220 args
[lcore_id
].sz
= bulk_sizes
[i
];
222 if (rte_eal_remote_launch(fn
, &args
[lcore_id
],
224 rte_panic("Failed to launch lcore %d\n",
228 lcore_id
= rte_lcore_id();
230 args
[lcore_id
].s
= s
;
231 args
[lcore_id
].sz
= bulk_sizes
[i
];
235 rte_eal_mp_wait_lcore();
237 avg
= args
[rte_lcore_id()].avg
;
240 RTE_LCORE_FOREACH_SLAVE(lcore_id
) {
243 avg
+= args
[lcore_id
].avg
;
246 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
247 bulk_sizes
[i
], avg
/ n
);
252 * Measure the cycle cost of pushing and popping a single pointer on a single
256 test_single_push_pop(struct rte_stack
*s
)
258 unsigned int iterations
= 16000000;
262 uint64_t start
= rte_rdtsc();
264 for (i
= 0; i
< iterations
; i
++) {
265 rte_stack_push(s
, &obj
, 1);
266 rte_stack_pop(s
, &obj
, 1);
269 uint64_t end
= rte_rdtsc();
271 printf("Average cycles per single object push/pop: %.2F\n",
272 ((double)(end
- start
)) / iterations
);
275 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
277 test_bulk_push_pop(struct rte_stack
*s
)
279 unsigned int iterations
= 8000000;
280 void *objs
[MAX_BURST
];
283 for (sz
= 0; sz
< ARRAY_SIZE(bulk_sizes
); sz
++) {
284 uint64_t start
= rte_rdtsc();
286 for (i
= 0; i
< iterations
; i
++) {
287 rte_stack_push(s
, objs
, bulk_sizes
[sz
]);
288 rte_stack_pop(s
, objs
, bulk_sizes
[sz
]);
291 uint64_t end
= rte_rdtsc();
293 double avg
= ((double)(end
- start
) /
294 (iterations
* bulk_sizes
[sz
]));
296 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
297 bulk_sizes
[sz
], avg
);
302 __test_stack_perf(uint32_t flags
)
304 struct lcore_pair cores
;
307 rte_atomic32_init(&lcore_barrier
);
309 s
= rte_stack_create(STACK_NAME
, STACK_SIZE
, rte_socket_id(), flags
);
311 printf("[%s():%u] failed to create a stack\n",
316 printf("### Testing single element push/pop ###\n");
317 test_single_push_pop(s
);
319 printf("\n### Testing empty pop ###\n");
322 printf("\n### Testing using a single lcore ###\n");
323 test_bulk_push_pop(s
);
325 if (get_two_hyperthreads(&cores
) == 0) {
326 printf("\n### Testing using two hyperthreads ###\n");
327 run_on_core_pair(&cores
, s
, bulk_push_pop
);
329 if (get_two_cores(&cores
) == 0) {
330 printf("\n### Testing using two physical cores ###\n");
331 run_on_core_pair(&cores
, s
, bulk_push_pop
);
333 if (get_two_sockets(&cores
) == 0) {
334 printf("\n### Testing using two NUMA nodes ###\n");
335 run_on_core_pair(&cores
, s
, bulk_push_pop
);
338 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
339 run_on_n_cores(s
, bulk_push_pop
, rte_lcore_count());
346 test_stack_perf(void)
348 return __test_stack_perf(0);
352 test_lf_stack_perf(void)
354 return __test_stack_perf(RTE_STACK_F_LF
);
357 REGISTER_TEST_COMMAND(stack_perf_autotest
, test_stack_perf
);
358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest
, test_lf_stack_perf
);