]>
Commit | Line | Data |
---|---|---|
1c13f3c9 IM |
1 | /* |
2 | * numa.c | |
3 | * | |
4 | * numa: Simulate NUMA-sensitive workload and measure their NUMA performance | |
5 | */ | |
6 | ||
8a158589 ACM |
7 | /* For the CLR_() macros */ |
8 | #include <pthread.h> | |
9 | ||
1c13f3c9 IM |
10 | #include "../perf.h" |
11 | #include "../builtin.h" | |
12 | #include "../util/util.h" | |
4b6ab94e | 13 | #include <subcmd/parse-options.h> |
2d8e405a | 14 | #include "../util/cloexec.h" |
1c13f3c9 IM |
15 | |
16 | #include "bench.h" | |
17 | ||
18 | #include <errno.h> | |
19 | #include <sched.h> | |
20 | #include <stdio.h> | |
21 | #include <assert.h> | |
22 | #include <malloc.h> | |
23 | #include <signal.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include <unistd.h> | |
1c13f3c9 IM |
27 | #include <sys/mman.h> |
28 | #include <sys/time.h> | |
b64aa553 | 29 | #include <sys/resource.h> |
1c13f3c9 IM |
30 | #include <sys/wait.h> |
31 | #include <sys/prctl.h> | |
32 | #include <sys/types.h> | |
a8ad8329 | 33 | #include <linux/time64.h> |
1c13f3c9 IM |
34 | |
35 | #include <numa.h> | |
36 | #include <numaif.h> | |
37 | ||
38 | /* | |
39 | * Regular printout to the terminal, supressed if -q is specified: | |
40 | */ | |
41 | #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) | |
42 | ||
43 | /* | |
44 | * Debug printf: | |
45 | */ | |
46 | #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) | |
47 | ||
48 | struct thread_data { | |
49 | int curr_cpu; | |
50 | cpu_set_t bind_cpumask; | |
51 | int bind_node; | |
52 | u8 *process_data; | |
53 | int process_nr; | |
54 | int thread_nr; | |
55 | int task_nr; | |
56 | unsigned int loops_done; | |
57 | u64 val; | |
58 | u64 runtime_ns; | |
b64aa553 PH |
59 | u64 system_time_ns; |
60 | u64 user_time_ns; | |
61 | double speed_gbs; | |
1c13f3c9 IM |
62 | pthread_mutex_t *process_lock; |
63 | }; | |
64 | ||
65 | /* Parameters set by options: */ | |
66 | ||
67 | struct params { | |
68 | /* Startup synchronization: */ | |
69 | bool serialize_startup; | |
70 | ||
71 | /* Task hierarchy: */ | |
72 | int nr_proc; | |
73 | int nr_threads; | |
74 | ||
75 | /* Working set sizes: */ | |
76 | const char *mb_global_str; | |
77 | const char *mb_proc_str; | |
78 | const char *mb_proc_locked_str; | |
79 | const char *mb_thread_str; | |
80 | ||
81 | double mb_global; | |
82 | double mb_proc; | |
83 | double mb_proc_locked; | |
84 | double mb_thread; | |
85 | ||
86 | /* Access patterns to the working set: */ | |
87 | bool data_reads; | |
88 | bool data_writes; | |
89 | bool data_backwards; | |
90 | bool data_zero_memset; | |
91 | bool data_rand_walk; | |
92 | u32 nr_loops; | |
93 | u32 nr_secs; | |
94 | u32 sleep_usecs; | |
95 | ||
96 | /* Working set initialization: */ | |
97 | bool init_zero; | |
98 | bool init_random; | |
99 | bool init_cpu0; | |
100 | ||
101 | /* Misc options: */ | |
102 | int show_details; | |
103 | int run_all; | |
104 | int thp; | |
105 | ||
106 | long bytes_global; | |
107 | long bytes_process; | |
108 | long bytes_process_locked; | |
109 | long bytes_thread; | |
110 | ||
111 | int nr_tasks; | |
112 | bool show_quiet; | |
113 | ||
114 | bool show_convergence; | |
115 | bool measure_convergence; | |
116 | ||
117 | int perturb_secs; | |
118 | int nr_cpus; | |
119 | int nr_nodes; | |
120 | ||
121 | /* Affinity options -C and -N: */ | |
122 | char *cpu_list_str; | |
123 | char *node_list_str; | |
124 | }; | |
125 | ||
126 | ||
127 | /* Global, read-writable area, accessible to all processes and threads: */ | |
128 | ||
129 | struct global_info { | |
130 | u8 *data; | |
131 | ||
132 | pthread_mutex_t startup_mutex; | |
133 | int nr_tasks_started; | |
134 | ||
135 | pthread_mutex_t startup_done_mutex; | |
136 | ||
137 | pthread_mutex_t start_work_mutex; | |
138 | int nr_tasks_working; | |
139 | ||
140 | pthread_mutex_t stop_work_mutex; | |
141 | u64 bytes_done; | |
142 | ||
143 | struct thread_data *threads; | |
144 | ||
145 | /* Convergence latency measurement: */ | |
146 | bool all_converged; | |
147 | bool stop_work; | |
148 | ||
149 | int print_once; | |
150 | ||
151 | struct params p; | |
152 | }; | |
153 | ||
154 | static struct global_info *g = NULL; | |
155 | ||
156 | static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); | |
157 | static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); | |
158 | ||
159 | struct params p0; | |
160 | ||
161 | static const struct option options[] = { | |
162 | OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"), | |
163 | OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"), | |
164 | ||
165 | OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"), | |
166 | OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"), | |
167 | OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), | |
168 | OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"), | |
169 | ||
b0d22e52 IM |
170 | OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)"), |
171 | OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"), | |
1c13f3c9 IM |
172 | OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), |
173 | ||
174 | OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), | |
175 | OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), | |
176 | OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), | |
177 | OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), | |
178 | OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"), | |
179 | ||
180 | ||
181 | OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"), | |
182 | OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"), | |
183 | OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"), | |
184 | OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"), | |
185 | ||
186 | OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), | |
187 | OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), | |
188 | OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), | |
189 | OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), | |
190 | OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), | |
24f1ced1 | 191 | OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"), |
1c13f3c9 IM |
192 | OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), |
193 | ||
194 | /* Special option string parsing callbacks: */ | |
195 | OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", | |
196 | "bind the first N tasks to these specific cpus (the rest is unbound)", | |
197 | parse_cpus_opt), | |
198 | OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", | |
199 | "bind the first N tasks to these specific memory nodes (the rest is unbound)", | |
200 | parse_nodes_opt), | |
201 | OPT_END() | |
202 | }; | |
203 | ||
204 | static const char * const bench_numa_usage[] = { | |
205 | "perf bench numa <options>", | |
206 | NULL | |
207 | }; | |
208 | ||
209 | static const char * const numa_usage[] = { | |
210 | "perf bench numa mem [<options>]", | |
211 | NULL | |
212 | }; | |
213 | ||
214 | static cpu_set_t bind_to_cpu(int target_cpu) | |
215 | { | |
216 | cpu_set_t orig_mask, mask; | |
217 | int ret; | |
218 | ||
219 | ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); | |
220 | BUG_ON(ret); | |
221 | ||
222 | CPU_ZERO(&mask); | |
223 | ||
224 | if (target_cpu == -1) { | |
225 | int cpu; | |
226 | ||
227 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
228 | CPU_SET(cpu, &mask); | |
229 | } else { | |
230 | BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); | |
231 | CPU_SET(target_cpu, &mask); | |
232 | } | |
233 | ||
234 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
235 | BUG_ON(ret); | |
236 | ||
237 | return orig_mask; | |
238 | } | |
239 | ||
240 | static cpu_set_t bind_to_node(int target_node) | |
241 | { | |
242 | int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; | |
243 | cpu_set_t orig_mask, mask; | |
244 | int cpu; | |
245 | int ret; | |
246 | ||
247 | BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); | |
248 | BUG_ON(!cpus_per_node); | |
249 | ||
250 | ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); | |
251 | BUG_ON(ret); | |
252 | ||
253 | CPU_ZERO(&mask); | |
254 | ||
255 | if (target_node == -1) { | |
256 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
257 | CPU_SET(cpu, &mask); | |
258 | } else { | |
259 | int cpu_start = (target_node + 0) * cpus_per_node; | |
260 | int cpu_stop = (target_node + 1) * cpus_per_node; | |
261 | ||
262 | BUG_ON(cpu_stop > g->p.nr_cpus); | |
263 | ||
264 | for (cpu = cpu_start; cpu < cpu_stop; cpu++) | |
265 | CPU_SET(cpu, &mask); | |
266 | } | |
267 | ||
268 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
269 | BUG_ON(ret); | |
270 | ||
271 | return orig_mask; | |
272 | } | |
273 | ||
274 | static void bind_to_cpumask(cpu_set_t mask) | |
275 | { | |
276 | int ret; | |
277 | ||
278 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
279 | BUG_ON(ret); | |
280 | } | |
281 | ||
282 | static void mempol_restore(void) | |
283 | { | |
284 | int ret; | |
285 | ||
286 | ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); | |
287 | ||
288 | BUG_ON(ret); | |
289 | } | |
290 | ||
291 | static void bind_to_memnode(int node) | |
292 | { | |
293 | unsigned long nodemask; | |
294 | int ret; | |
295 | ||
296 | if (node == -1) | |
297 | return; | |
298 | ||
3c52b658 | 299 | BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); |
1c13f3c9 IM |
300 | nodemask = 1L << node; |
301 | ||
302 | ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); | |
303 | dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); | |
304 | ||
305 | BUG_ON(ret); | |
306 | } | |
307 | ||
308 | #define HPSIZE (2*1024*1024) | |
309 | ||
310 | #define set_taskname(fmt...) \ | |
311 | do { \ | |
312 | char name[20]; \ | |
313 | \ | |
314 | snprintf(name, 20, fmt); \ | |
315 | prctl(PR_SET_NAME, name); \ | |
316 | } while (0) | |
317 | ||
318 | static u8 *alloc_data(ssize_t bytes0, int map_flags, | |
319 | int init_zero, int init_cpu0, int thp, int init_random) | |
320 | { | |
321 | cpu_set_t orig_mask; | |
322 | ssize_t bytes; | |
323 | u8 *buf; | |
324 | int ret; | |
325 | ||
326 | if (!bytes0) | |
327 | return NULL; | |
328 | ||
329 | /* Allocate and initialize all memory on CPU#0: */ | |
330 | if (init_cpu0) { | |
331 | orig_mask = bind_to_node(0); | |
332 | bind_to_memnode(0); | |
333 | } | |
334 | ||
335 | bytes = bytes0 + HPSIZE; | |
336 | ||
337 | buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); | |
338 | BUG_ON(buf == (void *)-1); | |
339 | ||
340 | if (map_flags == MAP_PRIVATE) { | |
341 | if (thp > 0) { | |
342 | ret = madvise(buf, bytes, MADV_HUGEPAGE); | |
343 | if (ret && !g->print_once) { | |
344 | g->print_once = 1; | |
345 | printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n"); | |
346 | } | |
347 | } | |
348 | if (thp < 0) { | |
349 | ret = madvise(buf, bytes, MADV_NOHUGEPAGE); | |
350 | if (ret && !g->print_once) { | |
351 | g->print_once = 1; | |
352 | printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n"); | |
353 | } | |
354 | } | |
355 | } | |
356 | ||
357 | if (init_zero) { | |
358 | bzero(buf, bytes); | |
359 | } else { | |
360 | /* Initialize random contents, different in each word: */ | |
361 | if (init_random) { | |
362 | u64 *wbuf = (void *)buf; | |
363 | long off = rand(); | |
364 | long i; | |
365 | ||
366 | for (i = 0; i < bytes/8; i++) | |
367 | wbuf[i] = i + off; | |
368 | } | |
369 | } | |
370 | ||
371 | /* Align to 2MB boundary: */ | |
372 | buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); | |
373 | ||
374 | /* Restore affinity: */ | |
375 | if (init_cpu0) { | |
376 | bind_to_cpumask(orig_mask); | |
377 | mempol_restore(); | |
378 | } | |
379 | ||
380 | return buf; | |
381 | } | |
382 | ||
383 | static void free_data(void *data, ssize_t bytes) | |
384 | { | |
385 | int ret; | |
386 | ||
387 | if (!data) | |
388 | return; | |
389 | ||
390 | ret = munmap(data, bytes); | |
391 | BUG_ON(ret); | |
392 | } | |
393 | ||
394 | /* | |
395 | * Create a shared memory buffer that can be shared between processes, zeroed: | |
396 | */ | |
397 | static void * zalloc_shared_data(ssize_t bytes) | |
398 | { | |
399 | return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
400 | } | |
401 | ||
402 | /* | |
403 | * Create a shared memory buffer that can be shared between processes: | |
404 | */ | |
405 | static void * setup_shared_data(ssize_t bytes) | |
406 | { | |
407 | return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
408 | } | |
409 | ||
410 | /* | |
411 | * Allocate process-local memory - this will either be shared between | |
412 | * threads of this process, or only be accessed by this thread: | |
413 | */ | |
414 | static void * setup_private_data(ssize_t bytes) | |
415 | { | |
416 | return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
417 | } | |
418 | ||
419 | /* | |
420 | * Return a process-shared (global) mutex: | |
421 | */ | |
422 | static void init_global_mutex(pthread_mutex_t *mutex) | |
423 | { | |
424 | pthread_mutexattr_t attr; | |
425 | ||
426 | pthread_mutexattr_init(&attr); | |
427 | pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); | |
428 | pthread_mutex_init(mutex, &attr); | |
429 | } | |
430 | ||
431 | static int parse_cpu_list(const char *arg) | |
432 | { | |
433 | p0.cpu_list_str = strdup(arg); | |
434 | ||
435 | dprintf("got CPU list: {%s}\n", p0.cpu_list_str); | |
436 | ||
437 | return 0; | |
438 | } | |
439 | ||
b81a48ea | 440 | static int parse_setup_cpu_list(void) |
1c13f3c9 IM |
441 | { |
442 | struct thread_data *td; | |
443 | char *str0, *str; | |
444 | int t; | |
445 | ||
446 | if (!g->p.cpu_list_str) | |
b81a48ea | 447 | return 0; |
1c13f3c9 IM |
448 | |
449 | dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); | |
450 | ||
451 | str0 = str = strdup(g->p.cpu_list_str); | |
452 | t = 0; | |
453 | ||
454 | BUG_ON(!str); | |
455 | ||
456 | tprintf("# binding tasks to CPUs:\n"); | |
457 | tprintf("# "); | |
458 | ||
459 | while (true) { | |
460 | int bind_cpu, bind_cpu_0, bind_cpu_1; | |
461 | char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; | |
462 | int bind_len; | |
463 | int step; | |
464 | int mul; | |
465 | ||
466 | tok = strsep(&str, ","); | |
467 | if (!tok) | |
468 | break; | |
469 | ||
470 | tok_end = strstr(tok, "-"); | |
471 | ||
472 | dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); | |
473 | if (!tok_end) { | |
474 | /* Single CPU specified: */ | |
475 | bind_cpu_0 = bind_cpu_1 = atol(tok); | |
476 | } else { | |
477 | /* CPU range specified (for example: "5-11"): */ | |
478 | bind_cpu_0 = atol(tok); | |
479 | bind_cpu_1 = atol(tok_end + 1); | |
480 | } | |
481 | ||
482 | step = 1; | |
483 | tok_step = strstr(tok, "#"); | |
484 | if (tok_step) { | |
485 | step = atol(tok_step + 1); | |
486 | BUG_ON(step <= 0 || step >= g->p.nr_cpus); | |
487 | } | |
488 | ||
489 | /* | |
490 | * Mask length. | |
491 | * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', | |
492 | * where the _4 means the next 4 CPUs are allowed. | |
493 | */ | |
494 | bind_len = 1; | |
495 | tok_len = strstr(tok, "_"); | |
496 | if (tok_len) { | |
497 | bind_len = atol(tok_len + 1); | |
498 | BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); | |
499 | } | |
500 | ||
501 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ | |
502 | mul = 1; | |
503 | tok_mul = strstr(tok, "x"); | |
504 | if (tok_mul) { | |
505 | mul = atol(tok_mul + 1); | |
506 | BUG_ON(mul <= 0); | |
507 | } | |
508 | ||
509 | dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); | |
510 | ||
b81a48ea PH |
511 | if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { |
512 | printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); | |
513 | return -1; | |
514 | } | |
515 | ||
516 | BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); | |
1c13f3c9 IM |
517 | BUG_ON(bind_cpu_0 > bind_cpu_1); |
518 | ||
519 | for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { | |
520 | int i; | |
521 | ||
522 | for (i = 0; i < mul; i++) { | |
523 | int cpu; | |
524 | ||
525 | if (t >= g->p.nr_tasks) { | |
526 | printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu); | |
527 | goto out; | |
528 | } | |
529 | td = g->threads + t; | |
530 | ||
531 | if (t) | |
532 | tprintf(","); | |
533 | if (bind_len > 1) { | |
534 | tprintf("%2d/%d", bind_cpu, bind_len); | |
535 | } else { | |
536 | tprintf("%2d", bind_cpu); | |
537 | } | |
538 | ||
539 | CPU_ZERO(&td->bind_cpumask); | |
540 | for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { | |
541 | BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); | |
542 | CPU_SET(cpu, &td->bind_cpumask); | |
543 | } | |
544 | t++; | |
545 | } | |
546 | } | |
547 | } | |
548 | out: | |
549 | ||
550 | tprintf("\n"); | |
551 | ||
552 | if (t < g->p.nr_tasks) | |
553 | printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); | |
554 | ||
555 | free(str0); | |
b81a48ea | 556 | return 0; |
1c13f3c9 IM |
557 | } |
558 | ||
559 | static int parse_cpus_opt(const struct option *opt __maybe_unused, | |
560 | const char *arg, int unset __maybe_unused) | |
561 | { | |
562 | if (!arg) | |
563 | return -1; | |
564 | ||
565 | return parse_cpu_list(arg); | |
566 | } | |
567 | ||
568 | static int parse_node_list(const char *arg) | |
569 | { | |
570 | p0.node_list_str = strdup(arg); | |
571 | ||
572 | dprintf("got NODE list: {%s}\n", p0.node_list_str); | |
573 | ||
574 | return 0; | |
575 | } | |
576 | ||
b81a48ea | 577 | static int parse_setup_node_list(void) |
1c13f3c9 IM |
578 | { |
579 | struct thread_data *td; | |
580 | char *str0, *str; | |
581 | int t; | |
582 | ||
583 | if (!g->p.node_list_str) | |
b81a48ea | 584 | return 0; |
1c13f3c9 IM |
585 | |
586 | dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); | |
587 | ||
588 | str0 = str = strdup(g->p.node_list_str); | |
589 | t = 0; | |
590 | ||
591 | BUG_ON(!str); | |
592 | ||
593 | tprintf("# binding tasks to NODEs:\n"); | |
594 | tprintf("# "); | |
595 | ||
596 | while (true) { | |
597 | int bind_node, bind_node_0, bind_node_1; | |
598 | char *tok, *tok_end, *tok_step, *tok_mul; | |
599 | int step; | |
600 | int mul; | |
601 | ||
602 | tok = strsep(&str, ","); | |
603 | if (!tok) | |
604 | break; | |
605 | ||
606 | tok_end = strstr(tok, "-"); | |
607 | ||
608 | dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); | |
609 | if (!tok_end) { | |
610 | /* Single NODE specified: */ | |
611 | bind_node_0 = bind_node_1 = atol(tok); | |
612 | } else { | |
613 | /* NODE range specified (for example: "5-11"): */ | |
614 | bind_node_0 = atol(tok); | |
615 | bind_node_1 = atol(tok_end + 1); | |
616 | } | |
617 | ||
618 | step = 1; | |
619 | tok_step = strstr(tok, "#"); | |
620 | if (tok_step) { | |
621 | step = atol(tok_step + 1); | |
622 | BUG_ON(step <= 0 || step >= g->p.nr_nodes); | |
623 | } | |
624 | ||
625 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ | |
626 | mul = 1; | |
627 | tok_mul = strstr(tok, "x"); | |
628 | if (tok_mul) { | |
629 | mul = atol(tok_mul + 1); | |
630 | BUG_ON(mul <= 0); | |
631 | } | |
632 | ||
633 | dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); | |
634 | ||
b81a48ea PH |
635 | if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { |
636 | printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); | |
637 | return -1; | |
638 | } | |
639 | ||
640 | BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); | |
1c13f3c9 IM |
641 | BUG_ON(bind_node_0 > bind_node_1); |
642 | ||
643 | for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { | |
644 | int i; | |
645 | ||
646 | for (i = 0; i < mul; i++) { | |
647 | if (t >= g->p.nr_tasks) { | |
648 | printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); | |
649 | goto out; | |
650 | } | |
651 | td = g->threads + t; | |
652 | ||
653 | if (!t) | |
654 | tprintf(" %2d", bind_node); | |
655 | else | |
656 | tprintf(",%2d", bind_node); | |
657 | ||
658 | td->bind_node = bind_node; | |
659 | t++; | |
660 | } | |
661 | } | |
662 | } | |
663 | out: | |
664 | ||
665 | tprintf("\n"); | |
666 | ||
667 | if (t < g->p.nr_tasks) | |
668 | printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); | |
669 | ||
670 | free(str0); | |
b81a48ea | 671 | return 0; |
1c13f3c9 IM |
672 | } |
673 | ||
674 | static int parse_nodes_opt(const struct option *opt __maybe_unused, | |
675 | const char *arg, int unset __maybe_unused) | |
676 | { | |
677 | if (!arg) | |
678 | return -1; | |
679 | ||
680 | return parse_node_list(arg); | |
681 | ||
682 | return 0; | |
683 | } | |
684 | ||
685 | #define BIT(x) (1ul << x) | |
686 | ||
687 | static inline uint32_t lfsr_32(uint32_t lfsr) | |
688 | { | |
689 | const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); | |
690 | return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); | |
691 | } | |
692 | ||
693 | /* | |
694 | * Make sure there's real data dependency to RAM (when read | |
695 | * accesses are enabled), so the compiler, the CPU and the | |
696 | * kernel (KSM, zero page, etc.) cannot optimize away RAM | |
697 | * accesses: | |
698 | */ | |
699 | static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) | |
700 | { | |
701 | if (g->p.data_reads) | |
702 | val += *data; | |
703 | if (g->p.data_writes) | |
704 | *data = val + 1; | |
705 | return val; | |
706 | } | |
707 | ||
708 | /* | |
709 | * The worker process does two types of work, a forwards going | |
710 | * loop and a backwards going loop. | |
711 | * | |
712 | * We do this so that on multiprocessor systems we do not create | |
713 | * a 'train' of processing, with highly synchronized processes, | |
714 | * skewing the whole benchmark. | |
715 | */ | |
716 | static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) | |
717 | { | |
718 | long words = bytes/sizeof(u64); | |
719 | u64 *data = (void *)__data; | |
720 | long chunk_0, chunk_1; | |
721 | u64 *d0, *d, *d1; | |
722 | long off; | |
723 | long i; | |
724 | ||
725 | BUG_ON(!data && words); | |
726 | BUG_ON(data && !words); | |
727 | ||
728 | if (!data) | |
729 | return val; | |
730 | ||
731 | /* Very simple memset() work variant: */ | |
732 | if (g->p.data_zero_memset && !g->p.data_rand_walk) { | |
733 | bzero(data, bytes); | |
734 | return val; | |
735 | } | |
736 | ||
737 | /* Spread out by PID/TID nr and by loop nr: */ | |
738 | chunk_0 = words/nr_max; | |
739 | chunk_1 = words/g->p.nr_loops; | |
740 | off = nr*chunk_0 + loop*chunk_1; | |
741 | ||
742 | while (off >= words) | |
743 | off -= words; | |
744 | ||
745 | if (g->p.data_rand_walk) { | |
746 | u32 lfsr = nr + loop + val; | |
747 | int j; | |
748 | ||
749 | for (i = 0; i < words/1024; i++) { | |
750 | long start, end; | |
751 | ||
752 | lfsr = lfsr_32(lfsr); | |
753 | ||
754 | start = lfsr % words; | |
755 | end = min(start + 1024, words-1); | |
756 | ||
757 | if (g->p.data_zero_memset) { | |
758 | bzero(data + start, (end-start) * sizeof(u64)); | |
759 | } else { | |
760 | for (j = start; j < end; j++) | |
761 | val = access_data(data + j, val); | |
762 | } | |
763 | } | |
764 | } else if (!g->p.data_backwards || (nr + loop) & 1) { | |
765 | ||
766 | d0 = data + off; | |
767 | d = data + off + 1; | |
768 | d1 = data + words; | |
769 | ||
770 | /* Process data forwards: */ | |
771 | for (;;) { | |
772 | if (unlikely(d >= d1)) | |
773 | d = data; | |
774 | if (unlikely(d == d0)) | |
775 | break; | |
776 | ||
777 | val = access_data(d, val); | |
778 | ||
779 | d++; | |
780 | } | |
781 | } else { | |
782 | /* Process data backwards: */ | |
783 | ||
784 | d0 = data + off; | |
785 | d = data + off - 1; | |
786 | d1 = data + words; | |
787 | ||
788 | /* Process data forwards: */ | |
789 | for (;;) { | |
790 | if (unlikely(d < data)) | |
791 | d = data + words-1; | |
792 | if (unlikely(d == d0)) | |
793 | break; | |
794 | ||
795 | val = access_data(d, val); | |
796 | ||
797 | d--; | |
798 | } | |
799 | } | |
800 | ||
801 | return val; | |
802 | } | |
803 | ||
804 | static void update_curr_cpu(int task_nr, unsigned long bytes_worked) | |
805 | { | |
806 | unsigned int cpu; | |
807 | ||
808 | cpu = sched_getcpu(); | |
809 | ||
810 | g->threads[task_nr].curr_cpu = cpu; | |
811 | prctl(0, bytes_worked); | |
812 | } | |
813 | ||
814 | #define MAX_NR_NODES 64 | |
815 | ||
816 | /* | |
817 | * Count the number of nodes a process's threads | |
818 | * are spread out on. | |
819 | * | |
820 | * A count of 1 means that the process is compressed | |
821 | * to a single node. A count of g->p.nr_nodes means it's | |
822 | * spread out on the whole system. | |
823 | */ | |
824 | static int count_process_nodes(int process_nr) | |
825 | { | |
826 | char node_present[MAX_NR_NODES] = { 0, }; | |
827 | int nodes; | |
828 | int n, t; | |
829 | ||
830 | for (t = 0; t < g->p.nr_threads; t++) { | |
831 | struct thread_data *td; | |
832 | int task_nr; | |
833 | int node; | |
834 | ||
835 | task_nr = process_nr*g->p.nr_threads + t; | |
836 | td = g->threads + task_nr; | |
837 | ||
838 | node = numa_node_of_cpu(td->curr_cpu); | |
1d90a685 PH |
839 | if (node < 0) /* curr_cpu was likely still -1 */ |
840 | return 0; | |
841 | ||
1c13f3c9 IM |
842 | node_present[node] = 1; |
843 | } | |
844 | ||
845 | nodes = 0; | |
846 | ||
847 | for (n = 0; n < MAX_NR_NODES; n++) | |
848 | nodes += node_present[n]; | |
849 | ||
850 | return nodes; | |
851 | } | |
852 | ||
853 | /* | |
854 | * Count the number of distinct process-threads a node contains. | |
855 | * | |
856 | * A count of 1 means that the node contains only a single | |
857 | * process. If all nodes on the system contain at most one | |
858 | * process then we are well-converged. | |
859 | */ | |
860 | static int count_node_processes(int node) | |
861 | { | |
862 | int processes = 0; | |
863 | int t, p; | |
864 | ||
865 | for (p = 0; p < g->p.nr_proc; p++) { | |
866 | for (t = 0; t < g->p.nr_threads; t++) { | |
867 | struct thread_data *td; | |
868 | int task_nr; | |
869 | int n; | |
870 | ||
871 | task_nr = p*g->p.nr_threads + t; | |
872 | td = g->threads + task_nr; | |
873 | ||
874 | n = numa_node_of_cpu(td->curr_cpu); | |
875 | if (n == node) { | |
876 | processes++; | |
877 | break; | |
878 | } | |
879 | } | |
880 | } | |
881 | ||
882 | return processes; | |
883 | } | |
884 | ||
885 | static void calc_convergence_compression(int *strong) | |
886 | { | |
887 | unsigned int nodes_min, nodes_max; | |
888 | int p; | |
889 | ||
890 | nodes_min = -1; | |
891 | nodes_max = 0; | |
892 | ||
893 | for (p = 0; p < g->p.nr_proc; p++) { | |
894 | unsigned int nodes = count_process_nodes(p); | |
895 | ||
1d90a685 PH |
896 | if (!nodes) { |
897 | *strong = 0; | |
898 | return; | |
899 | } | |
900 | ||
1c13f3c9 IM |
901 | nodes_min = min(nodes, nodes_min); |
902 | nodes_max = max(nodes, nodes_max); | |
903 | } | |
904 | ||
905 | /* Strong convergence: all threads compress on a single node: */ | |
906 | if (nodes_min == 1 && nodes_max == 1) { | |
907 | *strong = 1; | |
908 | } else { | |
909 | *strong = 0; | |
910 | tprintf(" {%d-%d}", nodes_min, nodes_max); | |
911 | } | |
912 | } | |
913 | ||
914 | static void calc_convergence(double runtime_ns_max, double *convergence) | |
915 | { | |
916 | unsigned int loops_done_min, loops_done_max; | |
917 | int process_groups; | |
918 | int nodes[MAX_NR_NODES]; | |
919 | int distance; | |
920 | int nr_min; | |
921 | int nr_max; | |
922 | int strong; | |
923 | int sum; | |
924 | int nr; | |
925 | int node; | |
926 | int cpu; | |
927 | int t; | |
928 | ||
929 | if (!g->p.show_convergence && !g->p.measure_convergence) | |
930 | return; | |
931 | ||
932 | for (node = 0; node < g->p.nr_nodes; node++) | |
933 | nodes[node] = 0; | |
934 | ||
935 | loops_done_min = -1; | |
936 | loops_done_max = 0; | |
937 | ||
938 | for (t = 0; t < g->p.nr_tasks; t++) { | |
939 | struct thread_data *td = g->threads + t; | |
940 | unsigned int loops_done; | |
941 | ||
942 | cpu = td->curr_cpu; | |
943 | ||
944 | /* Not all threads have written it yet: */ | |
945 | if (cpu < 0) | |
946 | continue; | |
947 | ||
948 | node = numa_node_of_cpu(cpu); | |
949 | ||
950 | nodes[node]++; | |
951 | ||
952 | loops_done = td->loops_done; | |
953 | loops_done_min = min(loops_done, loops_done_min); | |
954 | loops_done_max = max(loops_done, loops_done_max); | |
955 | } | |
956 | ||
957 | nr_max = 0; | |
958 | nr_min = g->p.nr_tasks; | |
959 | sum = 0; | |
960 | ||
961 | for (node = 0; node < g->p.nr_nodes; node++) { | |
962 | nr = nodes[node]; | |
963 | nr_min = min(nr, nr_min); | |
964 | nr_max = max(nr, nr_max); | |
965 | sum += nr; | |
966 | } | |
967 | BUG_ON(nr_min > nr_max); | |
968 | ||
969 | BUG_ON(sum > g->p.nr_tasks); | |
970 | ||
971 | if (0 && (sum < g->p.nr_tasks)) | |
972 | return; | |
973 | ||
974 | /* | |
975 | * Count the number of distinct process groups present | |
976 | * on nodes - when we are converged this will decrease | |
977 | * to g->p.nr_proc: | |
978 | */ | |
979 | process_groups = 0; | |
980 | ||
981 | for (node = 0; node < g->p.nr_nodes; node++) { | |
982 | int processes = count_node_processes(node); | |
983 | ||
984 | nr = nodes[node]; | |
985 | tprintf(" %2d/%-2d", nr, processes); | |
986 | ||
987 | process_groups += processes; | |
988 | } | |
989 | ||
990 | distance = nr_max - nr_min; | |
991 | ||
992 | tprintf(" [%2d/%-2d]", distance, process_groups); | |
993 | ||
994 | tprintf(" l:%3d-%-3d (%3d)", | |
995 | loops_done_min, loops_done_max, loops_done_max-loops_done_min); | |
996 | ||
997 | if (loops_done_min && loops_done_max) { | |
998 | double skew = 1.0 - (double)loops_done_min/loops_done_max; | |
999 | ||
1000 | tprintf(" [%4.1f%%]", skew * 100.0); | |
1001 | } | |
1002 | ||
1003 | calc_convergence_compression(&strong); | |
1004 | ||
1005 | if (strong && process_groups == g->p.nr_proc) { | |
1006 | if (!*convergence) { | |
1007 | *convergence = runtime_ns_max; | |
a8ad8329 | 1008 | tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC); |
1c13f3c9 IM |
1009 | if (g->p.measure_convergence) { |
1010 | g->all_converged = true; | |
1011 | g->stop_work = true; | |
1012 | } | |
1013 | } | |
1014 | } else { | |
1015 | if (*convergence) { | |
a8ad8329 | 1016 | tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC); |
1c13f3c9 IM |
1017 | *convergence = 0; |
1018 | } | |
1019 | tprintf("\n"); | |
1020 | } | |
1021 | } | |
1022 | ||
1023 | static void show_summary(double runtime_ns_max, int l, double *convergence) | |
1024 | { | |
1025 | tprintf("\r # %5.1f%% [%.1f mins]", | |
a8ad8329 | 1026 | (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0); |
1c13f3c9 IM |
1027 | |
1028 | calc_convergence(runtime_ns_max, convergence); | |
1029 | ||
1030 | if (g->p.show_details >= 0) | |
1031 | fflush(stdout); | |
1032 | } | |
1033 | ||
1034 | static void *worker_thread(void *__tdata) | |
1035 | { | |
1036 | struct thread_data *td = __tdata; | |
1037 | struct timeval start0, start, stop, diff; | |
1038 | int process_nr = td->process_nr; | |
1039 | int thread_nr = td->thread_nr; | |
1040 | unsigned long last_perturbance; | |
1041 | int task_nr = td->task_nr; | |
1042 | int details = g->p.show_details; | |
1043 | int first_task, last_task; | |
1044 | double convergence = 0; | |
1045 | u64 val = td->val; | |
1046 | double runtime_ns_max; | |
1047 | u8 *global_data; | |
1048 | u8 *process_data; | |
1049 | u8 *thread_data; | |
1050 | u64 bytes_done; | |
1051 | long work_done; | |
1052 | u32 l; | |
b64aa553 | 1053 | struct rusage rusage; |
1c13f3c9 IM |
1054 | |
1055 | bind_to_cpumask(td->bind_cpumask); | |
1056 | bind_to_memnode(td->bind_node); | |
1057 | ||
1058 | set_taskname("thread %d/%d", process_nr, thread_nr); | |
1059 | ||
1060 | global_data = g->data; | |
1061 | process_data = td->process_data; | |
1062 | thread_data = setup_private_data(g->p.bytes_thread); | |
1063 | ||
1064 | bytes_done = 0; | |
1065 | ||
1066 | last_task = 0; | |
1067 | if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) | |
1068 | last_task = 1; | |
1069 | ||
1070 | first_task = 0; | |
1071 | if (process_nr == 0 && thread_nr == 0) | |
1072 | first_task = 1; | |
1073 | ||
1074 | if (details >= 2) { | |
1075 | printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", | |
1076 | process_nr, thread_nr, global_data, process_data, thread_data); | |
1077 | } | |
1078 | ||
1079 | if (g->p.serialize_startup) { | |
1080 | pthread_mutex_lock(&g->startup_mutex); | |
1081 | g->nr_tasks_started++; | |
1082 | pthread_mutex_unlock(&g->startup_mutex); | |
1083 | ||
1084 | /* Here we will wait for the main process to start us all at once: */ | |
1085 | pthread_mutex_lock(&g->start_work_mutex); | |
1086 | g->nr_tasks_working++; | |
1087 | ||
1088 | /* Last one wake the main process: */ | |
1089 | if (g->nr_tasks_working == g->p.nr_tasks) | |
1090 | pthread_mutex_unlock(&g->startup_done_mutex); | |
1091 | ||
1092 | pthread_mutex_unlock(&g->start_work_mutex); | |
1093 | } | |
1094 | ||
1095 | gettimeofday(&start0, NULL); | |
1096 | ||
1097 | start = stop = start0; | |
1098 | last_perturbance = start.tv_sec; | |
1099 | ||
1100 | for (l = 0; l < g->p.nr_loops; l++) { | |
1101 | start = stop; | |
1102 | ||
1103 | if (g->stop_work) | |
1104 | break; | |
1105 | ||
1106 | val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val); | |
1107 | val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val); | |
1108 | val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); | |
1109 | ||
1110 | if (g->p.sleep_usecs) { | |
1111 | pthread_mutex_lock(td->process_lock); | |
1112 | usleep(g->p.sleep_usecs); | |
1113 | pthread_mutex_unlock(td->process_lock); | |
1114 | } | |
1115 | /* | |
1116 | * Amount of work to be done under a process-global lock: | |
1117 | */ | |
1118 | if (g->p.bytes_process_locked) { | |
1119 | pthread_mutex_lock(td->process_lock); | |
1120 | val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); | |
1121 | pthread_mutex_unlock(td->process_lock); | |
1122 | } | |
1123 | ||
1124 | work_done = g->p.bytes_global + g->p.bytes_process + | |
1125 | g->p.bytes_process_locked + g->p.bytes_thread; | |
1126 | ||
1127 | update_curr_cpu(task_nr, work_done); | |
1128 | bytes_done += work_done; | |
1129 | ||
1130 | if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) | |
1131 | continue; | |
1132 | ||
1133 | td->loops_done = l; | |
1134 | ||
1135 | gettimeofday(&stop, NULL); | |
1136 | ||
1137 | /* Check whether our max runtime timed out: */ | |
1138 | if (g->p.nr_secs) { | |
1139 | timersub(&stop, &start0, &diff); | |
2100f778 | 1140 | if ((u32)diff.tv_sec >= g->p.nr_secs) { |
1c13f3c9 IM |
1141 | g->stop_work = true; |
1142 | break; | |
1143 | } | |
1144 | } | |
1145 | ||
1146 | /* Update the summary at most once per second: */ | |
1147 | if (start.tv_sec == stop.tv_sec) | |
1148 | continue; | |
1149 | ||
1150 | /* | |
1151 | * Perturb the first task's equilibrium every g->p.perturb_secs seconds, | |
1152 | * by migrating to CPU#0: | |
1153 | */ | |
1154 | if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { | |
1155 | cpu_set_t orig_mask; | |
1156 | int target_cpu; | |
1157 | int this_cpu; | |
1158 | ||
1159 | last_perturbance = stop.tv_sec; | |
1160 | ||
1161 | /* | |
1162 | * Depending on where we are running, move into | |
1163 | * the other half of the system, to create some | |
1164 | * real disturbance: | |
1165 | */ | |
1166 | this_cpu = g->threads[task_nr].curr_cpu; | |
1167 | if (this_cpu < g->p.nr_cpus/2) | |
1168 | target_cpu = g->p.nr_cpus-1; | |
1169 | else | |
1170 | target_cpu = 0; | |
1171 | ||
1172 | orig_mask = bind_to_cpu(target_cpu); | |
1173 | ||
1174 | /* Here we are running on the target CPU already */ | |
1175 | if (details >= 1) | |
1176 | printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); | |
1177 | ||
1178 | bind_to_cpumask(orig_mask); | |
1179 | } | |
1180 | ||
1181 | if (details >= 3) { | |
1182 | timersub(&stop, &start, &diff); | |
a8ad8329 ACM |
1183 | runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; |
1184 | runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; | |
1c13f3c9 IM |
1185 | |
1186 | if (details >= 0) { | |
2100f778 | 1187 | printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", |
1c13f3c9 IM |
1188 | process_nr, thread_nr, runtime_ns_max / bytes_done, val); |
1189 | } | |
1190 | fflush(stdout); | |
1191 | } | |
1192 | if (!last_task) | |
1193 | continue; | |
1194 | ||
1195 | timersub(&stop, &start0, &diff); | |
a8ad8329 ACM |
1196 | runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; |
1197 | runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; | |
1c13f3c9 IM |
1198 | |
1199 | show_summary(runtime_ns_max, l, &convergence); | |
1200 | } | |
1201 | ||
1202 | gettimeofday(&stop, NULL); | |
1203 | timersub(&stop, &start0, &diff); | |
a8ad8329 ACM |
1204 | td->runtime_ns = diff.tv_sec * NSEC_PER_SEC; |
1205 | td->runtime_ns += diff.tv_usec * NSEC_PER_USEC; | |
1206 | td->speed_gbs = bytes_done / (td->runtime_ns / NSEC_PER_SEC) / 1e9; | |
b64aa553 PH |
1207 | |
1208 | getrusage(RUSAGE_THREAD, &rusage); | |
a8ad8329 ACM |
1209 | td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC; |
1210 | td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC; | |
1211 | td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC; | |
1212 | td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC; | |
1c13f3c9 IM |
1213 | |
1214 | free_data(thread_data, g->p.bytes_thread); | |
1215 | ||
1216 | pthread_mutex_lock(&g->stop_work_mutex); | |
1217 | g->bytes_done += bytes_done; | |
1218 | pthread_mutex_unlock(&g->stop_work_mutex); | |
1219 | ||
1220 | return NULL; | |
1221 | } | |
1222 | ||
1223 | /* | |
1224 | * A worker process starts a couple of threads: | |
1225 | */ | |
1226 | static void worker_process(int process_nr) | |
1227 | { | |
1228 | pthread_mutex_t process_lock; | |
1229 | struct thread_data *td; | |
1230 | pthread_t *pthreads; | |
1231 | u8 *process_data; | |
1232 | int task_nr; | |
1233 | int ret; | |
1234 | int t; | |
1235 | ||
1236 | pthread_mutex_init(&process_lock, NULL); | |
1237 | set_taskname("process %d", process_nr); | |
1238 | ||
1239 | /* | |
1240 | * Pick up the memory policy and the CPU binding of our first thread, | |
1241 | * so that we initialize memory accordingly: | |
1242 | */ | |
1243 | task_nr = process_nr*g->p.nr_threads; | |
1244 | td = g->threads + task_nr; | |
1245 | ||
1246 | bind_to_memnode(td->bind_node); | |
1247 | bind_to_cpumask(td->bind_cpumask); | |
1248 | ||
1249 | pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); | |
1250 | process_data = setup_private_data(g->p.bytes_process); | |
1251 | ||
1252 | if (g->p.show_details >= 3) { | |
1253 | printf(" # process %2d global mem: %p, process mem: %p\n", | |
1254 | process_nr, g->data, process_data); | |
1255 | } | |
1256 | ||
1257 | for (t = 0; t < g->p.nr_threads; t++) { | |
1258 | task_nr = process_nr*g->p.nr_threads + t; | |
1259 | td = g->threads + task_nr; | |
1260 | ||
1261 | td->process_data = process_data; | |
1262 | td->process_nr = process_nr; | |
1263 | td->thread_nr = t; | |
1264 | td->task_nr = task_nr; | |
1265 | td->val = rand(); | |
1266 | td->curr_cpu = -1; | |
1267 | td->process_lock = &process_lock; | |
1268 | ||
1269 | ret = pthread_create(pthreads + t, NULL, worker_thread, td); | |
1270 | BUG_ON(ret); | |
1271 | } | |
1272 | ||
1273 | for (t = 0; t < g->p.nr_threads; t++) { | |
1274 | ret = pthread_join(pthreads[t], NULL); | |
1275 | BUG_ON(ret); | |
1276 | } | |
1277 | ||
1278 | free_data(process_data, g->p.bytes_process); | |
1279 | free(pthreads); | |
1280 | } | |
1281 | ||
1282 | static void print_summary(void) | |
1283 | { | |
1284 | if (g->p.show_details < 0) | |
1285 | return; | |
1286 | ||
1287 | printf("\n ###\n"); | |
1288 | printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", | |
1289 | g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); | |
1290 | printf(" # %5dx %5ldMB global shared mem operations\n", | |
1291 | g->p.nr_loops, g->p.bytes_global/1024/1024); | |
1292 | printf(" # %5dx %5ldMB process shared mem operations\n", | |
1293 | g->p.nr_loops, g->p.bytes_process/1024/1024); | |
1294 | printf(" # %5dx %5ldMB thread local mem operations\n", | |
1295 | g->p.nr_loops, g->p.bytes_thread/1024/1024); | |
1296 | ||
1297 | printf(" ###\n"); | |
1298 | ||
1299 | printf("\n ###\n"); fflush(stdout); | |
1300 | } | |
1301 | ||
1302 | static void init_thread_data(void) | |
1303 | { | |
1304 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; | |
1305 | int t; | |
1306 | ||
1307 | g->threads = zalloc_shared_data(size); | |
1308 | ||
1309 | for (t = 0; t < g->p.nr_tasks; t++) { | |
1310 | struct thread_data *td = g->threads + t; | |
1311 | int cpu; | |
1312 | ||
1313 | /* Allow all nodes by default: */ | |
1314 | td->bind_node = -1; | |
1315 | ||
1316 | /* Allow all CPUs by default: */ | |
1317 | CPU_ZERO(&td->bind_cpumask); | |
1318 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
1319 | CPU_SET(cpu, &td->bind_cpumask); | |
1320 | } | |
1321 | } | |
1322 | ||
1323 | static void deinit_thread_data(void) | |
1324 | { | |
1325 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; | |
1326 | ||
1327 | free_data(g->threads, size); | |
1328 | } | |
1329 | ||
1330 | static int init(void) | |
1331 | { | |
1332 | g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); | |
1333 | ||
1334 | /* Copy over options: */ | |
1335 | g->p = p0; | |
1336 | ||
1337 | g->p.nr_cpus = numa_num_configured_cpus(); | |
1338 | ||
1339 | g->p.nr_nodes = numa_max_node() + 1; | |
1340 | ||
1341 | /* char array in count_process_nodes(): */ | |
1342 | BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); | |
1343 | ||
1344 | if (g->p.show_quiet && !g->p.show_details) | |
1345 | g->p.show_details = -1; | |
1346 | ||
1347 | /* Some memory should be specified: */ | |
1348 | if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) | |
1349 | return -1; | |
1350 | ||
1351 | if (g->p.mb_global_str) { | |
1352 | g->p.mb_global = atof(g->p.mb_global_str); | |
1353 | BUG_ON(g->p.mb_global < 0); | |
1354 | } | |
1355 | ||
1356 | if (g->p.mb_proc_str) { | |
1357 | g->p.mb_proc = atof(g->p.mb_proc_str); | |
1358 | BUG_ON(g->p.mb_proc < 0); | |
1359 | } | |
1360 | ||
1361 | if (g->p.mb_proc_locked_str) { | |
1362 | g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); | |
1363 | BUG_ON(g->p.mb_proc_locked < 0); | |
1364 | BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); | |
1365 | } | |
1366 | ||
1367 | if (g->p.mb_thread_str) { | |
1368 | g->p.mb_thread = atof(g->p.mb_thread_str); | |
1369 | BUG_ON(g->p.mb_thread < 0); | |
1370 | } | |
1371 | ||
1372 | BUG_ON(g->p.nr_threads <= 0); | |
1373 | BUG_ON(g->p.nr_proc <= 0); | |
1374 | ||
1375 | g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; | |
1376 | ||
1377 | g->p.bytes_global = g->p.mb_global *1024L*1024L; | |
1378 | g->p.bytes_process = g->p.mb_proc *1024L*1024L; | |
1379 | g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; | |
1380 | g->p.bytes_thread = g->p.mb_thread *1024L*1024L; | |
1381 | ||
1382 | g->data = setup_shared_data(g->p.bytes_global); | |
1383 | ||
1384 | /* Startup serialization: */ | |
1385 | init_global_mutex(&g->start_work_mutex); | |
1386 | init_global_mutex(&g->startup_mutex); | |
1387 | init_global_mutex(&g->startup_done_mutex); | |
1388 | init_global_mutex(&g->stop_work_mutex); | |
1389 | ||
1390 | init_thread_data(); | |
1391 | ||
1392 | tprintf("#\n"); | |
b81a48ea PH |
1393 | if (parse_setup_cpu_list() || parse_setup_node_list()) |
1394 | return -1; | |
1c13f3c9 IM |
1395 | tprintf("#\n"); |
1396 | ||
1397 | print_summary(); | |
1398 | ||
1399 | return 0; | |
1400 | } | |
1401 | ||
1402 | static void deinit(void) | |
1403 | { | |
1404 | free_data(g->data, g->p.bytes_global); | |
1405 | g->data = NULL; | |
1406 | ||
1407 | deinit_thread_data(); | |
1408 | ||
1409 | free_data(g, sizeof(*g)); | |
1410 | g = NULL; | |
1411 | } | |
1412 | ||
1413 | /* | |
1414 | * Print a short or long result, depending on the verbosity setting: | |
1415 | */ | |
1416 | static void print_res(const char *name, double val, | |
1417 | const char *txt_unit, const char *txt_short, const char *txt_long) | |
1418 | { | |
1419 | if (!name) | |
1420 | name = "main,"; | |
1421 | ||
24f1ced1 | 1422 | if (!g->p.show_quiet) |
1c13f3c9 IM |
1423 | printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); |
1424 | else | |
1425 | printf(" %14.3f %s\n", val, txt_long); | |
1426 | } | |
1427 | ||
1428 | static int __bench_numa(const char *name) | |
1429 | { | |
1430 | struct timeval start, stop, diff; | |
1431 | u64 runtime_ns_min, runtime_ns_sum; | |
1432 | pid_t *pids, pid, wpid; | |
1433 | double delta_runtime; | |
1434 | double runtime_avg; | |
1435 | double runtime_sec_max; | |
1436 | double runtime_sec_min; | |
1437 | int wait_stat; | |
1438 | double bytes; | |
b64aa553 | 1439 | int i, t, p; |
1c13f3c9 IM |
1440 | |
1441 | if (init()) | |
1442 | return -1; | |
1443 | ||
1444 | pids = zalloc(g->p.nr_proc * sizeof(*pids)); | |
1445 | pid = -1; | |
1446 | ||
1447 | /* All threads try to acquire it, this way we can wait for them to start up: */ | |
1448 | pthread_mutex_lock(&g->start_work_mutex); | |
1449 | ||
1450 | if (g->p.serialize_startup) { | |
1451 | tprintf(" #\n"); | |
1452 | tprintf(" # Startup synchronization: ..."); fflush(stdout); | |
1453 | } | |
1454 | ||
1455 | gettimeofday(&start, NULL); | |
1456 | ||
1457 | for (i = 0; i < g->p.nr_proc; i++) { | |
1458 | pid = fork(); | |
1459 | dprintf(" # process %2d: PID %d\n", i, pid); | |
1460 | ||
1461 | BUG_ON(pid < 0); | |
1462 | if (!pid) { | |
1463 | /* Child process: */ | |
1464 | worker_process(i); | |
1465 | ||
1466 | exit(0); | |
1467 | } | |
1468 | pids[i] = pid; | |
1469 | ||
1470 | } | |
1471 | /* Wait for all the threads to start up: */ | |
1472 | while (g->nr_tasks_started != g->p.nr_tasks) | |
a8ad8329 | 1473 | usleep(USEC_PER_MSEC); |
1c13f3c9 IM |
1474 | |
1475 | BUG_ON(g->nr_tasks_started != g->p.nr_tasks); | |
1476 | ||
1477 | if (g->p.serialize_startup) { | |
1478 | double startup_sec; | |
1479 | ||
1480 | pthread_mutex_lock(&g->startup_done_mutex); | |
1481 | ||
1482 | /* This will start all threads: */ | |
1483 | pthread_mutex_unlock(&g->start_work_mutex); | |
1484 | ||
1485 | /* This mutex is locked - the last started thread will wake us: */ | |
1486 | pthread_mutex_lock(&g->startup_done_mutex); | |
1487 | ||
1488 | gettimeofday(&stop, NULL); | |
1489 | ||
1490 | timersub(&stop, &start, &diff); | |
1491 | ||
a8ad8329 ACM |
1492 | startup_sec = diff.tv_sec * NSEC_PER_SEC; |
1493 | startup_sec += diff.tv_usec * NSEC_PER_USEC; | |
1494 | startup_sec /= NSEC_PER_SEC; | |
1c13f3c9 IM |
1495 | |
1496 | tprintf(" threads initialized in %.6f seconds.\n", startup_sec); | |
1497 | tprintf(" #\n"); | |
1498 | ||
1499 | start = stop; | |
1500 | pthread_mutex_unlock(&g->startup_done_mutex); | |
1501 | } else { | |
1502 | gettimeofday(&start, NULL); | |
1503 | } | |
1504 | ||
1505 | /* Parent process: */ | |
1506 | ||
1507 | ||
1508 | for (i = 0; i < g->p.nr_proc; i++) { | |
1509 | wpid = waitpid(pids[i], &wait_stat, 0); | |
1510 | BUG_ON(wpid < 0); | |
1511 | BUG_ON(!WIFEXITED(wait_stat)); | |
1512 | ||
1513 | } | |
1514 | ||
1515 | runtime_ns_sum = 0; | |
1516 | runtime_ns_min = -1LL; | |
1517 | ||
1518 | for (t = 0; t < g->p.nr_tasks; t++) { | |
1519 | u64 thread_runtime_ns = g->threads[t].runtime_ns; | |
1520 | ||
1521 | runtime_ns_sum += thread_runtime_ns; | |
1522 | runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); | |
1523 | } | |
1524 | ||
1525 | gettimeofday(&stop, NULL); | |
1526 | timersub(&stop, &start, &diff); | |
1527 | ||
1528 | BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); | |
1529 | ||
1530 | tprintf("\n ###\n"); | |
1531 | tprintf("\n"); | |
1532 | ||
a8ad8329 ACM |
1533 | runtime_sec_max = diff.tv_sec * NSEC_PER_SEC; |
1534 | runtime_sec_max += diff.tv_usec * NSEC_PER_USEC; | |
1535 | runtime_sec_max /= NSEC_PER_SEC; | |
1c13f3c9 | 1536 | |
a8ad8329 | 1537 | runtime_sec_min = runtime_ns_min / NSEC_PER_SEC; |
1c13f3c9 IM |
1538 | |
1539 | bytes = g->bytes_done; | |
a8ad8329 | 1540 | runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC; |
1c13f3c9 IM |
1541 | |
1542 | if (g->p.measure_convergence) { | |
1543 | print_res(name, runtime_sec_max, | |
1544 | "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); | |
1545 | } | |
1546 | ||
1547 | print_res(name, runtime_sec_max, | |
1548 | "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); | |
1549 | ||
1550 | print_res(name, runtime_sec_min, | |
1551 | "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime"); | |
1552 | ||
1553 | print_res(name, runtime_avg, | |
1554 | "secs,", "runtime-avg/thread", "secs average thread-runtime"); | |
1555 | ||
1556 | delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; | |
1557 | print_res(name, delta_runtime / runtime_sec_max * 100.0, | |
1558 | "%,", "spread-runtime/thread", "% difference between max/avg runtime"); | |
1559 | ||
1560 | print_res(name, bytes / g->p.nr_tasks / 1e9, | |
1561 | "GB,", "data/thread", "GB data processed, per thread"); | |
1562 | ||
1563 | print_res(name, bytes / 1e9, | |
1564 | "GB,", "data-total", "GB data processed, total"); | |
1565 | ||
a8ad8329 | 1566 | print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks), |
1c13f3c9 IM |
1567 | "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); |
1568 | ||
1569 | print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, | |
1570 | "GB/sec,", "thread-speed", "GB/sec/thread speed"); | |
1571 | ||
1572 | print_res(name, bytes / runtime_sec_max / 1e9, | |
1573 | "GB/sec,", "total-speed", "GB/sec total speed"); | |
1574 | ||
b64aa553 PH |
1575 | if (g->p.show_details >= 2) { |
1576 | char tname[32]; | |
1577 | struct thread_data *td; | |
1578 | for (p = 0; p < g->p.nr_proc; p++) { | |
1579 | for (t = 0; t < g->p.nr_threads; t++) { | |
1580 | memset(tname, 0, 32); | |
1581 | td = g->threads + p*g->p.nr_threads + t; | |
1582 | snprintf(tname, 32, "process%d:thread%d", p, t); | |
1583 | print_res(tname, td->speed_gbs, | |
1584 | "GB/sec", "thread-speed", "GB/sec/thread speed"); | |
a8ad8329 | 1585 | print_res(tname, td->system_time_ns / NSEC_PER_SEC, |
b64aa553 | 1586 | "secs", "thread-system-time", "system CPU time/thread"); |
a8ad8329 | 1587 | print_res(tname, td->user_time_ns / NSEC_PER_SEC, |
b64aa553 PH |
1588 | "secs", "thread-user-time", "user CPU time/thread"); |
1589 | } | |
1590 | } | |
1591 | } | |
1592 | ||
1c13f3c9 IM |
1593 | free(pids); |
1594 | ||
1595 | deinit(); | |
1596 | ||
1597 | return 0; | |
1598 | } | |
1599 | ||
1600 | #define MAX_ARGS 50 | |
1601 | ||
1602 | static int command_size(const char **argv) | |
1603 | { | |
1604 | int size = 0; | |
1605 | ||
1606 | while (*argv) { | |
1607 | size++; | |
1608 | argv++; | |
1609 | } | |
1610 | ||
1611 | BUG_ON(size >= MAX_ARGS); | |
1612 | ||
1613 | return size; | |
1614 | } | |
1615 | ||
1616 | static void init_params(struct params *p, const char *name, int argc, const char **argv) | |
1617 | { | |
1618 | int i; | |
1619 | ||
1620 | printf("\n # Running %s \"perf bench numa", name); | |
1621 | ||
1622 | for (i = 0; i < argc; i++) | |
1623 | printf(" %s", argv[i]); | |
1624 | ||
1625 | printf("\"\n"); | |
1626 | ||
1627 | memset(p, 0, sizeof(*p)); | |
1628 | ||
1629 | /* Initialize nonzero defaults: */ | |
1630 | ||
1631 | p->serialize_startup = 1; | |
1632 | p->data_reads = true; | |
1633 | p->data_writes = true; | |
1634 | p->data_backwards = true; | |
1635 | p->data_rand_walk = true; | |
1636 | p->nr_loops = -1; | |
1637 | p->init_random = true; | |
40ba93e3 RR |
1638 | p->mb_global_str = "1"; |
1639 | p->nr_proc = 1; | |
1640 | p->nr_threads = 1; | |
1641 | p->nr_secs = 5; | |
0fae799e | 1642 | p->run_all = argc == 1; |
1c13f3c9 IM |
1643 | } |
1644 | ||
1645 | static int run_bench_numa(const char *name, const char **argv) | |
1646 | { | |
1647 | int argc = command_size(argv); | |
1648 | ||
1649 | init_params(&p0, name, argc, argv); | |
1650 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); | |
1651 | if (argc) | |
1652 | goto err; | |
1653 | ||
1654 | if (__bench_numa(name)) | |
1655 | goto err; | |
1656 | ||
1657 | return 0; | |
1658 | ||
1659 | err: | |
1c13f3c9 IM |
1660 | return -1; |
1661 | } | |
1662 | ||
1663 | #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" | |
1664 | #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" | |
1665 | ||
1666 | #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" | |
1667 | #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" | |
1668 | ||
1669 | #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" | |
1670 | #define OPT_BW_NOTHP OPT_BW, "--thp", "-1" | |
1671 | ||
1672 | /* | |
1673 | * The built-in test-suite executed by "perf bench numa -a". | |
1674 | * | |
1675 | * (A minimum of 4 nodes and 16 GB of RAM is recommended.) | |
1676 | */ | |
1677 | static const char *tests[][MAX_ARGS] = { | |
1678 | /* Basic single-stream NUMA bandwidth measurements: */ | |
1679 | { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1680 | "-C" , "0", "-M", "0", OPT_BW_RAM }, | |
1681 | { "RAM-bw-local-NOTHP,", | |
1682 | "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1683 | "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, | |
1684 | { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1685 | "-C" , "0", "-M", "1", OPT_BW_RAM }, | |
1686 | ||
1687 | /* 2-stream NUMA bandwidth measurements: */ | |
1688 | { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1689 | "-C", "0,2", "-M", "0x2", OPT_BW_RAM }, | |
1690 | { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1691 | "-C", "0,2", "-M", "1x2", OPT_BW_RAM }, | |
1692 | ||
1693 | /* Cross-stream NUMA bandwidth measurement: */ | |
1694 | { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1695 | "-C", "0,8", "-M", "1,0", OPT_BW_RAM }, | |
1696 | ||
1697 | /* Convergence latency measurements: */ | |
1698 | { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, | |
1699 | { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, | |
1700 | { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, | |
1701 | { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, | |
1702 | { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, | |
1703 | { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, | |
1704 | { " 4x4-convergence-NOTHP,", | |
1705 | "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, | |
1706 | { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV }, | |
1707 | { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV }, | |
1708 | { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV }, | |
1709 | { " 8x4-convergence-NOTHP,", | |
1710 | "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, | |
1711 | { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV }, | |
1712 | { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV }, | |
1713 | { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV }, | |
1714 | { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV }, | |
1715 | { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV }, | |
1716 | ||
1717 | /* Various NUMA process/thread layout bandwidth measurements: */ | |
1718 | { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW }, | |
1719 | { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW }, | |
1720 | { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW }, | |
1721 | { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW }, | |
1722 | { " 8x1-bw-process-NOTHP,", | |
1723 | "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, | |
1724 | { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, | |
1725 | ||
1726 | { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, | |
1727 | { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, | |
1728 | { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, | |
1729 | { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, | |
1730 | ||
1731 | { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, | |
1732 | { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, | |
1733 | { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, | |
1734 | { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, | |
1735 | { " 4x8-bw-thread-NOTHP,", | |
1736 | "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, | |
1737 | { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, | |
1738 | { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, | |
1739 | ||
1740 | { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, | |
1741 | { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, | |
1742 | ||
1743 | { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, | |
1744 | { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, | |
1745 | { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, | |
1746 | { "numa01-bw-thread-NOTHP,", | |
1747 | "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP }, | |
1748 | }; | |
1749 | ||
1750 | static int bench_all(void) | |
1751 | { | |
1752 | int nr = ARRAY_SIZE(tests); | |
1753 | int ret; | |
1754 | int i; | |
1755 | ||
1756 | ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); | |
1757 | BUG_ON(ret < 0); | |
1758 | ||
1759 | for (i = 0; i < nr; i++) { | |
b81a48ea | 1760 | run_bench_numa(tests[i][0], tests[i] + 1); |
1c13f3c9 IM |
1761 | } |
1762 | ||
1763 | printf("\n"); | |
1764 | ||
1765 | return 0; | |
1766 | } | |
1767 | ||
1768 | int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) | |
1769 | { | |
1770 | init_params(&p0, "main,", argc, argv); | |
1771 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); | |
1772 | if (argc) | |
1773 | goto err; | |
1774 | ||
1775 | if (p0.run_all) | |
1776 | return bench_all(); | |
1777 | ||
1778 | if (__bench_numa(NULL)) | |
1779 | goto err; | |
1780 | ||
1781 | return 0; | |
1782 | ||
1783 | err: | |
1784 | usage_with_options(numa_usage, options); | |
1785 | return -1; | |
1786 | } |