]> git.proxmox.com Git - ovs.git/blob - lib/ovs-numa.c
netdev-tc-offloads: Fix vxlan tunnel offloading
[ovs.git] / lib / ovs-numa.c
1 /*
2 * Copyright (c) 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "ovs-numa.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #ifdef __linux__
23 #include <dirent.h>
24 #include <stddef.h>
25 #include <string.h>
26 #include <sys/types.h>
27 #include <unistd.h>
28 #endif /* __linux__ */
29
30 #include "hash.h"
31 #include "openvswitch/hmap.h"
32 #include "openvswitch/list.h"
33 #include "ovs-thread.h"
34 #include "openvswitch/vlog.h"
35 #include "util.h"
36
37 VLOG_DEFINE_THIS_MODULE(ovs_numa);
38
39 /* ovs-numa module
40 * ===============
41 *
42 * This module stores the affinity information of numa nodes and cpu cores.
43 * It also provides functions to bookkeep the pin of threads on cpu cores.
44 *
45 * It is assumed that the numa node ids and cpu core ids all start from 0 and
46 * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N,
47 * user can assume core ids from 0 to N-1 are all valid and there is a
48 * 'struct cpu_core' for each id.
49 *
50 * NOTE, this module should only be used by the main thread.
51 *
52 * NOTE, the assumption above will fail when cpu hotplug is used. In that
53 * case ovs-numa will not function correctly. For now, add a TODO entry
54 * for addressing it in the future.
55 *
56 * TODO: Fix ovs-numa when cpu hotplug is used.
57 */
58
59 #define MAX_NUMA_NODES 128
60
61 /* numa node. */
62 struct numa_node {
63 struct hmap_node hmap_node; /* In the 'all_numa_nodes'. */
64 struct ovs_list cores; /* List of cpu cores on the numa node. */
65 int numa_id; /* numa node id. */
66 };
67
68 /* Cpu core on a numa node. */
69 struct cpu_core {
70 struct hmap_node hmap_node;/* In the 'all_cpu_cores'. */
71 struct ovs_list list_node; /* In 'numa_node->cores' list. */
72 struct numa_node *numa; /* numa node containing the core. */
73 unsigned core_id; /* Core id. */
74 };
75
76 /* Contains all 'struct numa_node's. */
77 static struct hmap all_numa_nodes = HMAP_INITIALIZER(&all_numa_nodes);
78 /* Contains all 'struct cpu_core's. */
79 static struct hmap all_cpu_cores = HMAP_INITIALIZER(&all_cpu_cores);
80 /* True if numa node and core info are correctly extracted. */
81 static bool found_numa_and_core;
82 /* True if the module was initialized with dummy options. In this case, the
83 * module must not interact with the actual cpus/nodes in the system. */
84 static bool dummy_numa = false;
85 /* If 'dummy_numa' is true, contains a copy of the dummy numa configuration
86 * parameter */
87 static char *dummy_config;
88
89 static struct numa_node *get_numa_by_numa_id(int numa_id);
90
91 #ifdef __linux__
92 /* Returns true if 'str' contains all digits. Returns false otherwise. */
93 static bool
94 contain_all_digits(const char *str)
95 {
96 return str[strspn(str, "0123456789")] == '\0';
97 }
98 #endif /* __linux__ */
99
100 static struct numa_node *
101 insert_new_numa_node(int numa_id)
102 {
103 struct numa_node *n = xzalloc(sizeof *n);
104
105 hmap_insert(&all_numa_nodes, &n->hmap_node, hash_int(numa_id, 0));
106 ovs_list_init(&n->cores);
107 n->numa_id = numa_id;
108
109 return n;
110 }
111
112 static struct cpu_core *
113 insert_new_cpu_core(struct numa_node *n, unsigned core_id)
114 {
115 struct cpu_core *c = xzalloc(sizeof *c);
116
117 hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0));
118 ovs_list_insert(&n->cores, &c->list_node);
119 c->core_id = core_id;
120 c->numa = n;
121
122 return c;
123 }
124
125 /* Has the same effect as discover_numa_and_core(), but instead of reading
126 * sysfs entries, extracts the info from 'dummy_config'.
127 *
128 * 'dummy_config' lists the numa_ids of each CPU separated by a comma, e.g.
129 * - "0,0,0,0": four cores on numa socket 0.
130 * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
131 * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
132 *
133 * The different numa ids must be consecutives or the function will abort. */
134 static void
135 discover_numa_and_core_dummy(const char *dummy_config)
136 {
137 char *conf = xstrdup(dummy_config);
138 char *id, *saveptr = NULL;
139 unsigned i = 0;
140 long max_numa_id = 0;
141
142 for (id = strtok_r(conf, ",", &saveptr); id;
143 id = strtok_r(NULL, ",", &saveptr)) {
144 struct hmap_node *hnode;
145 struct numa_node *n;
146 long numa_id;
147
148 numa_id = strtol(id, NULL, 10);
149 if (numa_id < 0 || numa_id >= MAX_NUMA_NODES) {
150 VLOG_WARN("Invalid numa node %ld", numa_id);
151 continue;
152 }
153
154 max_numa_id = MAX(max_numa_id, numa_id);
155
156 hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
157
158 if (hnode) {
159 n = CONTAINER_OF(hnode, struct numa_node, hmap_node);
160 } else {
161 n = insert_new_numa_node(numa_id);
162 }
163
164 insert_new_cpu_core(n, i);
165
166 i++;
167 }
168
169 free(conf);
170
171 if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
172 ovs_fatal(0, "dummy numa contains non consecutive numa ids");
173 }
174 }
175
176 /* Discovers all numa nodes and the corresponding cpu cores.
177 * Constructs the 'struct numa_node' and 'struct cpu_core'. */
178 static void
179 discover_numa_and_core(void)
180 {
181 #ifdef __linux__
182 int i;
183 DIR *dir;
184 bool numa_supported = true;
185
186 /* Check if NUMA supported on this system. */
187 dir = opendir("/sys/devices/system/node");
188
189 if (!dir && errno == ENOENT) {
190 numa_supported = false;
191 }
192 if (dir) {
193 closedir(dir);
194 }
195
196 for (i = 0; i < MAX_NUMA_NODES; i++) {
197 char* path;
198
199 if (numa_supported) {
200 /* Constructs the path to node /sys/devices/system/nodeX. */
201 path = xasprintf("/sys/devices/system/node/node%d", i);
202 } else {
203 path = xasprintf("/sys/devices/system/cpu/");
204 }
205
206 dir = opendir(path);
207
208 /* Creates 'struct numa_node' if the 'dir' is non-null. */
209 if (dir) {
210 struct numa_node *n;
211 struct dirent *subdir;
212
213 n = insert_new_numa_node(i);
214
215 while ((subdir = readdir(dir)) != NULL) {
216 if (!strncmp(subdir->d_name, "cpu", 3)
217 && contain_all_digits(subdir->d_name + 3)) {
218 unsigned core_id;
219
220 core_id = strtoul(subdir->d_name + 3, NULL, 10);
221 insert_new_cpu_core(n, core_id);
222 }
223 }
224 closedir(dir);
225 } else if (errno != ENOENT) {
226 VLOG_WARN("opendir(%s) failed (%s)", path,
227 ovs_strerror(errno));
228 }
229
230 free(path);
231 if (!dir || !numa_supported) {
232 break;
233 }
234 }
235 #endif /* __linux__ */
236 }
237
238 /* Gets 'struct cpu_core' by 'core_id'. */
239 static struct cpu_core*
240 get_core_by_core_id(unsigned core_id)
241 {
242 struct cpu_core *core;
243
244 HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_int(core_id, 0),
245 &all_cpu_cores) {
246 if (core->core_id == core_id) {
247 return core;
248 }
249 }
250
251 return NULL;
252 }
253
254 /* Gets 'struct numa_node' by 'numa_id'. */
255 static struct numa_node*
256 get_numa_by_numa_id(int numa_id)
257 {
258 struct numa_node *numa;
259
260 HMAP_FOR_EACH_WITH_HASH (numa, hmap_node, hash_int(numa_id, 0),
261 &all_numa_nodes) {
262 if (numa->numa_id == numa_id) {
263 return numa;
264 }
265 }
266
267 return NULL;
268 }
269
270 \f
271
272 static bool
273 ovs_numa_init__(const char *dummy_config)
274 {
275 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
276
277 if (ovsthread_once_start(&once)) {
278 const struct numa_node *n;
279
280 if (!dummy_config) {
281 discover_numa_and_core();
282 } else {
283 discover_numa_and_core_dummy(dummy_config);
284 }
285
286 HMAP_FOR_EACH(n, hmap_node, &all_numa_nodes) {
287 VLOG_INFO("Discovered %"PRIuSIZE" CPU cores on NUMA node %d",
288 ovs_list_size(&n->cores), n->numa_id);
289 }
290
291 VLOG_INFO("Discovered %"PRIuSIZE" NUMA nodes and %"PRIuSIZE" CPU cores",
292 hmap_count(&all_numa_nodes), hmap_count(&all_cpu_cores));
293
294 if (hmap_count(&all_numa_nodes) && hmap_count(&all_cpu_cores)) {
295 found_numa_and_core = true;
296 }
297
298 ovsthread_once_done(&once);
299
300 return true;
301 } else {
302 return false;
303 }
304 }
305
306 /* Extracts the numa node and core info from the 'config'. This is useful for
307 * testing purposes. The function must be called once, before ovs_numa_init().
308 *
309 * The format of 'config' is explained in the comment above
310 * discover_numa_and_core_dummy().*/
311 void
312 ovs_numa_set_dummy(const char *config)
313 {
314 dummy_numa = true;
315 ovs_assert(config);
316 free(dummy_config);
317 dummy_config = xstrdup(config);
318 }
319
320 /* Initializes the numa module. */
321 void
322 ovs_numa_init(void)
323 {
324 if (dummy_numa) {
325 ovs_numa_init__(dummy_config);
326 } else {
327 ovs_numa_init__(NULL);
328 }
329 }
330
331 bool
332 ovs_numa_numa_id_is_valid(int numa_id)
333 {
334 return found_numa_and_core && numa_id < ovs_numa_get_n_numas();
335 }
336
337 bool
338 ovs_numa_core_id_is_valid(unsigned core_id)
339 {
340 return found_numa_and_core && core_id < ovs_numa_get_n_cores();
341 }
342
343 /* Returns the number of numa nodes. */
344 int
345 ovs_numa_get_n_numas(void)
346 {
347 return found_numa_and_core ? hmap_count(&all_numa_nodes)
348 : OVS_NUMA_UNSPEC;
349 }
350
351 /* Returns the number of cpu cores. */
352 int
353 ovs_numa_get_n_cores(void)
354 {
355 return found_numa_and_core ? hmap_count(&all_cpu_cores)
356 : OVS_CORE_UNSPEC;
357 }
358
359 /* Given 'core_id', returns the corresponding numa node id. Returns
360 * OVS_NUMA_UNSPEC if 'core_id' is invalid. */
361 int
362 ovs_numa_get_numa_id(unsigned core_id)
363 {
364 struct cpu_core *core = get_core_by_core_id(core_id);
365
366 if (core) {
367 return core->numa->numa_id;
368 }
369
370 return OVS_NUMA_UNSPEC;
371 }
372
373 /* Returns the number of cpu cores on numa node. Returns OVS_CORE_UNSPEC
374 * if 'numa_id' is invalid. */
375 int
376 ovs_numa_get_n_cores_on_numa(int numa_id)
377 {
378 struct numa_node *numa = get_numa_by_numa_id(numa_id);
379
380 if (numa) {
381 return ovs_list_size(&numa->cores);
382 }
383
384 return OVS_CORE_UNSPEC;
385 }
386
387 static struct ovs_numa_dump *
388 ovs_numa_dump_create(void)
389 {
390 struct ovs_numa_dump *dump = xmalloc(sizeof *dump);
391
392 hmap_init(&dump->cores);
393 hmap_init(&dump->numas);
394
395 return dump;
396 }
397
398 static void
399 ovs_numa_dump_add(struct ovs_numa_dump *dump, int numa_id, int core_id)
400 {
401 struct ovs_numa_info_core *c = xzalloc(sizeof *c);
402 struct ovs_numa_info_numa *n;
403
404 c->numa_id = numa_id;
405 c->core_id = core_id;
406 hmap_insert(&dump->cores, &c->hmap_node, hash_2words(numa_id, core_id));
407
408 HMAP_FOR_EACH_WITH_HASH (n, hmap_node, hash_int(numa_id, 0),
409 &dump->numas) {
410 if (n->numa_id == numa_id) {
411 n->n_cores++;
412 return;
413 }
414 }
415
416 n = xzalloc(sizeof *n);
417 n->numa_id = numa_id;
418 n->n_cores = 1;
419 hmap_insert(&dump->numas, &n->hmap_node, hash_int(numa_id, 0));
420 }
421
422 /* Given the 'numa_id', returns dump of all cores on the numa node. */
423 struct ovs_numa_dump *
424 ovs_numa_dump_cores_on_numa(int numa_id)
425 {
426 struct ovs_numa_dump *dump = ovs_numa_dump_create();
427 struct numa_node *numa = get_numa_by_numa_id(numa_id);
428
429 if (numa) {
430 struct cpu_core *core;
431
432 LIST_FOR_EACH (core, list_node, &numa->cores) {
433 ovs_numa_dump_add(dump, numa->numa_id, core->core_id);
434 }
435 }
436
437 return dump;
438 }
439
440 struct ovs_numa_dump *
441 ovs_numa_dump_cores_with_cmask(const char *cmask)
442 {
443 struct ovs_numa_dump *dump = ovs_numa_dump_create();
444 int core_id = 0;
445 int end_idx;
446
447 /* Ignore leading 0x. */
448 end_idx = 0;
449 if (!strncmp(cmask, "0x", 2) || !strncmp(cmask, "0X", 2)) {
450 end_idx = 2;
451 }
452
453 for (int i = strlen(cmask) - 1; i >= end_idx; i--) {
454 char hex = cmask[i];
455 int bin;
456
457 bin = hexit_value(hex);
458 if (bin == -1) {
459 VLOG_WARN("Invalid cpu mask: %c", cmask[i]);
460 bin = 0;
461 }
462
463 for (int j = 0; j < 4; j++) {
464 if ((bin >> j) & 0x1) {
465 struct cpu_core *core = get_core_by_core_id(core_id);
466
467 if (core) {
468 ovs_numa_dump_add(dump,
469 core->numa->numa_id,
470 core->core_id);
471 }
472 }
473
474 core_id++;
475 }
476 }
477
478 return dump;
479 }
480
481 struct ovs_numa_dump *
482 ovs_numa_dump_n_cores_per_numa(int cores_per_numa)
483 {
484 struct ovs_numa_dump *dump = ovs_numa_dump_create();
485 const struct numa_node *n;
486
487 HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) {
488 const struct cpu_core *core;
489 int i = 0;
490
491 LIST_FOR_EACH (core, list_node, &n->cores) {
492 if (i++ >= cores_per_numa) {
493 break;
494 }
495
496 ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id);
497 }
498 }
499
500 return dump;
501 }
502
503 bool
504 ovs_numa_dump_contains_core(const struct ovs_numa_dump *dump,
505 int numa_id, unsigned core_id)
506 {
507 struct ovs_numa_info_core *core;
508
509 HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_2words(numa_id, core_id),
510 &dump->cores) {
511 if (core->core_id == core_id && core->numa_id == numa_id) {
512 return true;
513 }
514 }
515
516 return false;
517 }
518
519 size_t
520 ovs_numa_dump_count(const struct ovs_numa_dump *dump)
521 {
522 return hmap_count(&dump->cores);
523 }
524
525 void
526 ovs_numa_dump_destroy(struct ovs_numa_dump *dump)
527 {
528 struct ovs_numa_info_core *c;
529 struct ovs_numa_info_numa *n;
530
531 if (!dump) {
532 return;
533 }
534
535 HMAP_FOR_EACH_POP (c, hmap_node, &dump->cores) {
536 free(c);
537 }
538
539 HMAP_FOR_EACH_POP (n, hmap_node, &dump->numas) {
540 free(n);
541 }
542
543 hmap_destroy(&dump->cores);
544 hmap_destroy(&dump->numas);
545
546 free(dump);
547 }
548
549 int ovs_numa_thread_setaffinity_core(unsigned core_id OVS_UNUSED)
550 {
551 if (dummy_numa) {
552 /* Nothing to do */
553 return 0;
554 }
555
556 #ifdef __linux__
557 cpu_set_t cpuset;
558 int err;
559
560 CPU_ZERO(&cpuset);
561 CPU_SET(core_id, &cpuset);
562 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
563 if (err) {
564 VLOG_ERR("Thread affinity error %d",err);
565 return err;
566 }
567
568 return 0;
569 #else /* !__linux__ */
570 return EOPNOTSUPP;
571 #endif /* __linux__ */
572 }