2 * Copyright (c) 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include <sys/types.h>
28 #endif /* __linux__ */
31 #include "openvswitch/hmap.h"
32 #include "openvswitch/list.h"
33 #include "ovs-thread.h"
34 #include "openvswitch/vlog.h"
36 VLOG_DEFINE_THIS_MODULE(ovs_numa
);
41 * This module stores the affinity information of numa nodes and cpu cores.
42 * It also provides functions to bookkeep the pin of threads on cpu cores.
44 * It is assumed that the numa node ids and cpu core ids all start from 0 and
45 * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N,
46 * user can assume core ids from 0 to N-1 are all valid and there is a
47 * 'struct cpu_core' for each id.
49 * NOTE, this module should only be used by the main thread.
51 * NOTE, the assumption above will fail when cpu hotplug is used. In that
52 * case ovs-numa will not function correctly. For now, add a TODO entry
53 * for addressing it in the future.
55 * TODO: Fix ovs-numa when cpu hotplug is used.
58 #define MAX_NUMA_NODES 128
62 struct hmap_node hmap_node
; /* In the 'all_numa_nodes'. */
63 struct ovs_list cores
; /* List of cpu cores on the numa node. */
64 int numa_id
; /* numa node id. */
67 /* Cpu core on a numa node. */
69 struct hmap_node hmap_node
;/* In the 'all_cpu_cores'. */
70 struct ovs_list list_node
; /* In 'numa_node->cores' list. */
71 struct numa_node
*numa
; /* numa node containing the core. */
72 unsigned core_id
; /* Core id. */
73 bool available
; /* If the core can be pinned. */
74 bool pinned
; /* If a thread has been pinned to the core. */
77 /* Contains all 'struct numa_node's. */
78 static struct hmap all_numa_nodes
= HMAP_INITIALIZER(&all_numa_nodes
);
79 /* Contains all 'struct cpu_core's. */
80 static struct hmap all_cpu_cores
= HMAP_INITIALIZER(&all_cpu_cores
);
81 /* True if numa node and core info are correctly extracted. */
82 static bool found_numa_and_core
;
83 /* True if the module was initialized with dummy options. In this case, the
84 * module must not interact with the actual cpus/nodes in the system. */
85 static bool dummy_numa
= false;
86 /* If 'dummy_numa' is true, contains a copy of the dummy numa configuration
88 static char *dummy_config
;
90 static struct numa_node
*get_numa_by_numa_id(int numa_id
);
93 /* Returns true if 'str' contains all digits. Returns false otherwise. */
95 contain_all_digits(const char *str
)
97 return str
[strspn(str
, "0123456789")] == '\0';
99 #endif /* __linux__ */
101 static struct numa_node
*
102 insert_new_numa_node(int numa_id
)
104 struct numa_node
*n
= xzalloc(sizeof *n
);
106 hmap_insert(&all_numa_nodes
, &n
->hmap_node
, hash_int(numa_id
, 0));
107 ovs_list_init(&n
->cores
);
108 n
->numa_id
= numa_id
;
113 static struct cpu_core
*
114 insert_new_cpu_core(struct numa_node
*n
, unsigned core_id
)
116 struct cpu_core
*c
= xzalloc(sizeof *c
);
118 hmap_insert(&all_cpu_cores
, &c
->hmap_node
, hash_int(core_id
, 0));
119 ovs_list_insert(&n
->cores
, &c
->list_node
);
120 c
->core_id
= core_id
;
127 /* Has the same effect as discover_numa_and_core(), but instead of reading
128 * sysfs entries, extracts the info from 'dummy_config'.
130 * 'dummy_config' lists the numa_ids of each CPU separated by a comma, e.g.
131 * - "0,0,0,0": four cores on numa socket 0.
132 * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
133 * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
135 * The different numa ids must be consecutives or the function will abort. */
137 discover_numa_and_core_dummy(const char *dummy_config
)
139 char *conf
= xstrdup(dummy_config
);
140 char *id
, *saveptr
= NULL
;
142 long max_numa_id
= 0;
144 for (id
= strtok_r(conf
, ",", &saveptr
); id
;
145 id
= strtok_r(NULL
, ",", &saveptr
)) {
146 struct hmap_node
*hnode
;
150 numa_id
= strtol(id
, NULL
, 10);
151 if (numa_id
< 0 || numa_id
>= MAX_NUMA_NODES
) {
152 VLOG_WARN("Invalid numa node %ld", numa_id
);
156 max_numa_id
= MAX(max_numa_id
, numa_id
);
158 hnode
= hmap_first_with_hash(&all_numa_nodes
, hash_int(numa_id
, 0));
161 n
= CONTAINER_OF(hnode
, struct numa_node
, hmap_node
);
163 n
= insert_new_numa_node(numa_id
);
166 insert_new_cpu_core(n
, i
);
173 if (max_numa_id
+ 1 != hmap_count(&all_numa_nodes
)) {
174 ovs_fatal(0, "dummy numa contains non consecutive numa ids");
178 /* Discovers all numa nodes and the corresponding cpu cores.
179 * Constructs the 'struct numa_node' and 'struct cpu_core'. */
181 discover_numa_and_core(void)
186 bool numa_supported
= true;
188 /* Check if NUMA supported on this system. */
189 dir
= opendir("/sys/devices/system/node");
191 if (!dir
&& errno
== ENOENT
) {
192 numa_supported
= false;
198 for (i
= 0; i
< MAX_NUMA_NODES
; i
++) {
201 if (numa_supported
) {
202 /* Constructs the path to node /sys/devices/system/nodeX. */
203 path
= xasprintf("/sys/devices/system/node/node%d", i
);
205 path
= xasprintf("/sys/devices/system/cpu/");
210 /* Creates 'struct numa_node' if the 'dir' is non-null. */
213 struct dirent
*subdir
;
215 n
= insert_new_numa_node(i
);
217 while ((subdir
= readdir(dir
)) != NULL
) {
218 if (!strncmp(subdir
->d_name
, "cpu", 3)
219 && contain_all_digits(subdir
->d_name
+ 3)) {
222 core_id
= strtoul(subdir
->d_name
+ 3, NULL
, 10);
223 insert_new_cpu_core(n
, core_id
);
227 } else if (errno
!= ENOENT
) {
228 VLOG_WARN("opendir(%s) failed (%s)", path
,
229 ovs_strerror(errno
));
233 if (!dir
|| !numa_supported
) {
237 #endif /* __linux__ */
240 /* Gets 'struct cpu_core' by 'core_id'. */
241 static struct cpu_core
*
242 get_core_by_core_id(unsigned core_id
)
244 struct cpu_core
*core
= NULL
;
246 if (ovs_numa_core_id_is_valid(core_id
)) {
247 core
= CONTAINER_OF(hmap_first_with_hash(&all_cpu_cores
,
248 hash_int(core_id
, 0)),
249 struct cpu_core
, hmap_node
);
255 /* Gets 'struct numa_node' by 'numa_id'. */
256 static struct numa_node
*
257 get_numa_by_numa_id(int numa_id
)
259 struct numa_node
*numa
= NULL
;
261 if (ovs_numa_numa_id_is_valid(numa_id
)) {
262 numa
= CONTAINER_OF(hmap_first_with_hash(&all_numa_nodes
,
263 hash_int(numa_id
, 0)),
264 struct numa_node
, hmap_node
);
273 ovs_numa_init__(const char *dummy_config
)
275 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
277 if (ovsthread_once_start(&once
)) {
278 const struct numa_node
*n
;
281 discover_numa_and_core();
283 discover_numa_and_core_dummy(dummy_config
);
286 HMAP_FOR_EACH(n
, hmap_node
, &all_numa_nodes
) {
287 VLOG_INFO("Discovered %"PRIuSIZE
" CPU cores on NUMA node %d",
288 ovs_list_size(&n
->cores
), n
->numa_id
);
291 VLOG_INFO("Discovered %"PRIuSIZE
" NUMA nodes and %"PRIuSIZE
" CPU cores",
292 hmap_count(&all_numa_nodes
), hmap_count(&all_cpu_cores
));
294 if (hmap_count(&all_numa_nodes
) && hmap_count(&all_cpu_cores
)) {
295 found_numa_and_core
= true;
298 ovsthread_once_done(&once
);
306 /* Extracts the numa node and core info from the 'config'. This is useful for
307 * testing purposes. The function must be called once, before ovs_numa_init().
309 * The format of 'config' is explained in the comment above
310 * discover_numa_and_core_dummy().*/
312 ovs_numa_set_dummy(const char *config
)
317 dummy_config
= xstrdup(config
);
320 /* Initializes the numa module. */
325 ovs_numa_init__(dummy_config
);
327 ovs_numa_init__(NULL
);
332 ovs_numa_numa_id_is_valid(int numa_id
)
334 return found_numa_and_core
&& numa_id
< ovs_numa_get_n_numas();
338 ovs_numa_core_id_is_valid(unsigned core_id
)
340 return found_numa_and_core
&& core_id
< ovs_numa_get_n_cores();
344 ovs_numa_core_is_pinned(unsigned core_id
)
346 struct cpu_core
*core
= get_core_by_core_id(core_id
);
355 /* Returns the number of numa nodes. */
357 ovs_numa_get_n_numas(void)
359 return found_numa_and_core
? hmap_count(&all_numa_nodes
)
363 /* Returns the number of cpu cores. */
365 ovs_numa_get_n_cores(void)
367 return found_numa_and_core
? hmap_count(&all_cpu_cores
)
371 /* Given 'core_id', returns the corresponding numa node id. Returns
372 * OVS_NUMA_UNSPEC if 'core_id' is invalid. */
374 ovs_numa_get_numa_id(unsigned core_id
)
376 struct cpu_core
*core
= get_core_by_core_id(core_id
);
379 return core
->numa
->numa_id
;
382 return OVS_NUMA_UNSPEC
;
385 /* Returns the number of cpu cores on numa node. Returns OVS_CORE_UNSPEC
386 * if 'numa_id' is invalid. */
388 ovs_numa_get_n_cores_on_numa(int numa_id
)
390 struct numa_node
*numa
= get_numa_by_numa_id(numa_id
);
393 return ovs_list_size(&numa
->cores
);
396 return OVS_CORE_UNSPEC
;
399 /* Returns the number of cpu cores that are available and unpinned
400 * on numa node. Returns OVS_CORE_UNSPEC if 'numa_id' is invalid. */
402 ovs_numa_get_n_unpinned_cores_on_numa(int numa_id
)
404 struct numa_node
*numa
= get_numa_by_numa_id(numa_id
);
407 struct cpu_core
*core
;
410 LIST_FOR_EACH(core
, list_node
, &numa
->cores
) {
411 if (core
->available
&& !core
->pinned
) {
418 return OVS_CORE_UNSPEC
;
421 /* Given 'core_id', tries to pin that core. Returns true, if succeeds.
422 * False, if the core has already been pinned, or if it is invalid or
425 ovs_numa_try_pin_core_specific(unsigned core_id
)
427 struct cpu_core
*core
= get_core_by_core_id(core_id
);
430 if (core
->available
&& !core
->pinned
) {
439 /* Searches through all cores for an unpinned and available core. Returns
440 * the 'core_id' if found and sets the 'core->pinned' to true. Otherwise,
441 * returns OVS_CORE_UNSPEC. */
443 ovs_numa_get_unpinned_core_any(void)
445 struct cpu_core
*core
;
447 HMAP_FOR_EACH(core
, hmap_node
, &all_cpu_cores
) {
448 if (core
->available
&& !core
->pinned
) {
450 return core
->core_id
;
454 return OVS_CORE_UNSPEC
;
457 /* Searches through all cores on numa node with 'numa_id' for an
458 * unpinned and available core. Returns the core_id if found and
459 * sets the 'core->pinned' to true. Otherwise, returns OVS_CORE_UNSPEC. */
461 ovs_numa_get_unpinned_core_on_numa(int numa_id
)
463 struct numa_node
*numa
= get_numa_by_numa_id(numa_id
);
466 struct cpu_core
*core
;
468 LIST_FOR_EACH(core
, list_node
, &numa
->cores
) {
469 if (core
->available
&& !core
->pinned
) {
471 return core
->core_id
;
476 return OVS_CORE_UNSPEC
;
479 /* Unpins the core with 'core_id'. */
481 ovs_numa_unpin_core(unsigned core_id
)
483 struct cpu_core
*core
= get_core_by_core_id(core_id
);
486 core
->pinned
= false;
490 /* Given the 'numa_id', returns dump of all cores on the numa node. */
491 struct ovs_numa_dump
*
492 ovs_numa_dump_cores_on_numa(int numa_id
)
494 struct ovs_numa_dump
*dump
= xmalloc(sizeof *dump
);
495 struct numa_node
*numa
= get_numa_by_numa_id(numa_id
);
497 ovs_list_init(&dump
->dump
);
500 struct cpu_core
*core
;
502 LIST_FOR_EACH(core
, list_node
, &numa
->cores
) {
503 struct ovs_numa_info
*info
= xmalloc(sizeof *info
);
505 info
->numa_id
= numa
->numa_id
;
506 info
->core_id
= core
->core_id
;
507 ovs_list_insert(&dump
->dump
, &info
->list_node
);
515 ovs_numa_dump_destroy(struct ovs_numa_dump
*dump
)
517 struct ovs_numa_info
*iter
;
523 LIST_FOR_EACH_POP (iter
, list_node
, &dump
->dump
) {
530 /* Reads the cpu mask configuration from 'cmask' and sets the
531 * 'available' of corresponding cores. For unspecified cores,
532 * sets 'available' to false. */
534 ovs_numa_set_cpu_mask(const char *cmask
)
539 if (!found_numa_and_core
) {
543 /* If no mask specified, resets the 'available' to true for all cores. */
545 struct cpu_core
*core
;
547 HMAP_FOR_EACH(core
, hmap_node
, &all_cpu_cores
) {
548 core
->available
= true;
554 for (i
= strlen(cmask
) - 1; i
>= 0; i
--) {
555 char hex
= toupper((unsigned char)cmask
[i
]);
558 if (hex
>= '0' && hex
<= '9') {
560 } else if (hex
>= 'A' && hex
<= 'F') {
561 bin
= hex
- 'A' + 10;
564 VLOG_WARN("Invalid cpu mask: %c", cmask
[i
]);
567 for (j
= 0; j
< 4; j
++) {
568 struct cpu_core
*core
;
570 core
= CONTAINER_OF(hmap_first_with_hash(&all_cpu_cores
,
571 hash_int(core_id
++, 0)),
572 struct cpu_core
, hmap_node
);
573 core
->available
= (bin
>> j
) & 0x1;
575 if (core_id
>= hmap_count(&all_cpu_cores
)) {
581 /* For unspecified cores, sets 'available' to false. */
582 while (core_id
< hmap_count(&all_cpu_cores
)) {
583 struct cpu_core
*core
;
585 core
= CONTAINER_OF(hmap_first_with_hash(&all_cpu_cores
,
586 hash_int(core_id
++, 0)),
587 struct cpu_core
, hmap_node
);
588 core
->available
= false;
592 int ovs_numa_thread_setaffinity_core(unsigned core_id OVS_UNUSED
)
604 CPU_SET(core_id
, &cpuset
);
605 err
= pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t
), &cpuset
);
607 VLOG_ERR("Thread affinity error %d",err
);
612 #else /* !__linux__ */
614 #endif /* __linux__ */