]> git.proxmox.com Git - ceph.git/blob - ceph/src/common/numa.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / common / numa.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "numa.h"
5
6 #include <cstring>
7 #include <errno.h>
8 #include <iostream>
9
10 #include "include/stringify.h"
11 #include "common/safe_io.h"
12
13 using namespace std::literals;
14
15 using std::set;
16
17
18 // list
19 #if defined(__linux__)
20 int parse_cpu_set_list(const char *s,
21 size_t *cpu_set_size,
22 cpu_set_t *cpu_set)
23 {
24 CPU_ZERO(cpu_set);
25 while (*s) {
26 char *end;
27 int a = strtol(s, &end, 10);
28 if (end == s) {
29 return -EINVAL;
30 }
31 if (*end == '-') {
32 s = end + 1;
33 int b = strtol(s, &end, 10);
34 if (end == s) {
35 return -EINVAL;
36 }
37 for (; a <= b; ++a) {
38 CPU_SET(a, cpu_set);
39 }
40 *cpu_set_size = a;
41 } else {
42 CPU_SET(a, cpu_set);
43 *cpu_set_size = a + 1;
44 }
45 if (*end == 0) {
46 break;
47 }
48 if (*end != ',') {
49 return -EINVAL;
50 }
51 s = end + 1;
52 }
53 return 0;
54 }
55
56 std::string cpu_set_to_str_list(size_t cpu_set_size,
57 const cpu_set_t *cpu_set)
58 {
59 std::string r;
60 unsigned a = 0;
61 while (true) {
62 while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
63 ++a;
64 }
65 if (a >= cpu_set_size) {
66 break;
67 }
68 unsigned b = a + 1;
69 while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
70 ++b;
71 }
72 if (r.size()) {
73 r += ",";
74 }
75 if (b > a + 1) {
76 r += stringify(a) + "-" + stringify(b - 1);
77 } else {
78 r += stringify(a);
79 }
80 a = b;
81 }
82 return r;
83 }
84
85 std::set<int> cpu_set_to_set(size_t cpu_set_size,
86 const cpu_set_t *cpu_set)
87 {
88 set<int> r;
89 unsigned a = 0;
90 while (true) {
91 while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
92 ++a;
93 }
94 if (a >= cpu_set_size) {
95 break;
96 }
97 unsigned b = a + 1;
98 while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
99 ++b;
100 }
101 while (a < b) {
102 r.insert(a);
103 ++a;
104 }
105 }
106 return r;
107 }
108
109
110 int get_numa_node_cpu_set(
111 int node,
112 size_t *cpu_set_size,
113 cpu_set_t *cpu_set)
114 {
115 std::string fn = "/sys/devices/system/node/node";
116 fn += stringify(node);
117 fn += "/cpulist";
118 int fd = ::open(fn.c_str(), O_RDONLY);
119 if (fd < 0) {
120 return -errno;
121 }
122 char buf[1024];
123 int r = safe_read(fd, &buf, sizeof(buf));
124 if (r < 0) {
125 goto out;
126 }
127 buf[r] = 0;
128 while (r > 0 && ::isspace(buf[--r])) {
129 buf[r] = 0;
130 }
131 r = parse_cpu_set_list(buf, cpu_set_size, cpu_set);
132 if (r < 0) {
133 goto out;
134 }
135 r = 0;
136 out:
137 ::close(fd);
138 return r;
139 }
140
141 static int easy_readdir(const std::string& dir, std::set<std::string> *out)
142 {
143 DIR *h = ::opendir(dir.c_str());
144 if (!h) {
145 return -errno;
146 }
147 struct dirent *de = nullptr;
148 while ((de = ::readdir(h))) {
149 if (strcmp(de->d_name, ".") == 0 ||
150 strcmp(de->d_name, "..") == 0) {
151 continue;
152 }
153 out->insert(de->d_name);
154 }
155 closedir(h);
156 return 0;
157 }
158
159 static std::string get_task_comm(pid_t tid)
160 {
161 static const char* comm_fmt = "/proc/self/task/%d/comm";
162 char comm_name[strlen(comm_fmt) + 8];
163 snprintf(comm_name, sizeof(comm_name), comm_fmt, tid);
164 int fd = open(comm_name, O_CLOEXEC | O_RDONLY);
165 if (fd == -1) {
166 return "";
167 }
168 // see linux/sched.h
169 static constexpr int TASK_COMM_LEN = 16;
170 char name[TASK_COMM_LEN];
171 ssize_t n = safe_read(fd, name, sizeof(name));
172 close(fd);
173 if (n < 0) {
174 return "";
175 }
176 assert(n <= sizeof(name));
177 if (name[n - 1] == '\n') {
178 name[n - 1] = '\0';
179 } else {
180 name[n] = '\0';
181 }
182 return name;
183 }
184
185 int set_cpu_affinity_all_threads(size_t cpu_set_size, cpu_set_t *cpu_set)
186 {
187 // first set my affinity
188 int r = sched_setaffinity(getpid(), cpu_set_size, cpu_set);
189 if (r < 0) {
190 return -errno;
191 }
192
193 // make 2 passes here so that we (hopefully) catch racing threads creating
194 // threads.
195 for (unsigned pass = 0; pass < 2; ++pass) {
196 // enumerate all child threads from /proc
197 std::set<std::string> ls;
198 std::string path = "/proc/"s + stringify(getpid()) + "/task";
199 r = easy_readdir(path, &ls);
200 if (r < 0) {
201 return r;
202 }
203 for (auto& i : ls) {
204 pid_t tid = atoll(i.c_str());
205 if (!tid) {
206 continue; // wtf
207 }
208 #ifdef HAVE_DPDK
209 std::string thread_name = get_task_comm(tid);
210 static const char *dpdk_worker_name = "lcore-worker";
211 if (!thread_name.compare(0, strlen(dpdk_worker_name), dpdk_worker_name)) {
212 // ignore dpdk reactor thread, as it takes case of numa by itself
213 continue;
214 }
215 #endif
216 r = sched_setaffinity(tid, cpu_set_size, cpu_set);
217 if (r < 0) {
218 return -errno;
219 }
220 }
221 }
222 return 0;
223 }
224
225 #else
226 int parse_cpu_set_list(const char *s,
227 size_t *cpu_set_size,
228 cpu_set_t *cpu_set)
229 {
230 return -ENOTSUP;
231 }
232
233 std::string cpu_set_to_str_list(size_t cpu_set_size,
234 const cpu_set_t *cpu_set)
235 {
236 return {};
237 }
238
239 std::set<int> cpu_set_to_set(size_t cpu_set_size,
240 const cpu_set_t *cpu_set)
241 {
242 return {};
243 }
244
245 int get_numa_node_cpu_set(int node,
246 size_t *cpu_set_size,
247 cpu_set_t *cpu_set)
248 {
249 return -ENOTSUP;
250 }
251
252 int set_cpu_affinity_all_threads(size_t cpu_set_size,
253 cpu_set_t *cpu_set)
254 {
255 return -ENOTSUP;
256 }
257
258 #endif