]>
Commit | Line | Data |
---|---|---|
28e407b8 AA |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
91327a77 AA |
3 | /* |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2011 New Dream Network | |
7 | * Copyright (C) 2018 Red Hat, Inc. | |
8 | * | |
9 | * This is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License version 2.1, as published by the Free Software | |
12 | * Foundation. See file COPYING. | |
13 | * | |
14 | */ | |
28e407b8 | 15 | |
9f95a23c TL |
16 | #include <cstdio> |
17 | ||
91327a77 | 18 | #include <errno.h> |
28e407b8 AA |
19 | #include <fcntl.h> |
20 | #include <stdint.h> | |
f67539c2 | 21 | #include <stdio.h> |
20effc67 TL |
22 | #include "acconfig.h" |
23 | #ifdef HAVE_MEMSET_S | |
24 | # define __STDC_WANT_LIB_EXT1__ 1 | |
25 | #endif | |
28e407b8 | 26 | #include <string.h> |
f67539c2 TL |
27 | #include <thread> |
28 | #ifndef _WIN32 | |
28e407b8 | 29 | #include <sys/mount.h> |
f67539c2 TL |
30 | #else |
31 | #include <stdlib.h> | |
32 | #endif | |
91327a77 AA |
33 | #include <sys/param.h> |
34 | #include <sys/socket.h> | |
35 | #include <sys/stat.h> | |
36 | #include <sys/types.h> | |
37 | #include <unistd.h> | |
28e407b8 AA |
38 | #if defined(__linux__) |
39 | #include <sys/vfs.h> | |
40 | #endif | |
41 | ||
91327a77 AA |
42 | #include "include/compat.h" |
43 | #include "include/sock_compat.h" | |
28e407b8 AA |
44 | #include "common/safe_io.h" |
45 | ||
46 | // The type-value for a ZFS FS in fstatfs. | |
47 | #define FS_ZFS_TYPE 0xde | |
48 | ||
49 | // On FreeBSD, ZFS fallocate always fails since it is considered impossible to | |
50 | // reserve space on a COW filesystem. posix_fallocate() returns EINVAL | |
51 | // Linux in this case already emulates the reservation in glibc | |
52 | // In which case it is allocated manually, and still that is not a real guarantee | |
53 | // that a full buffer is allocated on disk, since it could be compressed. | |
54 | // To prevent this the written buffer needs to be loaded with random data. | |
55 | int manual_fallocate(int fd, off_t offset, off_t len) { | |
56 | int r = lseek(fd, offset, SEEK_SET); | |
57 | if (r == -1) | |
58 | return errno; | |
59 | char data[1024*128]; | |
60 | // TODO: compressing filesystems would require random data | |
92f5a8d4 | 61 | // FIPS zeroization audit 20191115: this memset is not security related. |
28e407b8 AA |
62 | memset(data, 0x42, sizeof(data)); |
63 | for (off_t off = 0; off < len; off += sizeof(data)) { | |
11fdf7f2 | 64 | if (off + static_cast<off_t>(sizeof(data)) > len) |
28e407b8 AA |
65 | r = safe_write(fd, data, len - off); |
66 | else | |
67 | r = safe_write(fd, data, sizeof(data)); | |
68 | if (r == -1) { | |
69 | return errno; | |
70 | } | |
71 | } | |
72 | return 0; | |
73 | } | |
74 | ||
75 | int on_zfs(int basedir_fd) { | |
f67539c2 | 76 | #ifndef _WIN32 |
28e407b8 AA |
77 | struct statfs basefs; |
78 | (void)fstatfs(basedir_fd, &basefs); | |
79 | return (basefs.f_type == FS_ZFS_TYPE); | |
f67539c2 TL |
80 | #else |
81 | return 0; | |
82 | #endif | |
28e407b8 AA |
83 | } |
84 | ||
85 | int ceph_posix_fallocate(int fd, off_t offset, off_t len) { | |
86 | // Return 0 if oke, otherwise errno > 0 | |
87 | ||
88 | #ifdef HAVE_POSIX_FALLOCATE | |
89 | if (on_zfs(fd)) { | |
90 | return manual_fallocate(fd, offset, len); | |
91 | } else { | |
92 | return posix_fallocate(fd, offset, len); | |
93 | } | |
94 | #elif defined(__APPLE__) | |
95 | fstore_t store; | |
96 | store.fst_flags = F_ALLOCATECONTIG; | |
97 | store.fst_posmode = F_PEOFPOSMODE; | |
98 | store.fst_offset = offset; | |
99 | store.fst_length = len; | |
100 | ||
101 | int ret = fcntl(fd, F_PREALLOCATE, &store); | |
102 | if (ret == -1) { | |
103 | ret = errno; | |
104 | } | |
105 | return ret; | |
106 | #else | |
107 | return manual_fallocate(fd, offset, len); | |
108 | #endif | |
109 | } | |
110 | ||
9f95a23c | 111 | int pipe_cloexec(int pipefd[2], int flags) |
91327a77 AA |
112 | { |
113 | #if defined(HAVE_PIPE2) | |
9f95a23c | 114 | return pipe2(pipefd, O_CLOEXEC | flags); |
91327a77 AA |
115 | #else |
116 | if (pipe(pipefd) == -1) | |
117 | return -1; | |
118 | ||
f67539c2 | 119 | #ifndef _WIN32 |
91327a77 AA |
120 | /* |
121 | * The old-fashioned, race-condition prone way that we have to fall | |
122 | * back on if pipe2 does not exist. | |
123 | */ | |
124 | if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) < 0) { | |
125 | goto fail; | |
126 | } | |
127 | ||
128 | if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) < 0) { | |
129 | goto fail; | |
130 | } | |
f67539c2 | 131 | #endif |
91327a77 AA |
132 | |
133 | return 0; | |
134 | fail: | |
135 | int save_errno = errno; | |
136 | VOID_TEMP_FAILURE_RETRY(close(pipefd[0])); | |
137 | VOID_TEMP_FAILURE_RETRY(close(pipefd[1])); | |
138 | return (errno = save_errno, -1); | |
139 | #endif | |
140 | } | |
141 | ||
142 | ||
143 | int socket_cloexec(int domain, int type, int protocol) | |
144 | { | |
145 | #ifdef SOCK_CLOEXEC | |
146 | return socket(domain, type|SOCK_CLOEXEC, protocol); | |
147 | #else | |
148 | int fd = socket(domain, type, protocol); | |
149 | if (fd == -1) | |
150 | return -1; | |
151 | ||
f67539c2 | 152 | #ifndef _WIN32 |
91327a77 AA |
153 | if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0) |
154 | goto fail; | |
f67539c2 | 155 | #endif |
91327a77 AA |
156 | |
157 | return fd; | |
158 | fail: | |
159 | int save_errno = errno; | |
160 | VOID_TEMP_FAILURE_RETRY(close(fd)); | |
161 | return (errno = save_errno, -1); | |
162 | #endif | |
163 | } | |
164 | ||
165 | int socketpair_cloexec(int domain, int type, int protocol, int sv[2]) | |
166 | { | |
167 | #ifdef SOCK_CLOEXEC | |
168 | return socketpair(domain, type|SOCK_CLOEXEC, protocol, sv); | |
f67539c2 TL |
169 | #elif _WIN32 |
170 | /* TODO */ | |
171 | return -ENOTSUP; | |
91327a77 AA |
172 | #else |
173 | int rc = socketpair(domain, type, protocol, sv); | |
174 | if (rc == -1) | |
175 | return -1; | |
176 | ||
f67539c2 | 177 | #ifndef _WIN32 |
91327a77 AA |
178 | if (fcntl(sv[0], F_SETFD, FD_CLOEXEC) < 0) |
179 | goto fail; | |
180 | ||
181 | if (fcntl(sv[1], F_SETFD, FD_CLOEXEC) < 0) | |
182 | goto fail; | |
f67539c2 | 183 | #endif |
91327a77 AA |
184 | |
185 | return 0; | |
186 | fail: | |
187 | int save_errno = errno; | |
188 | VOID_TEMP_FAILURE_RETRY(close(sv[0])); | |
189 | VOID_TEMP_FAILURE_RETRY(close(sv[1])); | |
190 | return (errno = save_errno, -1); | |
191 | #endif | |
192 | } | |
193 | ||
194 | int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen) | |
195 | { | |
196 | #ifdef HAVE_ACCEPT4 | |
197 | return accept4(sockfd, addr, addrlen, SOCK_CLOEXEC); | |
198 | #else | |
199 | int fd = accept(sockfd, addr, addrlen); | |
200 | if (fd == -1) | |
201 | return -1; | |
202 | ||
f67539c2 | 203 | #ifndef _WIN32 |
91327a77 AA |
204 | if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0) |
205 | goto fail; | |
f67539c2 | 206 | #endif |
91327a77 AA |
207 | |
208 | return fd; | |
209 | fail: | |
210 | int save_errno = errno; | |
211 | VOID_TEMP_FAILURE_RETRY(close(fd)); | |
212 | return (errno = save_errno, -1); | |
213 | #endif | |
214 | } | |
11fdf7f2 TL |
215 | |
216 | #if defined(__FreeBSD__) | |
217 | int sched_setaffinity(pid_t pid, size_t cpusetsize, | |
218 | cpu_set_t *mask) | |
219 | { | |
220 | return 0; | |
221 | } | |
222 | #endif | |
223 | ||
9f95a23c TL |
224 | char *ceph_strerror_r(int errnum, char *buf, size_t buflen) |
225 | { | |
f67539c2 TL |
226 | #ifdef _WIN32 |
227 | strerror_s(buf, buflen, errnum); | |
228 | return buf; | |
229 | #elif defined(STRERROR_R_CHAR_P) | |
9f95a23c TL |
230 | return strerror_r(errnum, buf, buflen); |
231 | #else | |
232 | if (strerror_r(errnum, buf, buflen)) { | |
233 | snprintf(buf, buflen, "Unknown error %d", errnum); | |
234 | } | |
235 | return buf; | |
236 | #endif | |
237 | } | |
f67539c2 TL |
238 | |
239 | int ceph_memzero_s(void *dest, size_t destsz, size_t count) { | |
20effc67 | 240 | #ifdef HAVE_MEMSET_S |
f67539c2 TL |
241 | return memset_s(dest, destsz, 0, count); |
242 | #elif defined(_WIN32) | |
243 | SecureZeroMemory(dest, count); | |
244 | #else | |
245 | explicit_bzero(dest, count); | |
246 | #endif | |
247 | return 0; | |
248 | } | |
249 | ||
250 | #ifdef _WIN32 | |
251 | ||
252 | #include <iomanip> | |
253 | #include <ctime> | |
254 | ||
255 | // chown is not available on Windows. Plus, changing file owners is not | |
256 | // a common practice on Windows. | |
257 | int chown(const char *path, uid_t owner, gid_t group) { | |
258 | return 0; | |
259 | } | |
260 | ||
261 | int fchown(int fd, uid_t owner, gid_t group) { | |
262 | return 0; | |
263 | } | |
264 | ||
265 | int lchown(const char *path, uid_t owner, gid_t group) { | |
266 | return 0; | |
267 | } | |
268 | ||
269 | int posix_memalign(void **memptr, size_t alignment, size_t size) { | |
270 | *memptr = _aligned_malloc(size, alignment); | |
271 | return *memptr ? 0 : errno; | |
272 | } | |
273 | ||
274 | char *strptime(const char *s, const char *format, struct tm *tm) { | |
275 | std::istringstream input(s); | |
276 | input.imbue(std::locale(setlocale(LC_ALL, nullptr))); | |
277 | input >> std::get_time(tm, format); | |
278 | if (input.fail()) { | |
279 | return nullptr; | |
280 | } | |
281 | return (char*)(s + input.tellg()); | |
282 | } | |
283 | ||
284 | int pipe(int pipefd[2]) { | |
285 | // We'll use the same pipe size as Linux (64kb). | |
286 | return _pipe(pipefd, 0x10000, O_NOINHERIT); | |
287 | } | |
288 | ||
289 | // lrand48 is not available on Windows. We'll generate a pseudo-random | |
290 | // value in the 0 - 2^31 range by calling rand twice. | |
291 | long int lrand48(void) { | |
292 | long int val; | |
293 | val = (long int) rand(); | |
294 | val <<= 16; | |
295 | val += (long int) rand(); | |
296 | return val; | |
297 | } | |
298 | ||
299 | int random() { | |
300 | return rand(); | |
301 | } | |
302 | ||
303 | int fsync(int fd) { | |
304 | HANDLE handle = (HANDLE*)_get_osfhandle(fd); | |
305 | if (handle == INVALID_HANDLE_VALUE) | |
306 | return -1; | |
307 | if (!FlushFileBuffers(handle)) | |
308 | return -1; | |
309 | return 0; | |
310 | } | |
311 | ||
312 | ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) { | |
313 | DWORD bytes_written = 0; | |
314 | ||
315 | HANDLE handle = (HANDLE*)_get_osfhandle(fd); | |
316 | if (handle == INVALID_HANDLE_VALUE) | |
317 | return -1; | |
318 | ||
319 | OVERLAPPED overlapped = { 0 }; | |
320 | ULARGE_INTEGER offsetUnion; | |
321 | offsetUnion.QuadPart = offset; | |
322 | ||
323 | overlapped.Offset = offsetUnion.LowPart; | |
324 | overlapped.OffsetHigh = offsetUnion.HighPart; | |
325 | ||
326 | if (!WriteFile(handle, buf, count, &bytes_written, &overlapped)) | |
327 | // we may consider mapping error codes, although that may | |
328 | // not be exhaustive. | |
329 | return -1; | |
330 | ||
331 | return bytes_written; | |
332 | } | |
333 | ||
334 | ssize_t pread(int fd, void *buf, size_t count, off_t offset) { | |
335 | DWORD bytes_read = 0; | |
336 | ||
337 | HANDLE handle = (HANDLE*)_get_osfhandle(fd); | |
338 | if (handle == INVALID_HANDLE_VALUE) | |
339 | return -1; | |
340 | ||
341 | OVERLAPPED overlapped = { 0 }; | |
342 | ULARGE_INTEGER offsetUnion; | |
343 | offsetUnion.QuadPart = offset; | |
344 | ||
345 | overlapped.Offset = offsetUnion.LowPart; | |
346 | overlapped.OffsetHigh = offsetUnion.HighPart; | |
347 | ||
348 | if (!ReadFile(handle, buf, count, &bytes_read, &overlapped)) { | |
349 | if (GetLastError() != ERROR_HANDLE_EOF) | |
350 | return -1; | |
351 | } | |
352 | ||
353 | return bytes_read; | |
354 | } | |
355 | ||
356 | ssize_t preadv(int fd, const struct iovec *iov, int iov_cnt) { | |
357 | ssize_t read = 0; | |
358 | ||
359 | for (int i = 0; i < iov_cnt; i++) { | |
360 | int r = ::read(fd, iov[i].iov_base, iov[i].iov_len); | |
361 | if (r < 0) | |
362 | return r; | |
363 | read += r; | |
364 | if (r < iov[i].iov_len) | |
365 | break; | |
366 | } | |
367 | ||
368 | return read; | |
369 | } | |
370 | ||
371 | ssize_t writev(int fd, const struct iovec *iov, int iov_cnt) { | |
372 | ssize_t written = 0; | |
373 | ||
374 | for (int i = 0; i < iov_cnt; i++) { | |
375 | int r = ::write(fd, iov[i].iov_base, iov[i].iov_len); | |
376 | if (r < 0) | |
377 | return r; | |
378 | written += r; | |
379 | if (r < iov[i].iov_len) | |
380 | break; | |
381 | } | |
382 | ||
383 | return written; | |
384 | } | |
385 | ||
386 | int &alloc_tls() { | |
387 | static __thread int tlsvar; | |
388 | tlsvar++; | |
389 | return tlsvar; | |
390 | } | |
391 | ||
392 | void apply_tls_workaround() { | |
393 | // Workaround for the following Mingw bugs: | |
394 | // https://sourceforge.net/p/mingw-w64/bugs/727/ | |
395 | // https://sourceforge.net/p/mingw-w64/bugs/527/ | |
396 | // https://sourceforge.net/p/mingw-w64/bugs/445/ | |
397 | // https://gcc.gnu.org/bugzilla/attachment.cgi?id=41382 | |
398 | pthread_key_t key; | |
399 | pthread_key_create(&key, nullptr); | |
400 | // Use a TLS slot for emutls | |
401 | alloc_tls(); | |
402 | // Free up a slot that can now be used for c++ destructors | |
403 | pthread_key_delete(key); | |
404 | } | |
405 | ||
406 | CEPH_CONSTRUCTOR(ceph_windows_init) { | |
407 | // This will run at startup time before invoking main(). | |
408 | WSADATA wsaData; | |
409 | int error; | |
410 | ||
411 | #ifdef __MINGW32__ | |
412 | apply_tls_workaround(); | |
413 | #endif | |
414 | ||
415 | error = WSAStartup(MAKEWORD(2, 2), &wsaData); | |
416 | if (error != 0) { | |
417 | fprintf(stderr, "WSAStartup failed: %d", WSAGetLastError()); | |
418 | exit(error); | |
419 | } | |
420 | } | |
421 | ||
422 | int _win_socketpair(int socks[2]) | |
423 | { | |
424 | union { | |
425 | struct sockaddr_in inaddr; | |
426 | struct sockaddr addr; | |
427 | } a; | |
428 | SOCKET listener; | |
429 | int e; | |
430 | socklen_t addrlen = sizeof(a.inaddr); | |
431 | int reuse = 1; | |
432 | ||
433 | if (socks == 0) { | |
434 | WSASetLastError(WSAEINVAL); | |
435 | return -1; | |
436 | } | |
437 | ||
438 | listener = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); | |
439 | if (listener == INVALID_SOCKET) { | |
440 | return -1; | |
441 | } | |
442 | ||
443 | memset(&a, 0, sizeof(a)); | |
444 | a.inaddr.sin_family = AF_INET; | |
445 | a.inaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); | |
446 | a.inaddr.sin_port = 0; | |
447 | ||
448 | socks[0] = socks[1] = -1; | |
449 | SOCKET s[2] = { INVALID_SOCKET, INVALID_SOCKET }; | |
450 | ||
451 | do { | |
452 | if (setsockopt(listener, SOL_SOCKET, SO_REUSEADDR, | |
453 | (char*) &reuse, (socklen_t) sizeof(reuse)) == -1) | |
454 | break; | |
455 | if (bind(listener, &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR) | |
456 | break; | |
457 | if (getsockname(listener, &a.addr, &addrlen) == SOCKET_ERROR) | |
458 | break; | |
459 | if (listen(listener, 1) == SOCKET_ERROR) | |
460 | break; | |
461 | s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); | |
462 | if (s[0] == INVALID_SOCKET) | |
463 | break; | |
464 | if (connect(s[0], &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR) | |
465 | break; | |
466 | s[1] = accept(listener, NULL, NULL); | |
467 | if (s[1] == INVALID_SOCKET) | |
468 | break; | |
469 | ||
470 | closesocket(listener); | |
471 | ||
472 | // The Windows socket API is mostly compatible with the Berkeley | |
473 | // API, with a few exceptions. The Windows socket functions use | |
474 | // SOCKET instead of int. The issue is that on x64 systems, | |
475 | // SOCKET uses 64b while int uses 32b. There's been much debate | |
476 | // whether casting a Windows socket to an int is safe or not. | |
477 | // Worth noting that Windows kernel objects use 32b. For now, | |
478 | // we're just adding a check. | |
479 | // | |
480 | // Ideally, we should update ceph to use the right type but this | |
481 | // can be quite difficult, especially considering that there are | |
482 | // a significant number of functions that accept both sockets and | |
483 | // file descriptors. | |
484 | if (s[0] >> 32 || s[1] >> 32) { | |
485 | WSASetLastError(WSAENAMETOOLONG); | |
486 | break; | |
487 | } | |
488 | ||
489 | socks[0] = s[0]; | |
490 | socks[1] = s[1]; | |
491 | ||
492 | return 0; | |
493 | ||
494 | } while (0); | |
495 | ||
496 | e = WSAGetLastError(); | |
497 | closesocket(listener); | |
498 | closesocket(s[0]); | |
499 | closesocket(s[1]); | |
500 | WSASetLastError(e); | |
501 | return -1; | |
502 | } | |
503 | ||
504 | int win_socketpair(int socks[2]) { | |
505 | int r = 0; | |
506 | for (int i = 0; i < 15; i++) { | |
507 | r = _win_socketpair(socks); | |
508 | if (r && WSAGetLastError() == WSAEADDRINUSE) { | |
509 | sleep(2); | |
510 | continue; | |
511 | } | |
512 | else { | |
513 | break; | |
514 | } | |
515 | } | |
516 | return r; | |
517 | } | |
518 | ||
519 | unsigned get_page_size() { | |
520 | SYSTEM_INFO system_info; | |
521 | GetSystemInfo(&system_info); | |
522 | return system_info.dwPageSize; | |
523 | } | |
524 | ||
525 | int setenv(const char *name, const char *value, int overwrite) { | |
526 | if (!overwrite && getenv(name)) { | |
527 | return 0; | |
528 | } | |
529 | return _putenv_s(name, value); | |
530 | } | |
531 | ||
532 | ssize_t get_self_exe_path(char* path, int buff_length) { | |
533 | return GetModuleFileName(NULL, path, buff_length - 1); | |
534 | } | |
535 | ||
536 | int geteuid() | |
537 | { | |
538 | return 0; | |
539 | } | |
540 | ||
541 | int getegid() | |
542 | { | |
543 | return 0; | |
544 | } | |
545 | ||
546 | int getuid() | |
547 | { | |
548 | return 0; | |
549 | } | |
550 | ||
551 | int getgid() | |
552 | { | |
553 | return 0; | |
554 | } | |
555 | ||
556 | #else | |
557 | ||
558 | unsigned get_page_size() { | |
559 | return sysconf(_SC_PAGESIZE); | |
560 | } | |
561 | ||
562 | ssize_t get_self_exe_path(char* path, int buff_length) { | |
563 | return readlink("/proc/self/exe", path, | |
564 | sizeof(buff_length) - 1); | |
565 | } | |
566 | ||
567 | #endif /* _WIN32 */ |