]>
Commit | Line | Data |
---|---|---|
19b276cb | 1 | /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc. |
f85f8ebb BP |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | * you may not use this file except in compliance with the License. | |
5 | * You may obtain a copy of the License at: | |
6 | * | |
7 | * http://www.apache.org/licenses/LICENSE-2.0 | |
8 | * | |
9 | * Unless required by applicable law or agreed to in writing, software | |
10 | * distributed under the License is distributed on an "AS IS" BASIS, | |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | * See the License for the specific language governing permissions and | |
13 | * limitations under the License. | |
14 | */ | |
15 | ||
16 | #include <config.h> | |
17 | ||
41709ccc | 18 | #include "log.h" |
f85f8ebb | 19 | |
f85f8ebb BP |
20 | #include <errno.h> |
21 | #include <fcntl.h> | |
22 | #include <stdlib.h> | |
23 | #include <string.h> | |
3762274e | 24 | #include <sys/stat.h> |
f85f8ebb BP |
25 | #include <unistd.h> |
26 | ||
f70b61d3 | 27 | #include "lockfile.h" |
c9167341 | 28 | #include "openvswitch/dynamic-string.h" |
ee89ea7b | 29 | #include "openvswitch/json.h" |
c7007aa7 | 30 | #include "openvswitch/vlog.h" |
f70b61d3 BP |
31 | #include "ovs-atomic.h" |
32 | #include "ovs-rcu.h" | |
33 | #include "ovs-thread.h" | |
f85f8ebb | 34 | #include "ovsdb-error.h" |
f70b61d3 BP |
35 | #include "ovsdb.h" |
36 | #include "openvswitch/poll-loop.h" | |
37 | #include "seq.h" | |
f85f8ebb | 38 | #include "sha1.h" |
8e71cf88 | 39 | #include "socket-util.h" |
41709ccc | 40 | #include "transaction.h" |
f85f8ebb | 41 | #include "util.h" |
5136ce49 | 42 | |
c7007aa7 BP |
43 | VLOG_DEFINE_THIS_MODULE(ovsdb_log); |
44 | ||
421fc8a1 BP |
45 | /* State in a log's state machine. |
46 | * | |
47 | * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may | |
48 | * be read in this state only. Reaching end of file does not cause a state | |
49 | * transition. A read error transitions to OVSDB_LOG_READ_ERROR. | |
50 | * | |
51 | * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will | |
52 | * report the same error as before. A log write transitions away to | |
53 | * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR. | |
54 | * | |
55 | * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all | |
56 | * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this | |
57 | * state. A write error transitions to OVSDB_LOG_WRITE_ERROR. | |
58 | * | |
59 | * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes | |
60 | * retry and might transition back to OVSDB_LOG_WRITE. | |
61 | * | |
62 | * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or | |
63 | * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no | |
64 | * further reads or writes can succeed. This is a terminal state. | |
65 | */ | |
66 | enum ovsdb_log_state { | |
67 | OVSDB_LOG_READ, /* Ready to read. */ | |
68 | OVSDB_LOG_READ_ERROR, /* Read failed, see 'error' for details. */ | |
69 | OVSDB_LOG_WRITE, /* Ready to write. */ | |
70 | OVSDB_LOG_WRITE_ERROR, /* Write failed, see 'error' for details. */ | |
71 | OVSDB_LOG_BROKEN, /* Disk on fire, see 'error' for details. */ | |
f85f8ebb BP |
72 | }; |
73 | ||
41709ccc | 74 | struct ovsdb_log { |
421fc8a1 BP |
75 | enum ovsdb_log_state state; |
76 | struct ovsdb_error *error; | |
77 | ||
43675e26 | 78 | off_t prev_offset; |
f85f8ebb | 79 | off_t offset; |
02f4f231 BP |
80 | char *name; /* Absolute name of file. */ |
81 | char *display_name; /* For use in log messages, etc. */ | |
19b276cb | 82 | char *magic; |
f85f8ebb BP |
83 | struct lockfile *lockfile; |
84 | FILE *stream; | |
4cc9d1f0 | 85 | off_t base; |
f70b61d3 | 86 | struct afsync *afsync; |
f85f8ebb BP |
87 | }; |
88 | ||
421fc8a1 BP |
89 | /* Whether the OS supports renaming open files. |
90 | * | |
91 | * (Making this a variable makes it easier to test both strategies on Unix-like | |
92 | * systems.) */ | |
93 | #ifdef _WIN32 | |
94 | static bool rename_open_files = false; | |
95 | #else | |
96 | static bool rename_open_files = true; | |
97 | #endif | |
98 | ||
fed27759 BP |
99 | static bool parse_header(char *header, const char **magicp, |
100 | unsigned long int *length, | |
101 | uint8_t sha1[SHA1_DIGEST_SIZE]); | |
102 | static bool is_magic_ok(const char *needle, const char *haystack); | |
103 | ||
f70b61d3 BP |
104 | static struct afsync *afsync_create(int fd, uint64_t initial_ticket); |
105 | static uint64_t afsync_destroy(struct afsync *); | |
106 | ||
7446f148 BP |
107 | /* Attempts to open 'name' with the specified 'open_mode'. On success, stores |
108 | * the new log into '*filep' and returns NULL; otherwise returns NULL and | |
109 | * stores NULL into '*filep'. | |
110 | * | |
19b276cb BP |
111 | * 'magic' is a short text string put at the beginning of every record and used |
112 | * to distinguish one kind of log file from another. For a conventional OVSDB | |
fed27759 BP |
113 | * log file, use the OVSDB_MAGIC macro. To accept more than one magic string, |
114 | * separate them with "|", e.g. "MAGIC 1|MAGIC 2". | |
19b276cb | 115 | * |
7446f148 BP |
116 | * Whether the file will be locked using lockfile_lock() depends on 'locking': |
117 | * use true to lock it, false not to lock it, or -1 to lock it only if | |
118 | * 'open_mode' is a mode that allows writing. | |
4cc9d1f0 BP |
119 | * |
120 | * A log consists of a series of records. After opening or creating a log with | |
121 | * this function, the client may use ovsdb_log_read() to read any existing | |
122 | * records, one by one. The client may also use ovsdb_log_write() to write new | |
123 | * records (if some records have not yet been read at this point, then the | |
124 | * first write truncates them). | |
7446f148 | 125 | */ |
f85f8ebb | 126 | struct ovsdb_error * |
19b276cb BP |
127 | ovsdb_log_open(const char *name, const char *magic, |
128 | enum ovsdb_log_open_mode open_mode, | |
7446f148 | 129 | int locking, struct ovsdb_log **filep) |
f85f8ebb BP |
130 | { |
131 | struct lockfile *lockfile; | |
132 | struct ovsdb_error *error; | |
f85f8ebb BP |
133 | struct stat s; |
134 | FILE *stream; | |
7446f148 | 135 | int flags; |
f85f8ebb BP |
136 | int fd; |
137 | ||
fed27759 BP |
138 | /* If we can create a new file, we need to know what kind of magic to |
139 | * use, so there must be only one kind. */ | |
140 | if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) { | |
141 | ovs_assert(!strchr(magic, '|')); | |
142 | } | |
02f4f231 | 143 | |
f85f8ebb BP |
144 | *filep = NULL; |
145 | ||
02f4f231 BP |
146 | /* Get the absolute name of the file because we might need to access it by |
147 | * name again later after the process has changed directory (e.g. because | |
148 | * daemonize() chdirs to "/"). | |
149 | * | |
150 | * We save the user-provided name of the file for use in log messages, to | |
151 | * reduce user confusion. */ | |
152 | char *abs_name = abs_file_name(NULL, name); | |
153 | if (!abs_name) { | |
154 | error = ovsdb_io_error(0, "could not determine current " | |
155 | "working directory"); | |
156 | goto error; | |
157 | } | |
158 | ||
cb22974d | 159 | ovs_assert(locking == -1 || locking == false || locking == true); |
7446f148 BP |
160 | if (locking < 0) { |
161 | locking = open_mode != OVSDB_LOG_READ_ONLY; | |
162 | } | |
163 | if (locking) { | |
4770e795 | 164 | int retval = lockfile_lock(name, &lockfile); |
f85f8ebb BP |
165 | if (retval) { |
166 | error = ovsdb_io_error(retval, "%s: failed to lock lockfile", | |
167 | name); | |
168 | goto error; | |
169 | } | |
170 | } else { | |
171 | lockfile = NULL; | |
172 | } | |
173 | ||
1e0b7e94 BP |
174 | switch (open_mode) { |
175 | case OVSDB_LOG_READ_ONLY: | |
7446f148 | 176 | flags = O_RDONLY; |
1e0b7e94 BP |
177 | break; |
178 | ||
179 | case OVSDB_LOG_READ_WRITE: | |
7446f148 | 180 | flags = O_RDWR; |
1e0b7e94 BP |
181 | break; |
182 | ||
183 | case OVSDB_LOG_CREATE_EXCL: | |
7470e8e6 | 184 | #ifndef _WIN32 |
01ca539f BP |
185 | if (stat(name, &s) == -1 && errno == ENOENT |
186 | && lstat(name, &s) == 0 && S_ISLNK(s.st_mode)) { | |
187 | /* 'name' is a dangling symlink. We want to create the file that | |
188 | * the symlink points to, but POSIX says that open() with O_EXCL | |
189 | * must fail with EEXIST if the named file is a symlink. So, we | |
190 | * have to leave off O_EXCL and accept the race. */ | |
191 | flags = O_RDWR | O_CREAT; | |
192 | } else { | |
193 | flags = O_RDWR | O_CREAT | O_EXCL; | |
194 | } | |
7470e8e6 GS |
195 | #else |
196 | flags = O_RDWR | O_CREAT | O_EXCL; | |
197 | #endif | |
1e0b7e94 BP |
198 | break; |
199 | ||
200 | case OVSDB_LOG_CREATE: | |
201 | flags = O_RDWR | O_CREAT; | |
202 | break; | |
203 | ||
204 | default: | |
428b2edd | 205 | OVS_NOT_REACHED(); |
7446f148 | 206 | } |
3e94784c GS |
207 | #ifdef _WIN32 |
208 | flags = flags | O_BINARY; | |
209 | #endif | |
71e4030f BP |
210 | /* Special case for /dev/stdin to make it work even if the operating system |
211 | * doesn't support it under that name. */ | |
212 | if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) { | |
213 | fd = dup(STDIN_FILENO); | |
214 | } else { | |
215 | fd = open(name, flags, 0666); | |
216 | } | |
f85f8ebb | 217 | if (fd < 0) { |
1e0b7e94 BP |
218 | const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create" |
219 | : open_mode == OVSDB_LOG_CREATE ? "create or open" | |
220 | : "open"); | |
a6be657b | 221 | error = ovsdb_io_error(errno, "%s: %s failed", name, op); |
f85f8ebb BP |
222 | goto error_unlock; |
223 | } | |
224 | ||
7446f148 | 225 | stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b"); |
f85f8ebb BP |
226 | if (!stream) { |
227 | error = ovsdb_io_error(errno, "%s: fdopen failed", name); | |
fed27759 BP |
228 | close(fd); |
229 | goto error_unlock; | |
230 | } | |
231 | ||
232 | /* Read the magic from the first log record. */ | |
233 | char header[128]; | |
234 | const char *actual_magic; | |
235 | if (!fgets(header, sizeof header, stream)) { | |
236 | if (ferror(stream)) { | |
237 | error = ovsdb_io_error(errno, "%s: read error", name); | |
238 | goto error_fclose; | |
239 | } | |
240 | ||
241 | /* We need to be able to report what kind of file this is but we can't | |
242 | * if it's empty and we accept more than one. */ | |
243 | if (strchr(magic, '|')) { | |
244 | error = ovsdb_error(NULL, "%s: cannot identify file type", name); | |
245 | goto error_fclose; | |
246 | } | |
247 | actual_magic = magic; | |
248 | ||
249 | /* It's an empty file and therefore probably a new file, so fsync() | |
250 | * its parent directory to ensure that its directory entry is | |
251 | * committed to disk. */ | |
252 | fsync_parent_dir(name); | |
253 | } else { | |
254 | unsigned long int length; | |
255 | uint8_t sha1[SHA1_DIGEST_SIZE]; | |
256 | if (!parse_header(header, &actual_magic, &length, sha1)) { | |
257 | error = ovsdb_error(NULL, "%s: unexpected file format", name); | |
258 | goto error_fclose; | |
259 | } else if (!is_magic_ok(actual_magic, magic)) { | |
260 | error = ovsdb_error(NULL, "%s: cannot identify file type", name); | |
261 | goto error_fclose; | |
262 | } | |
263 | } | |
264 | ||
265 | if (fseek(stream, 0, SEEK_SET)) { | |
266 | error = ovsdb_io_error(errno, "%s: seek failed", name); | |
267 | goto error_fclose; | |
f85f8ebb BP |
268 | } |
269 | ||
fed27759 | 270 | struct ovsdb_log *file = xmalloc(sizeof *file); |
421fc8a1 BP |
271 | file->state = OVSDB_LOG_READ; |
272 | file->error = NULL; | |
02f4f231 BP |
273 | file->name = abs_name; |
274 | file->display_name = xstrdup(name); | |
fed27759 | 275 | file->magic = xstrdup(actual_magic); |
f85f8ebb BP |
276 | file->lockfile = lockfile; |
277 | file->stream = stream; | |
43675e26 | 278 | file->prev_offset = 0; |
f85f8ebb | 279 | file->offset = 0; |
4cc9d1f0 | 280 | file->base = 0; |
f70b61d3 | 281 | file->afsync = NULL; |
f85f8ebb BP |
282 | *filep = file; |
283 | return NULL; | |
284 | ||
fed27759 BP |
285 | error_fclose: |
286 | fclose(stream); | |
f85f8ebb BP |
287 | error_unlock: |
288 | lockfile_unlock(lockfile); | |
289 | error: | |
02f4f231 | 290 | free(abs_name); |
f85f8ebb BP |
291 | return error; |
292 | } | |
293 | ||
fed27759 BP |
294 | /* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */ |
295 | static bool | |
296 | is_magic_ok(const char *needle, const char *haystack) | |
297 | { | |
298 | /* 'needle' can't be multiple words. */ | |
299 | if (strchr(needle, '|')) { | |
300 | return false; | |
301 | } | |
302 | ||
303 | size_t n = strlen(needle); | |
304 | for (;;) { | |
305 | if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) { | |
306 | return true; | |
307 | } | |
308 | haystack = strchr(haystack, '|'); | |
309 | if (!haystack) { | |
310 | return false; | |
311 | } | |
312 | haystack++; | |
313 | } | |
314 | } | |
315 | ||
f85f8ebb | 316 | void |
41709ccc | 317 | ovsdb_log_close(struct ovsdb_log *file) |
f85f8ebb BP |
318 | { |
319 | if (file) { | |
421fc8a1 | 320 | ovsdb_error_destroy(file->error); |
f70b61d3 | 321 | afsync_destroy(file->afsync); |
f85f8ebb | 322 | free(file->name); |
02f4f231 | 323 | free(file->display_name); |
19b276cb | 324 | free(file->magic); |
421fc8a1 BP |
325 | if (file->stream) { |
326 | fclose(file->stream); | |
327 | } | |
f85f8ebb | 328 | lockfile_unlock(file->lockfile); |
f85f8ebb BP |
329 | free(file); |
330 | } | |
331 | } | |
332 | ||
fed27759 BP |
333 | const char * |
334 | ovsdb_log_get_magic(const struct ovsdb_log *log) | |
335 | { | |
336 | return log->magic; | |
337 | } | |
338 | ||
339 | /* Attempts to parse 'header' as a header line for an OVSDB log record (as | |
340 | * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp', | |
341 | * the length in *length, and the parsed sha1 value in sha1[]. | |
342 | * | |
343 | * Modifies 'header' and points '*magicp' inside it. | |
344 | * | |
345 | * Returns true if successful, false on failure. */ | |
f85f8ebb | 346 | static bool |
fed27759 BP |
347 | parse_header(char *header, const char **magicp, |
348 | unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE]) | |
f85f8ebb | 349 | { |
fed27759 BP |
350 | /* 'header' must consist of "OVSDB "... */ |
351 | const char lead[] = "OVSDB "; | |
352 | if (strncmp(lead, header, strlen(lead))) { | |
353 | return false; | |
354 | } | |
f85f8ebb | 355 | |
fed27759 BP |
356 | /* ...followed by a magic string... */ |
357 | char *magic = header + strlen(lead); | |
358 | size_t magic_len = strcspn(magic, " "); | |
359 | if (magic[magic_len] != ' ') { | |
f85f8ebb BP |
360 | return false; |
361 | } | |
fed27759 BP |
362 | magic[magic_len] = '\0'; |
363 | *magicp = magic; | |
f85f8ebb BP |
364 | |
365 | /* ...followed by a length in bytes... */ | |
fed27759 BP |
366 | char *p; |
367 | *length = strtoul(magic + magic_len + 1, &p, 10); | |
f85f8ebb BP |
368 | if (!*length || *length == ULONG_MAX || *p != ' ') { |
369 | return false; | |
370 | } | |
371 | p++; | |
372 | ||
373 | /* ...followed by a SHA-1 hash... */ | |
374 | if (!sha1_from_hex(sha1, p)) { | |
375 | return false; | |
376 | } | |
377 | p += SHA1_HEX_DIGEST_LEN; | |
378 | ||
379 | /* ...and ended by a new-line. */ | |
380 | if (*p != '\n') { | |
381 | return false; | |
382 | } | |
383 | ||
384 | return true; | |
385 | } | |
386 | ||
f85f8ebb | 387 | static struct ovsdb_error * |
41709ccc | 388 | parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length, |
f85f8ebb BP |
389 | uint8_t sha1[SHA1_DIGEST_SIZE], struct json **jsonp) |
390 | { | |
f85f8ebb BP |
391 | struct json_parser *parser; |
392 | struct sha1_ctx ctx; | |
393 | ||
394 | sha1_init(&ctx); | |
395 | parser = json_parser_create(JSPF_TRAILER); | |
396 | ||
f85f8ebb BP |
397 | while (length > 0) { |
398 | char input[BUFSIZ]; | |
399 | int chunk; | |
400 | ||
401 | chunk = MIN(length, sizeof input); | |
402 | if (fread(input, 1, chunk, file->stream) != chunk) { | |
403 | json_parser_abort(parser); | |
404 | return ovsdb_io_error(ferror(file->stream) ? errno : EOF, | |
405 | "%s: error reading %lu bytes " | |
02f4f231 BP |
406 | "starting at offset %lld", |
407 | file->display_name, length, | |
408 | (long long int) offset); | |
f85f8ebb BP |
409 | } |
410 | sha1_update(&ctx, input, chunk); | |
411 | json_parser_feed(parser, input, chunk); | |
412 | length -= chunk; | |
413 | } | |
414 | ||
415 | sha1_final(&ctx, sha1); | |
416 | *jsonp = json_parser_finish(parser); | |
417 | return NULL; | |
418 | } | |
419 | ||
4407aa48 BP |
420 | /* Attempts to read a log record from 'file'. |
421 | * | |
422 | * If successful, returns NULL and stores in '*jsonp' the JSON object that the | |
423 | * record contains. The caller owns the data and must eventually free it (with | |
424 | * json_destroy()). | |
425 | * | |
426 | * If a read error occurs, returns the error and stores NULL in '*jsonp'. | |
427 | * | |
428 | * If the read reaches end of file, returns NULL and stores NULL in | |
429 | * '*jsonp'. */ | |
f85f8ebb | 430 | struct ovsdb_error * |
41709ccc | 431 | ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp) |
f85f8ebb | 432 | { |
421fc8a1 BP |
433 | *jsonp = NULL; |
434 | switch (file->state) { | |
435 | case OVSDB_LOG_READ: | |
436 | break; | |
437 | ||
438 | case OVSDB_LOG_READ_ERROR: | |
439 | case OVSDB_LOG_WRITE_ERROR: | |
440 | case OVSDB_LOG_BROKEN: | |
441 | return ovsdb_error_clone(file->error); | |
442 | ||
443 | case OVSDB_LOG_WRITE: | |
444 | return NULL; | |
445 | } | |
446 | ||
f85f8ebb BP |
447 | uint8_t expected_sha1[SHA1_DIGEST_SIZE]; |
448 | uint8_t actual_sha1[SHA1_DIGEST_SIZE]; | |
449 | struct ovsdb_error *error; | |
f85f8ebb BP |
450 | unsigned long data_length; |
451 | struct json *json; | |
452 | char header[128]; | |
453 | ||
421fc8a1 | 454 | json = NULL; |
f85f8ebb BP |
455 | |
456 | if (!fgets(header, sizeof header, file->stream)) { | |
457 | if (feof(file->stream)) { | |
421fc8a1 | 458 | return NULL; |
f85f8ebb | 459 | } |
02f4f231 | 460 | error = ovsdb_io_error(errno, "%s: read failed", file->display_name); |
f85f8ebb BP |
461 | goto error; |
462 | } | |
fed27759 | 463 | off_t data_offset = file->offset + strlen(header); |
f85f8ebb | 464 | |
fed27759 BP |
465 | const char *magic; |
466 | if (!parse_header(header, &magic, &data_length, expected_sha1) | |
467 | || strcmp(magic, file->magic)) { | |
f85f8ebb BP |
468 | error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset " |
469 | "%lld in header line \"%.*s\"", | |
02f4f231 BP |
470 | file->display_name, |
471 | (long long int) file->offset, | |
f85f8ebb BP |
472 | (int) strcspn(header, "\n"), header); |
473 | goto error; | |
474 | } | |
475 | ||
f85f8ebb BP |
476 | error = parse_body(file, data_offset, data_length, actual_sha1, &json); |
477 | if (error) { | |
478 | goto error; | |
479 | } | |
480 | ||
481 | if (memcmp(expected_sha1, actual_sha1, SHA1_DIGEST_SIZE)) { | |
482 | error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at " | |
483 | "offset %lld have SHA-1 hash "SHA1_FMT" " | |
484 | "but should have hash "SHA1_FMT, | |
02f4f231 | 485 | file->display_name, data_length, |
f85f8ebb BP |
486 | (long long int) data_offset, |
487 | SHA1_ARGS(actual_sha1), | |
488 | SHA1_ARGS(expected_sha1)); | |
489 | goto error; | |
490 | } | |
491 | ||
492 | if (json->type == JSON_STRING) { | |
493 | error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at " | |
494 | "offset %lld are not valid JSON (%s)", | |
02f4f231 | 495 | file->display_name, data_length, |
f85f8ebb | 496 | (long long int) data_offset, |
fa37affa | 497 | json->string); |
f85f8ebb BP |
498 | goto error; |
499 | } | |
4407aa48 BP |
500 | if (json->type != JSON_OBJECT) { |
501 | error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at " | |
502 | "offset %lld are not a JSON object", | |
02f4f231 | 503 | file->display_name, data_length, |
4407aa48 BP |
504 | (long long int) data_offset); |
505 | goto error; | |
506 | } | |
f85f8ebb | 507 | |
43675e26 | 508 | file->prev_offset = file->offset; |
f85f8ebb BP |
509 | file->offset = data_offset + data_length; |
510 | *jsonp = json; | |
e3c17733 | 511 | return NULL; |
f85f8ebb BP |
512 | |
513 | error: | |
421fc8a1 BP |
514 | file->state = OVSDB_LOG_READ_ERROR; |
515 | file->error = ovsdb_error_clone(error); | |
f85f8ebb BP |
516 | json_destroy(json); |
517 | return error; | |
518 | } | |
519 | ||
43675e26 BP |
520 | /* Causes the log record read by the previous call to ovsdb_log_read() to be |
521 | * effectively discarded. The next call to ovsdb_log_write() will overwrite | |
522 | * that previously read record. | |
523 | * | |
524 | * Calling this function more than once has no additional effect. | |
525 | * | |
526 | * This function is useful when ovsdb_log_read() successfully reads a record | |
527 | * but that record does not make sense at a higher level (e.g. it specifies an | |
528 | * invalid transaction). */ | |
529 | void | |
530 | ovsdb_log_unread(struct ovsdb_log *file) | |
531 | { | |
421fc8a1 | 532 | ovs_assert(file->state == OVSDB_LOG_READ); |
43675e26 BP |
533 | file->offset = file->prev_offset; |
534 | } | |
535 | ||
02f4f231 BP |
536 | static struct ovsdb_error * |
537 | ovsdb_log_truncate(struct ovsdb_log *file) | |
538 | { | |
539 | file->state = OVSDB_LOG_WRITE; | |
540 | ||
541 | struct ovsdb_error *error = NULL; | |
542 | if (fseeko(file->stream, file->offset, SEEK_SET)) { | |
543 | error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld", | |
544 | file->display_name, | |
545 | (long long int) file->offset); | |
546 | } else if (ftruncate(fileno(file->stream), file->offset)) { | |
547 | error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld", | |
548 | file->display_name, | |
549 | (long long int) file->offset); | |
550 | } | |
551 | return error; | |
552 | } | |
553 | ||
c9167341 BP |
554 | /* Composes a log record for 'json' by filling 'header' with a header line and |
555 | * 'data' with a data line (each ending with a new-line). To write the record | |
556 | * to a file, write 'header' followed by 'data'. | |
557 | * | |
02f4f231 BP |
558 | * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC. |
559 | * | |
c9167341 BP |
560 | * The caller must initialize 'header' and 'data' to empty strings. */ |
561 | void | |
562 | ovsdb_log_compose_record(const struct json *json, | |
19b276cb | 563 | const char *magic, struct ds *header, struct ds *data) |
c9167341 BP |
564 | { |
565 | ovs_assert(json->type == JSON_OBJECT || json->type == JSON_ARRAY); | |
566 | ovs_assert(!header->length); | |
567 | ovs_assert(!data->length); | |
568 | ||
569 | /* Compose content. */ | |
570 | json_to_ds(json, 0, data); | |
571 | ds_put_char(data, '\n'); | |
572 | ||
573 | /* Compose header. */ | |
574 | uint8_t sha1[SHA1_DIGEST_SIZE]; | |
575 | sha1_bytes(data->string, data->length, sha1); | |
fed27759 | 576 | ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n", |
c9167341 BP |
577 | magic, data->length, SHA1_ARGS(sha1)); |
578 | } | |
579 | ||
226600d9 | 580 | /* Writes log record 'json' to 'file'. Returns NULL if successful or an error |
4cc9d1f0 BP |
581 | * (which the caller must eventually destroy) on failure. |
582 | * | |
583 | * If the log contains some records that have not yet been read, then calling | |
584 | * this function truncates them. | |
585 | * | |
586 | * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that | |
587 | * they are durable. | |
588 | */ | |
f85f8ebb | 589 | struct ovsdb_error * |
226600d9 | 590 | ovsdb_log_write(struct ovsdb_log *file, const struct json *json) |
f85f8ebb | 591 | { |
421fc8a1 BP |
592 | switch (file->state) { |
593 | case OVSDB_LOG_WRITE: | |
594 | break; | |
595 | ||
596 | case OVSDB_LOG_READ: | |
597 | case OVSDB_LOG_READ_ERROR: | |
598 | case OVSDB_LOG_WRITE_ERROR: | |
421fc8a1 | 599 | ovsdb_error_destroy(file->error); |
02f4f231 BP |
600 | file->error = ovsdb_log_truncate(file); |
601 | if (file->error) { | |
602 | file->state = OVSDB_LOG_WRITE_ERROR; | |
603 | return ovsdb_error_clone(file->error); | |
f85f8ebb | 604 | } |
02f4f231 | 605 | file->state = OVSDB_LOG_WRITE; |
421fc8a1 BP |
606 | break; |
607 | ||
608 | case OVSDB_LOG_BROKEN: | |
609 | return ovsdb_error_clone(file->error); | |
f85f8ebb BP |
610 | } |
611 | ||
612 | if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) { | |
02f4f231 | 613 | return OVSDB_BUG("bad JSON type"); |
f85f8ebb BP |
614 | } |
615 | ||
c9167341 BP |
616 | struct ds header = DS_EMPTY_INITIALIZER; |
617 | struct ds data = DS_EMPTY_INITIALIZER; | |
19b276cb | 618 | ovsdb_log_compose_record(json, file->magic, &header, &data); |
c9167341 | 619 | size_t total_length = header.length + data.length; |
f85f8ebb BP |
620 | |
621 | /* Write. */ | |
c9167341 BP |
622 | bool ok = (fwrite(header.string, header.length, 1, file->stream) == 1 |
623 | && fwrite(data.string, data.length, 1, file->stream) == 1 | |
624 | && fflush(file->stream) == 0); | |
625 | ds_destroy(&header); | |
626 | ds_destroy(&data); | |
627 | if (!ok) { | |
02f4f231 | 628 | int error = errno; |
f85f8ebb | 629 | |
c7007aa7 BP |
630 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); |
631 | VLOG_WARN_RL(&rl, "%s: write failed (%s)", | |
02f4f231 | 632 | file->name, ovs_strerror(error)); |
c7007aa7 | 633 | |
f85f8ebb BP |
634 | /* Remove any partially written data, ignoring errors since there is |
635 | * nothing further we can do. */ | |
18b9283b | 636 | ignore(ftruncate(fileno(file->stream), file->offset)); |
f85f8ebb | 637 | |
02f4f231 BP |
638 | file->error = ovsdb_io_error(error, "%s: write failed", |
639 | file->display_name); | |
640 | file->state = OVSDB_LOG_WRITE_ERROR; | |
641 | return ovsdb_error_clone(file->error); | |
f85f8ebb BP |
642 | } |
643 | ||
c9167341 | 644 | file->offset += total_length; |
e3c17733 | 645 | return NULL; |
f85f8ebb BP |
646 | } |
647 | ||
1b1d2e6d BP |
648 | struct ovsdb_error * OVS_WARN_UNUSED_RESULT |
649 | ovsdb_log_write_and_free(struct ovsdb_log *log, struct json *json) | |
650 | { | |
651 | struct ovsdb_error *error = ovsdb_log_write(log, json); | |
652 | json_destroy(json); | |
653 | return error; | |
654 | } | |
655 | ||
f70b61d3 BP |
656 | /* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail. |
657 | * Returns NULL if successful, otherwise the error that occurred. */ | |
f85f8ebb | 658 | struct ovsdb_error * |
f70b61d3 | 659 | ovsdb_log_commit_block(struct ovsdb_log *file) |
f85f8ebb | 660 | { |
3738d929 AI |
661 | #if (_POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500) |
662 | /* we do not check metadata - mtime, atime, anywhere, so we | |
663 | * do not need to update it every time we sync the log. | |
664 | * if the system supports it, the log update should be | |
665 | * data only | |
666 | */ | |
667 | if (file->stream && fdatasync(fileno(file->stream))) { | |
668 | #else | |
421fc8a1 | 669 | if (file->stream && fsync(fileno(file->stream))) { |
3738d929 | 670 | #endif |
02f4f231 | 671 | return ovsdb_io_error(errno, "%s: fsync failed", file->display_name); |
f85f8ebb | 672 | } |
e3c17733 | 673 | return NULL; |
f85f8ebb | 674 | } |
41709ccc | 675 | |
4cc9d1f0 BP |
676 | /* Sets the current position in 'log' as the "base", that is, the initial size |
677 | * of the log that ovsdb_log_grew_lots() uses to determine whether the log has | |
678 | * grown enough to make compacting worthwhile. */ | |
679 | void | |
680 | ovsdb_log_mark_base(struct ovsdb_log *log) | |
ada496b5 | 681 | { |
4cc9d1f0 BP |
682 | log->base = log->offset; |
683 | } | |
684 | ||
685 | /* Returns true if 'log' has grown enough above the base that it's worthwhile | |
686 | * to compact it, false otherwise. */ | |
687 | bool | |
688 | ovsdb_log_grew_lots(const struct ovsdb_log *log) | |
689 | { | |
1cfdc175 | 690 | return log->offset > 10 * 1024 * 1024 && log->offset / 2 > log->base; |
ada496b5 | 691 | } |
421fc8a1 BP |
692 | \f |
693 | /* Attempts to atomically replace the contents of 'log', on disk, by the 'n' | |
694 | * entries in 'entries'. If successful, returns NULL, otherwise returns an | |
695 | * error (which the caller must eventually free). | |
696 | * | |
697 | * If successful, 'log' will be in write mode at the end of the log. */ | |
698 | struct ovsdb_error * OVS_WARN_UNUSED_RESULT | |
699 | ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n) | |
700 | { | |
701 | struct ovsdb_error *error; | |
702 | struct ovsdb_log *new; | |
703 | ||
704 | error = ovsdb_log_replace_start(log, &new); | |
705 | if (error) { | |
706 | return error; | |
707 | } | |
708 | ||
709 | for (size_t i = 0; i < n; i++) { | |
710 | error = ovsdb_log_write(new, entries[i]); | |
711 | if (error) { | |
712 | ovsdb_log_replace_abort(new); | |
713 | return error; | |
714 | } | |
715 | } | |
4cc9d1f0 | 716 | ovsdb_log_mark_base(new); |
421fc8a1 BP |
717 | |
718 | return ovsdb_log_replace_commit(log, new); | |
719 | } | |
720 | ||
721 | struct ovsdb_error * OVS_WARN_UNUSED_RESULT | |
722 | ovsdb_log_replace_start(struct ovsdb_log *old, | |
723 | struct ovsdb_log **newp) | |
724 | { | |
725 | /* If old->name is a symlink, then we want the new file to be in the same | |
726 | * directory as the symlink's referent. */ | |
727 | char *deref_name = follow_symlinks(old->name); | |
728 | char *tmp_name = xasprintf("%s.tmp", deref_name); | |
729 | free(deref_name); | |
730 | ||
731 | struct ovsdb_error *error; | |
732 | ||
733 | ovs_assert(old->lockfile); | |
734 | ||
735 | /* Remove temporary file. (It might not exist.) */ | |
736 | if (unlink(tmp_name) < 0 && errno != ENOENT) { | |
737 | error = ovsdb_io_error(errno, "failed to remove %s", tmp_name); | |
738 | free(tmp_name); | |
739 | *newp = NULL; | |
740 | return error; | |
741 | } | |
742 | ||
743 | /* Create temporary file. */ | |
744 | error = ovsdb_log_open(tmp_name, old->magic, OVSDB_LOG_CREATE_EXCL, | |
745 | false, newp); | |
746 | free(tmp_name); | |
747 | return error; | |
748 | } | |
749 | ||
750 | /* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if | |
751 | * successful, otherwise an ovsdb_error that the caller must destroy. */ | |
752 | static struct ovsdb_error * OVS_WARN_UNUSED_RESULT | |
753 | ovsdb_rename(const char *old, const char *new) | |
754 | { | |
755 | #ifdef _WIN32 | |
756 | /* Avoid rename() because it fails if the destination exists. */ | |
757 | int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING | |
758 | | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED) | |
759 | ? 0 : EACCES); | |
760 | #else | |
761 | int error = rename(old, new) ? errno : 0; | |
762 | #endif | |
763 | ||
764 | return (error | |
765 | ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"", | |
766 | old, new) | |
767 | : NULL); | |
768 | } | |
769 | ||
770 | struct ovsdb_error * OVS_WARN_UNUSED_RESULT | |
771 | ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new) | |
772 | { | |
f70b61d3 | 773 | struct ovsdb_error *error = ovsdb_log_commit_block(new); |
421fc8a1 BP |
774 | if (error) { |
775 | ovsdb_log_replace_abort(new); | |
776 | return error; | |
777 | } | |
778 | ||
779 | /* Replace original file by the temporary file. | |
780 | * | |
781 | * We support two strategies: | |
782 | * | |
783 | * - The preferred strategy is to rename the temporary file over the | |
784 | * original one in-place, then close the original one. This works on | |
785 | * Unix-like systems. It does not work on Windows, which does not | |
786 | * allow open files to be renamed. The approach has the advantage | |
787 | * that, at any point, we can drop back to something that already | |
788 | * works. | |
789 | * | |
790 | * - Alternatively, we can close both files, rename, then open the new | |
791 | * file (which now has the original name). This works on all | |
792 | * systems, but if reopening the file fails then 'old' is broken. | |
793 | * | |
794 | * We make the strategy a variable instead of an #ifdef to make it easier | |
795 | * to test both strategies on Unix-like systems, and to make the code | |
796 | * easier to read. */ | |
797 | if (!rename_open_files) { | |
798 | fclose(old->stream); | |
799 | old->stream = NULL; | |
800 | ||
801 | fclose(new->stream); | |
802 | new->stream = NULL; | |
803 | } | |
804 | ||
805 | /* Rename 'old' to 'new'. We dereference the old name because, if it is a | |
806 | * symlink, we want to replace the referent of the symlink instead of the | |
807 | * symlink itself. */ | |
808 | char *deref_name = follow_symlinks(old->name); | |
809 | error = ovsdb_rename(new->name, deref_name); | |
810 | free(deref_name); | |
811 | ||
812 | if (error) { | |
813 | ovsdb_log_replace_abort(new); | |
814 | return error; | |
815 | } | |
816 | if (rename_open_files) { | |
817 | fsync_parent_dir(old->name); | |
818 | fclose(old->stream); | |
819 | old->stream = new->stream; | |
820 | new->stream = NULL; | |
821 | } else { | |
822 | old->stream = fopen(old->name, "r+b"); | |
823 | if (!old->stream) { | |
824 | old->error = ovsdb_io_error(errno, "%s: could not reopen log", | |
825 | old->name); | |
826 | old->state = OVSDB_LOG_BROKEN; | |
827 | return ovsdb_error_clone(old->error); | |
828 | } | |
829 | ||
830 | if (fseek(old->stream, new->offset, SEEK_SET)) { | |
831 | old->error = ovsdb_io_error(errno, "%s: seek failed", old->name); | |
832 | old->state = OVSDB_LOG_BROKEN; | |
833 | return ovsdb_error_clone(old->error); | |
834 | } | |
835 | } | |
836 | ||
837 | /* Replace 'old' by 'new' in memory. | |
838 | * | |
839 | * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode | |
840 | * anyway). */ | |
841 | old->state = OVSDB_LOG_WRITE; | |
842 | ovsdb_error_destroy(old->error); | |
843 | old->error = NULL; | |
844 | /* prev_offset only matters for OVSDB_LOG_READ. */ | |
f70b61d3 BP |
845 | if (old->afsync) { |
846 | uint64_t ticket = afsync_destroy(old->afsync); | |
847 | old->afsync = afsync_create(fileno(old->stream), ticket + 1); | |
848 | } | |
421fc8a1 BP |
849 | old->offset = new->offset; |
850 | /* Keep old->name. */ | |
851 | free(old->magic); | |
852 | old->magic = new->magic; | |
853 | new->magic = NULL; | |
854 | /* Keep old->lockfile. */ | |
4cc9d1f0 BP |
855 | old->base = new->base; |
856 | ||
421fc8a1 BP |
857 | /* Free 'new'. */ |
858 | ovsdb_log_close(new); | |
859 | ||
860 | return NULL; | |
861 | } | |
862 | ||
863 | void | |
864 | ovsdb_log_replace_abort(struct ovsdb_log *new) | |
865 | { | |
866 | if (new) { | |
867 | /* Unlink the new file, but only after we close it (because Windows | |
868 | * does not allow removing an open file). */ | |
869 | char *name = xstrdup(new->name); | |
870 | ovsdb_log_close(new); | |
871 | unlink(name); | |
872 | free(name); | |
873 | } | |
874 | } | |
875 | ||
876 | void | |
877 | ovsdb_log_disable_renaming_open_files(void) | |
878 | { | |
879 | rename_open_files = false; | |
880 | } | |
f70b61d3 BP |
881 | \f |
882 | struct afsync { | |
883 | pthread_t thread; | |
884 | atomic_uint64_t cur, next; | |
885 | struct seq *request, *complete; | |
886 | int fd; | |
887 | }; | |
888 | ||
889 | static void * | |
890 | afsync_thread(void *afsync_) | |
891 | { | |
892 | struct afsync *afsync = afsync_; | |
893 | uint64_t cur = 0; | |
894 | for (;;) { | |
895 | ovsrcu_quiesce_start(); | |
896 | ||
897 | uint64_t request_seq = seq_read(afsync->request); | |
898 | ||
899 | uint64_t next; | |
900 | atomic_read_explicit(&afsync->next, &next, memory_order_acquire); | |
901 | if (next == UINT64_MAX) { | |
902 | break; | |
903 | } | |
904 | ||
905 | if (cur != next && afsync->fd != -1) { | |
906 | int error = fsync(afsync->fd) ? errno : 0; | |
907 | if (!error) { | |
908 | cur = next; | |
909 | atomic_store_explicit(&afsync->cur, cur, memory_order_release); | |
910 | seq_change(afsync->complete); | |
911 | } else { | |
912 | VLOG_WARN("fsync failed (%s)", ovs_strerror(error)); | |
913 | } | |
914 | } | |
915 | ||
916 | seq_wait(afsync->request, request_seq); | |
917 | poll_block(); | |
918 | } | |
919 | return NULL; | |
920 | } | |
921 | ||
922 | static struct afsync * | |
923 | afsync_create(int fd, uint64_t initial_ticket) | |
924 | { | |
925 | struct afsync *afsync = xzalloc(sizeof *afsync); | |
926 | atomic_init(&afsync->cur, initial_ticket); | |
927 | atomic_init(&afsync->next, initial_ticket); | |
928 | afsync->request = seq_create(); | |
929 | afsync->complete = seq_create(); | |
930 | afsync->thread = ovs_thread_create("log_fsync", afsync_thread, afsync); | |
931 | afsync->fd = fd; | |
932 | return afsync; | |
933 | } | |
934 | ||
935 | static uint64_t | |
936 | afsync_destroy(struct afsync *afsync) | |
937 | { | |
938 | if (!afsync) { | |
939 | return 0; | |
940 | } | |
941 | ||
942 | uint64_t next; | |
943 | atomic_read(&afsync->next, &next); | |
944 | atomic_store(&afsync->next, UINT64_MAX); | |
945 | seq_change(afsync->request); | |
946 | xpthread_join(afsync->thread, NULL); | |
947 | ||
948 | seq_destroy(afsync->request); | |
949 | seq_destroy(afsync->complete); | |
950 | ||
951 | free(afsync); | |
952 | ||
953 | return next; | |
954 | } | |
955 | ||
956 | static struct afsync * | |
957 | ovsdb_log_get_afsync(struct ovsdb_log *log) | |
958 | { | |
959 | if (!log->afsync) { | |
960 | log->afsync = afsync_create(log->stream ? fileno(log->stream) : -1, 0); | |
961 | } | |
962 | return log->afsync; | |
963 | } | |
964 | ||
965 | /* Starts committing 'log' to disk. Returns a ticket that can be passed to | |
966 | * ovsdb_log_commit_wait() or compared against the return value of | |
967 | * ovsdb_log_commit_progress() later. */ | |
968 | uint64_t | |
969 | ovsdb_log_commit_start(struct ovsdb_log *log) | |
970 | { | |
971 | struct afsync *afsync = ovsdb_log_get_afsync(log); | |
972 | ||
973 | uint64_t orig; | |
974 | atomic_add_explicit(&afsync->next, 1, &orig, memory_order_acq_rel); | |
975 | ||
976 | seq_change(afsync->request); | |
977 | ||
978 | return orig + 1; | |
979 | } | |
980 | ||
981 | /* Returns a ticket value that represents the current progress of commits to | |
982 | * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any | |
983 | * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit | |
984 | * X is complete if and only if X <= Y. */ | |
985 | uint64_t | |
986 | ovsdb_log_commit_progress(struct ovsdb_log *log) | |
987 | { | |
988 | struct afsync *afsync = ovsdb_log_get_afsync(log); | |
989 | uint64_t cur; | |
990 | atomic_read_explicit(&afsync->cur, &cur, memory_order_acquire); | |
991 | return cur; | |
992 | } | |
993 | ||
994 | /* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log) | |
995 | * would return at least 'goal'. */ | |
996 | void | |
997 | ovsdb_log_commit_wait(struct ovsdb_log *log, uint64_t goal) | |
998 | { | |
999 | struct afsync *afsync = ovsdb_log_get_afsync(log); | |
1000 | uint64_t complete = seq_read(afsync->complete); | |
1001 | uint64_t cur = ovsdb_log_commit_progress(log); | |
1002 | if (cur < goal) { | |
1003 | seq_wait(afsync->complete, complete); | |
1004 | } else { | |
1005 | poll_immediate_wake(); | |
1006 | } | |
1007 | } |