]> git.proxmox.com Git - mirror_ovs.git/blob - ovsdb/log.c
ovn-nbctl: Fix the ovn-nbctl test "LBs - daemon" which fails during rpm build
[mirror_ovs.git] / ovsdb / log.c
1 /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <config.h>
17
18 #include "log.h"
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
26
27 #include "lockfile.h"
28 #include "openvswitch/dynamic-string.h"
29 #include "openvswitch/json.h"
30 #include "openvswitch/vlog.h"
31 #include "ovs-atomic.h"
32 #include "ovs-rcu.h"
33 #include "ovs-thread.h"
34 #include "ovsdb-error.h"
35 #include "ovsdb.h"
36 #include "openvswitch/poll-loop.h"
37 #include "seq.h"
38 #include "sha1.h"
39 #include "socket-util.h"
40 #include "transaction.h"
41 #include "util.h"
42
43 VLOG_DEFINE_THIS_MODULE(ovsdb_log);
44
45 /* State in a log's state machine.
46 *
47 * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may
48 * be read in this state only. Reaching end of file does not cause a state
49 * transition. A read error transitions to OVSDB_LOG_READ_ERROR.
50 *
51 * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will
52 * report the same error as before. A log write transitions away to
53 * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR.
54 *
55 * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all
56 * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this
57 * state. A write error transitions to OVSDB_LOG_WRITE_ERROR.
58 *
59 * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes
60 * retry and might transition back to OVSDB_LOG_WRITE.
61 *
62 * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or
63 * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no
64 * further reads or writes can succeed. This is a terminal state.
65 */
66 enum ovsdb_log_state {
67 OVSDB_LOG_READ, /* Ready to read. */
68 OVSDB_LOG_READ_ERROR, /* Read failed, see 'error' for details. */
69 OVSDB_LOG_WRITE, /* Ready to write. */
70 OVSDB_LOG_WRITE_ERROR, /* Write failed, see 'error' for details. */
71 OVSDB_LOG_BROKEN, /* Disk on fire, see 'error' for details. */
72 };
73
74 struct ovsdb_log {
75 enum ovsdb_log_state state;
76 struct ovsdb_error *error;
77
78 off_t prev_offset;
79 off_t offset;
80 char *name; /* Absolute name of file. */
81 char *display_name; /* For use in log messages, etc. */
82 char *magic;
83 struct lockfile *lockfile;
84 FILE *stream;
85 off_t base;
86 struct afsync *afsync;
87 };
88
89 /* Whether the OS supports renaming open files.
90 *
91 * (Making this a variable makes it easier to test both strategies on Unix-like
92 * systems.) */
93 #ifdef _WIN32
94 static bool rename_open_files = false;
95 #else
96 static bool rename_open_files = true;
97 #endif
98
99 static bool parse_header(char *header, const char **magicp,
100 unsigned long int *length,
101 uint8_t sha1[SHA1_DIGEST_SIZE]);
102 static bool is_magic_ok(const char *needle, const char *haystack);
103
104 static struct afsync *afsync_create(int fd, uint64_t initial_ticket);
105 static uint64_t afsync_destroy(struct afsync *);
106
107 /* Attempts to open 'name' with the specified 'open_mode'. On success, stores
108 * the new log into '*filep' and returns NULL; otherwise returns NULL and
109 * stores NULL into '*filep'.
110 *
111 * 'magic' is a short text string put at the beginning of every record and used
112 * to distinguish one kind of log file from another. For a conventional OVSDB
113 * log file, use the OVSDB_MAGIC macro. To accept more than one magic string,
114 * separate them with "|", e.g. "MAGIC 1|MAGIC 2".
115 *
116 * Whether the file will be locked using lockfile_lock() depends on 'locking':
117 * use true to lock it, false not to lock it, or -1 to lock it only if
118 * 'open_mode' is a mode that allows writing.
119 *
120 * A log consists of a series of records. After opening or creating a log with
121 * this function, the client may use ovsdb_log_read() to read any existing
122 * records, one by one. The client may also use ovsdb_log_write() to write new
123 * records (if some records have not yet been read at this point, then the
124 * first write truncates them).
125 */
126 struct ovsdb_error *
127 ovsdb_log_open(const char *name, const char *magic,
128 enum ovsdb_log_open_mode open_mode,
129 int locking, struct ovsdb_log **filep)
130 {
131 struct lockfile *lockfile;
132 struct ovsdb_error *error;
133 struct stat s;
134 FILE *stream;
135 int flags;
136 int fd;
137
138 /* If we can create a new file, we need to know what kind of magic to
139 * use, so there must be only one kind. */
140 if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) {
141 ovs_assert(!strchr(magic, '|'));
142 }
143
144 *filep = NULL;
145
146 /* Get the absolute name of the file because we might need to access it by
147 * name again later after the process has changed directory (e.g. because
148 * daemonize() chdirs to "/").
149 *
150 * We save the user-provided name of the file for use in log messages, to
151 * reduce user confusion. */
152 char *abs_name = abs_file_name(NULL, name);
153 if (!abs_name) {
154 error = ovsdb_io_error(0, "could not determine current "
155 "working directory");
156 goto error;
157 }
158
159 ovs_assert(locking == -1 || locking == false || locking == true);
160 if (locking < 0) {
161 locking = open_mode != OVSDB_LOG_READ_ONLY;
162 }
163 if (locking) {
164 int retval = lockfile_lock(name, &lockfile);
165 if (retval) {
166 error = ovsdb_io_error(retval, "%s: failed to lock lockfile",
167 name);
168 goto error;
169 }
170 } else {
171 lockfile = NULL;
172 }
173
174 switch (open_mode) {
175 case OVSDB_LOG_READ_ONLY:
176 flags = O_RDONLY;
177 break;
178
179 case OVSDB_LOG_READ_WRITE:
180 flags = O_RDWR;
181 break;
182
183 case OVSDB_LOG_CREATE_EXCL:
184 #ifndef _WIN32
185 if (stat(name, &s) == -1 && errno == ENOENT
186 && lstat(name, &s) == 0 && S_ISLNK(s.st_mode)) {
187 /* 'name' is a dangling symlink. We want to create the file that
188 * the symlink points to, but POSIX says that open() with O_EXCL
189 * must fail with EEXIST if the named file is a symlink. So, we
190 * have to leave off O_EXCL and accept the race. */
191 flags = O_RDWR | O_CREAT;
192 } else {
193 flags = O_RDWR | O_CREAT | O_EXCL;
194 }
195 #else
196 flags = O_RDWR | O_CREAT | O_EXCL;
197 #endif
198 break;
199
200 case OVSDB_LOG_CREATE:
201 flags = O_RDWR | O_CREAT;
202 break;
203
204 default:
205 OVS_NOT_REACHED();
206 }
207 #ifdef _WIN32
208 flags = flags | O_BINARY;
209 #endif
210 /* Special case for /dev/stdin to make it work even if the operating system
211 * doesn't support it under that name. */
212 if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
213 fd = dup(STDIN_FILENO);
214 } else {
215 fd = open(name, flags, 0666);
216 }
217 if (fd < 0) {
218 const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create"
219 : open_mode == OVSDB_LOG_CREATE ? "create or open"
220 : "open");
221 error = ovsdb_io_error(errno, "%s: %s failed", name, op);
222 goto error_unlock;
223 }
224
225 stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b");
226 if (!stream) {
227 error = ovsdb_io_error(errno, "%s: fdopen failed", name);
228 close(fd);
229 goto error_unlock;
230 }
231
232 /* Read the magic from the first log record. */
233 char header[128];
234 const char *actual_magic;
235 if (!fgets(header, sizeof header, stream)) {
236 if (ferror(stream)) {
237 error = ovsdb_io_error(errno, "%s: read error", name);
238 goto error_fclose;
239 }
240
241 /* We need to be able to report what kind of file this is but we can't
242 * if it's empty and we accept more than one. */
243 if (strchr(magic, '|')) {
244 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
245 goto error_fclose;
246 }
247 actual_magic = magic;
248
249 /* It's an empty file and therefore probably a new file, so fsync()
250 * its parent directory to ensure that its directory entry is
251 * committed to disk. */
252 fsync_parent_dir(name);
253 } else {
254 unsigned long int length;
255 uint8_t sha1[SHA1_DIGEST_SIZE];
256 if (!parse_header(header, &actual_magic, &length, sha1)) {
257 error = ovsdb_error(NULL, "%s: unexpected file format", name);
258 goto error_fclose;
259 } else if (!is_magic_ok(actual_magic, magic)) {
260 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
261 goto error_fclose;
262 }
263 }
264
265 if (fseek(stream, 0, SEEK_SET)) {
266 error = ovsdb_io_error(errno, "%s: seek failed", name);
267 goto error_fclose;
268 }
269
270 struct ovsdb_log *file = xmalloc(sizeof *file);
271 file->state = OVSDB_LOG_READ;
272 file->error = NULL;
273 file->name = abs_name;
274 file->display_name = xstrdup(name);
275 file->magic = xstrdup(actual_magic);
276 file->lockfile = lockfile;
277 file->stream = stream;
278 file->prev_offset = 0;
279 file->offset = 0;
280 file->base = 0;
281 file->afsync = NULL;
282 *filep = file;
283 return NULL;
284
285 error_fclose:
286 fclose(stream);
287 error_unlock:
288 lockfile_unlock(lockfile);
289 error:
290 free(abs_name);
291 return error;
292 }
293
294 /* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
295 static bool
296 is_magic_ok(const char *needle, const char *haystack)
297 {
298 /* 'needle' can't be multiple words. */
299 if (strchr(needle, '|')) {
300 return false;
301 }
302
303 size_t n = strlen(needle);
304 for (;;) {
305 if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) {
306 return true;
307 }
308 haystack = strchr(haystack, '|');
309 if (!haystack) {
310 return false;
311 }
312 haystack++;
313 }
314 }
315
316 void
317 ovsdb_log_close(struct ovsdb_log *file)
318 {
319 if (file) {
320 ovsdb_error_destroy(file->error);
321 afsync_destroy(file->afsync);
322 free(file->name);
323 free(file->display_name);
324 free(file->magic);
325 if (file->stream) {
326 fclose(file->stream);
327 }
328 lockfile_unlock(file->lockfile);
329 free(file);
330 }
331 }
332
333 const char *
334 ovsdb_log_get_magic(const struct ovsdb_log *log)
335 {
336 return log->magic;
337 }
338
339 /* Attempts to parse 'header' as a header line for an OVSDB log record (as
340 * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp',
341 * the length in *length, and the parsed sha1 value in sha1[].
342 *
343 * Modifies 'header' and points '*magicp' inside it.
344 *
345 * Returns true if successful, false on failure. */
346 static bool
347 parse_header(char *header, const char **magicp,
348 unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE])
349 {
350 /* 'header' must consist of "OVSDB "... */
351 const char lead[] = "OVSDB ";
352 if (strncmp(lead, header, strlen(lead))) {
353 return false;
354 }
355
356 /* ...followed by a magic string... */
357 char *magic = header + strlen(lead);
358 size_t magic_len = strcspn(magic, " ");
359 if (magic[magic_len] != ' ') {
360 return false;
361 }
362 magic[magic_len] = '\0';
363 *magicp = magic;
364
365 /* ...followed by a length in bytes... */
366 char *p;
367 *length = strtoul(magic + magic_len + 1, &p, 10);
368 if (!*length || *length == ULONG_MAX || *p != ' ') {
369 return false;
370 }
371 p++;
372
373 /* ...followed by a SHA-1 hash... */
374 if (!sha1_from_hex(sha1, p)) {
375 return false;
376 }
377 p += SHA1_HEX_DIGEST_LEN;
378
379 /* ...and ended by a new-line. */
380 if (*p != '\n') {
381 return false;
382 }
383
384 return true;
385 }
386
387 static struct ovsdb_error *
388 parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
389 uint8_t sha1[SHA1_DIGEST_SIZE], struct json **jsonp)
390 {
391 struct json_parser *parser;
392 struct sha1_ctx ctx;
393
394 sha1_init(&ctx);
395 parser = json_parser_create(JSPF_TRAILER);
396
397 while (length > 0) {
398 char input[BUFSIZ];
399 int chunk;
400
401 chunk = MIN(length, sizeof input);
402 if (fread(input, 1, chunk, file->stream) != chunk) {
403 json_parser_abort(parser);
404 return ovsdb_io_error(ferror(file->stream) ? errno : EOF,
405 "%s: error reading %lu bytes "
406 "starting at offset %lld",
407 file->display_name, length,
408 (long long int) offset);
409 }
410 sha1_update(&ctx, input, chunk);
411 json_parser_feed(parser, input, chunk);
412 length -= chunk;
413 }
414
415 sha1_final(&ctx, sha1);
416 *jsonp = json_parser_finish(parser);
417 return NULL;
418 }
419
420 /* Attempts to read a log record from 'file'.
421 *
422 * If successful, returns NULL and stores in '*jsonp' the JSON object that the
423 * record contains. The caller owns the data and must eventually free it (with
424 * json_destroy()).
425 *
426 * If a read error occurs, returns the error and stores NULL in '*jsonp'.
427 *
428 * If the read reaches end of file, returns NULL and stores NULL in
429 * '*jsonp'. */
430 struct ovsdb_error *
431 ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
432 {
433 *jsonp = NULL;
434 switch (file->state) {
435 case OVSDB_LOG_READ:
436 break;
437
438 case OVSDB_LOG_READ_ERROR:
439 case OVSDB_LOG_WRITE_ERROR:
440 case OVSDB_LOG_BROKEN:
441 return ovsdb_error_clone(file->error);
442
443 case OVSDB_LOG_WRITE:
444 return NULL;
445 }
446
447 uint8_t expected_sha1[SHA1_DIGEST_SIZE];
448 uint8_t actual_sha1[SHA1_DIGEST_SIZE];
449 struct ovsdb_error *error;
450 unsigned long data_length;
451 struct json *json;
452 char header[128];
453
454 json = NULL;
455
456 if (!fgets(header, sizeof header, file->stream)) {
457 if (feof(file->stream)) {
458 return NULL;
459 }
460 error = ovsdb_io_error(errno, "%s: read failed", file->display_name);
461 goto error;
462 }
463 off_t data_offset = file->offset + strlen(header);
464
465 const char *magic;
466 if (!parse_header(header, &magic, &data_length, expected_sha1)
467 || strcmp(magic, file->magic)) {
468 error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset "
469 "%lld in header line \"%.*s\"",
470 file->display_name,
471 (long long int) file->offset,
472 (int) strcspn(header, "\n"), header);
473 goto error;
474 }
475
476 error = parse_body(file, data_offset, data_length, actual_sha1, &json);
477 if (error) {
478 goto error;
479 }
480
481 if (memcmp(expected_sha1, actual_sha1, SHA1_DIGEST_SIZE)) {
482 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
483 "offset %lld have SHA-1 hash "SHA1_FMT" "
484 "but should have hash "SHA1_FMT,
485 file->display_name, data_length,
486 (long long int) data_offset,
487 SHA1_ARGS(actual_sha1),
488 SHA1_ARGS(expected_sha1));
489 goto error;
490 }
491
492 if (json->type == JSON_STRING) {
493 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
494 "offset %lld are not valid JSON (%s)",
495 file->display_name, data_length,
496 (long long int) data_offset,
497 json->string);
498 goto error;
499 }
500 if (json->type != JSON_OBJECT) {
501 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
502 "offset %lld are not a JSON object",
503 file->display_name, data_length,
504 (long long int) data_offset);
505 goto error;
506 }
507
508 file->prev_offset = file->offset;
509 file->offset = data_offset + data_length;
510 *jsonp = json;
511 return NULL;
512
513 error:
514 file->state = OVSDB_LOG_READ_ERROR;
515 file->error = ovsdb_error_clone(error);
516 json_destroy(json);
517 return error;
518 }
519
520 /* Causes the log record read by the previous call to ovsdb_log_read() to be
521 * effectively discarded. The next call to ovsdb_log_write() will overwrite
522 * that previously read record.
523 *
524 * Calling this function more than once has no additional effect.
525 *
526 * This function is useful when ovsdb_log_read() successfully reads a record
527 * but that record does not make sense at a higher level (e.g. it specifies an
528 * invalid transaction). */
529 void
530 ovsdb_log_unread(struct ovsdb_log *file)
531 {
532 ovs_assert(file->state == OVSDB_LOG_READ);
533 file->offset = file->prev_offset;
534 }
535
536 static struct ovsdb_error *
537 ovsdb_log_truncate(struct ovsdb_log *file)
538 {
539 file->state = OVSDB_LOG_WRITE;
540
541 struct ovsdb_error *error = NULL;
542 if (fseeko(file->stream, file->offset, SEEK_SET)) {
543 error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
544 file->display_name,
545 (long long int) file->offset);
546 } else if (ftruncate(fileno(file->stream), file->offset)) {
547 error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
548 file->display_name,
549 (long long int) file->offset);
550 }
551 return error;
552 }
553
554 /* Composes a log record for 'json' by filling 'header' with a header line and
555 * 'data' with a data line (each ending with a new-line). To write the record
556 * to a file, write 'header' followed by 'data'.
557 *
558 * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC.
559 *
560 * The caller must initialize 'header' and 'data' to empty strings. */
561 void
562 ovsdb_log_compose_record(const struct json *json,
563 const char *magic, struct ds *header, struct ds *data)
564 {
565 ovs_assert(json->type == JSON_OBJECT || json->type == JSON_ARRAY);
566 ovs_assert(!header->length);
567 ovs_assert(!data->length);
568
569 /* Compose content. */
570 json_to_ds(json, 0, data);
571 ds_put_char(data, '\n');
572
573 /* Compose header. */
574 uint8_t sha1[SHA1_DIGEST_SIZE];
575 sha1_bytes(data->string, data->length, sha1);
576 ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n",
577 magic, data->length, SHA1_ARGS(sha1));
578 }
579
580 /* Writes log record 'json' to 'file'. Returns NULL if successful or an error
581 * (which the caller must eventually destroy) on failure.
582 *
583 * If the log contains some records that have not yet been read, then calling
584 * this function truncates them.
585 *
586 * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that
587 * they are durable.
588 */
589 struct ovsdb_error *
590 ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
591 {
592 switch (file->state) {
593 case OVSDB_LOG_WRITE:
594 break;
595
596 case OVSDB_LOG_READ:
597 case OVSDB_LOG_READ_ERROR:
598 case OVSDB_LOG_WRITE_ERROR:
599 ovsdb_error_destroy(file->error);
600 file->error = ovsdb_log_truncate(file);
601 if (file->error) {
602 file->state = OVSDB_LOG_WRITE_ERROR;
603 return ovsdb_error_clone(file->error);
604 }
605 file->state = OVSDB_LOG_WRITE;
606 break;
607
608 case OVSDB_LOG_BROKEN:
609 return ovsdb_error_clone(file->error);
610 }
611
612 if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) {
613 return OVSDB_BUG("bad JSON type");
614 }
615
616 struct ds header = DS_EMPTY_INITIALIZER;
617 struct ds data = DS_EMPTY_INITIALIZER;
618 ovsdb_log_compose_record(json, file->magic, &header, &data);
619 size_t total_length = header.length + data.length;
620
621 /* Write. */
622 bool ok = (fwrite(header.string, header.length, 1, file->stream) == 1
623 && fwrite(data.string, data.length, 1, file->stream) == 1
624 && fflush(file->stream) == 0);
625 ds_destroy(&header);
626 ds_destroy(&data);
627 if (!ok) {
628 int error = errno;
629
630 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
631 VLOG_WARN_RL(&rl, "%s: write failed (%s)",
632 file->name, ovs_strerror(error));
633
634 /* Remove any partially written data, ignoring errors since there is
635 * nothing further we can do. */
636 ignore(ftruncate(fileno(file->stream), file->offset));
637
638 file->error = ovsdb_io_error(error, "%s: write failed",
639 file->display_name);
640 file->state = OVSDB_LOG_WRITE_ERROR;
641 return ovsdb_error_clone(file->error);
642 }
643
644 file->offset += total_length;
645 return NULL;
646 }
647
648 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
649 ovsdb_log_write_and_free(struct ovsdb_log *log, struct json *json)
650 {
651 struct ovsdb_error *error = ovsdb_log_write(log, json);
652 json_destroy(json);
653 return error;
654 }
655
656 /* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail.
657 * Returns NULL if successful, otherwise the error that occurred. */
658 struct ovsdb_error *
659 ovsdb_log_commit_block(struct ovsdb_log *file)
660 {
661 if (file->stream && fsync(fileno(file->stream))) {
662 return ovsdb_io_error(errno, "%s: fsync failed", file->display_name);
663 }
664 return NULL;
665 }
666
667 /* Sets the current position in 'log' as the "base", that is, the initial size
668 * of the log that ovsdb_log_grew_lots() uses to determine whether the log has
669 * grown enough to make compacting worthwhile. */
670 void
671 ovsdb_log_mark_base(struct ovsdb_log *log)
672 {
673 log->base = log->offset;
674 }
675
676 /* Returns true if 'log' has grown enough above the base that it's worthwhile
677 * to compact it, false otherwise. */
678 bool
679 ovsdb_log_grew_lots(const struct ovsdb_log *log)
680 {
681 return log->offset > 10 * 1024 * 1024 && log->offset / 2 > log->base;
682 }
683 \f
684 /* Attempts to atomically replace the contents of 'log', on disk, by the 'n'
685 * entries in 'entries'. If successful, returns NULL, otherwise returns an
686 * error (which the caller must eventually free).
687 *
688 * If successful, 'log' will be in write mode at the end of the log. */
689 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
690 ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n)
691 {
692 struct ovsdb_error *error;
693 struct ovsdb_log *new;
694
695 error = ovsdb_log_replace_start(log, &new);
696 if (error) {
697 return error;
698 }
699
700 for (size_t i = 0; i < n; i++) {
701 error = ovsdb_log_write(new, entries[i]);
702 if (error) {
703 ovsdb_log_replace_abort(new);
704 return error;
705 }
706 }
707 ovsdb_log_mark_base(new);
708
709 return ovsdb_log_replace_commit(log, new);
710 }
711
712 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
713 ovsdb_log_replace_start(struct ovsdb_log *old,
714 struct ovsdb_log **newp)
715 {
716 /* If old->name is a symlink, then we want the new file to be in the same
717 * directory as the symlink's referent. */
718 char *deref_name = follow_symlinks(old->name);
719 char *tmp_name = xasprintf("%s.tmp", deref_name);
720 free(deref_name);
721
722 struct ovsdb_error *error;
723
724 ovs_assert(old->lockfile);
725
726 /* Remove temporary file. (It might not exist.) */
727 if (unlink(tmp_name) < 0 && errno != ENOENT) {
728 error = ovsdb_io_error(errno, "failed to remove %s", tmp_name);
729 free(tmp_name);
730 *newp = NULL;
731 return error;
732 }
733
734 /* Create temporary file. */
735 error = ovsdb_log_open(tmp_name, old->magic, OVSDB_LOG_CREATE_EXCL,
736 false, newp);
737 free(tmp_name);
738 return error;
739 }
740
741 /* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if
742 * successful, otherwise an ovsdb_error that the caller must destroy. */
743 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
744 ovsdb_rename(const char *old, const char *new)
745 {
746 #ifdef _WIN32
747 /* Avoid rename() because it fails if the destination exists. */
748 int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING
749 | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED)
750 ? 0 : EACCES);
751 #else
752 int error = rename(old, new) ? errno : 0;
753 #endif
754
755 return (error
756 ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"",
757 old, new)
758 : NULL);
759 }
760
761 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
762 ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new)
763 {
764 struct ovsdb_error *error = ovsdb_log_commit_block(new);
765 if (error) {
766 ovsdb_log_replace_abort(new);
767 return error;
768 }
769
770 /* Replace original file by the temporary file.
771 *
772 * We support two strategies:
773 *
774 * - The preferred strategy is to rename the temporary file over the
775 * original one in-place, then close the original one. This works on
776 * Unix-like systems. It does not work on Windows, which does not
777 * allow open files to be renamed. The approach has the advantage
778 * that, at any point, we can drop back to something that already
779 * works.
780 *
781 * - Alternatively, we can close both files, rename, then open the new
782 * file (which now has the original name). This works on all
783 * systems, but if reopening the file fails then 'old' is broken.
784 *
785 * We make the strategy a variable instead of an #ifdef to make it easier
786 * to test both strategies on Unix-like systems, and to make the code
787 * easier to read. */
788 if (!rename_open_files) {
789 fclose(old->stream);
790 old->stream = NULL;
791
792 fclose(new->stream);
793 new->stream = NULL;
794 }
795
796 /* Rename 'old' to 'new'. We dereference the old name because, if it is a
797 * symlink, we want to replace the referent of the symlink instead of the
798 * symlink itself. */
799 char *deref_name = follow_symlinks(old->name);
800 error = ovsdb_rename(new->name, deref_name);
801 free(deref_name);
802
803 if (error) {
804 ovsdb_log_replace_abort(new);
805 return error;
806 }
807 if (rename_open_files) {
808 fsync_parent_dir(old->name);
809 fclose(old->stream);
810 old->stream = new->stream;
811 new->stream = NULL;
812 } else {
813 old->stream = fopen(old->name, "r+b");
814 if (!old->stream) {
815 old->error = ovsdb_io_error(errno, "%s: could not reopen log",
816 old->name);
817 old->state = OVSDB_LOG_BROKEN;
818 return ovsdb_error_clone(old->error);
819 }
820
821 if (fseek(old->stream, new->offset, SEEK_SET)) {
822 old->error = ovsdb_io_error(errno, "%s: seek failed", old->name);
823 old->state = OVSDB_LOG_BROKEN;
824 return ovsdb_error_clone(old->error);
825 }
826 }
827
828 /* Replace 'old' by 'new' in memory.
829 *
830 * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode
831 * anyway). */
832 old->state = OVSDB_LOG_WRITE;
833 ovsdb_error_destroy(old->error);
834 old->error = NULL;
835 /* prev_offset only matters for OVSDB_LOG_READ. */
836 if (old->afsync) {
837 uint64_t ticket = afsync_destroy(old->afsync);
838 old->afsync = afsync_create(fileno(old->stream), ticket + 1);
839 }
840 old->offset = new->offset;
841 /* Keep old->name. */
842 free(old->magic);
843 old->magic = new->magic;
844 new->magic = NULL;
845 /* Keep old->lockfile. */
846 old->base = new->base;
847
848 /* Free 'new'. */
849 ovsdb_log_close(new);
850
851 return NULL;
852 }
853
854 void
855 ovsdb_log_replace_abort(struct ovsdb_log *new)
856 {
857 if (new) {
858 /* Unlink the new file, but only after we close it (because Windows
859 * does not allow removing an open file). */
860 char *name = xstrdup(new->name);
861 ovsdb_log_close(new);
862 unlink(name);
863 free(name);
864 }
865 }
866
867 void
868 ovsdb_log_disable_renaming_open_files(void)
869 {
870 rename_open_files = false;
871 }
872 \f
873 struct afsync {
874 pthread_t thread;
875 atomic_uint64_t cur, next;
876 struct seq *request, *complete;
877 int fd;
878 };
879
880 static void *
881 afsync_thread(void *afsync_)
882 {
883 struct afsync *afsync = afsync_;
884 uint64_t cur = 0;
885 for (;;) {
886 ovsrcu_quiesce_start();
887
888 uint64_t request_seq = seq_read(afsync->request);
889
890 uint64_t next;
891 atomic_read_explicit(&afsync->next, &next, memory_order_acquire);
892 if (next == UINT64_MAX) {
893 break;
894 }
895
896 if (cur != next && afsync->fd != -1) {
897 int error = fsync(afsync->fd) ? errno : 0;
898 if (!error) {
899 cur = next;
900 atomic_store_explicit(&afsync->cur, cur, memory_order_release);
901 seq_change(afsync->complete);
902 } else {
903 VLOG_WARN("fsync failed (%s)", ovs_strerror(error));
904 }
905 }
906
907 seq_wait(afsync->request, request_seq);
908 poll_block();
909 }
910 return NULL;
911 }
912
913 static struct afsync *
914 afsync_create(int fd, uint64_t initial_ticket)
915 {
916 struct afsync *afsync = xzalloc(sizeof *afsync);
917 atomic_init(&afsync->cur, initial_ticket);
918 atomic_init(&afsync->next, initial_ticket);
919 afsync->request = seq_create();
920 afsync->complete = seq_create();
921 afsync->thread = ovs_thread_create("log_fsync", afsync_thread, afsync);
922 afsync->fd = fd;
923 return afsync;
924 }
925
926 static uint64_t
927 afsync_destroy(struct afsync *afsync)
928 {
929 if (!afsync) {
930 return 0;
931 }
932
933 uint64_t next;
934 atomic_read(&afsync->next, &next);
935 atomic_store(&afsync->next, UINT64_MAX);
936 seq_change(afsync->request);
937 xpthread_join(afsync->thread, NULL);
938
939 seq_destroy(afsync->request);
940 seq_destroy(afsync->complete);
941
942 free(afsync);
943
944 return next;
945 }
946
947 static struct afsync *
948 ovsdb_log_get_afsync(struct ovsdb_log *log)
949 {
950 if (!log->afsync) {
951 log->afsync = afsync_create(log->stream ? fileno(log->stream) : -1, 0);
952 }
953 return log->afsync;
954 }
955
956 /* Starts committing 'log' to disk. Returns a ticket that can be passed to
957 * ovsdb_log_commit_wait() or compared against the return value of
958 * ovsdb_log_commit_progress() later. */
959 uint64_t
960 ovsdb_log_commit_start(struct ovsdb_log *log)
961 {
962 struct afsync *afsync = ovsdb_log_get_afsync(log);
963
964 uint64_t orig;
965 atomic_add_explicit(&afsync->next, 1, &orig, memory_order_acq_rel);
966
967 seq_change(afsync->request);
968
969 return orig + 1;
970 }
971
972 /* Returns a ticket value that represents the current progress of commits to
973 * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any
974 * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit
975 * X is complete if and only if X <= Y. */
976 uint64_t
977 ovsdb_log_commit_progress(struct ovsdb_log *log)
978 {
979 struct afsync *afsync = ovsdb_log_get_afsync(log);
980 uint64_t cur;
981 atomic_read_explicit(&afsync->cur, &cur, memory_order_acquire);
982 return cur;
983 }
984
985 /* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log)
986 * would return at least 'goal'. */
987 void
988 ovsdb_log_commit_wait(struct ovsdb_log *log, uint64_t goal)
989 {
990 struct afsync *afsync = ovsdb_log_get_afsync(log);
991 uint64_t complete = seq_read(afsync->complete);
992 uint64_t cur = ovsdb_log_commit_progress(log);
993 if (cur < goal) {
994 seq_wait(afsync->complete, complete);
995 } else {
996 poll_immediate_wake();
997 }
998 }